From 67e467a11f62ff64ad219dc6aa5459e132c79d14 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Sat, 7 Mar 2026 14:30:41 +0530 Subject: netfs: Fix kernel BUG in netfs_limit_iter() for ITER_KVEC iterators When a process crashes and the kernel writes a core dump to a 9P filesystem, __kernel_write() creates an ITER_KVEC iterator. This iterator reaches netfs_limit_iter() via netfs_unbuffered_write(), which only handles ITER_FOLIOQ, ITER_BVEC and ITER_XARRAY iterator types, hitting the BUG() for any other type. Fix this by adding netfs_limit_kvec() following the same pattern as netfs_limit_bvec(), since both kvec and bvec are simple segment arrays with pointer and length fields. Dispatch it from netfs_limit_iter() when the iterator type is ITER_KVEC. Fixes: cae932d3aee5 ("netfs: Add func to calculate pagecount/size-limited span of an iterator") Reported-by: syzbot+9c058f0d63475adc97fd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9c058f0d63475adc97fd Tested-by: syzbot+9c058f0d63475adc97fd@syzkaller.appspotmail.com Signed-off-by: Deepanshu Kartikey Link: https://patch.msgid.link/20260307090041.359870-1-kartikey406@gmail.com Signed-off-by: Christian Brauner --- fs/netfs/iterator.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'fs') diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 72a435e5fc6d..154a14bb2d7f 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -142,6 +142,47 @@ static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset, return min(span, max_size); } +/* + * Select the span of a kvec iterator we're going to use. Limit it by both + * maximum size and maximum number of segments. Returns the size of the span + * in bytes. + */ +static size_t netfs_limit_kvec(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct kvec *kvecs = iter->kvec; + unsigned int nkv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset + start_offset; + + if (WARN_ON(!iov_iter_is_kvec(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + + while (n && ix < nkv && skip) { + len = kvecs[ix].iov_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nkv) { + len = min3(n, kvecs[ix].iov_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + return min(span, max_size); +} + /* * Select the span of an xarray iterator we're going to use. Limit it by both * maximum size and maximum number of segments. It is assumed that segments @@ -245,6 +286,8 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, return netfs_limit_bvec(iter, start_offset, max_size, max_segs); if (iov_iter_is_xarray(iter)) return netfs_limit_xarray(iter, start_offset, max_size, max_segs); + if (iov_iter_is_kvec(iter)) + return netfs_limit_kvec(iter, start_offset, max_size, max_segs); BUG(); } EXPORT_SYMBOL(netfs_limit_iter); -- cgit v1.2.3 From e9075e420a1eb3b52c60f3b95893a55e77419ce8 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Sat, 7 Mar 2026 10:09:47 +0530 Subject: netfs: Fix NULL pointer dereference in netfs_unbuffered_write() on retry When a write subrequest is marked NETFS_SREQ_NEED_RETRY, the retry path in netfs_unbuffered_write() unconditionally calls stream->prepare_write() without checking if it is NULL. Filesystems such as 9P do not set the prepare_write operation, so stream->prepare_write remains NULL. When get_user_pages() fails with -EFAULT and the subrequest is flagged for retry, this results in a NULL pointer dereference at fs/netfs/direct_write.c:189. Fix this by mirroring the pattern already used in write_retry.c: if stream->prepare_write is NULL, skip renegotiation and directly reissue the subrequest via netfs_reissue_write(), which handles iterator reset, IN_PROGRESS flag, stats update and reissue internally. Fixes: a0b4c7a49137 ("netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence") Reported-by: syzbot+7227db0fbac9f348dba0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7227db0fbac9f348dba0 Signed-off-by: Deepanshu Kartikey Link: https://patch.msgid.link/20260307043947.347092-1-kartikey406@gmail.com Tested-by: syzbot+7227db0fbac9f348dba0@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index dd1451bf7543..4d9760e36c11 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -186,10 +186,18 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq) stream->sreq_max_segs = INT_MAX; netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - stream->prepare_write(subreq); - __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_stat(&netfs_n_wh_retry_write_subreq); + if (stream->prepare_write) { + stream->prepare_write(subreq); + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); + } else { + struct iov_iter source; + + netfs_reset_iter(subreq); + source = subreq->io_iter; + netfs_reissue_write(stream, subreq, &source); + } } netfs_unbuffered_write_done(wreq); -- cgit v1.2.3 From f8b8820a4a16c4ef673e90ded41e8348514e53f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Mar 2026 14:44:04 +0100 Subject: fs: clear I_DIRTY_TIME in sync_lazytime For file systems implementing ->sync_lazytime, I_DIRTY_TIME fails to get cleared in sync_lazytime, and might cause additional calls to sync_lazytime during inode deactivation. Use the same pattern as in __mark_inode_dirty to clear the flag under the inode lock. Fixes: 5cf06ea56ee6 ("fs: add a ->sync_lazytime method") Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260317134409.1691317-1-hch@lst.de Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7c75ed7e8979..d8dac1931595 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1711,6 +1711,19 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, } } +static bool __sync_lazytime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (!(inode_state_read(inode) & I_DIRTY_TIME)) { + spin_unlock(&inode->i_lock); + return false; + } + inode_state_clear(inode, I_DIRTY_TIME); + spin_unlock(&inode->i_lock); + inode->i_op->sync_lazytime(inode); + return true; +} + bool sync_lazytime(struct inode *inode) { if (!(inode_state_read_once(inode) & I_DIRTY_TIME)) @@ -1718,9 +1731,8 @@ bool sync_lazytime(struct inode *inode) trace_writeback_lazytime(inode); if (inode->i_op->sync_lazytime) - inode->i_op->sync_lazytime(inode); - else - mark_inode_dirty_sync(inode); + return __sync_lazytime(inode); + mark_inode_dirty_sync(inode); return true; } -- cgit v1.2.3 From 4f24a767e3d64a5f58c595b5c29b6063a201f1e3 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Tue, 10 Mar 2026 18:38:37 +0000 Subject: xfs: stop reclaim before pushing AIL during unmount The unmount sequence in xfs_unmount_flush_inodes() pushed the AIL while background reclaim and inodegc are still running. This is broken independently of any use-after-free issues - background reclaim and inodegc should not be running while the AIL is being pushed during unmount, as inodegc can dirty and insert inodes into the AIL during the flush, and background reclaim can race to abort and free dirty inodes. Reorder xfs_unmount_flush_inodes() to stop inodegc and cancel background reclaim before pushing the AIL. Stop inodegc before cancelling m_reclaim_work because the inodegc worker can re-queue m_reclaim_work via xfs_inodegc_set_reclaimable. Reported-by: syzbot+652af2b3c5569c4ab63c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=652af2b3c5569c4ab63c Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary") Cc: stable@vger.kernel.org # v5.9 Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_mount.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9c295abd0a0a..ef1ea8a1238c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -608,8 +608,9 @@ xfs_unmount_check( * have been retrying in the background. This will prevent never-ending * retries in AIL pushing from hanging the unmount. * - * Finally, we can push the AIL to clean all the remaining dirty objects, then - * reclaim the remaining inodes that are still in memory at this point in time. + * Stop inodegc and background reclaim before pushing the AIL so that they + * are not running while the AIL is being flushed. Then push the AIL to + * clean all the remaining dirty objects and reclaim the remaining inodes. */ static void xfs_unmount_flush_inodes( @@ -621,9 +622,9 @@ xfs_unmount_flush_inodes( xfs_set_unmounting(mp); - xfs_ail_push_all_sync(mp->m_ail); xfs_inodegc_stop(mp); cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_ail_push_all_sync(mp->m_ail); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); xfs_healthmon_unmount(mp); -- cgit v1.2.3 From 79ef34ec0554ec04bdbafafbc9836423734e1bd6 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Tue, 10 Mar 2026 18:38:38 +0000 Subject: xfs: avoid dereferencing log items after push callbacks After xfsaild_push_item() calls iop_push(), the log item may have been freed if the AIL lock was dropped during the push. Background inode reclaim or the dquot shrinker can free the log item while the AIL lock is not held, and the tracepoints in the switch statement dereference the log item after iop_push() returns. Fix this by capturing the log item type, flags, and LSN before calling xfsaild_push_item(), and introducing a new xfs_ail_push_class trace event class that takes these pre-captured values and the ailp pointer instead of the log item pointer. Reported-by: syzbot+652af2b3c5569c4ab63c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=652af2b3c5569c4ab63c Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary") Cc: stable@vger.kernel.org # v5.9 Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_trace.h | 36 ++++++++++++++++++++++++++++++++---- fs/xfs/xfs_trans_ail.c | 26 +++++++++++++++++++------- 2 files changed, 51 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 813e5a9f57eb..0e994b3f768f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -56,6 +56,7 @@ #include struct xfs_agf; +struct xfs_ail; struct xfs_alloc_arg; struct xfs_attr_list_context; struct xfs_buf_log_item; @@ -1650,16 +1651,43 @@ TRACE_EVENT(xfs_log_force, DEFINE_EVENT(xfs_log_item_class, name, \ TP_PROTO(struct xfs_log_item *lip), \ TP_ARGS(lip)) -DEFINE_LOG_ITEM_EVENT(xfs_ail_push); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); -DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); -DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort); DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort); +DECLARE_EVENT_CLASS(xfs_ail_push_class, + TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn), + TP_ARGS(ailp, type, flags, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint, type) + __field(unsigned long, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = ailp->ail_log->l_mp->m_super->s_dev; + __entry->type = type; + __entry->flags = flags; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_AIL_PUSH_EVENT(name) \ +DEFINE_EVENT(xfs_ail_push_class, name, \ + TP_PROTO(struct xfs_ail *ailp, uint type, unsigned long flags, xfs_lsn_t lsn), \ + TP_ARGS(ailp, type, flags, lsn)) +DEFINE_AIL_PUSH_EVENT(xfs_ail_push); +DEFINE_AIL_PUSH_EVENT(xfs_ail_pinned); +DEFINE_AIL_PUSH_EVENT(xfs_ail_locked); +DEFINE_AIL_PUSH_EVENT(xfs_ail_flushing); + DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), TP_ARGS(lip, old_lsn, new_lsn), diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 923729af4206..63266d31b514 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -365,6 +365,12 @@ xfsaild_resubmit_item( return XFS_ITEM_SUCCESS; } +/* + * Push a single log item from the AIL. + * + * @lip may have been released and freed by the time this function returns, + * so callers must not dereference the log item afterwards. + */ static inline uint xfsaild_push_item( struct xfs_ail *ailp, @@ -505,7 +511,10 @@ xfsaild_push( lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, ailp->ail_target) <= 0)) { - int lock_result; + int lock_result; + uint type = lip->li_type; + unsigned long flags = lip->li_flags; + xfs_lsn_t item_lsn = lip->li_lsn; if (test_bit(XFS_LI_FLUSHING, &lip->li_flags)) goto next_item; @@ -514,14 +523,17 @@ xfsaild_push( * Note that iop_push may unlock and reacquire the AIL lock. We * rely on the AIL cursor implementation to be able to deal with * the dropped lock. + * + * The log item may have been freed by the push, so it must not + * be accessed or dereferenced below this line. */ lock_result = xfsaild_push_item(ailp, lip); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(mp, xs_push_ail_success); - trace_xfs_ail_push(lip); + trace_xfs_ail_push(ailp, type, flags, item_lsn); - ailp->ail_last_pushed_lsn = lsn; + ailp->ail_last_pushed_lsn = item_lsn; break; case XFS_ITEM_FLUSHING: @@ -537,22 +549,22 @@ xfsaild_push( * AIL is being flushed. */ XFS_STATS_INC(mp, xs_push_ail_flushing); - trace_xfs_ail_flushing(lip); + trace_xfs_ail_flushing(ailp, type, flags, item_lsn); flushing++; - ailp->ail_last_pushed_lsn = lsn; + ailp->ail_last_pushed_lsn = item_lsn; break; case XFS_ITEM_PINNED: XFS_STATS_INC(mp, xs_push_ail_pinned); - trace_xfs_ail_pinned(lip); + trace_xfs_ail_pinned(ailp, type, flags, item_lsn); stuck++; ailp->ail_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(mp, xs_push_ail_locked); - trace_xfs_ail_locked(lip); + trace_xfs_ail_locked(ailp, type, flags, item_lsn); stuck++; break; -- cgit v1.2.3 From 394d70b86fae9fe865e7e6d9540b7696f73aa9b6 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Tue, 10 Mar 2026 18:38:39 +0000 Subject: xfs: save ailp before dropping the AIL lock in push callbacks In xfs_inode_item_push() and xfs_qm_dquot_logitem_push(), the AIL lock is dropped to perform buffer IO. Once the cluster buffer no longer protects the log item from reclaim, the log item may be freed by background reclaim or the dquot shrinker. The subsequent spin_lock() call dereferences lip->li_ailp, which is a use-after-free. Fix this by saving the ailp pointer in a local variable while the AIL lock is held and the log item is guaranteed to be valid. Reported-by: syzbot+652af2b3c5569c4ab63c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=652af2b3c5569c4ab63c Fixes: 90c60e164012 ("xfs: xfs_iflush() is no longer necessary") Cc: stable@vger.kernel.org # v5.9 Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Yuto Ohnuki Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_dquot_item.c | 9 +++++++-- fs/xfs/xfs_inode_item.c | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 491e2a7053a3..65a0e69c3d08 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -125,6 +125,7 @@ xfs_qm_dquot_logitem_push( struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_buf *bp; + struct xfs_ail *ailp = lip->li_ailp; uint rval = XFS_ITEM_SUCCESS; int error; @@ -153,7 +154,7 @@ xfs_qm_dquot_logitem_push( goto out_unlock; } - spin_unlock(&lip->li_ailp->ail_lock); + spin_unlock(&ailp->ail_lock); error = xfs_dquot_use_attached_buf(dqp, &bp); if (error == -EAGAIN) { @@ -172,9 +173,13 @@ xfs_qm_dquot_logitem_push( rval = XFS_ITEM_FLUSHING; } xfs_buf_relse(bp); + /* + * The buffer no longer protects the log item from reclaim, so + * do not reference lip after this point. + */ out_relock_ail: - spin_lock(&lip->li_ailp->ail_lock); + spin_lock(&ailp->ail_lock); out_unlock: mutex_unlock(&dqp->q_qlock); return rval; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 8913036b8024..4ae81eed0442 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -746,6 +746,7 @@ xfs_inode_item_push( struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; struct xfs_buf *bp = lip->li_buf; + struct xfs_ail *ailp = lip->li_ailp; uint rval = XFS_ITEM_SUCCESS; int error; @@ -771,7 +772,7 @@ xfs_inode_item_push( if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - spin_unlock(&lip->li_ailp->ail_lock); + spin_unlock(&ailp->ail_lock); /* * We need to hold a reference for flushing the cluster buffer as it may @@ -795,7 +796,11 @@ xfs_inode_item_push( rval = XFS_ITEM_LOCKED; } - spin_lock(&lip->li_ailp->ail_lock); + /* + * The buffer no longer protects the log item from reclaim, so + * do not reference lip after this point. + */ + spin_lock(&ailp->ail_lock); return rval; } -- cgit v1.2.3 From 7cac60947335f8d88a6390814840590a61134484 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Tue, 10 Mar 2026 18:38:40 +0000 Subject: xfs: refactor xfsaild_push loop into helper Factor the loop body of xfsaild_push() into a separate xfsaild_process_logitem() helper to improve readability. This is a pure code movement with no functional change. Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_trans_ail.c | 127 +++++++++++++++++++++++++++---------------------- 1 file changed, 69 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 63266d31b514..99a9bf3762b7 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -464,6 +464,74 @@ xfs_ail_calc_push_target( return target_lsn; } +static void +xfsaild_process_logitem( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + int *stuck, + int *flushing) +{ + struct xfs_mount *mp = ailp->ail_log->l_mp; + uint type = lip->li_type; + unsigned long flags = lip->li_flags; + xfs_lsn_t item_lsn = lip->li_lsn; + int lock_result; + + /* + * Note that iop_push may unlock and reacquire the AIL lock. We + * rely on the AIL cursor implementation to be able to deal with + * the dropped lock. + * + * The log item may have been freed by the push, so it must not + * be accessed or dereferenced below this line. + */ + lock_result = xfsaild_push_item(ailp, lip); + switch (lock_result) { + case XFS_ITEM_SUCCESS: + XFS_STATS_INC(mp, xs_push_ail_success); + trace_xfs_ail_push(ailp, type, flags, item_lsn); + + ailp->ail_last_pushed_lsn = item_lsn; + break; + + case XFS_ITEM_FLUSHING: + /* + * The item or its backing buffer is already being + * flushed. The typical reason for that is that an + * inode buffer is locked because we already pushed the + * updates to it as part of inode clustering. + * + * We do not want to stop flushing just because lots + * of items are already being flushed, but we need to + * re-try the flushing relatively soon if most of the + * AIL is being flushed. + */ + XFS_STATS_INC(mp, xs_push_ail_flushing); + trace_xfs_ail_flushing(ailp, type, flags, item_lsn); + + (*flushing)++; + ailp->ail_last_pushed_lsn = item_lsn; + break; + + case XFS_ITEM_PINNED: + XFS_STATS_INC(mp, xs_push_ail_pinned); + trace_xfs_ail_pinned(ailp, type, flags, item_lsn); + + (*stuck)++; + ailp->ail_log_flush++; + break; + case XFS_ITEM_LOCKED: + XFS_STATS_INC(mp, xs_push_ail_locked); + trace_xfs_ail_locked(ailp, type, flags, item_lsn); + + (*stuck)++; + break; + default: + ASSERT(0); + break; + } +} + static long xfsaild_push( struct xfs_ail *ailp) @@ -511,68 +579,11 @@ xfsaild_push( lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, ailp->ail_target) <= 0)) { - int lock_result; - uint type = lip->li_type; - unsigned long flags = lip->li_flags; - xfs_lsn_t item_lsn = lip->li_lsn; if (test_bit(XFS_LI_FLUSHING, &lip->li_flags)) goto next_item; - /* - * Note that iop_push may unlock and reacquire the AIL lock. We - * rely on the AIL cursor implementation to be able to deal with - * the dropped lock. - * - * The log item may have been freed by the push, so it must not - * be accessed or dereferenced below this line. - */ - lock_result = xfsaild_push_item(ailp, lip); - switch (lock_result) { - case XFS_ITEM_SUCCESS: - XFS_STATS_INC(mp, xs_push_ail_success); - trace_xfs_ail_push(ailp, type, flags, item_lsn); - - ailp->ail_last_pushed_lsn = item_lsn; - break; - - case XFS_ITEM_FLUSHING: - /* - * The item or its backing buffer is already being - * flushed. The typical reason for that is that an - * inode buffer is locked because we already pushed the - * updates to it as part of inode clustering. - * - * We do not want to stop flushing just because lots - * of items are already being flushed, but we need to - * re-try the flushing relatively soon if most of the - * AIL is being flushed. - */ - XFS_STATS_INC(mp, xs_push_ail_flushing); - trace_xfs_ail_flushing(ailp, type, flags, item_lsn); - - flushing++; - ailp->ail_last_pushed_lsn = item_lsn; - break; - - case XFS_ITEM_PINNED: - XFS_STATS_INC(mp, xs_push_ail_pinned); - trace_xfs_ail_pinned(ailp, type, flags, item_lsn); - - stuck++; - ailp->ail_log_flush++; - break; - case XFS_ITEM_LOCKED: - XFS_STATS_INC(mp, xs_push_ail_locked); - trace_xfs_ail_locked(ailp, type, flags, item_lsn); - - stuck++; - break; - default: - ASSERT(0); - break; - } - + xfsaild_process_logitem(ailp, lip, &stuck, &flushing); count++; /* -- cgit v1.2.3 From 268378b6ad20569af0d1957992de1c8b16c6e900 Mon Sep 17 00:00:00 2001 From: hongao Date: Thu, 12 Mar 2026 20:10:26 +0800 Subject: xfs: scrub: unlock dquot before early return in quota scrub xchk_quota_item can return early after calling xchk_fblock_process_error. When that helper returns false, the function returned immediately without dropping dq->q_qlock, which can leave the dquot lock held and risk lock leaks or deadlocks in later quota operations. Fix this by unlocking dq->q_qlock before the early return. Signed-off-by: hongao Fixes: 7d1f0e167a067e ("xfs: check the ondisk space mapping behind a dquot") Cc: # v6.8 Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/quota.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 1d25bd5b892e..222812fe202c 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -171,8 +171,10 @@ xchk_quota_item( error = xchk_quota_item_bmap(sc, dq, offset); xchk_iunlock(sc, XFS_ILOCK_SHARED); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) { + mutex_unlock(&dq->q_qlock); return error; + } /* * Warn if the hard limits are larger than the fs. -- cgit v1.2.3 From 0c98524ab20193d8772cff9c71b00ad004fb1349 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2026 16:35:29 +0100 Subject: xfs: cleanup buftarg handling in XFS_IOC_VERIFY_MEDIA The newly added XFS_IOC_VERIFY_MEDIA is a bit unusual in how it handles buftarg fields. Update it to be more in line with other XFS code: - use btp->bt_dev instead of btp->bt_bdev->bd_dev to retrieve the device number for tracing - use btp->bt_logical_sectorsize instead of bdev_logical_block_size(btp->bt_bdev) to retrieve the logical sector size - compare the buftarg and not the bdev to see if there is a separate log buftarg Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_verify_media.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_verify_media.c b/fs/xfs/xfs_verify_media.c index 8bbd4ec567f8..5ead3976d511 100644 --- a/fs/xfs/xfs_verify_media.c +++ b/fs/xfs/xfs_verify_media.c @@ -183,10 +183,9 @@ xfs_verify_iosize( min_not_zero(SZ_1M, me->me_max_io_size); BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT); - ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev)); + ASSERT(BBTOB(bbcount) >= btp->bt_logical_sectorsize); - return clamp(iosize, bdev_logical_block_size(btp->bt_bdev), - BBTOB(bbcount)); + return clamp(iosize, btp->bt_logical_sectorsize, BBTOB(bbcount)); } /* Allocate as much memory as we can get for verification buffer. */ @@ -218,8 +217,8 @@ xfs_verify_media_error( unsigned int bio_bbcount, blk_status_t bio_status) { - trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr, - bio_bbcount, bio_status); + trace_xfs_verify_media_error(mp, me, btp->bt_dev, daddr, bio_bbcount, + bio_status); /* * Pass any error, I/O or otherwise, up to the caller if we didn't @@ -280,7 +279,7 @@ xfs_verify_media( btp = mp->m_ddev_targp; break; case XFS_DEV_LOG: - if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev) + if (mp->m_logdev_targp != mp->m_ddev_targp) btp = mp->m_logdev_targp; break; case XFS_DEV_RT: @@ -299,7 +298,7 @@ xfs_verify_media( /* start and end have to be aligned to the lba size */ if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr), - bdev_logical_block_size(btp->bt_bdev))) + btp->bt_logical_sectorsize)) return -EINVAL; /* @@ -331,8 +330,7 @@ xfs_verify_media( if (!folio) return -ENOMEM; - trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount, - folio); + trace_xfs_verify_media(mp, me, btp->bt_dev, daddr, bbcount, folio); bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL); if (!bio) { @@ -400,7 +398,7 @@ out_folio: * an operational error. */ me->me_start_daddr = daddr; - trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev); + trace_xfs_verify_media_end(mp, me, btp->bt_dev); return 0; } -- cgit v1.2.3 From e5966096d0856d071269cb5928d6bc33342d2dfd Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Mon, 16 Mar 2026 18:41:58 +0000 Subject: xfs: annotate struct xfs_attr_list_context with __counted_by_ptr Add the `__counted_by_ptr` attribute to the `buffer` field of `struct xfs_attr_list_context`. This field is used to point to a buffer of size `bufsize`. The `buffer` field is assigned in: 1. `xfs_ioc_attr_list` in `fs/xfs/xfs_handle.c` 2. `xfs_xattr_list` in `fs/xfs/xfs_xattr.c` 3. `xfs_getparents` in `fs/xfs/xfs_handle.c` (implicitly initialized to NULL) In `xfs_ioc_attr_list`, `buffer` was assigned before `bufsize`. Reorder them to ensure `bufsize` is set before `buffer` is assigned, although no access happens between them. In `xfs_xattr_list`, `buffer` was assigned before `bufsize`. Reorder them to ensure `bufsize` is set before `buffer` is assigned. In `xfs_getparents`, `buffer` is NULL (from zero initialization) and remains NULL. `bufsize` is set to a non-zero value, but since `buffer` is NULL, no access occurs. In all cases, the pointer `buffer` is not accessed before `bufsize` is set. This patch was generated by CodeMender and reviewed by Bill Wendling. Tested by running xfstests. Signed-off-by: Bill Wendling Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_attr.h | 3 ++- fs/xfs/xfs_handle.c | 2 +- fs/xfs/xfs_xattr.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 8244305949de..67fd9c75ac3f 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -55,7 +55,8 @@ struct xfs_attr_list_context { struct xfs_trans *tp; struct xfs_inode *dp; /* inode */ struct xfs_attrlist_cursor_kern cursor; /* position in list */ - void *buffer; /* output buffer */ + /* output buffer */ + void *buffer __counted_by_ptr(bufsize); /* * Abort attribute list iteration if non-zero. Can be used to pass diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index d1291ca15239..2b8617ae7ec2 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -443,8 +443,8 @@ xfs_ioc_attr_list( context.dp = dp; context.resynch = 1; context.attr_filter = xfs_attr_filter(flags); - context.buffer = buffer; context.bufsize = round_down(bufsize, sizeof(uint32_t)); + context.buffer = buffer; context.firstu = context.bufsize; context.put_listent = xfs_ioc_attr_put_listent; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index a735f16d9cd8..544213067d59 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -332,8 +332,8 @@ xfs_vn_listxattr( memset(&context, 0, sizeof(context)); context.dp = XFS_I(inode); context.resynch = 1; - context.buffer = size ? data : NULL; context.bufsize = size; + context.buffer = size ? data : NULL; context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; -- cgit v1.2.3 From bd71fb3fea9945987053968f028a948997cba8cc Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 17 Mar 2026 13:39:35 -0700 Subject: iomap: fix invalid folio access when i_blkbits differs from I/O granularity Commit aa35dd5cbc06 ("iomap: fix invalid folio access after folio_end_read()") partially addressed invalid folio access for folios without an ifs attached, but it did not handle the case where 1 << inode->i_blkbits matches the folio size but is different from the granularity used for the IO, which means IO can be submitted for less than the full folio for the !ifs case. In this case, the condition: if (*bytes_submitted == folio_len) ctx->cur_folio = NULL; in iomap_read_folio_iter() will not invalidate ctx->cur_folio, and iomap_read_end() will still be called on the folio even though the IO helper owns it and will finish the read on it. Fix this by unconditionally invalidating ctx->cur_folio for the !ifs case. Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Link: https://lore.kernel.org/linux-fsdevel/b3dfe271-4e3d-4922-b618-e73731242bca@wdc.com/ Fixes: b2f35ac4146d ("iomap: add caller-provided callbacks for read and readahead") Cc: stable@vger.kernel.org Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20260317203935.830549-1-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 00f0efaf12b2..92a831cf4bf1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -514,6 +514,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; size_t folio_len = folio_size(folio); + struct iomap_folio_state *ifs; size_t poff, plen; loff_t pos_diff; int ret; @@ -525,7 +526,7 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, return iomap_iter_advance(iter, length); } - ifs_alloc(iter->inode, folio, iter->flags); + ifs = ifs_alloc(iter->inode, folio, iter->flags); length = min_t(loff_t, length, folio_len - offset_in_folio(folio, pos)); while (length) { @@ -560,11 +561,15 @@ static int iomap_read_folio_iter(struct iomap_iter *iter, *bytes_submitted += plen; /* - * If the entire folio has been read in by the IO - * helper, then the helper owns the folio and will end - * the read on it. + * Hand off folio ownership to the IO helper when: + * 1) The entire folio has been submitted for IO, or + * 2) There is no ifs attached to the folio + * + * Case (2) occurs when 1 << i_blkbits matches the folio + * size but the underlying filesystem or block device + * uses a smaller granularity for IO. */ - if (*bytes_submitted == folio_len) + if (*bytes_submitted == folio_len || !ifs) ctx->cur_folio = NULL; } -- cgit v1.2.3 From 8fb6857f2f5e35179ff35e7d25358b9add681b7e Mon Sep 17 00:00:00 2001 From: Jori Koolstra Date: Wed, 18 Mar 2026 21:39:52 +0100 Subject: vfs: fix docstring of hash_name() The docstring of hash_name() is falsely reporting that it returns the component length, whereas it returns a pointer to the terminating '/' or NUL character in the pathname being resolved. Signed-off-by: Jori Koolstra Link: https://patch.msgid.link/20260318203953.5770-1-jkoolstra@xs4all.nl Signed-off-by: Christian Brauner --- fs/namei.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 58f715f7657e..9e5500dad14f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2437,8 +2437,14 @@ inside: EXPORT_SYMBOL(hashlen_string); /* - * Calculate the length and hash of the path component, and - * return the length as the result. + * hash_name - Calculate the length and hash of the path component + * @nd: the path resolution state + * @name: the pathname to read the component from + * @lastword: if the component fits in a single word, LAST_WORD_IS_DOT, + * LAST_WORD_IS_DOTDOT, or some other value depending on whether the + * component is '.', '..', or something else. Otherwise, @lastword is 0. + * + * Returns: a pointer to the terminating '/' or NUL character in @name. */ static inline const char *hash_name(struct nameidata *nd, const char *name, -- cgit v1.2.3 From 7e57523490cd2efb52b1ea97f2e0a74c0fb634cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Mar 2026 15:38:58 +0000 Subject: netfs: Fix read abandonment during retry Under certain circumstances, all the remaining subrequests from a read request will get abandoned during retry. The abandonment process expects the 'subreq' variable to be set to the place to start abandonment from, but it doesn't always have a useful value (it will be uninitialised on the first pass through the loop and it may point to a deleted subrequest on later passes). Fix the first jump to "abandon:" to set subreq to the start of the first subrequest expected to need retry (which, in this abandonment case, turned out unexpectedly to no longer have NEED_RETRY set). Also clear the subreq pointer after discarding superfluous retryable subrequests to cause an oops if we do try to access it. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://patch.msgid.link/3775287.1773848338@warthog.procyon.org.uk Reviewed-by: Paulo Alcantara (Red Hat) cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_retry.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 7793ba5e3e8f..cca9ac43c077 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -93,8 +93,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) from->start, from->transferred, from->len); if (test_bit(NETFS_SREQ_FAILED, &from->flags) || - !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) { + subreq = from; goto abandon; + } list_for_each_continue(next, &stream->subrequests) { subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); @@ -178,6 +180,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) if (subreq == to) break; } + subreq = NULL; continue; } -- cgit v1.2.3 From 76f9377cd2ab7a9220c25d33940d9ca20d368172 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 19 Mar 2026 17:51:45 -0700 Subject: writeback: don't block sync for filesystems with no data integrity guarantees Add a SB_I_NO_DATA_INTEGRITY superblock flag for filesystems that cannot guarantee data persistence on sync (eg fuse). For superblocks with this flag set, sync kicks off writeback of dirty inodes but does not wait for the flusher threads to complete the writeback. This replaces the per-inode AS_NO_DATA_INTEGRITY mapping flag added in commit f9a49aa302a0 ("fs/writeback: skip AS_NO_DATA_INTEGRITY mappings in wait_sb_inodes()"). The flag belongs at the superblock level because data integrity is a filesystem-wide property, not a per-inode one. Having this flag at the superblock level also allows us to skip having to iterate every dirty inode in wait_sb_inodes() only to skip each inode individually. Prior to this commit, mappings with no data integrity guarantees skipped waiting on writeback completion but still waited on the flusher threads to finish initiating the writeback. Waiting on the flusher threads is unnecessary. This commit kicks off writeback but does not wait on the flusher threads. This change properly addresses a recent report [1] for a suspend-to-RAM hang seen on fuse-overlayfs that was caused by waiting on the flusher threads to finish: Workqueue: pm_fs_sync pm_fs_sync_work_fn Call Trace: __schedule+0x457/0x1720 schedule+0x27/0xd0 wb_wait_for_completion+0x97/0xe0 sync_inodes_sb+0xf8/0x2e0 __iterate_supers+0xdc/0x160 ksys_sync+0x43/0xb0 pm_fs_sync_work_fn+0x17/0xa0 process_one_work+0x193/0x350 worker_thread+0x1a1/0x310 kthread+0xfc/0x240 ret_from_fork+0x243/0x280 ret_from_fork_asm+0x1a/0x30 On fuse this is problematic because there are paths that may cause the flusher thread to block (eg if systemd freezes the user session cgroups first, which freezes the fuse daemon, before invoking the kernel suspend. The kernel suspend triggers ->write_node() which on fuse issues a synchronous setattr request, which cannot be processed since the daemon is frozen. Or if the daemon is buggy and cannot properly complete writeback, initiating writeback on a dirty folio already under writeback leads to writeback_get_folio() -> folio_prepare_writeback() -> unconditional wait on writeback to finish, which will cause a hang). This commit restores fuse to its prior behavior before tmp folios were removed, where sync was essentially a no-op. [1] https://lore.kernel.org/linux-fsdevel/CAJnrk1a-asuvfrbKXbEwwDSctvemF+6zfhdnuzO65Pt8HsFSRw@mail.gmail.com/T/#m632c4648e9cafc4239299887109ebd880ac6c5c1 Fixes: 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal rb tree") Reported-by: John Cc: stable@vger.kernel.org Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20260320005145.2483161-2-joannelkoong@gmail.com Reviewed-by: Jan Kara Reviewed-by: David Hildenbrand (Arm) Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 18 ++++++++++++------ fs/fuse/file.c | 4 +--- fs/fuse/inode.c | 1 + include/linux/fs/super_types.h | 1 + include/linux/pagemap.h | 11 ----------- 5 files changed, 15 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8dac1931595..3c75ee025bda 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2787,13 +2787,8 @@ static void wait_sb_inodes(struct super_block *sb) * The mapping can appear untagged while still on-list since we * do not have the mapping lock. Skip it here, wb completion * will remove it. - * - * If the mapping does not have data integrity semantics, - * there's no need to wait for the writeout to complete, as the - * mapping cannot guarantee that data is persistently stored. */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK) || - mapping_no_data_integrity(mapping)) + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) continue; spin_unlock_irq(&sb->s_inode_wblist_lock); @@ -2928,6 +2923,17 @@ void sync_inodes_sb(struct super_block *sb) */ if (bdi == &noop_backing_dev_info) return; + + /* + * If the superblock has SB_I_NO_DATA_INTEGRITY set, there's no need to + * wait for the writeout to complete, as the filesystem cannot guarantee + * data persistence on sync. Just kick off writeback and return. + */ + if (sb->s_iflags & SB_I_NO_DATA_INTEGRITY) { + wakeup_flusher_threads_bdi(bdi, WB_REASON_SYNC); + return; + } + WARN_ON(!rwsem_is_locked(&sb->s_umount)); /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b1bb7153cb78..676fd9856bfb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3201,10 +3201,8 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; - if (fc->writeback_cache) { + if (fc->writeback_cache) mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); - mapping_set_no_data_integrity(&inode->i_data); - } INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e57b8af06be9..c795abe47a4f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1709,6 +1709,7 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; sb->s_iflags |= SB_I_NOIDMAP; + sb->s_iflags |= SB_I_NO_DATA_INTEGRITY; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index fa7638b81246..383050e7fdf5 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -338,5 +338,6 @@ struct super_block { #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ +#define SB_I_NO_DATA_INTEGRITY 0x00008000 /* fs cannot guarantee data persistence on sync */ #endif /* _LINUX_FS_SUPER_TYPES_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ec442af3f886..31a848485ad9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,7 +210,6 @@ enum mapping_flags { AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't account usage to user cgroups */ - AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -346,16 +345,6 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline void mapping_set_no_data_integrity(struct address_space *mapping) -{ - set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); -} - -static inline bool mapping_no_data_integrity(const struct address_space *mapping) -{ - return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); -} - static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; -- cgit v1.2.3 From 9bbb19d21ded7d78645506f20d8c44895e3d0fb9 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Tue, 17 Mar 2026 08:52:01 +0900 Subject: ksmbd: do not expire session on binding failure When a multichannel session binding request fails (e.g. wrong password), the error path unconditionally sets sess->state = SMB2_SESSION_EXPIRED. However, during binding, sess points to the target session looked up via ksmbd_session_lookup_slowpath() -- which belongs to another connection's user. This allows a remote attacker to invalidate any active session by simply sending a binding request with a wrong password (DoS). Fix this by skipping session expiration when the failed request was a binding attempt, since the session does not belong to the current connection. The reference taken by ksmbd_session_lookup_slowpath() is still correctly released via ksmbd_user_session_put(). Cc: stable@vger.kernel.org Signed-off-by: Hyunwoo Kim Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 9c44e71e3c3b..8fa780e8efd0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1939,8 +1939,14 @@ out_err: if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION) try_delay = true; - sess->last_active = jiffies; - sess->state = SMB2_SESSION_EXPIRED; + /* + * For binding requests, session belongs to another + * connection. Do not expire it. + */ + if (!(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { + sess->last_active = jiffies; + sess->state = SMB2_SESSION_EXPIRED; + } ksmbd_user_session_put(sess); work->sess = NULL; if (try_delay) { -- cgit v1.2.3 From 48623ec358c1c600fa1e38368746f933e0f1a617 Mon Sep 17 00:00:00 2001 From: Werner Kasselman Date: Mon, 16 Mar 2026 11:38:47 +0000 Subject: ksmbd: fix use-after-free and NULL deref in smb_grant_oplock() smb_grant_oplock() has two issues in the oplock publication sequence: 1) opinfo is linked into ci->m_op_list (via opinfo_add) before add_lease_global_list() is called. If add_lease_global_list() fails (kmalloc returns NULL), the error path frees the opinfo via __free_opinfo() while it is still linked in ci->m_op_list. Concurrent m_op_list readers (opinfo_get_list, or direct iteration in smb_break_all_levII_oplock) dereference the freed node. 2) opinfo->o_fp is assigned after add_lease_global_list() publishes the opinfo on the global lease list. A concurrent find_same_lease_key() can walk the lease list and dereference opinfo->o_fp->f_ci while o_fp is still NULL. Fix by restructuring the publication sequence to eliminate post-publish failure: - Set opinfo->o_fp before any list publication (fixes NULL deref). - Preallocate lease_table via alloc_lease_table() before opinfo_add() so add_lease_global_list() becomes infallible after publication. - Keep the original m_op_list publication order (opinfo_add before lease list) so concurrent opens via same_client_has_lease() and opinfo_get_list() still see the in-flight grant. - Use opinfo_put() instead of __free_opinfo() on err_out so that the RCU-deferred free path is used. This also requires splitting add_lease_global_list() to take a preallocated lease_table and changing its return type from int to void, since it can no longer fail. Fixes: 1dfd062caa16 ("ksmbd: fix use-after-free by using call_rcu() for oplock_info") Cc: stable@vger.kernel.org Signed-off-by: Werner Kasselman Reviewed-by: ChenXiaoSong Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/oplock.c | 72 +++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 393a4ae47cc1..9b2bb8764a80 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -82,11 +82,19 @@ static void lease_del_list(struct oplock_info *opinfo) spin_unlock(&lb->lb_lock); } -static void lb_add(struct lease_table *lb) +static struct lease_table *alloc_lease_table(struct oplock_info *opinfo) { - write_lock(&lease_list_lock); - list_add(&lb->l_entry, &lease_table_list); - write_unlock(&lease_list_lock); + struct lease_table *lb; + + lb = kmalloc_obj(struct lease_table, KSMBD_DEFAULT_GFP); + if (!lb) + return NULL; + + memcpy(lb->client_guid, opinfo->conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE); + INIT_LIST_HEAD(&lb->lease_list); + spin_lock_init(&lb->lb_lock); + return lb; } static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx) @@ -1042,34 +1050,27 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2) lease2->version = lease1->version; } -static int add_lease_global_list(struct oplock_info *opinfo) +static void add_lease_global_list(struct oplock_info *opinfo, + struct lease_table *new_lb) { struct lease_table *lb; - read_lock(&lease_list_lock); + write_lock(&lease_list_lock); list_for_each_entry(lb, &lease_table_list, l_entry) { if (!memcmp(lb->client_guid, opinfo->conn->ClientGUID, SMB2_CLIENT_GUID_SIZE)) { opinfo->o_lease->l_lb = lb; lease_add_list(opinfo); - read_unlock(&lease_list_lock); - return 0; + write_unlock(&lease_list_lock); + kfree(new_lb); + return; } } - read_unlock(&lease_list_lock); - lb = kmalloc_obj(struct lease_table, KSMBD_DEFAULT_GFP); - if (!lb) - return -ENOMEM; - - memcpy(lb->client_guid, opinfo->conn->ClientGUID, - SMB2_CLIENT_GUID_SIZE); - INIT_LIST_HEAD(&lb->lease_list); - spin_lock_init(&lb->lb_lock); - opinfo->o_lease->l_lb = lb; + opinfo->o_lease->l_lb = new_lb; lease_add_list(opinfo); - lb_add(lb); - return 0; + list_add(&new_lb->l_entry, &lease_table_list); + write_unlock(&lease_list_lock); } static void set_oplock_level(struct oplock_info *opinfo, int level, @@ -1189,6 +1190,7 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, int err = 0; struct oplock_info *opinfo = NULL, *prev_opinfo = NULL; struct ksmbd_inode *ci = fp->f_ci; + struct lease_table *new_lb = NULL; bool prev_op_has_lease; __le32 prev_op_state = 0; @@ -1291,21 +1293,37 @@ set_lev: set_oplock_level(opinfo, req_op_level, lctx); out: - opinfo_count_inc(fp); - opinfo_add(opinfo, fp); - + /* + * Set o_fp before any publication so that concurrent readers + * (e.g. find_same_lease_key() on the lease list) that + * dereference opinfo->o_fp don't hit a NULL pointer. + * + * Keep the original publication order so concurrent opens can + * still observe the in-flight grant via ci->m_op_list, but make + * everything after opinfo_add() no-fail by preallocating any new + * lease_table first. + */ + opinfo->o_fp = fp; if (opinfo->is_lease) { - err = add_lease_global_list(opinfo); - if (err) + new_lb = alloc_lease_table(opinfo); + if (!new_lb) { + err = -ENOMEM; goto err_out; + } } + opinfo_count_inc(fp); + opinfo_add(opinfo, fp); + + if (opinfo->is_lease) + add_lease_global_list(opinfo, new_lb); + rcu_assign_pointer(fp->f_opinfo, opinfo); - opinfo->o_fp = fp; return 0; err_out: - __free_opinfo(opinfo); + kfree(new_lb); + opinfo_put(opinfo); return err; } -- cgit v1.2.3 From 309b44ed684496ed3f9c5715d10b899338623512 Mon Sep 17 00:00:00 2001 From: Werner Kasselman Date: Tue, 17 Mar 2026 07:55:37 +0000 Subject: ksmbd: fix memory leaks and NULL deref in smb2_lock() smb2_lock() has three error handling issues after list_del() detaches smb_lock from lock_list at no_check_cl: 1) If vfs_lock_file() returns an unexpected error in the non-UNLOCK path, goto out leaks smb_lock and its flock because the out: handler only iterates lock_list and rollback_list, neither of which contains the detached smb_lock. 2) If vfs_lock_file() returns -ENOENT in the UNLOCK path, goto out leaks smb_lock and flock for the same reason. The error code returned to the dispatcher is also stale. 3) In the rollback path, smb_flock_init() can return NULL on allocation failure. The result is dereferenced unconditionally, causing a kernel NULL pointer dereference. Add a NULL check to prevent the crash and clean up the bookkeeping; the VFS lock itself cannot be rolled back without the allocation and will be released at file or connection teardown. Fix cases 1 and 2 by hoisting the locks_free_lock()/kfree() to before the if(!rc) check in the UNLOCK branch so all exit paths share one free site, and by freeing smb_lock and flock before goto out in the non-UNLOCK branch. Propagate the correct error code in both cases. Fix case 3 by wrapping the VFS unlock in an if(rlock) guard and adding a NULL check for locks_free_lock(rlock) in the shared cleanup. Found via call-graph analysis using sqry. Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Cc: stable@vger.kernel.org Suggested-by: ChenXiaoSong Signed-off-by: Werner Kasselman Reviewed-by: ChenXiaoSong Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 8fa780e8efd0..24f8d58493d0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -7592,14 +7592,15 @@ retry: rc = vfs_lock_file(filp, smb_lock->cmd, flock, NULL); skip: if (smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) { + locks_free_lock(flock); + kfree(smb_lock); if (!rc) { ksmbd_debug(SMB, "File unlocked\n"); } else if (rc == -ENOENT) { rsp->hdr.Status = STATUS_NOT_LOCKED; + err = rc; goto out; } - locks_free_lock(flock); - kfree(smb_lock); } else { if (rc == FILE_LOCK_DEFERRED) { void **argv; @@ -7668,6 +7669,9 @@ skip: spin_unlock(&work->conn->llist_lock); ksmbd_debug(SMB, "successful in taking lock\n"); } else { + locks_free_lock(flock); + kfree(smb_lock); + err = rc; goto out; } } @@ -7698,13 +7702,17 @@ out: struct file_lock *rlock = NULL; rlock = smb_flock_init(filp); - rlock->c.flc_type = F_UNLCK; - rlock->fl_start = smb_lock->start; - rlock->fl_end = smb_lock->end; + if (rlock) { + rlock->c.flc_type = F_UNLCK; + rlock->fl_start = smb_lock->start; + rlock->fl_end = smb_lock->end; - rc = vfs_lock_file(filp, F_SETLK, rlock, NULL); - if (rc) - pr_err("rollback unlock fail : %d\n", rc); + rc = vfs_lock_file(filp, F_SETLK, rlock, NULL); + if (rc) + pr_err("rollback unlock fail : %d\n", rc); + } else { + pr_err("rollback unlock alloc failed\n"); + } list_del(&smb_lock->llist); spin_lock(&work->conn->llist_lock); @@ -7714,7 +7722,8 @@ out: spin_unlock(&work->conn->llist_lock); locks_free_lock(smb_lock->fl); - locks_free_lock(rlock); + if (rlock) + locks_free_lock(rlock); kfree(smb_lock); } out2: -- cgit v1.2.3 From 0e55f63dd08f09651d39e1b709a91705a8a0ddcb Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 13 Mar 2026 14:45:58 +0900 Subject: ksmbd: replace hardcoded hdr2_len with offsetof() in smb2_calc_max_out_buf_len() After this commit (e2b76ab8b5c9 "ksmbd: add support for read compound"), response buffer management was changed to use dynamic iov array. In the new design, smb2_calc_max_out_buf_len() expects the second argument (hdr2_len) to be the offset of ->Buffer field in the response structure, not a hardcoded magic number. Fix the remaining call sites to use the correct offsetof() value. Cc: stable@vger.kernel.org Fixes: e2b76ab8b5c9 ("ksmbd: add support for read compound") Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 24f8d58493d0..f5f1bf5f642e 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4452,8 +4452,9 @@ int smb2_query_dir(struct ksmbd_work *work) d_info.wptr = (char *)rsp->Buffer; d_info.rptr = (char *)rsp->Buffer; d_info.out_buf_len = - smb2_calc_max_out_buf_len(work, 8, - le32_to_cpu(req->OutputBufferLength)); + smb2_calc_max_out_buf_len(work, + offsetof(struct smb2_query_directory_rsp, Buffer), + le32_to_cpu(req->OutputBufferLength)); if (d_info.out_buf_len < 0) { rc = -EINVAL; goto err_out; @@ -4720,8 +4721,9 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, } buf_free_len = - smb2_calc_max_out_buf_len(work, 8, - le32_to_cpu(req->OutputBufferLength)); + smb2_calc_max_out_buf_len(work, + offsetof(struct smb2_query_info_rsp, Buffer), + le32_to_cpu(req->OutputBufferLength)); if (buf_free_len < 0) return -EINVAL; @@ -5047,8 +5049,9 @@ static int get_file_stream_info(struct ksmbd_work *work, file_info = (struct smb2_file_stream_info *)rsp->Buffer; buf_free_len = - smb2_calc_max_out_buf_len(work, 8, - le32_to_cpu(req->OutputBufferLength)); + smb2_calc_max_out_buf_len(work, + offsetof(struct smb2_query_info_rsp, Buffer), + le32_to_cpu(req->OutputBufferLength)); if (buf_free_len < 0) goto out; @@ -8206,8 +8209,9 @@ int smb2_ioctl(struct ksmbd_work *work) buffer = (char *)req + le32_to_cpu(req->InputOffset); cnt_code = le32_to_cpu(req->CtlCode); - ret = smb2_calc_max_out_buf_len(work, 48, - le32_to_cpu(req->MaxOutputResponse)); + ret = smb2_calc_max_out_buf_len(work, + offsetof(struct smb2_ioctl_rsp, Buffer), + le32_to_cpu(req->MaxOutputResponse)); if (ret < 0) { rsp->hdr.Status = STATUS_INVALID_PARAMETER; goto out; -- cgit v1.2.3 From e942498385bf80f4d6d075b47174035545eb6a2e Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 17 Mar 2026 09:51:52 +0800 Subject: xfs: only assert new size for datafork during truncate extents The assertion functions properly because we currently only truncate the attr to a zero size. Any other new size of the attr is not preempted. Make this assertion is specific to the datafork, preparing for subsequent patches to truncate the attribute to a non-zero size. Reviewed-by: Darrick J. Wong Signed-off-by: Long Li Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 50c0404f9064..beaa26ec62da 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1048,7 +1048,8 @@ xfs_itruncate_extents_flags( xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); - ASSERT(new_size <= XFS_ISIZE(ip)); + if (whichfork == XFS_DATA_FORK) + ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); ASSERT(ip->i_itemp->ili_lock_flags == 0); -- cgit v1.2.3 From ce4e789cf3561c9fac73cc24445bfed9ea0c514b Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 17 Mar 2026 09:51:53 +0800 Subject: xfs: factor out xfs_attr3_node_entry_remove Factor out wrapper xfs_attr3_node_entry_remove function, which exported for external use. Reviewed-by: Darrick J. Wong Signed-off-by: Long Li Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_da_btree.c | 53 +++++++++++++++++++++++++++++++++++--------- fs/xfs/libxfs/xfs_da_btree.h | 2 ++ 2 files changed, 44 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 09d4c17b3e7b..ad801b7bd2dd 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -1506,21 +1506,20 @@ xfs_da3_fixhashpath( } /* - * Remove an entry from an intermediate node. + * Internal implementation to remove an entry from an intermediate node. */ STATIC void -xfs_da3_node_remove( - struct xfs_da_state *state, - struct xfs_da_state_blk *drop_blk) +__xfs_da3_node_remove( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_da_geometry *geo, + struct xfs_da_state_blk *drop_blk) { struct xfs_da_intnode *node; struct xfs_da3_icnode_hdr nodehdr; struct xfs_da_node_entry *btree; int index; int tmp; - struct xfs_inode *dp = state->args->dp; - - trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node); @@ -1536,17 +1535,17 @@ xfs_da3_node_remove( tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); memmove(&btree[index], &btree[index + 1], tmp); - xfs_trans_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(tp, drop_blk->bp, XFS_DA_LOGRANGE(node, &btree[index], tmp)); index = nodehdr.count - 1; } memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); - xfs_trans_log_buf(state->args->trans, drop_blk->bp, + xfs_trans_log_buf(tp, drop_blk->bp, XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); nodehdr.count -= 1; xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr); - xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, state->args->geo->node_hdr_size)); + xfs_trans_log_buf(tp, drop_blk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, geo->node_hdr_size)); /* * Copy the last hash value from the block to propagate upwards. @@ -1554,6 +1553,38 @@ xfs_da3_node_remove( drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); } +/* + * Remove an entry from an intermediate node. + */ +STATIC void +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) +{ + trace_xfs_da_node_remove(state->args); + + __xfs_da3_node_remove(state->args->trans, state->args->dp, + state->args->geo, drop_blk); +} + +/* + * Remove an entry from an intermediate attr node at the specified index. + */ +void +xfs_attr3_node_entry_remove( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf *bp, + int index) +{ + struct xfs_da_state_blk blk = { + .index = index, + .bp = bp, + }; + + __xfs_da3_node_remove(tp, dp, dp->i_mount->m_attr_geo, &blk); +} + /* * Unbalance the elements between two intermediate nodes, * move all Btree elements from one node into another. diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 354d5d65043e..afcf2d3c7a21 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -184,6 +184,8 @@ int xfs_da3_split(xfs_da_state_t *state); int xfs_da3_join(xfs_da_state_t *state); void xfs_da3_fixhashpath(struct xfs_da_state *state, struct xfs_da_state_path *path_to_to_fix); +void xfs_attr3_node_entry_remove(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_buf *bp, int index); /* * Routines used for finding things in the Btree. -- cgit v1.2.3 From e65bb55d7f8c2041c8fdb73cd29b0b4cad4ed847 Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 17 Mar 2026 09:51:54 +0800 Subject: xfs: factor out xfs_attr3_leaf_init Factor out wrapper xfs_attr3_leaf_init function, which exported for external use. Reviewed-by: Darrick J. Wong Signed-off-by: Long Li Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_attr_leaf.c | 22 ++++++++++++++++++++++ fs/xfs/libxfs/xfs_attr_leaf.h | 3 +++ 2 files changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 47f48ae555c0..2b78041e8672 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1415,6 +1415,28 @@ xfs_attr3_leaf_create( return 0; } +/* + * Reinitialize an existing attr fork block as an empty leaf, and attach + * the buffer to tp. + */ +int +xfs_attr3_leaf_init( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t blkno) +{ + struct xfs_buf *bp = NULL; + struct xfs_da_args args = { + .trans = tp, + .dp = dp, + .owner = dp->i_ino, + .geo = dp->i_mount->m_attr_geo, + }; + + ASSERT(tp != NULL); + + return xfs_attr3_leaf_create(&args, blkno, &bp); +} /* * Split the leaf node, rebalance, then add the new entry. * diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index aca46da2bc50..72639efe6ac3 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -87,6 +87,9 @@ int xfs_attr3_leaf_list_int(struct xfs_buf *bp, /* * Routines used for shrinking the Btree. */ + +int xfs_attr3_leaf_init(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t blkno); int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, -- cgit v1.2.3 From b854e1c4eff3473b6d3a9ae74129ac5c48bc0b61 Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 17 Mar 2026 09:51:55 +0800 Subject: xfs: close crash window in attr dabtree inactivation When inactivating an inode with node-format extended attributes, xfs_attr3_node_inactive() invalidates all child leaf/node blocks via xfs_trans_binval(), but intentionally does not remove the corresponding entries from their parent node blocks. The implicit assumption is that xfs_attr_inactive() will truncate the entire attr fork to zero extents afterwards, so log recovery will never reach the root node and follow those stale pointers. However, if a log shutdown occurs after the leaf/node block cancellations commit but before the attr bmap truncation commits, this assumption breaks. Recovery replays the attr bmap intact (the inode still has attr fork extents), but suppresses replay of all cancelled leaf/node blocks, maybe leaving them as stale data on disk. On the next mount, xlog_recover_process_iunlinks() retries inactivation and attempts to read the root node via the attr bmap. If the root node was not replayed, reading the unreplayed root block triggers a metadata verification failure immediately; if it was replayed, following its child pointers to unreplayed child blocks triggers the same failure: XFS (pmem0): Metadata corruption detected at xfs_da3_node_read_verify+0x53/0x220, xfs_da3_node block 0x78 XFS (pmem0): Unmount and run xfs_repair XFS (pmem0): First 128 bytes of corrupted metadata buffer: 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000050: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ XFS (pmem0): metadata I/O error in "xfs_da_read_buf+0x104/0x190" at daddr 0x78 len 8 error 117 Fix this in two places: In xfs_attr3_node_inactive(), after calling xfs_trans_binval() on a child block, immediately remove the entry that references it from the parent node in the same transaction. This eliminates the window where the parent holds a pointer to a cancelled block. Once all children are removed, the now-empty root node is converted to a leaf block within the same transaction. This node-to-leaf conversion is necessary for crash safety. If the system shutdown after the empty node is written to the log but before the second-phase bmap truncation commits, log recovery will attempt to verify the root block on disk. xfs_da3_node_verify() does not permit a node block with count == 0; such a block will fail verification and trigger a metadata corruption shutdown. on the other hand, leaf blocks are allowed to have this transient state. In xfs_attr_inactive(), split the attr fork truncation into two explicit phases. First, truncate all extents beyond the root block (the child extents whose parent references have already been removed above). Second, invalidate the root block and truncate the attr bmap to zero in a single transaction. The two operations in the second phase must be atomic: as long as the attr bmap has any non-zero length, recovery can follow it to the root block, so the root block invalidation must commit together with the bmap-to-zero truncation. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_attr_inactive.c | 95 +++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 92331991f9fd..a5b69c0fbfd0 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -140,7 +140,7 @@ xfs_attr3_node_inactive( xfs_daddr_t parent_blkno, child_blkno; struct xfs_buf *child_bp; struct xfs_da3_icnode_hdr ichdr; - int error, i; + int error; /* * Since this code is recursive (gasp!) we must protect ourselves. @@ -152,7 +152,7 @@ xfs_attr3_node_inactive( return -EFSCORRUPTED; } - xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr); + xfs_da3_node_hdr_from_disk(mp, &ichdr, bp->b_addr); parent_blkno = xfs_buf_daddr(bp); if (!ichdr.count) { xfs_trans_brelse(*trans, bp); @@ -167,7 +167,7 @@ xfs_attr3_node_inactive( * over the leaves removing all of them. If this is higher up * in the tree, recurse downward. */ - for (i = 0; i < ichdr.count; i++) { + while (ichdr.count > 0) { /* * Read the subsidiary block to see what we have to work with. * Don't do this in a transaction. This is a depth-first @@ -218,29 +218,32 @@ xfs_attr3_node_inactive( xfs_trans_binval(*trans, child_bp); child_bp = NULL; + error = xfs_da3_node_read_mapped(*trans, dp, + parent_blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + /* - * If we're not done, re-read the parent to get the next - * child block number. + * Remove entry from parent node, prevents being indexed to. */ - if (i + 1 < ichdr.count) { - struct xfs_da3_icnode_hdr phdr; + xfs_attr3_node_entry_remove(*trans, dp, bp, 0); + + xfs_da3_node_hdr_from_disk(mp, &ichdr, bp->b_addr); + bp = NULL; - error = xfs_da3_node_read_mapped(*trans, dp, - parent_blkno, &bp, XFS_ATTR_FORK); + if (ichdr.count > 0) { + /* + * If we're not done, get the next child block number. + */ + child_fsb = be32_to_cpu(ichdr.btree[0].before); + + /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll_inode(trans, dp); if (error) return error; - xfs_da3_node_hdr_from_disk(dp->i_mount, &phdr, - bp->b_addr); - child_fsb = be32_to_cpu(phdr.btree[i + 1].before); - xfs_trans_brelse(*trans, bp); - bp = NULL; } - /* - * Atomically commit the whole invalidate stuff. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; } return 0; @@ -257,10 +260,8 @@ xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) { - struct xfs_mount *mp = dp->i_mount; struct xfs_da_blkinfo *info; struct xfs_buf *bp; - xfs_daddr_t blkno; int error; /* @@ -272,7 +273,6 @@ xfs_attr3_root_inactive( error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK); if (error) return error; - blkno = xfs_buf_daddr(bp); /* * Invalidate the tree, even if the "tree" is only a single leaf block. @@ -283,10 +283,26 @@ xfs_attr3_root_inactive( case cpu_to_be16(XFS_DA_NODE_MAGIC): case cpu_to_be16(XFS_DA3_NODE_MAGIC): error = xfs_attr3_node_inactive(trans, dp, bp, 1); + /* + * Empty root node block are not allowed, convert it to leaf. + */ + if (!error) + error = xfs_attr3_leaf_init(*trans, dp, 0); + if (!error) + error = xfs_trans_roll_inode(trans, dp); break; case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): error = xfs_attr3_leaf_inactive(trans, dp, bp); + /* + * Reinit the leaf before truncating extents so that a crash + * mid-truncation leaves an empty leaf rather than one with + * entries that may reference freed remote value blocks. + */ + if (!error) + error = xfs_attr3_leaf_init(*trans, dp, 0); + if (!error) + error = xfs_trans_roll_inode(trans, dp); break; default: xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); @@ -295,21 +311,6 @@ xfs_attr3_root_inactive( xfs_trans_brelse(*trans, bp); break; } - if (error) - return error; - - /* - * Invalidate the incore copy of the root block. - */ - error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno, - XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp); - if (error) - return error; - xfs_trans_binval(*trans, bp); /* remove from cache */ - /* - * Commit the invalidate and start the next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); return error; } @@ -328,6 +329,7 @@ xfs_attr_inactive( { struct xfs_trans *trans; struct xfs_mount *mp; + struct xfs_buf *bp; int lock_mode = XFS_ILOCK_SHARED; int error = 0; @@ -363,10 +365,27 @@ xfs_attr_inactive( * removal below. */ if (dp->i_af.if_nextents > 0) { + /* + * Invalidate and truncate all blocks but leave the root block. + */ error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, + XFS_FSB_TO_B(mp, mp->m_attr_geo->fsbcount)); + if (error) + goto out_cancel; + + /* + * Invalidate and truncate the root block and ensure that the + * operation is completed within a single transaction. + */ + error = xfs_da_get_buf(trans, dp, 0, &bp, XFS_ATTR_FORK); + if (error) + goto out_cancel; + + xfs_trans_binval(trans, bp); error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); if (error) goto out_cancel; -- cgit v1.2.3 From d72f2084e30966097c8eae762e31986a33c3c0ae Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 20 Mar 2026 10:11:29 +0800 Subject: xfs: fix ri_total validation in xlog_recover_attri_commit_pass2 The ri_total checks for SET/REPLACE operations are hardcoded to 3, but xfs_attri_item_size() only emits a value iovec when value_len > 0, so ri_total is 2 when value_len == 0. For PPTR_SET/PPTR_REMOVE/PPTR_REPLACE, value_len is validated by xfs_attri_validate() to be exactly sizeof(struct xfs_parent_rec) and is never zero, so their hardcoded checks remain correct. This problem may cause log recovery failures. The following script can be used to reproduce the problem: #!/bin/bash mkfs.xfs -f /dev/sda mount /dev/sda /mnt/test/ touch /mnt/test/file for i in {1..200}; do attr -s "user.attr_$i" -V "value_$i" /mnt/test/file > /dev/null done echo 1 > /sys/fs/xfs/debug/larp echo 1 > /sys/fs/xfs/sda/errortag/larp attr -s "user.zero" -V "" /mnt/test/file echo 0 > /sys/fs/xfs/sda/errortag/larp umount /mnt/test mount /dev/sda /mnt/test/ # mount failed Fix this by deriving the expected count dynamically as "2 + !!value_len" for SET/REPLACE operations. Cc: stable@vger.kernel.org # v6.9 Fixes: ad206ae50eca ("xfs: check opcode and iovec count match in xlog_recover_attri_commit_pass2") Reviewed-by: Darrick J. Wong Signed-off-by: Long Li Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_attr_item.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 354472bf45f1..83d09635b200 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -1047,8 +1047,8 @@ xlog_recover_attri_commit_pass2( break; case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: - /* Log item, attr name, attr value */ - if (item->ri_total != 3) { + /* Log item, attr name, optional attr value */ + if (item->ri_total != 2 + !!attri_formatp->alfi_value_len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; -- cgit v1.2.3 From c6c56ff975f046be25f527231a239e37920aca5e Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 20 Mar 2026 10:11:30 +0800 Subject: xfs: remove redundant validation in xlog_recover_attri_commit_pass2 Remove the redundant post-parse validation switch. By the time that block is reached, xfs_attri_validate() has already guaranteed all name lengths are non-zero via xfs_attri_validate_namelen(), and xfs_attri_validate_name_iovec() has already returned -EFSCORRUPTED for NULL names. For the REMOVE case, attr_value and value_len are structurally guaranteed to be NULL/zero because the parsing loop only populates them when value_len != 0. All checks in that switch are therefore dead code. Reviewed-by: Darrick J. Wong Signed-off-by: Long Li Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_attr_item.c | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 83d09635b200..82324f42537b 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -1132,52 +1132,6 @@ xlog_recover_attri_commit_pass2( return -EFSCORRUPTED; } - switch (op) { - case XFS_ATTRI_OP_FLAGS_REMOVE: - /* Regular remove operations operate only on names. */ - if (attr_value != NULL || value_len != 0) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - attri_formatp, len); - return -EFSCORRUPTED; - } - fallthrough; - case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: - case XFS_ATTRI_OP_FLAGS_PPTR_SET: - case XFS_ATTRI_OP_FLAGS_SET: - case XFS_ATTRI_OP_FLAGS_REPLACE: - /* - * Regular xattr set/remove/replace operations require a name - * and do not take a newname. Values are optional for set and - * replace. - * - * Name-value set/remove operations must have a name, do not - * take a newname, and can take a value. - */ - if (attr_name == NULL || name_len == 0) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - attri_formatp, len); - return -EFSCORRUPTED; - } - break; - case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: - /* - * Name-value replace operations require the caller to - * specify the old and new names and values explicitly. - * Values are optional. - */ - if (attr_name == NULL || name_len == 0) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - attri_formatp, len); - return -EFSCORRUPTED; - } - if (attr_new_name == NULL || new_name_len == 0) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - attri_formatp, len); - return -EFSCORRUPTED; - } - break; - } - /* * Memory alloc failure will cause replay to abort. We attach the * name/value buffer to the recovered incore log item and drop our -- cgit v1.2.3 From 34420cb92dbb9e37ff6c6603f4f5e1807db3f1de Mon Sep 17 00:00:00 2001 From: Huiwen He Date: Mon, 23 Mar 2026 17:08:12 +0800 Subject: smb/client: ensure smb2_mapping_table rebuild on cmd changes The current rule for smb2_mapping_table.c uses `$(call cmd,...)`, which fails to track command line modifications in the Makefile (e.g., modifying the command to `perl -d` or `perl -w` for debug will not trigger a rebuild) and does not generate the required .cmd file for Kbuild. Fix this by transitioning to the standard `$(call if_changed,...)` macro. This includes adding the `FORCE` prerequisite and appending the output file to the `targets` variable so Kbuild can track it properly. As a result, Kbuild now automatically handles the cleaning of the generated file, allowing us to safely drop the redundant `clean-files` assignment. Fixes: c527e13a7a66 ("cifs: Autogenerate SMB2 error mapping table") Signed-off-by: Huiwen He Reviewed-by: ChenXiaoSong Signed-off-by: Steve French --- fs/smb/client/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile index 26b6105f04d1..1a6e1e1c9764 100644 --- a/fs/smb/client/Makefile +++ b/fs/smb/client/Makefile @@ -48,8 +48,8 @@ cifs-$(CONFIG_CIFS_COMPRESSION) += compress.o compress/lz77.o # Build the SMB2 error mapping table from smb2status.h # $(obj)/smb2_mapping_table.c: $(src)/../common/smb2status.h \ - $(src)/gen_smb2_mapping - $(call cmd,gen_smb2_mapping) + $(src)/gen_smb2_mapping FORCE + $(call if_changed,gen_smb2_mapping) $(obj)/smb2maperror.o: $(obj)/smb2_mapping_table.c @@ -58,4 +58,5 @@ quiet_cmd_gen_smb2_mapping = GEN $@ obj-$(CONFIG_SMB_KUNIT_TESTS) += smb2maperror_test.o -clean-files += smb2_mapping_table.c +# Let Kbuild handle tracking and cleaning +targets += smb2_mapping_table.c -- cgit v1.2.3 From b52fe51f724385b3ed81e37e510a4a33107e8161 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 17:35:42 +0000 Subject: btrfs: fix super block offset in error message in btrfs_validate_super() Fix the superblock offset mismatch error message in btrfs_validate_super(): we changed it so that it considers all the superblocks, but the message still assumes we're only looking at the first one. The change from %u to %llu is because we're changing from a constant to a u64. Fixes: 069ec957c35e ("btrfs: Refactor btrfs_check_super_valid") Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1b53d713ee9..3524976ccc1d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2531,8 +2531,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, if (mirror_num >= 0 && btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { - btrfs_err(fs_info, "super offset mismatch %llu != %u", - btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + btrfs_err(fs_info, "super offset mismatch %llu != %llu", + btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num)); ret = -EINVAL; } -- cgit v1.2.3 From 5254d4181add9dfaa5e3519edd71cc8f752b2f85 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 17 Feb 2026 14:46:50 +0000 Subject: btrfs: fix zero size inode with non-zero size after log replay When logging that an inode exists, as part of logging a new name or logging new dir entries for a directory, we always set the generation of the logged inode item to 0. This is to signal during log replay (in overwrite_item()), that we should not set the i_size since we only logged that an inode exists, so the i_size of the inode in the subvolume tree must be preserved (as when we log new names or that an inode exists, we don't log extents). This works fine except when we have already logged an inode in full mode or it's the first time we are logging an inode created in a past transaction, that inode has a new i_size of 0 and then we log a new name for the inode (due to a new hardlink or a rename), in which case we log an i_size of 0 for the inode and a generation of 0, which causes the log replay code to not update the inode's i_size to 0 (in overwrite_item()). An example scenario: mkdir /mnt/dir xfs_io -f -c "pwrite 0 64K" /mnt/dir/foo sync xfs_io -c "truncate 0" -c "fsync" /mnt/dir/foo ln /mnt/dir/foo /mnt/dir/bar xfs_io -c "fsync" /mnt/dir After log replay the file remains with a size of 64K. This is because when we first log the inode, when we fsync file foo, we log its current i_size of 0, and then when we create a hard link we log again the inode in exists mode (LOG_INODE_EXISTS) but we set a generation of 0 for the inode item we add to the log tree, so during log replay overwrite_item() sees that the generation is 0 and i_size is 0 so we skip updating the inode's i_size from 64K to 0. Fix this by making sure at fill_inode_item() we always log the real generation of the inode if it was logged in the current transaction with the i_size we logged before. Also if an inode created in a previous transaction is logged in exists mode only, make sure we log the i_size stored in the inode item located from the commit root, so that if we log multiple times that the inode exists we get the correct i_size. A test case for fstests will follow soon. Reported-by: Vyacheslav Kovalevsky Link: https://lore.kernel.org/linux-btrfs/af8c15fa-4e41-4bb2-885c-0bc4e97532a6@gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 98 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9ff3933bc382..fce1b16a882b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4616,21 +4616,32 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode, bool log_inode_only, u64 logged_isize) { + u64 gen = BTRFS_I(inode)->generation; u64 flags; if (log_inode_only) { - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' + /* + * Set the generation to zero so the recover code can tell the + * difference between a logging just to say 'this inode exists' + * and a logging to say 'update this inode with these values'. + * But only if the inode was not already logged before. + * We access ->logged_trans directly since it was already set + * up in the call chain by btrfs_log_inode(), and data_race() + * to avoid false alerts from KCSAN and since it was set already + * and one can set it to 0 since that only happens on eviction + * and we are holding a ref on the inode. */ - btrfs_set_inode_generation(leaf, item, 0); + ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0); + if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid) + gen = 0; + btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); btrfs_set_inode_size(leaf, item, inode->i_size); } + btrfs_set_inode_generation(leaf, item, gen); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_mode(leaf, item, inode->i_mode); @@ -5448,42 +5459,63 @@ process: return 0; } -static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, - struct btrfs_path *path, u64 *size_ret) +static int get_inode_size_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, u64 *size_ret) { struct btrfs_key key; + struct btrfs_inode_item *item; int ret; key.objectid = btrfs_ino(inode); key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - *size_ret = 0; - } else { - struct btrfs_inode_item *item; + /* + * Our caller called inode_logged(), so logged_trans is up to date. + * Use data_race() to silence any warning from KCSAN. Once logged_trans + * is set, it can only be reset to 0 after inode eviction. + */ + if (data_race(inode->logged_trans) == trans->transid) { + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + } else if (inode->generation < trans->transid) { + path->search_commit_root = true; + path->skip_locking = true; + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); + path->search_commit_root = false; + path->skip_locking = false; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - *size_ret = btrfs_inode_size(path->nodes[0], item); - /* - * If the in-memory inode's i_size is smaller then the inode - * size stored in the btree, return the inode's i_size, so - * that we get a correct inode size after replaying the log - * when before a power failure we had a shrinking truncate - * followed by addition of a new name (rename / new hard link). - * Otherwise return the inode size from the btree, to avoid - * data loss when replaying a log due to previously doing a - * write that expands the inode's size and logging a new name - * immediately after. - */ - if (*size_ret > inode->vfs_inode.i_size) - *size_ret = inode->vfs_inode.i_size; + } else { + *size_ret = 0; + return 0; } + /* + * If the inode was logged before or is from a past transaction, then + * its inode item must exist in the log root or in the commit root. + */ + ASSERT(ret <= 0); + if (WARN_ON_ONCE(ret > 0)) + ret = -ENOENT; + + if (ret < 0) + return ret; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode size stored + * in the btree, return the inode's i_size, so that we get a correct + * inode size after replaying the log when before a power failure we had + * a shrinking truncate followed by addition of a new name (rename / new + * hard link). Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a write that + * expands the inode's size and logging a new name immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; + btrfs_release_path(path); return 0; } @@ -6996,7 +7028,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = drop_inode_items(trans, log, path, inode, BTRFS_XATTR_ITEM_KEY); } else { - if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { + if (inode_only == LOG_INODE_EXISTS) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -7010,7 +7042,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - ret = logged_inode_size(log, inode, path, &logged_isize); + ret = get_inode_size_to_log(trans, inode, path, &logged_isize); if (ret) goto out_unlock; } -- cgit v1.2.3 From a4376d9a5d4c9610e69def3fc0b32c86a7ab7a41 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Sun, 1 Mar 2026 21:17:04 +0900 Subject: btrfs: fix leak of kobject name for sub-group space_info When create_space_info_sub_group() allocates elements of space_info->sub_group[], kobject_init_and_add() is called for each element via btrfs_sysfs_add_space_info_type(). However, when check_removing_space_info() frees these elements, it does not call btrfs_sysfs_remove_space_info() on them. As a result, kobject_put() is not called and the associated kobj->name objects are leaked. This memory leak is reproduced by running the blktests test case zbd/009 on kernels built with CONFIG_DEBUG_KMEMLEAK. The kmemleak feature reports the following error: unreferenced object 0xffff888112877d40 (size 16): comm "mount", pid 1244, jiffies 4294996972 hex dump (first 16 bytes): 64 61 74 61 2d 72 65 6c 6f 63 00 c4 c6 a7 cb 7f data-reloc...... backtrace (crc 53ffde4d): __kmalloc_node_track_caller_noprof+0x619/0x870 kstrdup+0x42/0xc0 kobject_set_name_vargs+0x44/0x110 kobject_init_and_add+0xcf/0x150 btrfs_sysfs_add_space_info_type+0xfc/0x210 [btrfs] create_space_info_sub_group.constprop.0+0xfb/0x1b0 [btrfs] create_space_info+0x211/0x320 [btrfs] btrfs_init_space_info+0x15a/0x1b0 [btrfs] open_ctree+0x33c7/0x4a50 [btrfs] btrfs_get_tree.cold+0x9f/0x1ee [btrfs] vfs_get_tree+0x87/0x2f0 vfs_cmd_create+0xbd/0x280 __do_sys_fsconfig+0x3df/0x990 do_syscall_64+0x136/0x1540 entry_SYSCALL_64_after_hwframe+0x76/0x7e To avoid the leak, call btrfs_sysfs_remove_space_info() instead of kfree() for the elements. Fixes: f92ee31e031c ("btrfs: introduce btrfs_space_info sub-group") Link: https://lore.kernel.org/linux-block/b9488881-f18d-4f47-91a5-3c9bf63955a5@wdc.com/ Reviewed-by: Johannes Thumshirn Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fa55d868ecd8..f7fcff7dca8f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4584,7 +4584,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { if (space_info->sub_group[i]) { check_removing_space_info(space_info->sub_group[i]); - kfree(space_info->sub_group[i]); + btrfs_sysfs_remove_space_info(space_info->sub_group[i]); space_info->sub_group[i] = NULL; } } -- cgit v1.2.3 From 0dcabcb920a5c143c568f37c26c6f2b4b9206bd1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Mar 2026 18:35:26 +1030 Subject: btrfs: zlib: handle page aligned compressed size correctly [BUG] Since commit 3d74a7556fba ("btrfs: zlib: introduce zlib_compress_bio() helper"), there are some reports about different crashes in zlib compression path. One of the symptoms is list corruption like the following: list_del corruption. next->prev should be fffffbb340204a08, but was ffff8d6517cb7de0. (next=fffffbb3402d62c8) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:65! Oops: invalid opcode: 0000 [#1] SMP NOPTI CPU: 1 UID: 0 PID: 21436 Comm: kworker/u16:7 Not tainted 7.0.0-rc2-jcg+ #1 PREEMPT Hardware name: LENOVO 10VGS02P00/3130, BIOS M1XKT57A 02/10/2022 Workqueue: btrfs-delalloc btrfs_work_helper [btrfs] RIP: 0010:__list_del_entry_valid_or_report+0xec/0xf0 Call Trace: btrfs_alloc_compr_folio+0xae/0xc0 [btrfs] zlib_compress_bio+0x39d/0x6a0 [btrfs] btrfs_compress_bio+0x2e3/0x3d0 [btrfs] compress_file_range+0x2b0/0x660 [btrfs] btrfs_work_helper+0xdb/0x3e0 [btrfs] process_one_work+0x192/0x3d0 worker_thread+0x19a/0x310 kthread+0xdf/0x120 ret_from_fork+0x22e/0x310 ret_from_fork_asm+0x1a/0x30 ---[ end trace 0000000000000000 ]--- Other symptoms include VM_BUG_ON() during folio_put() but it's rarer. David Sterba firstly reported this during his CI runs but unfortunately I'm unable to hit it. Meanwhile zstd/lzo doesn't seem to have the same problem. [CAUSE] During zlib_compress_bio() every time the output buffer is full, we queue the full folio into the compressed bio, and allocate a new folio as the output folio. After the input has finished, we loop through zlib_deflate() with Z_FINISH to flush all output. And when that is done, we still need to check if the last folio has any content, and if so we still need to queue that part into the compressed bio. The problem is in the final folio handling, if the final folio is full (for x86_64 the folio size is 4K), the length to queue is calculated by u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); But since total_out is 4K aligned, the resulted @cur_len will be 0, then we hit the bio_add_folio(), which has a quirk that if bio_add_folio() got an length 0, it will still queue the folio into the bio, but return false. In that case we go to out: tag, which calls btrfs_free_compr_folio() to release @out_folio, which may put the out folio into the btrfs global pool list. On the other hand, that @out_folio is already added to the compressed bio, and will later be released again by cleanup_compressed_bio(), which results double release. And if this time we still need to put the folio into the btrfs global pool list, it will result a list corruption because it's already in the list. [FIX] Instead of offset_inside_folio(), directly use the difference between strm.total_out and bi_size. So that if the last folio is completely full, we can still properly queue the full folio other than queueing zero byte. Fixes: 3d74a7556fba ("btrfs: zlib: introduce zlib_compress_bio() helper") Reported-by: David Sterba Reported-by: Jean-Christophe Guillain Reported-by: syzbot+3c4d8371d65230f852a2@syzkaller.appspotmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=221176 Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 0a8fcee16428..27fc2b828002 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -308,7 +308,9 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) } /* Queue the remaining part of the folio. */ if (workspace->strm.total_out > bio->bi_iter.bi_size) { - u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); + const u32 cur_len = workspace->strm.total_out - bio->bi_iter.bi_size; + + ASSERT(cur_len <= folio_size(out_folio)); if (!bio_add_folio(bio, out_folio, cur_len, 0)) { ret = -E2BIG; -- cgit v1.2.3 From 1c37d896b12dfd0d4c96e310b0033c6676933917 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Mar 2026 16:17:59 +0000 Subject: btrfs: fix lost error when running device stats on multiple devices fs Whenever we get an error updating the device stats item for a device in btrfs_run_dev_stats() we allow the loop to go to the next device, and if updating the stats item for the next device succeeds, we end up losing the error we had from the previous device. Fix this by breaking out of the loop once we get an error and make sure it's returned to the caller. Since we are in the transaction commit path (and in the critical section actually), returning the error will result in a transaction abort. Fixes: 733f4fbbc108 ("Btrfs: read device stats on mount, write modified ones during commit") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8fbd736aad9f..117e13d245f6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -8099,8 +8099,9 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) smp_rmb(); ret = update_dev_stat_item(trans, device); - if (!ret) - atomic_sub(stats_cnt, &device->dev_stats_ccnt); + if (ret) + break; + atomic_sub(stats_cnt, &device->dev_stats_ccnt); } mutex_unlock(&fs_devices->device_list_mutex); -- cgit v1.2.3 From f621324dfb3d6719cc9ffe65e8ec6051664ca059 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 23 Mar 2026 14:00:17 -0700 Subject: iomap: fix lockdep complaint when reads fail Zorro Lang reported the following lockdep splat: "While running fstests xfs/556 on kernel 7.0.0-rc4+ (HEAD=04a9f1766954), a lockdep warning was triggered indicating an inconsistent lock state for sb->s_type->i_lock_key. "The deadlock might occur because iomap_read_end_io (called from a hardware interrupt completion path) invokes fserror_report, which then calls igrab. igrab attempts to acquire the i_lock spinlock. However, the i_lock is frequently acquired in process context with interrupts enabled. If an interrupt occurs while a process holds the i_lock, and that interrupt handler calls fserror_report, the system deadlocks. "I hit this warning several times by running xfs/556 (mostly) or generic/648 on xfs. More details refer to below console log." along with this dmesg, for which I've cleaned up the stacktraces: run fstests xfs/556 at 2026-03-18 20:05:30 XFS (sda3): Mounting V5 Filesystem 396e9164-c45a-4e05-be9d-b38c2c5c6477 XFS (sda3): Ending clean mount XFS (sda3): Unmounting Filesystem 396e9164-c45a-4e05-be9d-b38c2c5c6477 XFS (sda3): Mounting V5 Filesystem bf3f89c3-3c45-4650-a9c7-744f39c0191e XFS (sda3): Ending clean mount XFS (sda3): Unmounting Filesystem bf3f89c3-3c45-4650-a9c7-744f39c0191e XFS (dm-0): Mounting V5 Filesystem bf3f89c3-3c45-4650-a9c7-744f39c0191e XFS (dm-0): Ending clean mount device-mapper: table: 253:0: adding target device (start sect 209 len 1) caused an alignment inconsistency device-mapper: table: 253:0: adding target device (start sect 210 len 62914350) caused an alignment inconsistency buffer_io_error: 6 callbacks suppressed Buffer I/O error on dev dm-0, logical block 209, async page read Buffer I/O error on dev dm-0, logical block 209, async page read XFS (dm-0): Unmounting Filesystem bf3f89c3-3c45-4650-a9c7-744f39c0191e XFS (dm-0): Mounting V5 Filesystem bf3f89c3-3c45-4650-a9c7-744f39c0191e XFS (dm-0): Ending clean mount ================================ WARNING: inconsistent lock state 7.0.0-rc4+ #1 Tainted: G S W -------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. od/2368602 [HC1[1]:SC0[0]:HE0:SE1] takes: ff1100069f2b4a98 (&sb->s_type->i_lock_key#31){?.+.}-{3:3}, at: igrab+0x28/0x1a0 {HARDIRQ-ON-W} state was registered at: __lock_acquire+0x40d/0xbd0 lock_acquire.part.0+0xbd/0x260 _raw_spin_lock+0x37/0x80 unlock_new_inode+0x66/0x2a0 xfs_iget+0x67b/0x7b0 [xfs] xfs_mountfs+0xde4/0x1c80 [xfs] xfs_fs_fill_super+0xe86/0x17a0 [xfs] get_tree_bdev_flags+0x312/0x590 vfs_get_tree+0x8d/0x2f0 vfs_cmd_create+0xb2/0x240 __do_sys_fsconfig+0x3d8/0x9a0 do_syscall_64+0x13a/0x1520 entry_SYSCALL_64_after_hwframe+0x76/0x7e irq event stamp: 3118 hardirqs last enabled at (3117): [] _raw_spin_unlock_irq+0x28/0x50 hardirqs last disabled at (3118): [] common_interrupt+0x19/0xe0 softirqs last enabled at (3040): [] handle_softirqs+0x6b8/0x950 softirqs last disabled at (3023): [] __irq_exit_rcu+0xfd/0x250 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&sb->s_type->i_lock_key#31); lock(&sb->s_type->i_lock_key#31); *** DEADLOCK *** 1 lock held by od/2368602: #0: ff1100069f2b4b58 (&sb->s_type->i_mutex_key#19){++++}-{4:4}, at: xfs_ilock+0x324/0x4b0 [xfs] stack backtrace: CPU: 15 UID: 0 PID: 2368602 Comm: od Kdump: loaded Tainted: G S W 7.0.0-rc4+ #1 PREEMPT(full) Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN Hardware name: Dell Inc. PowerEdge R660/0R5JJC, BIOS 2.1.5 03/14/2024 Call Trace: dump_stack_lvl+0x6f/0xb0 print_usage_bug.part.0+0x230/0x2c0 mark_lock_irq+0x3ce/0x5b0 mark_lock+0x1cb/0x3d0 mark_usage+0x109/0x120 __lock_acquire+0x40d/0xbd0 lock_acquire.part.0+0xbd/0x260 _raw_spin_lock+0x37/0x80 igrab+0x28/0x1a0 fserror_report+0x127/0x2d0 iomap_finish_folio_read+0x13c/0x280 iomap_read_end_io+0x10e/0x2c0 clone_endio+0x37e/0x780 [dm_mod] blk_update_request+0x448/0xf00 scsi_end_request+0x74/0x750 scsi_io_completion+0xe9/0x7c0 _scsih_io_done+0x6ba/0x1ca0 [mpt3sas] _base_process_reply_queue+0x249/0x15b0 [mpt3sas] _base_interrupt+0x95/0xe0 [mpt3sas] __handle_irq_event_percpu+0x1f0/0x780 handle_irq_event+0xa9/0x1c0 handle_edge_irq+0x2ef/0x8a0 __common_interrupt+0xa0/0x170 common_interrupt+0xb7/0xe0 asm_common_interrupt+0x26/0x40 RIP: 0010:_raw_spin_unlock_irq+0x2e/0x50 Code: 0f 1f 44 00 00 53 48 8b 74 24 08 48 89 fb 48 83 c7 18 e8 b5 73 5e fd 48 89 df e8 ed e2 5e fd e8 08 78 8f fd fb bf 01 00 00 00 8d 56 4d fd 65 8b 05 46 d5 1d 03 85 c0 74 06 5b c3 cc cc cc cc RSP: 0018:ffa0000027d07538 EFLAGS: 00000206 RAX: 0000000000000c2d RBX: ffffffffb6614bc8 RCX: 0000000000000080 RDX: 0000000000000000 RSI: ffffffffb6306a01 RDI: 0000000000000001 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: ffffffffb75efc67 R11: 0000000000000001 R12: ff1100015ada0000 R13: 0000000000000083 R14: 0000000000000002 R15: ffffffffb6614c10 folio_wait_bit_common+0x407/0x780 filemap_update_page+0x8e7/0xbd0 filemap_get_pages+0x904/0xc50 filemap_read+0x320/0xc20 xfs_file_buffered_read+0x2aa/0x380 [xfs] xfs_file_read_iter+0x263/0x4a0 [xfs] vfs_read+0x6cb/0xb70 ksys_read+0xf9/0x1d0 do_syscall_64+0x13a/0x1520 Zorro's diagnosis makes sense, so the solution is to kick the failed read handling to a workqueue much like we added for writeback ioends in commit 294f54f849d846 ("fserror: fix lockdep complaint when igrabbing inode"). Cc: Zorro Lang Link: https://lore.kernel.org/linux-xfs/20260319194303.efw4wcu7c4idhthz@doltdoltdolt/ Fixes: a9d573ee88af98 ("iomap: report file I/O errors to the VFS") Signed-off-by: "Darrick J. Wong" Link: https://patch.msgid.link/20260323210017.GL6223@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/bio.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index fc045f2e4c45..edd908183058 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -8,7 +8,10 @@ #include "internal.h" #include "trace.h" -static void iomap_read_end_io(struct bio *bio) +static DEFINE_SPINLOCK(failed_read_lock); +static struct bio_list failed_read_list = BIO_EMPTY_LIST; + +static void __iomap_read_end_io(struct bio *bio) { int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; @@ -18,6 +21,52 @@ static void iomap_read_end_io(struct bio *bio) bio_put(bio); } +static void +iomap_fail_reads( + struct work_struct *work) +{ + struct bio *bio; + struct bio_list tmp = BIO_EMPTY_LIST; + unsigned long flags; + + spin_lock_irqsave(&failed_read_lock, flags); + bio_list_merge_init(&tmp, &failed_read_list); + spin_unlock_irqrestore(&failed_read_lock, flags); + + while ((bio = bio_list_pop(&tmp)) != NULL) { + __iomap_read_end_io(bio); + cond_resched(); + } +} + +static DECLARE_WORK(failed_read_work, iomap_fail_reads); + +static void iomap_fail_buffered_read(struct bio *bio) +{ + unsigned long flags; + + /* + * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions + * in the fserror code. The caller no longer owns the bio reference + * after the spinlock drops. + */ + spin_lock_irqsave(&failed_read_lock, flags); + if (bio_list_empty(&failed_read_list)) + WARN_ON_ONCE(!schedule_work(&failed_read_work)); + bio_list_add(&failed_read_list, bio); + spin_unlock_irqrestore(&failed_read_lock, flags); +} + +static void iomap_read_end_io(struct bio *bio) +{ + if (bio->bi_status) { + iomap_fail_buffered_read(bio); + return; + } + + __iomap_read_end_io(bio); +} + static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; -- cgit v1.2.3 From 53a7c171e9dd833f0a96b545adcb89bd57387239 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 8 Mar 2026 12:02:21 +0100 Subject: ovl: fix wrong detection of 32bit inode numbers The implicit FILEID_INO32_GEN encoder was changed to be explicit, so we need to fix the detection. When mounting overlayfs with upperdir and lowerdir on different ext4 filesystems, the expected kmsg log is: overlayfs: "xino" feature enabled using 32 upper inode bits. But instead, since the regressing commit, the kmsg log was: overlayfs: "xino" feature enabled using 2 upper inode bits. Fixes: e21fc2038c1b9 ("exportfs: make ->encode_fh() a mandatory method for NFS export") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Amir Goldstein --- fs/overlayfs/util.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 3f1b763a8bb4..2ea769f311c3 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -85,7 +85,10 @@ int ovl_can_decode_fh(struct super_block *sb) if (!exportfs_can_decode_fh(sb->s_export_op)) return 0; - return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; + if (sb->s_export_op->encode_fh == generic_encode_ino32_fh) + return FILEID_INO32_GEN; + + return -1; } struct dentry *ovl_indexdir(struct super_block *sb) -- cgit v1.2.3 From beef2634f81f1c086208191f7228bce1d366493d Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 19 Mar 2026 21:00:02 +0900 Subject: ksmbd: fix potencial OOB in get_file_all_info() for compound requests When a compound request consists of QUERY_DIRECTORY + QUERY_INFO (FILE_ALL_INFORMATION) and the first command consumes nearly the entire max_trans_size, get_file_all_info() would blindly call smbConvertToUTF16() with PATH_MAX, causing out-of-bounds write beyond the response buffer. In get_file_all_info(), there was a missing validation check for the client-provided OutputBufferLength before copying the filename into FileName field of the smb2_file_all_info structure. If the filename length exceeds the available buffer space, it could lead to potential buffer overflows or memory corruption during smbConvertToUTF16 conversion. This calculating the actual free buffer size using smb2_calc_max_out_buf_len() and returning -EINVAL if the buffer is insufficient and updating smbConvertToUTF16 to use the actual filename length (clamped by PATH_MAX) to ensure a safe copy operation. Cc: stable@vger.kernel.org Fixes: e2b76ab8b5c9 ("ksmbd: add support for read compound") Reported-by: Asim Viladi Oglu Manizada Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index f5f1bf5f642e..6fb7a795ff5d 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4940,7 +4940,8 @@ static int get_file_all_info(struct ksmbd_work *work, int conv_len; char *filename; u64 time; - int ret; + int ret, buf_free_len, filename_len; + struct smb2_query_info_req *req = ksmbd_req_buf_next(work); if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) { ksmbd_debug(SMB, "no right to read the attributes : 0x%x\n", @@ -4952,6 +4953,16 @@ static int get_file_all_info(struct ksmbd_work *work, if (IS_ERR(filename)) return PTR_ERR(filename); + filename_len = strlen(filename); + buf_free_len = smb2_calc_max_out_buf_len(work, + offsetof(struct smb2_query_info_rsp, Buffer) + + offsetof(struct smb2_file_all_info, FileName), + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < (filename_len + 1) * 2) { + kfree(filename); + return -EINVAL; + } + ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (ret) { @@ -4995,7 +5006,8 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->Mode = fp->coption; file_info->AlignmentRequirement = 0; conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename, - PATH_MAX, conn->local_nls, 0); + min(filename_len, PATH_MAX), + conn->local_nls, 0); conv_len *= 2; file_info->FileNameLength = cpu_to_le32(conv_len); rsp->OutputBufferLength = -- cgit v1.2.3 From 70685c291ef82269180758130394ecdc4496b52c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 23 Mar 2026 14:01:57 -0700 Subject: xfs: don't irele after failing to iget in xfs_attri_recover_work xlog_recovery_iget* never set @ip to a valid pointer if they return an error, so this irele will walk off a dangling pointer. Fix that. Cc: stable@vger.kernel.org # v6.10 Fixes: ae673f534a3097 ("xfs: record inode generation in xattr update log intent items") Signed-off-by: Darrick J. Wong Reviewed-by: Long Li Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_attr_item.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 82324f42537b..deab14f31b38 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -653,7 +653,6 @@ xfs_attri_recover_work( break; } if (error) { - xfs_irele(ip); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attrp, sizeof(*attrp)); return ERR_PTR(-EFSCORRUPTED); -- cgit v1.2.3 From e31c53a8060e134111ed095783fee0aa0c43b080 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 23 Mar 2026 14:04:33 -0700 Subject: xfs: remove file_path tracepoint data The xfile/xmbuf shmem file descriptions are no longer as detailed as they were when online fsck was first merged, because moving to static strings in commit 60382993a2e180 ("xfs: get rid of the xchk_xfile_*_descr calls") removed a memory allocation and hence a source of failure. However this makes encoding the description in the tracepoints sort of a waste of memory. David Laight also points out that file_path doesn't zero the whole buffer which causes exposure of stale trace bytes, and Steven Rostedt wonders why we're not using a dynamic array for the file path. I don't think this is worth fixing, so let's just rip it out. Cc: rostedt@goodmis.org Cc: david.laight.linux@gmail.com Link: https://lore.kernel.org/linux-xfs/20260323172204.work.979-kees@kernel.org/ Cc: stable@vger.kernel.org # v6.11 Fixes: 19ebc8f84ea12e ("xfs: fix file_path handling in tracepoints") Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/trace.h | 12 ++---------- fs/xfs/xfs_trace.h | 11 ++--------- 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 39ea651cbb75..286c5f5e0544 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -972,20 +972,12 @@ TRACE_EVENT(xfile_create, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, ino) - __array(char, pathname, MAXNAMELEN) ), TP_fast_assign( - char *path; - __entry->ino = file_inode(xf->file)->i_ino; - path = file_path(xf->file, __entry->pathname, MAXNAMELEN); - if (IS_ERR(path)) - strncpy(__entry->pathname, "(unknown)", - sizeof(__entry->pathname)); ), - TP_printk("xfino 0x%lx path '%s'", - __entry->ino, - __entry->pathname) + TP_printk("xfino 0x%lx", + __entry->ino) ); TRACE_EVENT(xfile_destroy, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0e994b3f768f..5e8190fe2be9 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -5119,23 +5119,16 @@ TRACE_EVENT(xmbuf_create, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, ino) - __array(char, pathname, MAXNAMELEN) ), TP_fast_assign( - char *path; struct file *file = btp->bt_file; __entry->dev = btp->bt_mount->m_super->s_dev; __entry->ino = file_inode(file)->i_ino; - path = file_path(file, __entry->pathname, MAXNAMELEN); - if (IS_ERR(path)) - strncpy(__entry->pathname, "(unknown)", - sizeof(__entry->pathname)); ), - TP_printk("dev %d:%d xmino 0x%lx path '%s'", + TP_printk("dev %d:%d xmino 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->pathname) + __entry->ino) ); TRACE_EVENT(xmbuf_free, -- cgit v1.2.3 From 0e764b9d46071668969410ec5429be0e2f38c6d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Mar 2026 08:20:17 +0000 Subject: netfs: Fix the handling of stream->front by removing it The netfs_io_stream::front member is meant to point to the subrequest currently being collected on a stream, but it isn't actually used this way by direct write (which mostly ignores it). However, there's a tracepoint which looks at it. Further, stream->front is actually redundant with stream->subrequests.next. Fix the potential problem in the direct code by just removing the member and using stream->subrequests.next instead, thereby also simplifying the code. Fixes: a0b4c7a49137 ("netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence") Reported-by: Paulo Alcantara Signed-off-by: David Howells Link: https://patch.msgid.link/4158599.1774426817@warthog.procyon.org.uk Reviewed-by: Paulo Alcantara (Red Hat) cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 3 +-- fs/netfs/direct_read.c | 3 +-- fs/netfs/direct_write.c | 1 - fs/netfs/read_collect.c | 4 ++-- fs/netfs/read_single.c | 1 - fs/netfs/write_collect.c | 4 ++-- fs/netfs/write_issue.c | 3 +-- include/linux/netfs.h | 1 - include/trace/events/netfs.h | 8 ++++---- 9 files changed, 11 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 88a0d801525f..a8c0d86118c5 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -171,9 +171,8 @@ static void netfs_queue_read(struct netfs_io_request *rreq, spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { - stream->front = subreq; if (!stream->active) { - stream->collected_to = stream->front->start; + stream->collected_to = subreq->start; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); } diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index a498ee8d6674..f72e6da88cca 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -71,9 +71,8 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { - stream->front = subreq; if (!stream->active) { - stream->collected_to = stream->front->start; + stream->collected_to = subreq->start; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); } diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 4d9760e36c11..f9ab69de3e29 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -111,7 +111,6 @@ static int netfs_unbuffered_write(struct netfs_io_request *wreq) netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred); subreq = stream->construct; stream->construct = NULL; - stream->front = NULL; } /* Check if (re-)preparation failed. */ diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 137f0e28a44c..e5f6665b3341 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -205,7 +205,8 @@ reassess: * in progress. The issuer thread may be adding stuff to the tail * whilst we're doing this. */ - front = READ_ONCE(stream->front); + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); while (front) { size_t transferred; @@ -301,7 +302,6 @@ reassess: list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); - stream->front = front; spin_unlock(&rreq->lock); netfs_put_subrequest(remove, notes & ABANDON_SREQ ? diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index 8e6264f62a8f..d0e23bc42445 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -107,7 +107,6 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) spin_lock(&rreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); trace_netfs_sreq(subreq, netfs_sreq_trace_added); - stream->front = subreq; /* Store list pointers before active flag */ smp_store_release(&stream->active, true); spin_unlock(&rreq->lock); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 83eb3dc1adf8..b194447f4b11 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -228,7 +228,8 @@ reassess_streams: if (!smp_load_acquire(&stream->active)) continue; - front = stream->front; + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); while (front) { trace_netfs_collect_sreq(wreq, front); //_debug("sreq [%x] %llx %zx/%zx", @@ -279,7 +280,6 @@ reassess_streams: list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); - stream->front = front; spin_unlock(&wreq->lock); netfs_put_subrequest(remove, notes & SAW_FAILURE ? diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 437268f65640..2db688f94125 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -206,9 +206,8 @@ void netfs_prepare_write(struct netfs_io_request *wreq, spin_lock(&wreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { - stream->front = subreq; if (!stream->active) { - stream->collected_to = stream->front->start; + stream->collected_to = subreq->start; /* Write list pointers before active flag */ smp_store_release(&stream->active, true); } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 72ee7d210a74..ba17ac5bf356 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -140,7 +140,6 @@ struct netfs_io_stream { void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ struct list_head subrequests; /* Contributory I/O operations */ - struct netfs_io_subrequest *front; /* Op being collected */ unsigned long long collected_to; /* Position we've collected results to */ size_t transferred; /* The amount transferred from this stream */ unsigned short error; /* Aggregate error for the stream */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 2d366be46a1c..cbe28211106c 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -740,19 +740,19 @@ TRACE_EVENT(netfs_collect_stream, __field(unsigned int, wreq) __field(unsigned char, stream) __field(unsigned long long, collected_to) - __field(unsigned long long, front) + __field(unsigned long long, issued_to) ), TP_fast_assign( __entry->wreq = wreq->debug_id; __entry->stream = stream->stream_nr; __entry->collected_to = stream->collected_to; - __entry->front = stream->front ? stream->front->start : UINT_MAX; + __entry->issued_to = atomic64_read(&wreq->issued_to); ), - TP_printk("R=%08x[%x:] cto=%llx frn=%llx", + TP_printk("R=%08x[%x:] cto=%llx ito=%llx", __entry->wreq, __entry->stream, - __entry->collected_to, __entry->front) + __entry->collected_to, __entry->issued_to) ); TRACE_EVENT(netfs_folioq, -- cgit v1.2.3 From 1f6ee9be92f8df85a8c9a5a78c20fd39c0c21a95 Mon Sep 17 00:00:00 2001 From: Fei Lv Date: Mon, 22 Jul 2024 18:14:43 +0800 Subject: ovl: make fsync after metadata copy-up opt-in mount option Commit 7d6899fb69d25 ("ovl: fsync after metadata copy-up") was done to fix durability of overlayfs copy up on an upper filesystem which does not enforce ordering on storing of metadata changes (e.g. ubifs). In an earlier revision of the regressing commit by Lei Lv, the metadata fsync behavior was opt-in via a new "fsync=strict" mount option. We were hoping that the opt-in mount option could be avoided, so the change was only made to depend on metacopy=off, in the hope of not hurting performance of metadata heavy workloads, which are more likely to be using metacopy=on. This hope was proven wrong by a performance regression report from Google COS workload after upgrade to kernel 6.12. This is an adaptation of Lei's original "fsync=strict" mount option to the existing upstream code. The new mount option is mutually exclusive with the "volatile" mount option, so the latter is now an alias to the "fsync=volatile" mount option. Reported-by: Chenglong Tang Closes: https://lore.kernel.org/linux-unionfs/CAOdxtTadAFH01Vui1FvWfcmQ8jH1O45owTzUcpYbNvBxnLeM7Q@mail.gmail.com/ Link: https://lore.kernel.org/linux-unionfs/CAOQ4uxgKC1SgjMWre=fUb00v8rxtd6sQi-S+dxR8oDzAuiGu8g@mail.gmail.com/ Fixes: 7d6899fb69d25 ("ovl: fsync after metadata copy-up") Depends: 50e638beb67e0 ("ovl: Use str_on_off() helper in ovl_show_options()") Cc: stable@vger.kernel.org # v6.12+ Signed-off-by: Fei Lv Signed-off-by: Amir Goldstein --- Documentation/filesystems/overlayfs.rst | 50 +++++++++++++++++++++++++++++++++ fs/overlayfs/copy_up.c | 6 ++-- fs/overlayfs/overlayfs.h | 21 ++++++++++++++ fs/overlayfs/ovl_entry.h | 7 +---- fs/overlayfs/params.c | 33 ++++++++++++++++++---- fs/overlayfs/super.c | 2 +- 6 files changed, 104 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index af5a69f87da4..eb846518e6ac 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -783,6 +783,56 @@ controlled by the "uuid" mount option, which supports these values: mounted with "uuid=on". +Durability and copy up +---------------------- + +The fsync(2) system call ensures that the data and metadata of a file +are safely written to the backing storage, which is expected to +guarantee the existence of the information post system crash. + +Without an fsync(2) call, there is no guarantee that the observed +data after a system crash will be either the old or the new data, but +in practice, the observed data after crash is often the old or new data +or a mix of both. + +When an overlayfs file is modified for the first time, copy up will +create a copy of the lower file and its parent directories in the upper +layer. Since the Linux filesystem API does not enforce any particular +ordering on storing changes without explicit fsync(2) calls, in case +of a system crash, the upper file could end up with no data at all +(i.e. zeros), which would be an unusual outcome. To avoid this +experience, overlayfs calls fsync(2) on the upper file before completing +data copy up with rename(2) or link(2) to make the copy up "atomic". + +By default, overlayfs does not explicitly call fsync(2) on copied up +directories or on metadata-only copy up, so it provides no guarantee to +persist the user's modification unless the user calls fsync(2). +The fsync during copy up only guarantees that if a copy up is observed +after a crash, the observed data is not zeroes or intermediate values +from the copy up staging area. + +On traditional local filesystems with a single journal (e.g. ext4, xfs), +fsync on a file also persists the parent directory changes, because they +are usually modified in the same transaction, so metadata durability during +data copy up effectively comes for free. Overlayfs further limits risk by +disallowing network filesystems as upper layer. + +Overlayfs can be tuned to prefer performance or durability when storing +to the underlying upper layer. This is controlled by the "fsync" mount +option, which supports these values: + +- "auto": (default) + Call fsync(2) on upper file before completion of data copy up. + No explicit fsync(2) on directory or metadata-only copy up. +- "strict": + Call fsync(2) on upper file and directories before completion of any + copy up. +- "volatile": [*] + Prefer performance over durability (see `Volatile mount`_) + +[*] The mount option "volatile" is an alias to "fsync=volatile". + + Volatile mount -------------- diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 758611ee4475..13cb60b52bd6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -1146,15 +1146,15 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return -EOVERFLOW; /* - * With metacopy disabled, we fsync after final metadata copyup, for + * With "fsync=strict", we fsync after final metadata copyup, for * both regular files and directories to get atomic copyup semantics * on filesystems that do not use strict metadata ordering (e.g. ubifs). * - * With metacopy enabled we want to avoid fsync on all meta copyup + * By default, we want to avoid fsync on all meta copyup, because * that will hurt performance of workloads such as chown -R, so we * only fsync on data copyup as legacy behavior. */ - ctx.metadata_fsync = !OVL_FS(dentry->d_sb)->config.metacopy && + ctx.metadata_fsync = ovl_should_sync_metadata(OVL_FS(dentry->d_sb)) && (S_ISREG(ctx.stat.mode) || S_ISDIR(ctx.stat.mode)); ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index cad2055ebf18..63b299bf12f7 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -99,6 +99,12 @@ enum { OVL_VERITY_REQUIRE, }; +enum { + OVL_FSYNC_VOLATILE, + OVL_FSYNC_AUTO, + OVL_FSYNC_STRICT, +}; + /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: @@ -656,6 +662,21 @@ static inline bool ovl_xino_warn(struct ovl_fs *ofs) return ofs->config.xino == OVL_XINO_ON; } +static inline bool ovl_should_sync(struct ovl_fs *ofs) +{ + return ofs->config.fsync_mode != OVL_FSYNC_VOLATILE; +} + +static inline bool ovl_should_sync_metadata(struct ovl_fs *ofs) +{ + return ofs->config.fsync_mode == OVL_FSYNC_STRICT; +} + +static inline bool ovl_is_volatile(struct ovl_config *config) +{ + return config->fsync_mode == OVL_FSYNC_VOLATILE; +} + /* * To avoid regressions in existing setups with overlay lower offline changes, * we allow lower changes only if none of the new features are used. diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 1d4828dbcf7a..80cad4ea96a3 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -18,7 +18,7 @@ struct ovl_config { int xino; bool metacopy; bool userxattr; - bool ovl_volatile; + int fsync_mode; }; struct ovl_sb { @@ -120,11 +120,6 @@ static inline struct ovl_fs *OVL_FS(struct super_block *sb) return (struct ovl_fs *)sb->s_fs_info; } -static inline bool ovl_should_sync(struct ovl_fs *ofs) -{ - return !ofs->config.ovl_volatile; -} - static inline unsigned int ovl_numlower(struct ovl_entry *oe) { return oe ? oe->__numlower : 0; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 8111b437ae5d..c93fcaa45d4a 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -58,6 +58,7 @@ enum ovl_opt { Opt_xino, Opt_metacopy, Opt_verity, + Opt_fsync, Opt_volatile, Opt_override_creds, }; @@ -140,6 +141,23 @@ static int ovl_verity_mode_def(void) return OVL_VERITY_OFF; } +static const struct constant_table ovl_parameter_fsync[] = { + { "volatile", OVL_FSYNC_VOLATILE }, + { "auto", OVL_FSYNC_AUTO }, + { "strict", OVL_FSYNC_STRICT }, + {} +}; + +static const char *ovl_fsync_mode(struct ovl_config *config) +{ + return ovl_parameter_fsync[config->fsync_mode].name; +} + +static int ovl_fsync_mode_def(void) +{ + return OVL_FSYNC_AUTO; +} + const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), fsparam_file_or_string("lowerdir+", Opt_lowerdir_add), @@ -155,6 +173,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_enum("xino", Opt_xino, ovl_parameter_xino), fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), fsparam_enum("verity", Opt_verity, ovl_parameter_verity), + fsparam_enum("fsync", Opt_fsync, ovl_parameter_fsync), fsparam_flag("volatile", Opt_volatile), fsparam_flag_no("override_creds", Opt_override_creds), {} @@ -665,8 +684,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_verity: config->verity_mode = result.uint_32; break; + case Opt_fsync: + config->fsync_mode = result.uint_32; + break; case Opt_volatile: - config->ovl_volatile = true; + config->fsync_mode = OVL_FSYNC_VOLATILE; break; case Opt_userxattr: config->userxattr = true; @@ -800,6 +822,7 @@ int ovl_init_fs_context(struct fs_context *fc) ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); ofs->config.metacopy = ovl_metacopy_def; + ofs->config.fsync_mode = ovl_fsync_mode_def(); fc->s_fs_info = ofs; fc->fs_private = ctx; @@ -870,9 +893,9 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, config->index = false; } - if (!config->upperdir && config->ovl_volatile) { + if (!config->upperdir && ovl_is_volatile(config)) { pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); - config->ovl_volatile = false; + config->fsync_mode = ovl_fsync_mode_def(); } if (!config->upperdir && config->uuid == OVL_UUID_ON) { @@ -1070,8 +1093,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config)); if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", str_on_off(ofs->config.metacopy)); - if (ofs->config.ovl_volatile) - seq_puts(m, ",volatile"); + if (ofs->config.fsync_mode != ovl_fsync_mode_def()) + seq_printf(m, ",fsync=%s", ovl_fsync_mode(&ofs->config)); if (ofs->config.userxattr) seq_puts(m, ",userxattr"); if (ofs->config.verity_mode != ovl_verity_mode_def()) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d4c12feec039..0822987cfb51 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -776,7 +776,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, * For volatile mount, create a incompat/volatile/dirty file to keep * track of it. */ - if (ofs->config.ovl_volatile) { + if (ovl_is_volatile(&ofs->config)) { err = ovl_create_volatile_dirty(ofs); if (err < 0) { pr_err("Failed to create volatile/dirty file.\n"); -- cgit v1.2.3 From fffca572f9ca51607f180a37d0c898404c8f9112 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 15:06:31 +0100 Subject: mpage: Provide variant of mpage_writepages() with own optional folio handler Some filesystems need to treat some folios specially (for example for inodes with inline data). Doing the handling in their .writepages method in a race-free manner results in duplicating some of the writeback internals. So provide generalized version of mpage_writepages() that allows filesystem to provide a handler called for each folio which can handle the folio in a special way. Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260326140635.15895-3-jack@suse.cz Signed-off-by: Jan Kara --- fs/mpage.c | 30 ++++++++++++++++++++++++------ include/linux/mpage.h | 11 +++++++++-- 2 files changed, 33 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/mpage.c b/fs/mpage.c index 7dae5afc2b9e..b3d9f231a04a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -646,17 +646,24 @@ out: } /** - * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them + * __mpage_writepages - walk the list of dirty pages of the given address space + * & writepage() all of them * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. + * @write_folio: handler to call for each folio before calling + * mpage_write_folio() * * This is a library function, which implements the writepages() - * address_space_operation. + * address_space_operation. It calls @write_folio handler for each folio. If + * the handler returns value > 0, it calls mpage_write_folio() to do the + * folio writeback. */ int -mpage_writepages(struct address_space *mapping, - struct writeback_control *wbc, get_block_t get_block) +__mpage_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block, + int (*write_folio)(struct folio *folio, + struct writeback_control *wbc)) { struct mpage_data mpd = { .get_block = get_block, @@ -666,11 +673,22 @@ mpage_writepages(struct address_space *mapping, int error; blk_start_plug(&plug); - while ((folio = writeback_iter(mapping, wbc, folio, &error))) + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + if (write_folio) { + error = write_folio(folio, wbc); + /* + * == 0 means folio is handled, < 0 means error. In + * both cases hand back control to writeback_iter() + */ + if (error <= 0) + continue; + /* Let mpage_write_folio() handle the folio. */ + } error = mpage_write_folio(wbc, folio, &mpd); + } if (mpd.bio) mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); return error; } -EXPORT_SYMBOL(mpage_writepages); +EXPORT_SYMBOL(__mpage_writepages); diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 1bdc39daac0a..358946990bfa 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -17,7 +17,14 @@ struct readahead_control; void mpage_readahead(struct readahead_control *, get_block_t get_block); int mpage_read_folio(struct folio *folio, get_block_t get_block); -int mpage_writepages(struct address_space *mapping, - struct writeback_control *wbc, get_block_t get_block); +int __mpage_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block, + int (*write_folio)(struct folio *folio, + struct writeback_control *wbc)); +static inline int mpage_writepages(struct address_space *mapping, + struct writeback_control *wbc, get_block_t get_block) +{ + return __mpage_writepages(mapping, wbc, get_block, NULL); +} #endif -- cgit v1.2.3 From 102e57d56f81fa5c5ed78f576101d1bf1b3e6fe2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Mar 2026 15:06:32 +0100 Subject: udf: Fix race between file type conversion and writeback udf_setsize() can race with udf_writepages() as follows: udf_setsize() udf_writepages() if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) err = udf_expand_file_adinicb(inode); err = udf_extend_file(inode, newsize); udf_adinicb_writepages() memcpy_from_file_folio() - crash because inode size is too big. Fix the problem by checking the file type under folio lock in udf_handle_page_wb() handler called from __mpage_writepages() which properly serializes with udf_expand_file_adinicb(). Reported-by: Jianzhou Zhao Link: https://lore.kernel.org/all/f622c01.67ac.19cdbdd777d.Coremail.luckd0g@163.com Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260326140635.15895-4-jack@suse.cz Signed-off-by: Jan Kara --- fs/udf/inode.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7fae8002344a..23e894092dab 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -181,22 +181,23 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } -static int udf_adinicb_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int udf_handle_page_wb(struct folio *folio, + struct writeback_control *wbc) { - struct inode *inode = mapping->host; + struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); - struct folio *folio = NULL; - int error = 0; - while ((folio = writeback_iter(mapping, wbc, folio, &error))) { - BUG_ON(!folio_test_locked(folio)); - BUG_ON(folio->index != 0); - memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, - 0, i_size_read(inode)); - folio_unlock(folio); - } + /* + * Inodes in the normal format are handled by the generic code. This + * check is race-free as the folio lock protects us from inode type + * conversion. + */ + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) + return 1; + memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, + 0, i_size_read(inode)); + folio_unlock(folio); mark_inode_dirty(inode); return 0; } @@ -204,12 +205,8 @@ static int udf_adinicb_writepages(struct address_space *mapping, static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct inode *inode = mapping->host; - struct udf_inode_info *iinfo = UDF_I(inode); - - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - return udf_adinicb_writepages(mapping, wbc); - return mpage_writepages(mapping, wbc, udf_get_block_wb); + return __mpage_writepages(mapping, wbc, udf_get_block_wb, + udf_handle_page_wb); } static void udf_adinicb_read_folio(struct folio *folio) -- cgit v1.2.3 From 84e21e3fb8fd99ea460eb7274584750d11cf3e9f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 31 Jan 2026 17:11:56 +0800 Subject: ext4: do not check fast symlink during orphan recovery Commit '5f920d5d6083 ("ext4: verify fast symlink length")' causes the generic/475 test to fail during orphan cleanup of zero-length symlinks. generic/475 84s ... _check_generic_filesystem: filesystem on /dev/vde is inconsistent The fsck reports are provided below: Deleted inode 9686 has zero dtime. Deleted inode 158230 has zero dtime. ... Inode bitmap differences: -9686 -158230 Orphan file (inode 12) block 13 is not clean. Failed to initialize orphan file. In ext4_symlink(), a newly created symlink can be added to the orphan list due to ENOSPC. Its data has not been initialized, and its size is zero. Therefore, we need to disregard the length check of the symbolic link when cleaning up orphan inodes. Instead, we should ensure that the nlink count is zero. Fixes: 5f920d5d6083 ("ext4: verify fast symlink length") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260131091156.1733648-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..af6d1759c8de 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5401,18 +5401,36 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - if (inode->i_size == 0 || - inode->i_size >= sizeof(ei->i_data) || - strnlen((char *)ei->i_data, inode->i_size + 1) != - inode->i_size) { - ext4_error_inode(inode, function, line, 0, - "invalid fast symlink length %llu", - (unsigned long long)inode->i_size); - ret = -EFSCORRUPTED; - goto bad_inode; + + /* + * Orphan cleanup can see inodes with i_size == 0 + * and i_data uninitialized. Skip size checks in + * that case. This is safe because the first thing + * ext4_evict_inode() does for fast symlinks is + * clearing of i_data and i_size. + */ + if ((EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS)) { + if (inode->i_nlink != 0) { + ext4_error_inode(inode, function, line, 0, + "invalid orphan symlink nlink %d", + inode->i_nlink); + ret = -EFSCORRUPTED; + goto bad_inode; + } + } else { + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } - inode_set_cached_link(inode, (char *)ei->i_data, - inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } -- cgit v1.2.3 From f4a2b42e78914ff15630e71289adc589c3a8eb45 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 5 Feb 2026 10:22:24 +0100 Subject: ext4: fix stale xarray tags after writeback There are cases where ext4_bio_write_page() gets called for a page which has no buffers to submit. This happens e.g. when the part of the file is actually a hole, when we cannot allocate blocks due to being called from jbd2, or in data=journal mode when checkpointing writes the buffers earlier. In these cases we just return from ext4_bio_write_page() however if the page didn't need redirtying, we will leave stale DIRTY and/or TOWRITE tags in xarray because those get cleared only in __folio_start_writeback(). As a result we can leave these tags set in mappings even after a final sync on filesystem that's getting remounted read-only or that's being frozen. Various assertions can then get upset when writeback is started on such filesystems (Gerald reported assertion in ext4_journal_check_start() firing). Fix the problem by cycling the page through writeback state even if we decide nothing needs to be written for it so that xarray tags get properly updated. This is slightly silly (we could update the xarray tags directly) but I don't think a special helper messing with xarray tags is really worth it in this relatively rare corner case. Reported-by: Gerald Yang Link: https://lore.kernel.org/all/20260128074515.2028982-1-gerald.yang@canonical.com Fixes: dff4ac75eeee ("ext4: move keep_towrite handling to ext4_bio_write_page()") Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260205092223.21287-2-jack@suse.cz Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/page-io.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a8c95eee91b7..39fe50b3c662 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -524,9 +524,15 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, nr_to_submit++; } while ((bh = bh->b_this_page) != head); - /* Nothing to submit? Just unlock the folio... */ - if (!nr_to_submit) + if (!nr_to_submit) { + /* + * We have nothing to submit. Just cycle the folio through + * writeback state to properly update xarray tags. + */ + __folio_start_writeback(folio, keep_towrite); + folio_end_writeback(folio); return 0; + } bh = head = folio_buffers(folio); -- cgit v1.2.3 From ed9356a30e59c7cc3198e7fc46cfedf3767b9b17 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Sat, 7 Feb 2026 10:06:07 +0530 Subject: ext4: convert inline data to extents when truncate exceeds inline size Add a check in ext4_setattr() to convert files from inline data storage to extent-based storage when truncate() grows the file size beyond the inline capacity. This prevents the filesystem from entering an inconsistent state where the inline data flag is set but the file size exceeds what can be stored inline. Without this fix, the following sequence causes a kernel BUG_ON(): 1. Mount filesystem with inode that has inline flag set and small size 2. truncate(file, 50MB) - grows size but inline flag remains set 3. sendfile() attempts to write data 4. ext4_write_inline_data() hits BUG_ON(write_size > inline_capacity) The crash occurs because ext4_write_inline_data() expects inline storage to accommodate the write, but the actual inline capacity (~60 bytes for i_block + ~96 bytes for xattrs) is far smaller than the file size and write request. The fix checks if the new size from setattr exceeds the inode's actual inline capacity (EXT4_I(inode)->i_inline_size) and converts the file to extent-based storage before proceeding with the size change. This addresses the root cause by ensuring the inline data flag and file size remain consistent during truncate operations. Reported-by: syzbot+7de5fe447862fc37576f@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7de5fe447862fc37576f Tested-by: syzbot+7de5fe447862fc37576f@syzkaller.appspotmail.com Signed-off-by: Deepanshu Kartikey Link: https://patch.msgid.link/20260207043607.1175976-1-kartikey406@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index af6d1759c8de..252191632f56 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5867,6 +5867,18 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (attr->ia_size == inode->i_size) inc_ivers = false; + /* + * If file has inline data but new size exceeds inline capacity, + * convert to extent-based storage first to prevent inconsistent + * state (inline flag set but size exceeds inline capacity). + */ + if (ext4_has_inline_data(inode) && + attr->ia_size > EXT4_I(inode)->i_inline_size) { + error = ext4_convert_inline_data(inode); + if (error) + goto err_out; + } + if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, -- cgit v1.2.3 From b1d682f1990c19fb1d5b97d13266210457092bcd Mon Sep 17 00:00:00 2001 From: Simon Weber Date: Sat, 7 Feb 2026 10:53:03 +0100 Subject: ext4: fix journal credit check when setting fscrypt context Fix an issue arising when ext4 features has_journal, ea_inode, and encrypt are activated simultaneously, leading to ENOSPC when creating an encrypted file. Fix by passing XATTR_CREATE flag to xattr_set_handle function if a handle is specified, i.e., when the function is called in the control flow of creating a new inode. This aligns the number of jbd2 credits set_handle checks for with the number allocated for creating a new inode. ext4_set_context must not be called with a non-null handle (fs_data) if fscrypt context xattr is not guaranteed to not exist yet. The only other usage of this function currently is when handling the ioctl FS_IOC_SET_ENCRYPTION_POLICY, which calls it with fs_data=NULL. Fixes: c1a5d5f6ab21eb7e ("ext4: improve journal credit handling in set xattr paths") Co-developed-by: Anthony Durrer Signed-off-by: Anthony Durrer Signed-off-by: Simon Weber Reviewed-by: Eric Biggers Link: https://patch.msgid.link/20260207100148.724275-4-simon.weber.39@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/crypto.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index cf0a0970c095..f41f320f4437 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -163,10 +163,17 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, */ if (handle) { + /* + * Since the inode is new it is ok to pass the + * XATTR_CREATE flag. This is necessary to match the + * remaining journal credits check in the set_handle + * function with the credits allocated for the new + * inode. + */ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - ctx, len, 0); + ctx, len, XATTR_CREATE); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, -- cgit v1.2.3 From bd060afa7cc3e0ad30afa9ecc544a78638498555 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 16 Feb 2026 17:48:43 +0100 Subject: ext4: make recently_deleted() properly work with lazy itable initialization recently_deleted() checks whether inode has been used in the near past. However this can give false positive result when inode table is not initialized yet and we are in fact comparing to random garbage (or stale itable block of a filesystem before mkfs). Ultimately this results in uninitialized inodes being skipped during inode allocation and possibly they are never initialized and thus e2fsck complains. Verify if the inode has been initialized before checking for dtime. Signed-off-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20260216164848.3074-3-jack@suse.cz Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ialloc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b20a1bf866ab..b1bc1950c9f0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -686,6 +686,12 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) if (unlikely(!gdp)) return 0; + /* Inode was never used in this filesystem? */ + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT) || + ino >= EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp))) + return 0; + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); if (!bh || !buffer_uptodate(bh)) -- cgit v1.2.3 From 1308255bbf8452762f89f44f7447ce137ecdbcff Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 16 Feb 2026 17:48:44 +0100 Subject: ext4: fix fsync(2) for nojournal mode When inode metadata is changed, we sometimes just call ext4_mark_inode_dirty() to track modified metadata. This copies inode metadata into block buffer which is enough when we are journalling metadata. However when we are running in nojournal mode we currently fail to write the dirtied inode buffer during fsync(2) because the inode is not marked as dirty. Use explicit ext4_write_inode() call to make sure the inode table buffer is written to the disk. This is a band aid solution but proper solution requires a much larger rewrite including changes in metadata bh tracking infrastructure. Reported-by: Free Ekanayaka Link: https://lore.kernel.org/all/87il8nhxdm.fsf@x1.mail-host-address-is-not-set/ CC: stable@vger.kernel.org Signed-off-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20260216164848.3074-4-jack@suse.cz Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fsync.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e476c6de3074..bd8f230fa507 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -83,11 +83,23 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, int datasync, bool *needs_barrier) { struct inode *inode = file->f_inode; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + }; int ret; ret = generic_buffers_fsync_noflush(file, start, end, datasync); - if (!ret) - ret = ext4_sync_parent(inode); + if (ret) + return ret; + + /* Force writeout of inode table buffer to disk */ + ret = ext4_write_inode(inode, &wbc); + if (ret) + return ret; + + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) *needs_barrier = true; -- cgit v1.2.3 From 356227096eb66e41b23caf7045e6304877322edf Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Mon, 23 Feb 2026 12:33:46 +0000 Subject: ext4: replace BUG_ON with proper error handling in ext4_read_inline_folio Replace BUG_ON() with proper error handling when inline data size exceeds PAGE_SIZE. This prevents kernel panic and allows the system to continue running while properly reporting the filesystem corruption. The error is logged via ext4_error_inode(), the buffer head is released to prevent memory leak, and -EFSCORRUPTED is returned to indicate filesystem corruption. Signed-off-by: Yuto Ohnuki Link: https://patch.msgid.link/20260223123345.14838-2-ytohnuki@amazon.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inline.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1f6bc05593df..408677fa8196 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -522,7 +522,15 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); - BUG_ON(len > PAGE_SIZE); + + if (len > PAGE_SIZE) { + ext4_error_inode(inode, __func__, __LINE__, 0, + "inline size %zu exceeds PAGE_SIZE", len); + ret = -EFSCORRUPTED; + brelse(iloc.bh); + goto out; + } + kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); kaddr = folio_zero_tail(folio, len, kaddr + len); -- cgit v1.2.3 From 1aec30021edd410b986c156f195f3d23959a9d11 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Wed, 25 Feb 2026 16:26:16 +0800 Subject: ext4: publish jinode after initialization ext4_inode_attach_jinode() publishes ei->jinode to concurrent users. It used to set ei->jinode before jbd2_journal_init_jbd_inode(), allowing a reader to observe a non-NULL jinode with i_vfs_inode still unset. The fast commit flush path can then pass this jinode to jbd2_wait_inode_data(), which dereferences i_vfs_inode->i_mapping and may crash. Below is the crash I observe: ``` BUG: unable to handle page fault for address: 000000010beb47f4 PGD 110e51067 P4D 110e51067 PUD 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 1 UID: 0 PID: 4850 Comm: fc_fsync_bench_ Not tainted 6.18.0-00764-g795a690c06a5 #1 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.17.0-2-2 04/01/2014 RIP: 0010:xas_find_marked+0x3d/0x2e0 Code: e0 03 48 83 f8 02 0f 84 f0 01 00 00 48 8b 47 08 48 89 c3 48 39 c6 0f 82 fd 01 00 00 48 85 c9 74 3d 48 83 f9 03 77 63 4c 8b 0f <49> 8b 71 08 48 c7 47 18 00 00 00 00 48 89 f1 83 e1 03 48 83 f9 02 RSP: 0018:ffffbbee806e7bf0 EFLAGS: 00010246 RAX: 000000000010beb4 RBX: 000000000010beb4 RCX: 0000000000000003 RDX: 0000000000000001 RSI: 0000002000300000 RDI: ffffbbee806e7c10 RBP: 0000000000000001 R08: 0000002000300000 R09: 000000010beb47ec R10: ffff9ea494590090 R11: 0000000000000000 R12: 0000002000300000 R13: ffffbbee806e7c90 R14: ffff9ea494513788 R15: ffffbbee806e7c88 FS: 00007fc2f9e3e6c0(0000) GS:ffff9ea6b1444000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000010beb47f4 CR3: 0000000119ac5000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: filemap_get_folios_tag+0x87/0x2a0 __filemap_fdatawait_range+0x5f/0xd0 ? srso_alias_return_thunk+0x5/0xfbef5 ? __schedule+0x3e7/0x10c0 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? preempt_count_sub+0x5f/0x80 ? srso_alias_return_thunk+0x5/0xfbef5 ? cap_safe_nice+0x37/0x70 ? srso_alias_return_thunk+0x5/0xfbef5 ? preempt_count_sub+0x5f/0x80 ? srso_alias_return_thunk+0x5/0xfbef5 filemap_fdatawait_range_keep_errors+0x12/0x40 ext4_fc_commit+0x697/0x8b0 ? ext4_file_write_iter+0x64b/0x950 ? srso_alias_return_thunk+0x5/0xfbef5 ? preempt_count_sub+0x5f/0x80 ? srso_alias_return_thunk+0x5/0xfbef5 ? vfs_write+0x356/0x480 ? srso_alias_return_thunk+0x5/0xfbef5 ? preempt_count_sub+0x5f/0x80 ext4_sync_file+0xf7/0x370 do_fsync+0x3b/0x80 ? syscall_trace_enter+0x108/0x1d0 __x64_sys_fdatasync+0x16/0x20 do_syscall_64+0x62/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ... ``` Fix this by initializing the jbd2_inode first. Use smp_wmb() and WRITE_ONCE() to publish ei->jinode after initialization. Readers use READ_ONCE() to fetch the pointer. Fixes: a361293f5fede ("jbd2: Fix oops in jbd2_journal_file_inode()") Cc: stable@vger.kernel.org Signed-off-by: Li Chen Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260225082617.147957-1-me@linux.beauty Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 4 ++-- fs/ext4/inode.c | 15 +++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f575751f1cae..6e949c21842d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -975,13 +975,13 @@ static int ext4_fc_flush_data(journal_t *journal) int ret = 0; list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ret = jbd2_submit_inode_data(journal, ei->jinode); + ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode)); if (ret) return ret; } list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ret = jbd2_wait_inode_data(journal, ei->jinode); + ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode)); if (ret) return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 252191632f56..ac5f3446c731 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -128,6 +128,8 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + struct jbd2_inode *jinode = READ_ONCE(EXT4_I(inode)->jinode); + trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for @@ -135,10 +137,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ - if (!EXT4_I(inode)->jinode) + if (!jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), - EXT4_I(inode)->jinode, + jinode, new_size); } @@ -4451,8 +4453,13 @@ int ext4_inode_attach_jinode(struct inode *inode) spin_unlock(&inode->i_lock); return -ENOMEM; } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); + jbd2_journal_init_jbd_inode(jinode, inode); + /* + * Publish ->jinode only after it is fully initialized so that + * readers never observe a partially initialized jbd2_inode. + */ + smp_wmb(); + WRITE_ONCE(ei->jinode, jinode); jinode = NULL; } spin_unlock(&inode->i_lock); -- cgit v1.2.3 From afe376d2c1fa78c8a1063a357c6971bba3f6da91 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Sun, 1 Mar 2026 21:44:26 +0530 Subject: ext4: kunit: extents-test: lix percpu_counters list corruption commit 82f80e2e3b23 ("ext4: add extent status cache support to kunit tests"), added ext4_es_register_shrinker() in extents_kunit_init() function but failed to add the unregister shrinker routine in extents_kunit_exit(). This could cause the following percpu_counters list corruption bug. ok 1 split unwrit extent to 2 extents and convert 1st half writ slab kmalloc-4k start c0000002007ff000 pointer offset 1448 size 4096 list_add corruption. next->prev should be prev (c000000004bc9e60), but was 0000000000000000. (next=c0000002007ff5a8). ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:29! cpu 0x2: Vector: 700 (Program Check) at [c000000241927a30] pc: c000000000f26ed0: __list_add_valid_or_report+0x120/0x164 lr: c000000000f26ecc: __list_add_valid_or_report+0x11c/0x164 sp: c000000241927cd0 msr: 800000000282b033 current = 0xc000000241215200 paca = 0xc0000003fffff300 irqmask: 0x03 irq_happened: 0x09 pid = 258, comm = kunit_try_catch kernel BUG at lib/list_debug.c:29! enter ? for help __percpu_counter_init_many+0x148/0x184 ext4_es_register_shrinker+0x74/0x23c extents_kunit_init+0x100/0x308 kunit_try_run_case+0x78/0x1f8 kunit_generic_run_threadfn_adapter+0x40/0x70 kthread+0x190/0x1a0 start_kernel_thread+0x14/0x18 2:mon> This happens because: extents_kunit_init(test N): ext4_es_register_shrinker(sbi) percpu_counters_init() x 4; // this adds 4 list nodes to global percpu_counters list list_add(&fbc->list, &percpu_counters); shrinker_register(); extents_kunit_exit(test N): kfree(sbi); // frees sbi w/o removing those 4 list nodes. // So, those list node now becomes dangling pointers extents_kunit_init(test N+1): kzalloc_obj(ext4_sb_info) // allocator returns same page, but zeroed. ext4_es_register_shrinker(sbi) percpu_counters_init() list_add(&fbc->list, &percpu_counters); __list_add_valid(new, prev, next); next->prev != prev // list corruption bug detected, since next->prev = NULL Fixes: 82f80e2e3b23 ("ext4: add extent status cache support to kunit tests") Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Ojaswin Mujoo Link: https://patch.msgid.link/5bb9041471dab8ce870c191c19cbe4df57473be8.1772381213.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents-test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index 7c4690eb7dad..a6b3e6b592a5 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -142,8 +142,10 @@ static struct file_system_type ext_fs_type = { static void extents_kunit_exit(struct kunit *test) { - struct ext4_sb_info *sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info; + struct super_block *sb = k_ctx.k_ei->vfs_inode.i_sb; + struct ext4_sb_info *sbi = sb->s_fs_info; + ext4_es_unregister_shrinker(sbi); kfree(sbi); kfree(k_ctx.k_ei); kfree(k_ctx.k_data); -- cgit v1.2.3 From 46066e3a06647c5b186cc6334409722622d05c44 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 2 Mar 2026 21:46:19 +0800 Subject: ext4: avoid allocate block from corrupted group in ext4_mb_find_by_goal() There's issue as follows: ... EXT4-fs (mmcblk0p1): Delayed block allocation failed for inode 206 at logical offset 0 with max blocks 1 with error 117 EXT4-fs (mmcblk0p1): This should not happen!! Data will be lost EXT4-fs (mmcblk0p1): Delayed block allocation failed for inode 206 at logical offset 0 with max blocks 1 with error 117 EXT4-fs (mmcblk0p1): This should not happen!! Data will be lost EXT4-fs (mmcblk0p1): Delayed block allocation failed for inode 206 at logical offset 0 with max blocks 1 with error 117 EXT4-fs (mmcblk0p1): This should not happen!! Data will be lost EXT4-fs (mmcblk0p1): Delayed block allocation failed for inode 206 at logical offset 0 with max blocks 1 with error 117 EXT4-fs (mmcblk0p1): This should not happen!! Data will be lost EXT4-fs (mmcblk0p1): Delayed block allocation failed for inode 2243 at logical offset 0 with max blocks 1 with error 117 EXT4-fs (mmcblk0p1): This should not happen!! Data will be lost EXT4-fs (mmcblk0p1): Delayed block allocation failed for inode 2239 at logical offset 0 with max blocks 1 with error 117 EXT4-fs (mmcblk0p1): This should not happen!! Data will be lost EXT4-fs (mmcblk0p1): error count since last fsck: 1 EXT4-fs (mmcblk0p1): initial error at time 1765597433: ext4_mb_generate_buddy:760 EXT4-fs (mmcblk0p1): last error at time 1765597433: ext4_mb_generate_buddy:760 ... According to the log analysis, blocks are always requested from the corrupted block group. This may happen as follows: ext4_mb_find_by_goal ext4_mb_load_buddy ext4_mb_load_buddy_gfp ext4_mb_init_cache ext4_read_block_bitmap_nowait ext4_wait_block_bitmap ext4_validate_block_bitmap if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; // There's no logs. if (err) return err; // Will return error ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) // Unreachable goto out; After commit 9008a58e5dce ("ext4: make the bitmap read routines return real error codes") merged, Commit 163a203ddb36 ("ext4: mark block group as corrupt on block bitmap error") is no real solution for allocating blocks from corrupted block groups. This is because if 'EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)' is true, then 'ext4_mb_load_buddy()' may return an error. This means that the block allocation will fail. Therefore, check block group if corrupted when ext4_mb_load_buddy() returns error. Fixes: 163a203ddb36 ("ext4: mark block group as corrupt on block bitmap error") Fixes: 9008a58e5dce ("ext4: make the bitmap read routines return real error codes") Signed-off-by: Ye Bin Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Zhang Yi Reviewed-by: Andreas Dilger Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260302134619.3145520-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20e9fdaf4301..705a879f13d3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2443,8 +2443,12 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); - if (err) + if (err) { + if (EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info) && + !(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return 0; return err; + } ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) -- cgit v1.2.3 From c4a48e9eeefd610ae0d26e9ff085277f751c8e53 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Mon, 2 Mar 2026 20:08:11 +0530 Subject: ext4: minor fix for ext4_split_extent_zeroout() We missed storing the error which triggerd smatch warning: fs/ext4/extents.c:3369 ext4_split_extent_zeroout() warn: duplicate zero check 'err' (previous on line 3363) fs/ext4/extents.c 3361 3362 err = ext4_ext_get_access(handle, inode, path + depth); 3363 if (err) 3364 return err; 3365 3366 ext4_ext_mark_initialized(ex); 3367 3368 ext4_ext_dirty(handle, inode, path + depth); --> 3369 if (err) 3370 return err; 3371 3372 return 0; 3373 } Fix it by correctly storing the err value from ext4_ext_dirty(). Link: https://lore.kernel.org/all/aYXvVgPnKltX79KE@stanley.mountain/ Reported-by: Dan Carpenter Fixes: a985e07c26455 ("ext4: refactor zeroout path and handle all cases") Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Zhang Yi Reviewed-by: Baokun Li Reviewed-by: Andreas Dilger Link: https://patch.msgid.link/20260302143811.605174-1-ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ae3804f36535..b41a44dc245c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3363,7 +3363,7 @@ static int ext4_split_extent_zeroout(handle_t *handle, struct inode *inode, ext4_ext_mark_initialized(ex); - ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + depth); if (err) return err; -- cgit v1.2.3 From 73bf12adbea10b13647864cd1c62410d19e21086 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 3 Mar 2026 09:22:42 +0800 Subject: ext4: test if inode's all dirty pages are submitted to disk The commit aa373cf55099 ("writeback: stop background/kupdate works from livelocking other works") introduced an issue where unmounting a filesystem in a multi-logical-partition scenario could lead to batch file data loss. This problem was not fixed until the commit d92109891f21 ("fs/writeback: bail out if there is no more inodes for IO and queued once"). It took considerable time to identify the root cause. Additionally, in actual production environments, we frequently encountered file data loss after normal system reboots. Therefore, we are adding a check in the inode release flow to verify whether all dirty pages have been flushed to disk, in order to determine whether the data loss is caused by a logic issue in the filesystem code. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260303012242.3206465-1-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ac5f3446c731..1123d995494b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -186,6 +186,14 @@ void ext4_evict_inode(struct inode *inode) if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { + /* + * If there's dirty page will lead to data loss, user + * could see stale data. + */ + if (unlikely(!ext4_emergency_state(inode->i_sb) && + mapping_tagged(&inode->i_data, PAGECACHE_TAG_DIRTY))) + ext4_warning_inode(inode, "data will be lost"); + truncate_inode_pages_final(&inode->i_data); goto no_delete; -- cgit v1.2.3 From 2acb5c12ebd860f30e4faf67e6cc8c44ddfe5fe8 Mon Sep 17 00:00:00 2001 From: Tejas Bharambe Date: Tue, 3 Mar 2026 23:14:34 -0800 Subject: ext4: validate p_idx bounds in ext4_ext_correct_indexes ext4_ext_correct_indexes() walks up the extent tree correcting index entries when the first extent in a leaf is modified. Before accessing path[k].p_idx->ei_block, there is no validation that p_idx falls within the valid range of index entries for that level. If the on-disk extent header contains a corrupted or crafted eh_entries value, p_idx can point past the end of the allocated buffer, causing a slab-out-of-bounds read. Fix this by validating path[k].p_idx against EXT_LAST_INDEX() at both access sites: before the while loop and inside it. Return -EFSCORRUPTED if the index pointer is out of range, consistent with how other bounds violations are handled in the ext4 extent tree code. Reported-by: syzbot+04c4e65cab786a2e5b7e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=04c4e65cab786a2e5b7e Signed-off-by: Tejas Bharambe Link: https://patch.msgid.link/JH0PR06MB66326016F9B6AD24097D232B897CA@JH0PR06MB6632.apcprd06.prod.outlook.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b41a44dc245c..c99e0802d700 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1736,6 +1736,13 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; + if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) { + EXT4_ERROR_INODE(inode, + "path[%d].p_idx %p > EXT_LAST_INDEX %p", + k, path[k].p_idx, + EXT_LAST_INDEX(path[k].p_hdr)); + return -EFSCORRUPTED; + } path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) @@ -1748,6 +1755,14 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + k); if (err) goto clean; + if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) { + EXT4_ERROR_INODE(inode, + "path[%d].p_idx %p > EXT_LAST_INDEX %p", + k, path[k].p_idx, + EXT_LAST_INDEX(path[k].p_hdr)); + err = -EFSCORRUPTED; + goto clean; + } path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) -- cgit v1.2.3 From 5422fe71d26d42af6c454ca9527faaad4e677d6c Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Fri, 6 Mar 2026 09:31:58 +0800 Subject: ext4: avoid infinite loops caused by residual data On the mkdir/mknod path, when mapping logical blocks to physical blocks, if inserting a new extent into the extent tree fails (in this example, because the file system disabled the huge file feature when marking the inode as dirty), ext4_ext_map_blocks() only calls ext4_free_blocks() to reclaim the physical block without deleting the corresponding data in the extent tree. This causes subsequent mkdir operations to reference the previously reclaimed physical block number again, even though this physical block is already being used by the xattr block. Therefore, a situation arises where both the directory and xattr are using the same buffer head block in memory simultaneously. The above causes ext4_xattr_block_set() to enter an infinite loop about "inserted" and cannot release the inode lock, ultimately leading to the 143s blocking problem mentioned in [1]. If the metadata is corrupted, then trying to remove some extent space can do even more harm. Also in case EXT4_GET_BLOCKS_DELALLOC_RESERVE was passed, remove space wrongly update quota information. Jan Kara suggests distinguishing between two cases: 1) The error is ENOSPC or EDQUOT - in this case the filesystem is fully consistent and we must maintain its consistency including all the accounting. However these errors can happen only early before we've inserted the extent into the extent tree. So current code works correctly for this case. 2) Some other error - this means metadata is corrupted. We should strive to do as few modifications as possible to limit damage. So I'd just skip freeing of allocated blocks. [1] INFO: task syz.0.17:5995 blocked for more than 143 seconds. Call Trace: inode_lock_nested include/linux/fs.h:1073 [inline] __start_dirop fs/namei.c:2923 [inline] start_dirop fs/namei.c:2934 [inline] Reported-by: syzbot+512459401510e2a9a39f@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1659aaaaa8d9d11265d7 Tested-by: syzbot+1659aaaaa8d9d11265d7@syzkaller.appspotmail.com Reported-by: syzbot+1659aaaaa8d9d11265d7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=512459401510e2a9a39f Tested-by: syzbot+1659aaaaa8d9d11265d7@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Reviewed-by: Jan Kara Tested-by: syzbot+512459401510e2a9a39f@syzkaller.appspotmail.com Link: https://patch.msgid.link/tencent_43696283A68450B761D76866C6F360E36705@qq.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c99e0802d700..8744d3845577 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4472,9 +4472,13 @@ got_allocated_blocks: path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (IS_ERR(path)) { err = PTR_ERR(path); - if (allocated_clusters) { + /* + * Gracefully handle out of space conditions. If the filesystem + * is inconsistent, we'll just leak allocated blocks to avoid + * causing even more damage. + */ + if (allocated_clusters && (err == -EDQUOT || err == -ENOSPC)) { int fb_flags = 0; - /* * free data blocks we just allocated. * not a good idea to call discard here directly, -- cgit v1.2.3 From bac3190a8e79beff6ed221975e0c9b1b5f2a21da Mon Sep 17 00:00:00 2001 From: Milos Nikic Date: Tue, 10 Mar 2026 21:15:48 -0700 Subject: jbd2: gracefully abort on checkpointing state corruptions This patch targets two internal state machine invariants in checkpoint.c residing inside functions that natively return integer error codes. - In jbd2_cleanup_journal_tail(): A blocknr of 0 indicates a severely corrupted journal superblock. Replaced the J_ASSERT with a WARN_ON_ONCE and a graceful journal abort, returning -EFSCORRUPTED. - In jbd2_log_do_checkpoint(): Replaced the J_ASSERT_BH checking for an unexpected buffer_jwrite state. If the warning triggers, we explicitly drop the just-taken get_bh() reference and call __flush_batch() to safely clean up any previously queued buffers in the j_chkpt_bhs array, preventing a memory leak before returning -EFSCORRUPTED. Signed-off-by: Milos Nikic Reviewed-by: Andreas Dilger Reviewed-by: Zhang Yi Reviewed-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260311041548.159424-1-nikic.milos@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/checkpoint.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index de89c5bef607..1508e2f54462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -267,7 +267,15 @@ restart: */ BUFFER_TRACE(bh, "queue"); get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); + if (WARN_ON_ONCE(buffer_jwrite(bh))) { + put_bh(bh); /* drop the ref we just took */ + spin_unlock(&journal->j_list_lock); + /* Clean up any previously batched buffers */ + if (batch_count) + __flush_batch(journal, &batch_count); + jbd2_journal_abort(journal, -EFSCORRUPTED); + return -EFSCORRUPTED; + } journal->j_chkpt_bhs[batch_count++] = bh; transaction->t_chp_stats.cs_written++; transaction->t_checkpoint_list = jh->b_cpnext; @@ -325,7 +333,10 @@ int jbd2_cleanup_journal_tail(journal_t *journal) if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; - J_ASSERT(blocknr != 0); + if (WARN_ON_ONCE(blocknr == 0)) { + jbd2_journal_abort(journal, -EFSCORRUPTED); + return -EFSCORRUPTED; + } /* * We need to make sure that any blocks that were recently written out -- cgit v1.2.3 From 49504a512587147dd6da3b4b08832ccc157b97dc Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 14 Mar 2026 15:52:56 +0800 Subject: ext4: introduce EXPORT_SYMBOL_FOR_EXT4_TEST() helper Introduce EXPORT_SYMBOL_FOR_EXT4_TEST() helper for kuint test. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260314075258.1317579-2-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 293f698b7042..c579f68b3c11 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3944,6 +3944,11 @@ static inline bool ext4_inode_can_atomic_write(struct inode *inode) extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); + +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +#define EXPORT_SYMBOL_FOR_EXT4_TEST(sym) \ + EXPORT_SYMBOL_FOR_MODULES(sym, "ext4-test") +#endif #endif /* __KERNEL__ */ #endif /* _EXT4_H */ -- cgit v1.2.3 From 519b76ac0b31d86b45784735d4ef964e8efdc56b Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 14 Mar 2026 15:52:57 +0800 Subject: ext4: fix mballoc-test.c is not compiled when EXT4_KUNIT_TESTS=M Now, only EXT4_KUNIT_TESTS=Y testcase will be compiled in 'mballoc.c'. To solve this issue, the ext4 test code needs to be decoupled. The ext4 test module is compiled into a separate module. Reported-by: ChenXiaoSong Closes: https://patchwork.kernel.org/project/cifs-client/patch/20260118091313.1988168-2-chenxiaosong.chenxiaosong@linux.dev/ Fixes: 7c9fa399a369 ("ext4: add first unit test for ext4_mb_new_blocks_simple in mballoc") Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260314075258.1317579-3-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/Makefile | 4 +- fs/ext4/mballoc-test.c | 81 ++++++++++++++++++++------------------- fs/ext4/mballoc.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/ext4/mballoc.h | 30 +++++++++++++++ 4 files changed, 172 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 72206a292676..d836c3fe311b 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -14,7 +14,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-inode-test-objs += inode-test.o -obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o +ext4-test-objs += inode-test.o mballoc-test.o +obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o ext4-$(CONFIG_FS_VERITY) += verity.o ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 9fbdf6a09489..6f5bfbb0e8a4 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -8,6 +8,7 @@ #include #include "ext4.h" +#include "mballoc.h" struct mbt_grp_ctx { struct buffer_head bitmap_bh; @@ -336,7 +337,7 @@ ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, if (state) mb_set_bits(bitmap_bh->b_data, blkoff, len); else - mb_clear_bits(bitmap_bh->b_data, blkoff, len); + mb_clear_bits_test(bitmap_bh->b_data, blkoff, len); return 0; } @@ -413,14 +414,14 @@ static void test_new_blocks_simple(struct kunit *test) /* get block at goal */ ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, "failed to alloc block at goal, expected %llu found %llu", ar.goal, found); /* get block after goal in goal group */ ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, "failed to alloc block after goal in goal group, expected %llu found %llu", ar.goal + 1, found); @@ -428,7 +429,7 @@ static void test_new_blocks_simple(struct kunit *test) /* get block after goal group */ mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ext4_group_first_block_no(sb, goal_group + 1), found, "failed to alloc block after goal group, expected %llu found %llu", @@ -438,7 +439,7 @@ static void test_new_blocks_simple(struct kunit *test) for (i = goal_group; i < ext4_get_groups_count(sb); i++) mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_EQ_MSG(test, ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, "failed to alloc block before goal group, expected %llu found %llu", @@ -448,7 +449,7 @@ static void test_new_blocks_simple(struct kunit *test) for (i = 0; i < ext4_get_groups_count(sb); i++) mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); ar.goal = ext4_group_first_block_no(sb, goal_group); - found = ext4_mb_new_blocks_simple(&ar, &err); + found = ext4_mb_new_blocks_simple_test(&ar, &err); KUNIT_ASSERT_NE_MSG(test, err, 0, "unexpectedly get block when no block is available"); } @@ -492,16 +493,16 @@ validate_free_blocks_simple(struct kunit *test, struct super_block *sb, continue; bitmap = mbt_ctx_bitmap(sb, i); - bit = mb_find_next_zero_bit(bitmap, max, 0); + bit = mb_find_next_zero_bit_test(bitmap, max, 0); KUNIT_ASSERT_EQ_MSG(test, bit, max, "free block on unexpected group %d", i); } bitmap = mbt_ctx_bitmap(sb, goal_group); - bit = mb_find_next_zero_bit(bitmap, max, 0); + bit = mb_find_next_zero_bit_test(bitmap, max, 0); KUNIT_ASSERT_EQ(test, bit, start); - bit = mb_find_next_bit(bitmap, max, bit + 1); + bit = mb_find_next_bit_test(bitmap, max, bit + 1); KUNIT_ASSERT_EQ(test, bit, start + len); } @@ -524,7 +525,7 @@ test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, block = ext4_group_first_block_no(sb, goal_group) + EXT4_C2B(sbi, start); - ext4_free_blocks_simple(inode, block, len); + ext4_free_blocks_simple_test(inode, block, len); validate_free_blocks_simple(test, sb, goal_group, start, len); mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); } @@ -566,15 +567,15 @@ test_mark_diskspace_used_range(struct kunit *test, bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); memset(bitmap, 0, sb->s_blocksize); - ret = ext4_mb_mark_diskspace_used(ac, NULL); + ret = ext4_mb_mark_diskspace_used_test(ac, NULL); KUNIT_ASSERT_EQ(test, ret, 0); max = EXT4_CLUSTERS_PER_GROUP(sb); - i = mb_find_next_bit(bitmap, max, 0); + i = mb_find_next_bit_test(bitmap, max, 0); KUNIT_ASSERT_EQ(test, i, start); - i = mb_find_next_zero_bit(bitmap, max, i + 1); + i = mb_find_next_zero_bit_test(bitmap, max, i + 1); KUNIT_ASSERT_EQ(test, i, start + len); - i = mb_find_next_bit(bitmap, max, i + 1); + i = mb_find_next_bit_test(bitmap, max, i + 1); KUNIT_ASSERT_EQ(test, max, i); } @@ -617,54 +618,54 @@ static void mbt_generate_buddy(struct super_block *sb, void *buddy, max = EXT4_CLUSTERS_PER_GROUP(sb); bb_h = buddy + sbi->s_mb_offsets[1]; - off = mb_find_next_zero_bit(bb, max, 0); + off = mb_find_next_zero_bit_test(bb, max, 0); grp->bb_first_free = off; while (off < max) { grp->bb_counters[0]++; grp->bb_free++; - if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + if (!(off & 1) && !mb_test_bit_test(off + 1, bb)) { grp->bb_free++; grp->bb_counters[0]--; - mb_clear_bit(off >> 1, bb_h); + mb_clear_bit_test(off >> 1, bb_h); grp->bb_counters[1]++; grp->bb_largest_free_order = 1; off++; } - off = mb_find_next_zero_bit(bb, max, off + 1); + off = mb_find_next_zero_bit_test(bb, max, off + 1); } for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { bb = buddy + sbi->s_mb_offsets[order]; bb_h = buddy + sbi->s_mb_offsets[order + 1]; max = max >> 1; - off = mb_find_next_zero_bit(bb, max, 0); + off = mb_find_next_zero_bit_test(bb, max, 0); while (off < max) { - if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + if (!(off & 1) && !mb_test_bit_test(off + 1, bb)) { mb_set_bits(bb, off, 2); grp->bb_counters[order] -= 2; - mb_clear_bit(off >> 1, bb_h); + mb_clear_bit_test(off >> 1, bb_h); grp->bb_counters[order + 1]++; grp->bb_largest_free_order = order + 1; off++; } - off = mb_find_next_zero_bit(bb, max, off + 1); + off = mb_find_next_zero_bit_test(bb, max, off + 1); } } max = EXT4_CLUSTERS_PER_GROUP(sb); - off = mb_find_next_zero_bit(bitmap, max, 0); + off = mb_find_next_zero_bit_test(bitmap, max, 0); while (off < max) { grp->bb_fragments++; - off = mb_find_next_bit(bitmap, max, off + 1); + off = mb_find_next_bit_test(bitmap, max, off + 1); if (off + 1 >= max) break; - off = mb_find_next_zero_bit(bitmap, max, off + 1); + off = mb_find_next_zero_bit_test(bitmap, max, off + 1); } } @@ -706,7 +707,7 @@ do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, /* needed by validation in ext4_mb_generate_buddy */ ext4_grp->bb_free = mbt_grp->bb_free; memset(ext4_buddy, 0xff, sb->s_blocksize); - ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_mb_generate_buddy_test(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, ext4_grp); KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), @@ -760,7 +761,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, ex.fe_group = TEST_GOAL_GROUP; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_mark_used(e4b, &ex); + mb_mark_used_test(e4b, &ex); ext4_unlock_group(sb, TEST_GOAL_GROUP); mb_set_bits(bitmap, start, len); @@ -769,7 +770,7 @@ test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, memset(buddy, 0xff, sb->s_blocksize); for (i = 0; i < MB_NUM_ORDERS(sb); i++) grp->bb_counters[i] = 0; - ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp); KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), 0); @@ -798,7 +799,7 @@ static void test_mb_mark_used(struct kunit *test) bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); - ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); @@ -809,7 +810,7 @@ static void test_mb_mark_used(struct kunit *test) test_mb_mark_used_range(test, &e4b, ranges[i].start, ranges[i].len, bitmap, buddy, grp); - ext4_mb_unload_buddy(&e4b); + ext4_mb_unload_buddy_test(&e4b); } static void @@ -825,16 +826,16 @@ test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, return; ext4_lock_group(sb, e4b->bd_group); - mb_free_blocks(NULL, e4b, start, len); + mb_free_blocks_test(NULL, e4b, start, len); ext4_unlock_group(sb, e4b->bd_group); - mb_clear_bits(bitmap, start, len); + mb_clear_bits_test(bitmap, start, len); /* bypass bb_free validatoin in ext4_mb_generate_buddy */ grp->bb_free += len; memset(buddy, 0xff, sb->s_blocksize); for (i = 0; i < MB_NUM_ORDERS(sb); i++) grp->bb_counters[i] = 0; - ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + ext4_mb_generate_buddy_test(sb, buddy, bitmap, 0, grp); KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), 0); @@ -865,7 +866,7 @@ static void test_mb_free_blocks(struct kunit *test) bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); - ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); ex.fe_start = 0; @@ -873,7 +874,7 @@ static void test_mb_free_blocks(struct kunit *test) ex.fe_group = TEST_GOAL_GROUP; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_mark_used(&e4b, &ex); + mb_mark_used_test(&e4b, &ex); ext4_unlock_group(sb, TEST_GOAL_GROUP); grp->bb_free = 0; @@ -886,7 +887,7 @@ static void test_mb_free_blocks(struct kunit *test) test_mb_free_blocks_range(test, &e4b, ranges[i].start, ranges[i].len, bitmap, buddy, grp); - ext4_mb_unload_buddy(&e4b); + ext4_mb_unload_buddy_test(&e4b); } #define COUNT_FOR_ESTIMATE 100000 @@ -904,7 +905,7 @@ static void test_mb_mark_used_cost(struct kunit *test) if (sb->s_blocksize > PAGE_SIZE) kunit_skip(test, "blocksize exceeds pagesize"); - ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + ret = ext4_mb_load_buddy_test(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); ex.fe_group = TEST_GOAL_GROUP; @@ -918,7 +919,7 @@ static void test_mb_mark_used_cost(struct kunit *test) ex.fe_start = ranges[i].start; ex.fe_len = ranges[i].len; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_mark_used(&e4b, &ex); + mb_mark_used_test(&e4b, &ex); ext4_unlock_group(sb, TEST_GOAL_GROUP); } end = jiffies; @@ -929,14 +930,14 @@ static void test_mb_mark_used_cost(struct kunit *test) continue; ext4_lock_group(sb, TEST_GOAL_GROUP); - mb_free_blocks(NULL, &e4b, ranges[i].start, + mb_free_blocks_test(NULL, &e4b, ranges[i].start, ranges[i].len); ext4_unlock_group(sb, TEST_GOAL_GROUP); } } kunit_info(test, "costed jiffies %lu\n", all); - ext4_mb_unload_buddy(&e4b); + ext4_mb_unload_buddy_test(&e4b); } static const struct mbt_ext4_block_layout mbt_test_layouts[] = { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 705a879f13d3..93d37f6cf9c3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4088,7 +4088,7 @@ void ext4_exit_mballoc(void) #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 #define EXT4_MB_SYNC_UPDATE 0x0002 -static int +int ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, ext4_group_t group, ext4_grpblk_t blkoff, ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) @@ -7192,6 +7192,102 @@ out_unload: return error; } -#ifdef CONFIG_EXT4_KUNIT_TESTS -#include "mballoc-test.c" +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +void mb_clear_bits_test(void *bm, int cur, int len) +{ + mb_clear_bits(bm, cur, len); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bits_test); + +ext4_fsblk_t +ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar, + int *errp) +{ + return ext4_mb_new_blocks_simple(ar, errp); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_new_blocks_simple_test); + +int mb_find_next_zero_bit_test(void *addr, int max, int start) +{ + return mb_find_next_zero_bit(addr, max, start); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_zero_bit_test); + +int mb_find_next_bit_test(void *addr, int max, int start) +{ + return mb_find_next_bit(addr, max, start); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_bit_test); + +void mb_clear_bit_test(int bit, void *addr) +{ + mb_clear_bit(bit, addr); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bit_test); + +int mb_test_bit_test(int bit, void *addr) +{ + return mb_test_bit(bit, addr); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_test_bit_test); + +int ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac, + handle_t *handle) +{ + return ext4_mb_mark_diskspace_used(ac, handle); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_diskspace_used_test); + +int mb_mark_used_test(struct ext4_buddy *e4b, struct ext4_free_extent *ex) +{ + return mb_mark_used(e4b, ex); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_mark_used_test); + +void ext4_mb_generate_buddy_test(struct super_block *sb, void *buddy, + void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) +{ + ext4_mb_generate_buddy(sb, buddy, bitmap, group, grp); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_generate_buddy_test); + +int ext4_mb_load_buddy_test(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + return ext4_mb_load_buddy(sb, group, e4b); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_load_buddy_test); + +void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b) +{ + ext4_mb_unload_buddy(e4b); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_unload_buddy_test); + +void mb_free_blocks_test(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + mb_free_blocks(inode, e4b, first, count); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_free_blocks_test); + +void ext4_free_blocks_simple_test(struct inode *inode, ext4_fsblk_t block, + unsigned long count) +{ + return ext4_free_blocks_simple(inode, block, count); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_blocks_simple_test); + +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_wait_block_bitmap); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_init); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_desc); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_count_free_clusters); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_info); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_group_clusters_set); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_release); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_read_block_bitmap_nowait); +EXPORT_SYMBOL_FOR_EXT4_TEST(mb_set_bits); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_fc_init_inode); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_context); #endif diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 15a049f05d04..39333ce72cbd 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -270,4 +270,34 @@ ext4_mballoc_query_range( ext4_mballoc_query_range_fn formatter, void *priv); +extern int ext4_mb_mark_context(handle_t *handle, + struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed); +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +extern void mb_clear_bits_test(void *bm, int cur, int len); +extern ext4_fsblk_t +ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar, + int *errp); +extern int mb_find_next_zero_bit_test(void *addr, int max, int start); +extern int mb_find_next_bit_test(void *addr, int max, int start); +extern void mb_clear_bit_test(int bit, void *addr); +extern int mb_test_bit_test(int bit, void *addr); +extern int +ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac, + handle_t *handle); +extern int mb_mark_used_test(struct ext4_buddy *e4b, + struct ext4_free_extent *ex); +extern void ext4_mb_generate_buddy_test(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp); +extern int ext4_mb_load_buddy_test(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b); +extern void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b); +extern void mb_free_blocks_test(struct inode *inode, + struct ext4_buddy *e4b, int first, int count); +extern void ext4_free_blocks_simple_test(struct inode *inode, + ext4_fsblk_t block, unsigned long count); +#endif #endif -- cgit v1.2.3 From 9e1b14320b154094bb2c1bee6d8c6cb851fc3215 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 14 Mar 2026 15:52:58 +0800 Subject: ext4: fix extents-test.c is not compiled when EXT4_KUNIT_TESTS=M Now, only EXT4_KUNIT_TESTS=Y testcase will be compiled in 'extents.c'. To solve this issue, the ext4 test code needs to be decoupled. The 'extents-test' module is compiled into 'ext4-test' module. Fixes: cb1e0c1d1fad ("ext4: kunit tests for extent splitting and conversion") Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260314075258.1317579-4-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/Makefile | 3 ++- fs/ext4/ext4_extents.h | 12 ++++++++++++ fs/ext4/extents-test.c | 8 ++++---- fs/ext4/extents.c | 39 +++++++++++++++++++++++++++++++++------ 4 files changed, 51 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index d836c3fe311b..3baee4e7c1cf 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -14,7 +14,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-test-objs += inode-test.o mballoc-test.o +ext4-test-objs += inode-test.o mballoc-test.o \ + extents-test.o obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o ext4-$(CONFIG_FS_VERITY) += verity.o ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index c484125d963f..ebaf7cc42430 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -264,5 +264,17 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } +extern int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path); +extern int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex); +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +extern int ext4_ext_space_root_idx_test(struct inode *inode, int check); +extern struct ext4_ext_path *ext4_split_convert_extents_test( + handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, + int flags, unsigned int *allocated); +#endif #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c index a6b3e6b592a5..5496b2c8e2cd 100644 --- a/fs/ext4/extents-test.c +++ b/fs/ext4/extents-test.c @@ -282,8 +282,8 @@ static int extents_kunit_init(struct kunit *test) eh->eh_depth = 0; eh->eh_entries = cpu_to_le16(1); eh->eh_magic = EXT4_EXT_MAGIC; - eh->eh_max = - cpu_to_le16(ext4_ext_space_root_idx(&k_ctx.k_ei->vfs_inode, 0)); + eh->eh_max = cpu_to_le16(ext4_ext_space_root_idx_test( + &k_ctx.k_ei->vfs_inode, 0)); eh->eh_generation = 0; /* @@ -386,8 +386,8 @@ static void test_split_convert(struct kunit *test) switch (param->type) { case TEST_SPLIT_CONVERT: - path = ext4_split_convert_extents(NULL, inode, &map, path, - param->split_flags, NULL); + path = ext4_split_convert_extents_test(NULL, inode, &map, + path, param->split_flags, NULL); break; case TEST_CREATE_BLOCKS: ext4_map_create_blocks_helper(test, inode, &map, param->split_flags); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8744d3845577..da78eb23aaa6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -184,9 +184,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; @@ -3159,7 +3159,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) } /* FIXME!! we need to try to merge to left or right after zero-out */ -static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; @@ -6257,6 +6257,33 @@ out: return 0; } -#ifdef CONFIG_EXT4_KUNIT_TESTS -#include "extents-test.c" +#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS) +int ext4_ext_space_root_idx_test(struct inode *inode, int check) +{ + return ext4_ext_space_root_idx(inode, check); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_space_root_idx_test); + +struct ext4_ext_path *ext4_split_convert_extents_test(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int *allocated) +{ + return ext4_split_convert_extents(handle, inode, map, path, + flags, allocated); +} +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_split_convert_extents_test); + +EXPORT_SYMBOL_FOR_EXT4_TEST(__ext4_ext_dirty); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_zeroout); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_register_shrinker); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_unregister_shrinker); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_create_blocks); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_init_tree); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_lookup_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_es_insert_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_ext_insert_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_find_extent); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_issue_zeroout); +EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_map_query_blocks); #endif -- cgit v1.2.3 From 3822743dc20386d9897e999dbb990befa3a5b3f8 Mon Sep 17 00:00:00 2001 From: Helen Koike Date: Tue, 17 Mar 2026 11:23:10 -0300 Subject: ext4: reject mount if bigalloc with s_first_data_block != 0 bigalloc with s_first_data_block != 0 is not supported, reject mounting it. Signed-off-by: Helen Koike Suggested-by: Theodore Ts'o Reported-by: syzbot+b73703b873a33d8eb8f6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b73703b873a33d8eb8f6 Link: https://patch.msgid.link/20260317142325.135074-1-koike@igalia.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 43f680c750ae..152c58fe8e01 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3633,6 +3633,13 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) "extents feature\n"); return 0; } + if (ext4_has_feature_bigalloc(sb) && + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_msg(sb, KERN_WARNING, + "bad geometry: bigalloc file system with non-zero " + "first_data_block\n"); + return 0; + } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || -- cgit v1.2.3 From 496bb99b7e66f48b178126626f47e9ba79e2d0fa Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 19 Mar 2026 17:45:45 +0800 Subject: ext4: fix the might_sleep() warnings in kvfree() Use the kvfree() in the RCU read critical section can trigger the following warnings: EXT4-fs (vdb): unmounting filesystem cd983e5b-3c83-4f5a-a136-17b00eb9d018. WARNING: suspicious RCU usage ./include/linux/rcupdate.h:409 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 Call Trace: dump_stack_lvl+0xbb/0xd0 dump_stack+0x14/0x20 lockdep_rcu_suspicious+0x15a/0x1b0 __might_resched+0x375/0x4d0 ? put_object.part.0+0x2c/0x50 __might_sleep+0x108/0x160 vfree+0x58/0x910 ? ext4_group_desc_free+0x27/0x270 kvfree+0x23/0x40 ext4_group_desc_free+0x111/0x270 ext4_put_super+0x3c8/0xd40 generic_shutdown_super+0x14c/0x4a0 ? __pfx_shrinker_free+0x10/0x10 kill_block_super+0x40/0x90 ext4_kill_sb+0x6d/0xb0 deactivate_locked_super+0xb4/0x180 deactivate_super+0x7e/0xa0 cleanup_mnt+0x296/0x3e0 __cleanup_mnt+0x16/0x20 task_work_run+0x157/0x250 ? __pfx_task_work_run+0x10/0x10 ? exit_to_user_mode_loop+0x6a/0x550 exit_to_user_mode_loop+0x102/0x550 do_syscall_64+0x44a/0x500 entry_SYSCALL_64_after_hwframe+0x77/0x7f BUG: sleeping function called from invalid context at mm/vmalloc.c:3441 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 556, name: umount preempt_count: 1, expected: 0 CPU: 3 UID: 0 PID: 556 Comm: umount Call Trace: dump_stack_lvl+0xbb/0xd0 dump_stack+0x14/0x20 __might_resched+0x275/0x4d0 ? put_object.part.0+0x2c/0x50 __might_sleep+0x108/0x160 vfree+0x58/0x910 ? ext4_group_desc_free+0x27/0x270 kvfree+0x23/0x40 ext4_group_desc_free+0x111/0x270 ext4_put_super+0x3c8/0xd40 generic_shutdown_super+0x14c/0x4a0 ? __pfx_shrinker_free+0x10/0x10 kill_block_super+0x40/0x90 ext4_kill_sb+0x6d/0xb0 deactivate_locked_super+0xb4/0x180 deactivate_super+0x7e/0xa0 cleanup_mnt+0x296/0x3e0 __cleanup_mnt+0x16/0x20 task_work_run+0x157/0x250 ? __pfx_task_work_run+0x10/0x10 ? exit_to_user_mode_loop+0x6a/0x550 exit_to_user_mode_loop+0x102/0x550 do_syscall_64+0x44a/0x500 entry_SYSCALL_64_after_hwframe+0x77/0x7f The above scenarios occur in initialization failures and teardown paths, there are no parallel operations on the resources released by kvfree(), this commit therefore remove rcu_read_lock/unlock() and use rcu_access_pointer() instead of rcu_dereference() operations. Fixes: 7c990728b99e ("ext4: fix potential race between s_flex_groups online resizing and access") Fixes: df3da4ea5a0f ("ext4: fix potential race between s_group_info online resizing and access") Signed-off-by: Zqiang Reviewed-by: Baokun Li Link: https://patch.msgid.link/20260319094545.19291-1-qiang.zhang@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 10 +++------- fs/ext4/super.c | 8 ++------ 2 files changed, 5 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 93d37f6cf9c3..bb6faebf9b6d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3584,9 +3584,7 @@ err_freebuddy: rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: - rcu_read_lock(); - kvfree(rcu_dereference(sbi->s_group_info)); - rcu_read_unlock(); + kvfree(rcu_access_pointer(sbi->s_group_info)); return -ENOMEM; } @@ -3901,7 +3899,8 @@ void ext4_mb_release(struct super_block *sb) WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); } - if (sbi->s_group_info) { + group_info = rcu_access_pointer(sbi->s_group_info); + if (group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); @@ -3919,12 +3918,9 @@ void ext4_mb_release(struct super_block *sb) num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); - rcu_read_lock(); - group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) kfree(group_info[i]); kvfree(group_info); - rcu_read_unlock(); } ext4_mb_avg_fragment_size_destroy(sbi); ext4_mb_largest_free_orders_destroy(sbi); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 152c58fe8e01..baa067eb8cf4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1254,12 +1254,10 @@ static void ext4_group_desc_free(struct ext4_sb_info *sbi) struct buffer_head **group_desc; int i; - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); + group_desc = rcu_access_pointer(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); - rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) @@ -1267,14 +1265,12 @@ static void ext4_flex_groups_free(struct ext4_sb_info *sbi) struct flex_groups **flex_groups; int i; - rcu_read_lock(); - flex_groups = rcu_dereference(sbi->s_flex_groups); + flex_groups = rcu_access_pointer(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } - rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) -- cgit v1.2.3 From d15e4b0a418537aafa56b2cb80d44add83e83697 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 19 Mar 2026 20:03:35 +0800 Subject: ext4: fix use-after-free in update_super_work when racing with umount Commit b98535d09179 ("ext4: fix bug_on in start_this_handle during umount filesystem") moved ext4_unregister_sysfs() before flushing s_sb_upd_work to prevent new error work from being queued via /proc/fs/ext4/xx/mb_groups reads during unmount. However, this introduced a use-after-free because update_super_work calls ext4_notify_error_sysfs() -> sysfs_notify() which accesses the kobject's kernfs_node after it has been freed by kobject_del() in ext4_unregister_sysfs(): update_super_work ext4_put_super ----------------- -------------- ext4_unregister_sysfs(sb) kobject_del(&sbi->s_kobj) __kobject_del() sysfs_remove_dir() kobj->sd = NULL sysfs_put(sd) kernfs_put() // RCU free ext4_notify_error_sysfs(sbi) sysfs_notify(&sbi->s_kobj) kn = kobj->sd // stale pointer kernfs_get(kn) // UAF on freed kernfs_node ext4_journal_destroy() flush_work(&sbi->s_sb_upd_work) Instead of reordering the teardown sequence, fix this by making ext4_notify_error_sysfs() detect that sysfs has already been torn down by checking s_kobj.state_in_sysfs, and skipping the sysfs_notify() call in that case. A dedicated mutex (s_error_notify_mutex) serializes ext4_notify_error_sysfs() against kobject_del() in ext4_unregister_sysfs() to prevent TOCTOU races where the kobject could be deleted between the state_in_sysfs check and the sysfs_notify() call. Fixes: b98535d09179 ("ext4: fix bug_on in start_this_handle during umount filesystem") Cc: Jiayuan Chen Suggested-by: Jan Kara Signed-off-by: Jiayuan Chen Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260319120336.157873-1-jiayuan.chen@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 1 + fs/ext4/sysfs.c | 10 +++++++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c579f68b3c11..7617e2d454ea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1570,6 +1570,7 @@ struct ext4_sb_info { struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; + struct mutex s_error_notify_mutex; /* protects sysfs_notify vs kobject_del */ struct super_block *s_sb; struct buffer_head *s_mmp_bh; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index baa067eb8cf4..cb69a9b38b7c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5406,6 +5406,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); + mutex_init(&sbi->s_error_notify_mutex); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index b87d7bdab06a..923b375e017f 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -597,7 +597,10 @@ static const struct kobj_type ext4_feat_ktype = { void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) { - sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); + mutex_lock(&sbi->s_error_notify_mutex); + if (sbi->s_kobj.state_in_sysfs) + sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); + mutex_unlock(&sbi->s_error_notify_mutex); } static struct kobject *ext4_root; @@ -610,8 +613,10 @@ int ext4_register_sysfs(struct super_block *sb) int err; init_completion(&sbi->s_kobj_unregister); + mutex_lock(&sbi->s_error_notify_mutex); err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root, "%s", sb->s_id); + mutex_unlock(&sbi->s_error_notify_mutex); if (err) { kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); @@ -644,7 +649,10 @@ void ext4_unregister_sysfs(struct super_block *sb) if (sbi->s_proc) remove_proc_subtree(sb->s_id, ext4_proc_root); + + mutex_lock(&sbi->s_error_notify_mutex); kobject_del(&sbi->s_kobj); + mutex_unlock(&sbi->s_error_notify_mutex); } int __init ext4_init_sysfs(void) -- cgit v1.2.3 From 0c90eed1b95335eba4f546e6742a8e4503d79349 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 20 Mar 2026 10:04:29 +0100 Subject: ext4: fix deadlock on inode reallocation Currently there is a race in ext4 when reallocating freed inode resulting in a deadlock: Task1 Task2 ext4_evict_inode() handle = ext4_journal_start(); ... if (IS_SYNC(inode)) handle->h_sync = 1; ext4_free_inode() ext4_new_inode() handle = ext4_journal_start() finds the bit in inode bitmap already clear insert_inode_locked() waits for inode to be removed from the hash. ext4_journal_stop(handle) jbd2_journal_stop(handle) jbd2_log_wait_commit(journal, tid); - deadlocks waiting for transaction handle Task2 holds Fix the problem by removing inode from the hash already in ext4_clear_inode() by which time all IO for the inode is done so reuse is already fine but we are still before possibly blocking on transaction commit. Reported-by: "Lai, Yi" Link: https://lore.kernel.org/all/abNvb2PcrKj1FBeC@ly-workstation Fixes: 88ec797c4680 ("fs: make insert_inode_locked() wait for inode destruction") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260320090428.24899-2-jack@suse.cz Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cb69a9b38b7c..a34efb44e73d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1523,6 +1523,27 @@ void ext4_clear_inode(struct inode *inode) invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode); + /* + * We must remove the inode from the hash before ext4_free_inode() + * clears the bit in inode bitmap as otherwise another process reusing + * the inode will block in insert_inode_hash() waiting for inode + * eviction to complete while holding transaction handle open, but + * ext4_evict_inode() still running for that inode could block waiting + * for transaction commit if the inode is marked as IS_SYNC => deadlock. + * + * Removing the inode from the hash here is safe. There are two cases + * to consider: + * 1) The inode still has references to it (i_nlink > 0). In that case + * we are keeping the inode and once we remove the inode from the hash, + * iget() can create the new inode structure for the same inode number + * and we are fine with that as all IO on behalf of the inode is + * finished. + * 2) We are deleting the inode (i_nlink == 0). In that case inode + * number cannot be reused until ext4_free_inode() clears the bit in + * the inode bitmap, at which point all IO is done and reuse is fine + * again. + */ + remove_inode_hash(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { -- cgit v1.2.3 From ec0a7500d8eace5b4f305fa0c594dd148f0e8d29 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 23 Mar 2026 14:08:36 +0800 Subject: ext4: fix iloc.bh leak in ext4_fc_replay_inode() error paths During code review, Joseph found that ext4_fc_replay_inode() calls ext4_get_fc_inode_loc() to get the inode location, which holds a reference to iloc.bh that must be released via brelse(). However, several error paths jump to the 'out' label without releasing iloc.bh: - ext4_handle_dirty_metadata() failure - sync_dirty_buffer() failure - ext4_mark_inode_used() failure - ext4_iget() failure Fix this by introducing an 'out_brelse' label placed just before the existing 'out' label to ensure iloc.bh is always released. Additionally, make ext4_fc_replay_inode() propagate errors properly instead of always returning 0. Reported-by: Joseph Qi Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20260323060836.3452660-1-libaokun@linux.alibaba.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6e949c21842d..2f0057e04934 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1613,19 +1613,21 @@ static int ext4_fc_replay_inode(struct super_block *sb, /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) - goto out; + goto out_brelse; ret = sync_dirty_buffer(iloc.bh); if (ret) - goto out; + goto out_brelse; ret = ext4_mark_inode_used(sb, ino); if (ret) - goto out; + goto out_brelse; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); - return -EFSCORRUPTED; + inode = NULL; + ret = -EFSCORRUPTED; + goto out_brelse; } /* @@ -1642,13 +1644,14 @@ static int ext4_fc_replay_inode(struct super_block *sb, ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); +out_brelse: brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); - return 0; + return ret; } /* -- cgit v1.2.3 From 3ceda17325fc2600f66fd85b526592bc8a9dfb9d Mon Sep 17 00:00:00 2001 From: hongao Date: Tue, 24 Mar 2026 09:58:15 +0800 Subject: ext4: skip split extent recovery on corruption ext4_split_extent_at() retries after ext4_ext_insert_extent() fails by refinding the original extent and restoring its length. That recovery is only safe for transient resource failures such as -ENOSPC, -EDQUOT, and -ENOMEM. When ext4_ext_insert_extent() fails because the extent tree is already corrupted, ext4_find_extent() can return a leaf path without p_ext. ext4_split_extent_at() then dereferences path[depth].p_ext while trying to fix up the original extent length, causing a NULL pointer dereference while handling a pre-existing filesystem corruption. Do not enter the recovery path for corruption errors, and validate p_ext after refinding the extent before touching it. This keeps the recovery path limited to cases it can actually repair and turns the syzbot-triggered crash into a proper corruption report. Fixes: 716b9c23b862 ("ext4: refactor split and convert extents") Reported-by: syzbot+1ffa5d865557e51cb604@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1ffa5d865557e51cb604 Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Signed-off-by: hongao Link: https://patch.msgid.link/EF77870F23FF9C90+20260324015815.35248-1-hongao@uniontech.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index da78eb23aaa6..8cce1479be6d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3254,6 +3254,9 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, insert_err = PTR_ERR(path); err = 0; + if (insert_err != -ENOSPC && insert_err != -EDQUOT && + insert_err != -ENOMEM) + goto out_path; /* * Get a new path to try to zeroout or fix the extent length. @@ -3270,13 +3273,20 @@ static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, goto out_path; } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, + "bad extent address lblock: %lu, depth: %d pblock %llu", + (unsigned long)ee_block, depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - depth = ext_depth(inode); - ex = path[depth].p_ext; - fix_extent_len: ex->ee_len = orig_ex.ee_len; err = ext4_ext_dirty(handle, inode, path + path->p_depth); -- cgit v1.2.3 From bb81702370fad22c06ca12b6e1648754dbc37e0f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 26 Mar 2026 00:58:34 -0400 Subject: ext4: handle wraparound when searching for blocks for indirect mapped blocks Commit 4865c768b563 ("ext4: always allocate blocks only from groups inode can use") restricts what blocks will be allocated for indirect block based files to block numbers that fit within 32-bit block numbers. However, when using a review bot running on the latest Gemini LLM to check this commit when backporting into an LTS based kernel, it raised this concern: If ac->ac_g_ex.fe_group is >= ngroups (for instance, if the goal group was populated via stream allocation from s_mb_last_groups), then start will be >= ngroups. Does this allow allocating blocks beyond the 32-bit limit for indirect block mapped files? The commit message mentions that ext4_mb_scan_groups_linear() takes care to not select unsupported groups. However, its loop uses group = *start, and the very first iteration will call ext4_mb_scan_group() with this unsupported group because next_linear_group() is only called at the end of the iteration. After reviewing the code paths involved and considering the LLM review, I determined that this can happen when there is a file system where some files/directories are extent-mapped and others are indirect-block mapped. To address this, add a safety clamp in ext4_mb_scan_groups(). Fixes: 4865c768b563 ("ext4: always allocate blocks only from groups inode can use") Cc: Jan Kara Reviewed-by: Baokun Li Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://patch.msgid.link/20260326045834.1175822-1-tytso@mit.edu Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bb6faebf9b6d..cb2bd87c355c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1199,6 +1199,8 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) /* searching for the right group start from the goal value specified */ start = ac->ac_g_ex.fe_group; + if (start >= ngroups) + start = 0; ac->ac_prefetch_grp = start; ac->ac_prefetch_nr = 0; -- cgit v1.2.3 From 9ee29d20aab228adfb02ca93f87fb53c56c2f3af Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 27 Mar 2026 02:13:15 -0400 Subject: ext4: always drain queued discard work in ext4_mb_release() While reviewing recent ext4 patch[1], Sashiko raised the following concern[2]: > If the filesystem is initially mounted with the discard option, > deleting files will populate sbi->s_discard_list and queue > s_discard_work. If it is then remounted with nodiscard, the > EXT4_MOUNT_DISCARD flag is cleared, but the pending s_discard_work is > neither cancelled nor flushed. [1] https://lore.kernel.org/r/20260319094545.19291-1-qiang.zhang@linux.dev/ [2] https://sashiko.dev/#/patchset/20260319094545.19291-1-qiang.zhang%40linux.dev The concern was valid, but it had nothing to do with the patch[1]. One of the problems with Sashiko in its current (early) form is that it will detect pre-existing issues and report it as a problem with the patch that it is reviewing. In practice, it would be hard to hit deliberately (unless you are a malicious syzkaller fuzzer), since it would involve mounting the file system with -o discard, and then deleting a large number of files, remounting the file system with -o nodiscard, and then immediately unmounting the file system before the queued discard work has a change to drain on its own. Fix it because it's a real bug, and to avoid Sashiko from raising this concern when analyzing future patches to mballoc.c. Signed-off-by: Theodore Ts'o Fixes: 55cdd0af2bc5 ("ext4: get discard out of jbd2 commit kthread contex") Cc: stable@kernel.org --- fs/ext4/mballoc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cb2bd87c355c..bb58eafb87bc 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3893,13 +3893,11 @@ void ext4_mb_release(struct super_block *sb) struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; - if (test_opt(sb, DISCARD)) { - /* - * wait the discard work to drain all of ext4_free_data - */ - flush_work(&sbi->s_discard_work); - WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); - } + /* + * wait the discard work to drain all of ext4_free_data + */ + flush_work(&sbi->s_discard_work); + WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); group_info = rcu_access_pointer(sbi->s_group_info); if (group_info) { -- cgit v1.2.3