From cc3d2f55c43affde7195867afb7b0ee45dd8019b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 9 Feb 2025 05:33:43 +0100
Subject: xfs: reflow xfs_dec_freecounter

Let the successful allocation be the main path through the function
with exception handling in branches to make the code easier to
follow.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_mount.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b69356582b86..ba6e60dc3a45 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1257,7 +1257,6 @@ xfs_dec_freecounter(
 	uint64_t		delta,
 	bool			rsvd)
 {
-	int64_t			lcounter;
 	uint64_t		set_aside = 0;
 	s32			batch;
 	bool			has_resv_pool;
@@ -1296,28 +1295,26 @@ xfs_dec_freecounter(
 		set_aside = xfs_fdblocks_unavailable(mp);
 	percpu_counter_add_batch(counter, -((int64_t)delta), batch);
 	if (__percpu_counter_compare(counter, set_aside,
-				     XFS_FDBLOCKS_BATCH) >= 0) {
-		/* we had space! */
-		return 0;
-	}
-
-	/*
-	 * lock up the sb for dipping into reserves before releasing the space
-	 * that took us to ENOSPC.
-	 */
-	spin_lock(&mp->m_sb_lock);
-	percpu_counter_add(counter, delta);
-	if (!has_resv_pool || !rsvd)
-		goto fdblocks_enospc;
-
-	lcounter = (long long)mp->m_resblks_avail - delta;
-	if (lcounter >= 0) {
-		mp->m_resblks_avail = lcounter;
+			XFS_FDBLOCKS_BATCH) < 0) {
+		/*
+		 * Lock up the sb for dipping into reserves before releasing the
+		 * space that took us to ENOSPC.
+		 */
+		spin_lock(&mp->m_sb_lock);
+		percpu_counter_add(counter, delta);
+		if (!rsvd)
+			goto fdblocks_enospc;
+		if (delta > mp->m_resblks_avail) {
+			xfs_warn_once(mp,
+"Reserve blocks depleted! Consider increasing reserve pool size.");
+			goto fdblocks_enospc;
+		}
+		mp->m_resblks_avail -= delta;
 		spin_unlock(&mp->m_sb_lock);
-		return 0;
 	}
-	xfs_warn_once(mp,
-"Reserve blocks depleted! Consider increasing reserve pool size.");
+
+	/* we had space! */
+	return 0;
 
 fdblocks_enospc:
 	spin_unlock(&mp->m_sb_lock);
-- 
cgit v1.2.3


From 712bae96631852c1a1822ee4f57a08ccd843358b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 9 Feb 2025 05:43:50 +0100
Subject: xfs: generalize the freespace and reserved blocks handling

xfs_{add,dec}_freecounter already handles the block and RT extent
percpu counters, but it currently hardcodes the passed in counter.

Add a freecounter abstraction that uses an enum to designate the counter
and add wrappers that hide the actual percpu_counters.  This will allow
expanding the reserved block handling to the RT extent counter in the
next step, and also prepares for adding yet another such counter that
can share the code.  Both these additions will be needed for the zoned
allocator.

Also switch the flooring of the frextents counter to 0 in statfs for the
rthinherit case to a manual min_t call to match the handling of the
fdblocks counter for normal file systems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ialloc.c       |  2 +-
 fs/xfs/libxfs/xfs_metafile.c     |  2 +-
 fs/xfs/libxfs/xfs_sb.c           |  8 ++---
 fs/xfs/libxfs/xfs_types.h        | 17 ++++++++++
 fs/xfs/scrub/fscounters.c        | 11 ++++---
 fs/xfs/scrub/fscounters_repair.c |  4 +--
 fs/xfs/scrub/newbt.c             |  2 +-
 fs/xfs/xfs_fsops.c               |  6 ++--
 fs/xfs/xfs_icache.c              |  4 +--
 fs/xfs/xfs_ioctl.c               |  6 ++--
 fs/xfs/xfs_iomap.c               |  9 +++---
 fs/xfs/xfs_mount.c               | 37 ++++++++++++++++-----
 fs/xfs/xfs_mount.h               | 70 +++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_rtalloc.c             |  2 +-
 fs/xfs/xfs_super.c               | 42 +++++++++++++-----------
 fs/xfs/xfs_trace.h               |  2 +-
 16 files changed, 151 insertions(+), 73 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f3a840a425f5..57513ba19d6a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1927,7 +1927,7 @@ xfs_dialloc(
 	 * that we can immediately allocate, but then we allow allocation on the
 	 * second pass if we fail to find an AG with free inodes in it.
 	 */
-	if (percpu_counter_read_positive(&mp->m_fdblocks) <
+	if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
 			mp->m_low_space[XFS_LOWSP_1_PCNT]) {
 		ok_alloc = false;
 		low_space = true;
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index 2f5f554a36d4..7625e694eb8d 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -95,7 +95,7 @@ xfs_metafile_resv_can_cover(
 	 * There aren't enough blocks left in the inode's reservation, but it
 	 * isn't critical unless there also isn't enough free space.
 	 */
-	return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+	return xfs_compare_freecounter(ip->i_mount, XC_FREE_BLOCKS,
 			rhs - ip->i_delayed_blks, 2048) >= 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 3dc5f5dba162..3fdd20df961c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1265,8 +1265,7 @@ xfs_log_sb(
 		mp->m_sb.sb_ifree = min_t(uint64_t,
 				percpu_counter_sum_positive(&mp->m_ifree),
 				mp->m_sb.sb_icount);
-		mp->m_sb.sb_fdblocks =
-				percpu_counter_sum_positive(&mp->m_fdblocks);
+		mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
 	}
 
 	/*
@@ -1275,9 +1274,10 @@ xfs_log_sb(
 	 * we handle nearly-lockless reservations, so we must use the _positive
 	 * variant here to avoid writing out nonsense frextents.
 	 */
-	if (xfs_has_rtgroups(mp))
+	if (xfs_has_rtgroups(mp)) {
 		mp->m_sb.sb_frextents =
-				percpu_counter_sum_positive(&mp->m_frextents);
+				xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
+	}
 
 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index ca2401c1facd..76f3c31573ec 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -233,6 +233,23 @@ enum xfs_group_type {
 	{ XG_TYPE_AG,	"ag" }, \
 	{ XG_TYPE_RTG,	"rtg" }
 
+enum xfs_free_counter {
+	/*
+	 * Number of free blocks on the data device.
+	 */
+	XC_FREE_BLOCKS,
+
+	/*
+	 * Number of free RT extents on the RT device.
+	 */
+	XC_FREE_RTEXTENTS,
+	XC_FREE_NR,
+};
+
+#define XFS_FREECOUNTER_STR \
+	{ XC_FREE_BLOCKS,		"blocks" }, \
+	{ XC_FREE_RTEXTENTS,		"rtextents" }
+
 /*
  * Type verifier functions
  */
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index ca23cf4db6c5..207a238de429 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -513,8 +513,8 @@ xchk_fscounters(
 	/* Snapshot the percpu counters. */
 	icount = percpu_counter_sum(&mp->m_icount);
 	ifree = percpu_counter_sum(&mp->m_ifree);
-	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
-	frextents = percpu_counter_sum(&mp->m_frextents);
+	fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+	frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
 
 	/* No negative values, please! */
 	if (icount < 0 || ifree < 0)
@@ -589,15 +589,16 @@ xchk_fscounters(
 			try_again = true;
 	}
 
-	if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
-			fsc->fdblocks)) {
+	if (!xchk_fscount_within_range(sc, fdblocks,
+			&mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
 		if (fsc->frozen)
 			xchk_set_corrupt(sc);
 		else
 			try_again = true;
 	}
 
-	if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+	if (!xchk_fscount_within_range(sc, frextents,
+			&mp->m_free[XC_FREE_RTEXTENTS].count,
 			fsc->frextents - fsc->frextents_delayed)) {
 		if (fsc->frozen)
 			xchk_set_corrupt(sc);
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index cda13447a373..8fb0db78489e 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -64,7 +64,7 @@ xrep_fscounters(
 
 	percpu_counter_set(&mp->m_icount, fsc->icount);
 	percpu_counter_set(&mp->m_ifree, fsc->ifree);
-	percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+	xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
 
 	/*
 	 * Online repair is only supported on v5 file systems, which require
@@ -74,7 +74,7 @@ xrep_fscounters(
 	 * track of the delalloc reservations separately, as they are are
 	 * subtracted from m_frextents, but not included in sb_frextents.
 	 */
-	percpu_counter_set(&mp->m_frextents,
+	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
 		fsc->frextents - fsc->frextents_delayed);
 	if (!xfs_has_rtgroups(mp))
 		mp->m_sb.sb_frextents = fsc->frextents;
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index ac38f5843090..1588ce971cb8 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
 		free = sc->sa.pag->pagf_freeblks;
 		sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
 	} else {
-		free = percpu_counter_sum(&sc->mp->m_fdblocks);
+		free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
 		sz = sc->mp->m_sb.sb_dblocks;
 	}
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 455298503d01..58249f37a7ad 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -409,7 +409,7 @@ xfs_reserve_blocks(
 
 	/*
 	 * If the request is larger than the current reservation, reserve the
-	 * blocks before we update the reserve counters. Sample m_fdblocks and
+	 * blocks before we update the reserve counters. Sample m_free and
 	 * perform a partial reservation if the request exceeds free space.
 	 *
 	 * The code below estimates how many blocks it can request from
@@ -419,8 +419,8 @@ xfs_reserve_blocks(
 	 * space to fill it because mod_fdblocks will refill an undersized
 	 * reserve when it can.
 	 */
-	free = percpu_counter_sum(&mp->m_fdblocks) -
-						xfs_fdblocks_unavailable(mp);
+	free = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS) -
+		xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS);
 	delta = request - mp->m_resblks;
 	mp->m_resblks = request;
 	if (delta > 0 && free > 0) {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 7b6c026d01a1..c9ded501e89b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2076,7 +2076,7 @@ xfs_inodegc_want_queue_rt_file(
 	if (!XFS_IS_REALTIME_INODE(ip))
 		return false;
 
-	if (__percpu_counter_compare(&mp->m_frextents,
+	if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
 				mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
 				XFS_FDBLOCKS_BATCH) < 0)
 		return true;
@@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work(
 	if (items > mp->m_ino_geo.inodes_per_cluster)
 		return true;
 
-	if (__percpu_counter_compare(&mp->m_fdblocks,
+	if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
 				mp->m_low_space[XFS_LOWSP_5_PCNT],
 				XFS_FDBLOCKS_BATCH) < 0)
 		return true;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ed85322507dd..0418aad2db91 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts(
 	struct xfs_fsop_counts	out = {
 		.allocino = percpu_counter_read_positive(&mp->m_icount),
 		.freeino  = percpu_counter_read_positive(&mp->m_ifree),
-		.freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
-				xfs_fdblocks_unavailable(mp),
-		.freertx  = percpu_counter_read_positive(&mp->m_frextents),
+		.freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
+				xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
+		.freertx  = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
 	};
 
 	if (copy_to_user(uarg, &out, sizeof(out)))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 46acf727cbe7..c669b93bb2d1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -431,13 +431,14 @@ xfs_quota_calc_throttle(
 
 static int64_t
 xfs_iomap_freesp(
-	struct percpu_counter	*counter,
+	struct xfs_mount	*mp,
+	unsigned int		idx,
 	uint64_t		low_space[XFS_LOWSP_MAX],
 	int			*shift)
 {
 	int64_t			freesp;
 
-	freesp = percpu_counter_read_positive(counter);
+	freesp = xfs_estimate_freecounter(mp, idx);
 	if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
 		*shift = 2;
 		if (freesp < low_space[XFS_LOWSP_4_PCNT])
@@ -536,10 +537,10 @@ xfs_iomap_prealloc_size(
 
 	if (unlikely(XFS_IS_REALTIME_INODE(ip)))
 		freesp = xfs_rtbxlen_to_blen(mp,
-				xfs_iomap_freesp(&mp->m_frextents,
+				xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
 					mp->m_low_rtexts, &shift));
 	else
-		freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
+		freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
 				&shift);
 
 	/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ba6e60dc3a45..f444b41d4587 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1220,13 +1220,31 @@ xfs_fs_writable(
 	return true;
 }
 
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks.  This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ *   the actual blocks that will fill per-AG metadata space reservations
+ */
+uint64_t
+xfs_freecounter_unavailable(
+	struct xfs_mount	*mp,
+	enum xfs_free_counter	ctr)
+{
+	if (ctr != XC_FREE_BLOCKS)
+		return 0;
+	return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+}
+
 void
 xfs_add_freecounter(
 	struct xfs_mount	*mp,
-	struct percpu_counter	*counter,
+	enum xfs_free_counter	ctr,
 	uint64_t		delta)
 {
-	bool			has_resv_pool = (counter == &mp->m_fdblocks);
+	bool			has_resv_pool = (ctr == XC_FREE_BLOCKS);
 	uint64_t		res_used;
 
 	/*
@@ -1234,7 +1252,7 @@ xfs_add_freecounter(
 	 * Most of the time the pool is full.
 	 */
 	if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
-		percpu_counter_add(counter, delta);
+		percpu_counter_add(&mp->m_free[ctr].count, delta);
 		return;
 	}
 
@@ -1245,24 +1263,27 @@ xfs_add_freecounter(
 	} else {
 		delta -= res_used;
 		mp->m_resblks_avail = mp->m_resblks;
-		percpu_counter_add(counter, delta);
+		percpu_counter_add(&mp->m_free[ctr].count, delta);
 	}
 	spin_unlock(&mp->m_sb_lock);
 }
 
+
+/* Adjust in-core free blocks or RT extents. */
 int
 xfs_dec_freecounter(
 	struct xfs_mount	*mp,
-	struct percpu_counter	*counter,
+	enum xfs_free_counter	ctr,
 	uint64_t		delta,
 	bool			rsvd)
 {
+	struct percpu_counter	*counter = &mp->m_free[ctr].count;
 	uint64_t		set_aside = 0;
 	s32			batch;
 	bool			has_resv_pool;
 
-	ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
-	has_resv_pool = (counter == &mp->m_fdblocks);
+	ASSERT(ctr < XC_FREE_NR);
+	has_resv_pool = (ctr == XC_FREE_BLOCKS);
 	if (rsvd)
 		ASSERT(has_resv_pool);
 
@@ -1292,7 +1313,7 @@ xfs_dec_freecounter(
 	 * slightly premature -ENOSPC.
 	 */
 	if (has_resv_pool)
-		set_aside = xfs_fdblocks_unavailable(mp);
+		set_aside = xfs_freecounter_unavailable(mp, ctr);
 	percpu_counter_add_batch(counter, -((int64_t)delta), batch);
 	if (__percpu_counter_compare(counter, set_aside,
 			XFS_FDBLOCKS_BATCH) < 0) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fbed172d6770..7f3265d669bc 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -105,6 +105,11 @@ struct xfs_groups {
 	uint64_t		blkmask;
 };
 
+struct xfs_freecounter {
+	/* free blocks for general use: */
+	struct percpu_counter	count;
+};
+
 /*
  * The struct xfsmount layout is optimised to separate read-mostly variables
  * from variables that are frequently modified. We put the read-mostly variables
@@ -222,8 +227,8 @@ typedef struct xfs_mount {
 	spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
 	struct percpu_counter	m_icount;	/* allocated inodes counter */
 	struct percpu_counter	m_ifree;	/* free inodes counter */
-	struct percpu_counter	m_fdblocks;	/* free block counter */
-	struct percpu_counter	m_frextents;	/* free rt extent counter */
+
+	struct xfs_freecounter	m_free[XC_FREE_NR];
 
 	/*
 	 * Count of data device blocks reserved for delayed allocations,
@@ -646,45 +651,74 @@ extern void	xfs_unmountfs(xfs_mount_t *);
  */
 #define XFS_FDBLOCKS_BATCH	1024
 
+uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
+		enum xfs_free_counter ctr);
+
 /*
- * Estimate the amount of free space that is not available to userspace and is
- * not explicitly reserved from the incore fdblocks.  This includes:
- *
- * - The minimum number of blocks needed to support splitting a bmap btree
- * - The blocks currently in use by the freespace btrees because they record
- *   the actual blocks that will fill per-AG metadata space reservations
+ * Sum up the freecount, but never return negative values.
  */
-static inline uint64_t
-xfs_fdblocks_unavailable(
-	struct xfs_mount	*mp)
+static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
+		enum xfs_free_counter ctr)
+{
+	return percpu_counter_sum_positive(&mp->m_free[ctr].count);
+}
+
+/*
+ * Same as above, but does return negative values.  Mostly useful for
+ * special cases like repair and tracing.
+ */
+static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp,
+		enum xfs_free_counter ctr)
+{
+	return percpu_counter_sum(&mp->m_free[ctr].count);
+}
+
+/*
+ * This just provides and estimate without the cpu-local updates, use
+ * xfs_sum_freecounter for the exact value.
+ */
+static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
+		enum xfs_free_counter ctr)
+{
+	return percpu_counter_read_positive(&mp->m_free[ctr].count);
+}
+
+static inline int xfs_compare_freecounter(struct xfs_mount *mp,
+		enum xfs_free_counter ctr, s64 rhs, s32 batch)
+{
+	return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch);
+}
+
+static inline void xfs_set_freecounter(struct xfs_mount *mp,
+		enum xfs_free_counter ctr, uint64_t val)
 {
-	return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+	percpu_counter_set(&mp->m_free[ctr].count, val);
 }
 
-int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
 		uint64_t delta, bool rsvd);
-void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
 		uint64_t delta);
 
 static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
 		bool reserved)
 {
-	return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+	return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
 }
 
 static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
 {
-	xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
+	xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
 }
 
 static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
 {
-	return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
+	return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
 }
 
 static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
 {
-	xfs_add_freecounter(mp, &mp->m_frextents, delta);
+	xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
 }
 
 extern int	xfs_readsb(xfs_mount_t *, int);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 57bef567e011..93caf4406402 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1519,7 +1519,7 @@ xfs_rtalloc_reinit_frextents(
 	spin_lock(&mp->m_sb_lock);
 	mp->m_sb.sb_frextents = val;
 	spin_unlock(&mp->m_sb_lock);
-	percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0055066fb1d9..b08d28a895cb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -834,10 +834,12 @@ xfs_statfs_data(
 	struct kstatfs		*st)
 {
 	int64_t			fdblocks =
-		percpu_counter_sum(&mp->m_fdblocks);
+		xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
 
 	/* make sure st->f_bfree does not underflow */
-	st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
+	st->f_bfree = max(0LL,
+		fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
+
 	/*
 	 * sb_dblocks can change during growfs, but nothing cares about reporting
 	 * the old or new value during growfs.
@@ -856,7 +858,7 @@ xfs_statfs_rt(
 	struct kstatfs		*st)
 {
 	st->f_bfree = xfs_rtbxlen_to_blen(mp,
-			percpu_counter_sum_positive(&mp->m_frextents));
+			xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
 	st->f_blocks = mp->m_sb.sb_rblocks;
 }
 
@@ -1065,7 +1067,8 @@ static int
 xfs_init_percpu_counters(
 	struct xfs_mount	*mp)
 {
-	int		error;
+	int			error;
+	int			i;
 
 	error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
 	if (error)
@@ -1075,30 +1078,29 @@ xfs_init_percpu_counters(
 	if (error)
 		goto free_icount;
 
-	error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
-	if (error)
-		goto free_ifree;
-
 	error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
 	if (error)
-		goto free_fdblocks;
+		goto free_ifree;
 
 	error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
 	if (error)
 		goto free_delalloc;
 
-	error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
-	if (error)
-		goto free_delalloc_rt;
+	for (i = 0; i < XC_FREE_NR; i++) {
+		error = percpu_counter_init(&mp->m_free[i].count, 0,
+				GFP_KERNEL);
+		if (error)
+			goto free_freecounters;
+	}
 
 	return 0;
 
-free_delalloc_rt:
+free_freecounters:
+	while (--i > 0)
+		percpu_counter_destroy(&mp->m_free[i].count);
 	percpu_counter_destroy(&mp->m_delalloc_rtextents);
 free_delalloc:
 	percpu_counter_destroy(&mp->m_delalloc_blks);
-free_fdblocks:
-	percpu_counter_destroy(&mp->m_fdblocks);
 free_ifree:
 	percpu_counter_destroy(&mp->m_ifree);
 free_icount:
@@ -1112,24 +1114,26 @@ xfs_reinit_percpu_counters(
 {
 	percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
 	percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
-	percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
-	percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+	xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
+	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
 }
 
 static void
 xfs_destroy_percpu_counters(
 	struct xfs_mount	*mp)
 {
+	enum xfs_free_counter	i;
+
+	for (i = 0; i < XC_FREE_NR; i++)
+		percpu_counter_destroy(&mp->m_free[i].count);
 	percpu_counter_destroy(&mp->m_icount);
 	percpu_counter_destroy(&mp->m_ifree);
-	percpu_counter_destroy(&mp->m_fdblocks);
 	ASSERT(xfs_is_shutdown(mp) ||
 	       percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
 	percpu_counter_destroy(&mp->m_delalloc_rtextents);
 	ASSERT(xfs_is_shutdown(mp) ||
 	       percpu_counter_sum(&mp->m_delalloc_blks) == 0);
 	percpu_counter_destroy(&mp->m_delalloc_blks);
-	percpu_counter_destroy(&mp->m_frextents);
 }
 
 static int
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bfc2f1249022..f97129f01b48 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5622,7 +5622,7 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
 
 		__entry->dev = mp->m_super->s_dev;
 		__entry->ino = ip->i_ino;
-		__entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
+		__entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
 		__entry->reserved = ip->i_delayed_blks;
 		__entry->asked = ip->i_meta_resv_asked;
 		__entry->used = ip->i_nblocks;
-- 
cgit v1.2.3


From c8c4e8bc692ae0cd062eaabf99ff9d0d143a6370 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 9 Feb 2025 06:19:06 +0100
Subject: xfs: support reserved blocks for the rt extent counter

The zoned space allocator will need reserved RT extents for garbage
collection and zeroing of partial blocks.  Move the resblks related
fields into the freecounter array so that they can be used for all
counters.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/scrub/fscounters.c |  2 +-
 fs/xfs/xfs_fsops.c        | 25 +++++++++--------
 fs/xfs/xfs_fsops.h        |  3 +-
 fs/xfs/xfs_ioctl.c        |  6 ++--
 fs/xfs/xfs_mount.c        | 70 +++++++++++++++++++++++++++--------------------
 fs/xfs/xfs_mount.h        | 15 +++++++---
 fs/xfs/xfs_super.c        | 32 ++++++++++++++--------
 7 files changed, 91 insertions(+), 62 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 207a238de429..9dd893ece188 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -350,7 +350,7 @@ retry:
 	 * The global incore space reservation is taken from the incore
 	 * counters, so leave that out of the computation.
 	 */
-	fsc->fdblocks -= mp->m_resblks_avail;
+	fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
 
 	/*
 	 * Delayed allocation reservations are taken out of the incore counters
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 58249f37a7ad..f055aebe4c7a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -366,6 +366,7 @@ xfs_growfs_log(
 int
 xfs_reserve_blocks(
 	struct xfs_mount	*mp,
+	enum xfs_free_counter	ctr,
 	uint64_t		request)
 {
 	int64_t			lcounter, delta;
@@ -373,6 +374,8 @@ xfs_reserve_blocks(
 	int64_t			free;
 	int			error = 0;
 
+	ASSERT(ctr < XC_FREE_NR);
+
 	/*
 	 * With per-cpu counters, this becomes an interesting problem. we need
 	 * to work out if we are freeing or allocation blocks first, then we can
@@ -391,16 +394,16 @@ xfs_reserve_blocks(
 	 * counters directly since we shouldn't have any problems unreserving
 	 * space.
 	 */
-	if (mp->m_resblks > request) {
-		lcounter = mp->m_resblks_avail - request;
+	if (mp->m_free[ctr].res_total > request) {
+		lcounter = mp->m_free[ctr].res_avail - request;
 		if (lcounter > 0) {		/* release unused blocks */
 			fdblks_delta = lcounter;
-			mp->m_resblks_avail -= lcounter;
+			mp->m_free[ctr].res_avail -= lcounter;
 		}
-		mp->m_resblks = request;
+		mp->m_free[ctr].res_total = request;
 		if (fdblks_delta) {
 			spin_unlock(&mp->m_sb_lock);
-			xfs_add_fdblocks(mp, fdblks_delta);
+			xfs_add_freecounter(mp, ctr, fdblks_delta);
 			spin_lock(&mp->m_sb_lock);
 		}
 
@@ -419,10 +422,10 @@ xfs_reserve_blocks(
 	 * space to fill it because mod_fdblocks will refill an undersized
 	 * reserve when it can.
 	 */
-	free = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS) -
-		xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS);
-	delta = request - mp->m_resblks;
-	mp->m_resblks = request;
+	free = xfs_sum_freecounter_raw(mp, ctr) -
+		xfs_freecounter_unavailable(mp, ctr);
+	delta = request - mp->m_free[ctr].res_total;
+	mp->m_free[ctr].res_total = request;
 	if (delta > 0 && free > 0) {
 		/*
 		 * We'll either succeed in getting space from the free block
@@ -436,9 +439,9 @@ xfs_reserve_blocks(
 		 */
 		fdblks_delta = min(free, delta);
 		spin_unlock(&mp->m_sb_lock);
-		error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
+		error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
 		if (!error)
-			xfs_add_fdblocks(mp, fdblks_delta);
+			xfs_add_freecounter(mp, ctr, fdblks_delta);
 		spin_lock(&mp->m_sb_lock);
 	}
 out:
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 3e2f73bcf831..9d23c361ef56 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -8,7 +8,8 @@
 
 int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
 int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
+		uint64_t request);
 int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
 
 int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0418aad2db91..d250f7f74e3b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks(
 		error = mnt_want_write_file(filp);
 		if (error)
 			return error;
-		error = xfs_reserve_blocks(mp, fsop.resblks);
+		error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
 		mnt_drop_write_file(filp);
 		if (error)
 			return error;
 	}
 
 	spin_lock(&mp->m_sb_lock);
-	fsop.resblks = mp->m_resblks;
-	fsop.resblks_avail = mp->m_resblks_avail;
+	fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total;
+	fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail;
 	spin_unlock(&mp->m_sb_lock);
 
 	if (copy_to_user(arg, &fsop, sizeof(fsop)))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f444b41d4587..01f387784039 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -461,11 +461,21 @@ xfs_mount_reset_sbqflags(
 	return xfs_sync_sb(mp, false);
 }
 
+static const char *const xfs_free_pool_name[] = {
+	[XC_FREE_BLOCKS]	= "free blocks",
+	[XC_FREE_RTEXTENTS]	= "free rt extents",
+};
+
 uint64_t
-xfs_default_resblks(xfs_mount_t *mp)
+xfs_default_resblks(
+	struct xfs_mount	*mp,
+	enum xfs_free_counter	ctr)
 {
 	uint64_t resblks;
 
+	if (ctr == XC_FREE_RTEXTENTS)
+		return 0;
+
 	/*
 	 * We default to 5% or 8192 fsbs of space reserved, whichever is
 	 * smaller.  This is intended to cover concurrent allocation
@@ -678,6 +688,7 @@ xfs_mountfs(
 	uint			quotamount = 0;
 	uint			quotaflags = 0;
 	int			error = 0;
+	int			i;
 
 	xfs_sb_mount_common(mp, sbp);
 
@@ -1046,17 +1057,21 @@ xfs_mountfs(
 	 * privileged transactions. This is needed so that transaction
 	 * space required for critical operations can dip into this pool
 	 * when at ENOSPC. This is needed for operations like create with
-	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
-	 * are not allowed to use this reserved space.
+	 * attr, unwritten extent conversion at ENOSPC, garbage collection
+	 * etc. Data allocations are not allowed to use this reserved space.
 	 *
 	 * This may drive us straight to ENOSPC on mount, but that implies
 	 * we were already there on the last unmount. Warn if this occurs.
 	 */
 	if (!xfs_is_readonly(mp)) {
-		error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
-		if (error)
-			xfs_warn(mp,
-	"Unable to allocate reserve blocks. Continuing without reserve pool.");
+		for (i = 0; i < XC_FREE_NR; i++) {
+			error = xfs_reserve_blocks(mp, i,
+					xfs_default_resblks(mp, i));
+			if (error)
+				xfs_warn(mp,
+"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
+					xfs_free_pool_name[i]);
+		}
 
 		/* Reserve AG blocks for future btree expansion. */
 		error = xfs_fs_reserve_ag_blocks(mp);
@@ -1173,7 +1188,7 @@ xfs_unmountfs(
 	 * we only every apply deltas to the superblock and hence the incore
 	 * value does not matter....
 	 */
-	error = xfs_reserve_blocks(mp, 0);
+	error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
 	if (error)
 		xfs_warn(mp, "Unable to free reserved block pool. "
 				"Freespace may not be correct on next mount.");
@@ -1244,26 +1259,26 @@ xfs_add_freecounter(
 	enum xfs_free_counter	ctr,
 	uint64_t		delta)
 {
-	bool			has_resv_pool = (ctr == XC_FREE_BLOCKS);
+	struct xfs_freecounter	*counter = &mp->m_free[ctr];
 	uint64_t		res_used;
 
 	/*
 	 * If the reserve pool is depleted, put blocks back into it first.
 	 * Most of the time the pool is full.
 	 */
-	if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
-		percpu_counter_add(&mp->m_free[ctr].count, delta);
+	if (likely(counter->res_avail == counter->res_total)) {
+		percpu_counter_add(&counter->count, delta);
 		return;
 	}
 
 	spin_lock(&mp->m_sb_lock);
-	res_used = mp->m_resblks - mp->m_resblks_avail;
+	res_used = counter->res_total - counter->res_avail;
 	if (res_used > delta) {
-		mp->m_resblks_avail += delta;
+		counter->res_avail += delta;
 	} else {
 		delta -= res_used;
-		mp->m_resblks_avail = mp->m_resblks;
-		percpu_counter_add(&mp->m_free[ctr].count, delta);
+		counter->res_avail = counter->res_total;
+		percpu_counter_add(&counter->count, delta);
 	}
 	spin_unlock(&mp->m_sb_lock);
 }
@@ -1277,15 +1292,10 @@ xfs_dec_freecounter(
 	uint64_t		delta,
 	bool			rsvd)
 {
-	struct percpu_counter	*counter = &mp->m_free[ctr].count;
-	uint64_t		set_aside = 0;
+	struct xfs_freecounter	*counter = &mp->m_free[ctr];
 	s32			batch;
-	bool			has_resv_pool;
 
 	ASSERT(ctr < XC_FREE_NR);
-	has_resv_pool = (ctr == XC_FREE_BLOCKS);
-	if (rsvd)
-		ASSERT(has_resv_pool);
 
 	/*
 	 * Taking blocks away, need to be more accurate the closer we
@@ -1295,7 +1305,7 @@ xfs_dec_freecounter(
 	 * then make everything serialise as we are real close to
 	 * ENOSPC.
 	 */
-	if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
+	if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH,
 				     XFS_FDBLOCKS_BATCH) < 0)
 		batch = 1;
 	else
@@ -1312,25 +1322,25 @@ xfs_dec_freecounter(
 	 * problems (i.e. transaction abort, pagecache discards, etc.) than
 	 * slightly premature -ENOSPC.
 	 */
-	if (has_resv_pool)
-		set_aside = xfs_freecounter_unavailable(mp, ctr);
-	percpu_counter_add_batch(counter, -((int64_t)delta), batch);
-	if (__percpu_counter_compare(counter, set_aside,
+	percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch);
+	if (__percpu_counter_compare(&counter->count,
+			xfs_freecounter_unavailable(mp, ctr),
 			XFS_FDBLOCKS_BATCH) < 0) {
 		/*
 		 * Lock up the sb for dipping into reserves before releasing the
 		 * space that took us to ENOSPC.
 		 */
 		spin_lock(&mp->m_sb_lock);
-		percpu_counter_add(counter, delta);
+		percpu_counter_add(&counter->count, delta);
 		if (!rsvd)
 			goto fdblocks_enospc;
-		if (delta > mp->m_resblks_avail) {
-			xfs_warn_once(mp,
+		if (delta > counter->res_avail) {
+			if (ctr == XC_FREE_BLOCKS)
+				xfs_warn_once(mp,
 "Reserve blocks depleted! Consider increasing reserve pool size.");
 			goto fdblocks_enospc;
 		}
-		mp->m_resblks_avail -= delta;
+		counter->res_avail -= delta;
 		spin_unlock(&mp->m_sb_lock);
 	}
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7f3265d669bc..579eaf09157d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -108,6 +108,15 @@ struct xfs_groups {
 struct xfs_freecounter {
 	/* free blocks for general use: */
 	struct percpu_counter	count;
+
+	/* total reserved blocks: */
+	uint64_t		res_total;
+
+	/* available reserved blocks: */
+	uint64_t		res_avail;
+
+	/* reserved blks @ remount,ro: */
+	uint64_t		res_saved;
 };
 
 /*
@@ -250,9 +259,6 @@ typedef struct xfs_mount {
 	atomic64_t		m_allocbt_blks;
 
 	struct xfs_groups	m_groups[XG_TYPE_MAX];
-	uint64_t		m_resblks;	/* total reserved blocks */
-	uint64_t		m_resblks_avail;/* available reserved blocks */
-	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
 	struct dentry		*m_debugfs;	/* debugfs parent */
 	struct xfs_kobj		m_kobj;
@@ -638,7 +644,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 
 extern void	xfs_uuid_table_free(void);
-extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
+uint64_t	xfs_default_resblks(struct xfs_mount *mp,
+			enum xfs_free_counter ctr);
 extern int	xfs_mountfs(xfs_mount_t *mp);
 extern void	xfs_unmountfs(xfs_mount_t *);
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b08d28a895cb..366837e71eeb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -924,24 +924,32 @@ xfs_fs_statfs(
 }
 
 STATIC void
-xfs_save_resvblks(struct xfs_mount *mp)
+xfs_save_resvblks(
+	struct xfs_mount	*mp)
 {
-	mp->m_resblks_save = mp->m_resblks;
-	xfs_reserve_blocks(mp, 0);
+	enum xfs_free_counter	i;
+
+	for (i = 0; i < XC_FREE_NR; i++) {
+		mp->m_free[i].res_saved = mp->m_free[i].res_total;
+		xfs_reserve_blocks(mp, i, 0);
+	}
 }
 
 STATIC void
-xfs_restore_resvblks(struct xfs_mount *mp)
+xfs_restore_resvblks(
+	struct xfs_mount	*mp)
 {
-	uint64_t resblks;
-
-	if (mp->m_resblks_save) {
-		resblks = mp->m_resblks_save;
-		mp->m_resblks_save = 0;
-	} else
-		resblks = xfs_default_resblks(mp);
+	uint64_t		resblks;
+	enum xfs_free_counter	i;
 
-	xfs_reserve_blocks(mp, resblks);
+	for (i = 0; i < XC_FREE_NR; i++) {
+		if (mp->m_free[i].res_saved) {
+			resblks = mp->m_free[i].res_saved;
+			mp->m_free[i].res_saved = 0;
+		} else
+			resblks = xfs_default_resblks(mp, i);
+		xfs_reserve_blocks(mp, i, resblks);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From a0760cca8e1007be7bfa154004e566b2a4fbcd71 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 9 Feb 2025 06:08:20 +0100
Subject: xfs: trace in-memory freecounter reservations

Add two tracepoints when the freecounter dips into the reserved pool
and when it is entirely out of space.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_mount.c |  2 ++
 fs/xfs/xfs_trace.h | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 01f387784039..a77a2de11278 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1341,6 +1341,7 @@ xfs_dec_freecounter(
 			goto fdblocks_enospc;
 		}
 		counter->res_avail -= delta;
+		trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_);
 		spin_unlock(&mp->m_sb_lock);
 	}
 
@@ -1348,6 +1349,7 @@ xfs_dec_freecounter(
 	return 0;
 
 fdblocks_enospc:
+	trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_);
 	spin_unlock(&mp->m_sb_lock);
 	return -ENOSPC;
 }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f97129f01b48..dc4d3e913b17 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5669,6 +5669,45 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
 );
 #endif /* CONFIG_XFS_RT */
 
+TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
+TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+
+DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
+	TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
+		 uint64_t delta, unsigned long caller_ip),
+	TP_ARGS(mp, ctr, delta, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(enum xfs_free_counter, ctr)
+		__field(uint64_t, delta)
+		__field(uint64_t, avail)
+		__field(uint64_t, total)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ctr = ctr;
+		__entry->delta = delta;
+		__entry->avail = mp->m_free[ctr].res_avail;
+		__entry->total = mp->m_free[ctr].res_total;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR),
+		  __entry->delta,
+		  __entry->avail,
+		  __entry->total,
+		  (char *)__entry->caller_ip)
+)
+#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
+	TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \
+		 uint64_t delta, unsigned long caller_ip), \
+	TP_ARGS(mp, ctr, delta, caller_ip))
+DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
+DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 1d319ac6fe1bd6364c5fc6e285ac47b117aed117 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Feb 2025 05:47:15 +0100
Subject: xfs: disable sb_frextents for zoned file systems

Zoned file systems not only don't use the global frextents counter, but
for them the in-memory percpu counter also includes reservations taken
before even allocating delalloc extent records, so it will never match
the per-zone used information.  Disable all updates and verification of
the sb counter for zoned file systems as it isn't useful for them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_sb.c           |  2 +-
 fs/xfs/scrub/fscounters.c        | 11 +++++++++--
 fs/xfs/scrub/fscounters_repair.c | 10 ++++++----
 fs/xfs/xfs_mount.c               |  2 +-
 fs/xfs/xfs_super.c               |  4 +++-
 5 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 3b886886ea69..65256e109e64 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1333,7 +1333,7 @@ xfs_log_sb(
 	 * we handle nearly-lockless reservations, so we must use the _positive
 	 * variant here to avoid writing out nonsense frextents.
 	 */
-	if (xfs_has_rtgroups(mp)) {
+	if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
 		mp->m_sb.sb_frextents =
 				xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
 	}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 9dd893ece188..e629663e460a 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
 
 	fsc->frextents = 0;
 	fsc->frextents_delayed = 0;
-	if (!xfs_has_realtime(mp))
+
+	/*
+	 * Don't bother verifying and repairing the fs counters for zoned file
+	 * systems as they don't track an on-disk frextents count, and the
+	 * in-memory percpu counter also includes reservations.
+	 */
+	if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
 		return 0;
 
 	while ((rtg = xfs_rtgroup_next(mp, rtg))) {
@@ -597,7 +603,8 @@ xchk_fscounters(
 			try_again = true;
 	}
 
-	if (!xchk_fscount_within_range(sc, frextents,
+	if (!xfs_has_zoned(mp) &&
+	    !xchk_fscount_within_range(sc, frextents,
 			&mp->m_free[XC_FREE_RTEXTENTS].count,
 			fsc->frextents - fsc->frextents_delayed)) {
 		if (fsc->frozen)
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index 8fb0db78489e..f0d2b04644e4 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -74,10 +74,12 @@ xrep_fscounters(
 	 * track of the delalloc reservations separately, as they are are
 	 * subtracted from m_frextents, but not included in sb_frextents.
 	 */
-	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
-		fsc->frextents - fsc->frextents_delayed);
-	if (!xfs_has_rtgroups(mp))
-		mp->m_sb.sb_frextents = fsc->frextents;
+	if (!xfs_has_zoned(mp)) {
+		xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+				fsc->frextents - fsc->frextents_delayed);
+		if (!xfs_has_rtgroups(mp))
+			mp->m_sb.sb_frextents = fsc->frextents;
+	}
 
 	return 0;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a77a2de11278..4a8350c3f771 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -553,7 +553,7 @@ xfs_check_summary_counts(
 	 * If we're mounting the rt volume after recovering the log, recompute
 	 * frextents from the rtbitmap file to fix the inconsistency.
 	 */
-	if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+	if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
 		error = xfs_rtalloc_reinit_frextents(mp);
 		if (error)
 			return error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 86d61f3d83cd..a840e1c68ff2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1131,7 +1131,9 @@ xfs_reinit_percpu_counters(
 	percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
 	percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
 	xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
-	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
+	if (!xfs_has_zoned(mp))
+		xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+				mp->m_sb.sb_frextents);
 }
 
 static void
-- 
cgit v1.2.3


From 4e4d52075577707f8393e3fc74c1ef79ca1d3ce6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Feb 2025 05:49:17 +0100
Subject: xfs: add the zoned space allocator

For zoned RT devices space is always allocated at the write pointer, that
is right after the last written block and only recorded on I/O completion.

Because the actual allocation algorithm is very simple and just involves
picking a good zone - preferably the one used for the last write to the
inode.  As the number of zones that can written at the same time is
usually limited by the hardware, selecting a zone is done as late as
possible from the iomap dio and buffered writeback bio submissions
helpers just before submitting the bio.

Given that the writers already took a reservation before acquiring the
iolock, space will always be readily available if an open zone slot is
available.  A new structure is used to track these open zones, and
pointed to by the xfs_rtgroup.  Because zoned file systems don't have
a rsum cache the space for that pointer can be reused.

Allocations are only recorded at I/O completion time.  The scheme used
for that is very similar to the reflink COW end I/O path.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/Makefile             |   3 +-
 fs/xfs/libxfs/xfs_rtgroup.h |  22 +-
 fs/xfs/libxfs/xfs_types.h   |   1 +
 fs/xfs/xfs_log.c            |   4 +
 fs/xfs/xfs_mount.c          |  11 +
 fs/xfs/xfs_mount.h          |   2 +
 fs/xfs/xfs_rtalloc.c        |   6 +-
 fs/xfs/xfs_trace.c          |   2 +
 fs/xfs/xfs_trace.h          | 101 +++++
 fs/xfs/xfs_zone_alloc.c     | 956 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_zone_alloc.h     |  34 ++
 fs/xfs/xfs_zone_priv.h      |  89 +++++
 12 files changed, 1224 insertions(+), 7 deletions(-)
 create mode 100644 fs/xfs/xfs_zone_alloc.c
 create mode 100644 fs/xfs/xfs_zone_alloc.h
 create mode 100644 fs/xfs/xfs_zone_priv.h

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index ea8e66c1e969..28bd2627e9ef 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -137,7 +137,8 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 				   xfs_quotaops.o
 
 # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
+				   xfs_zone_alloc.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index e35d1d798327..5d8777f819f4 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -37,15 +37,27 @@ struct xfs_rtgroup {
 	xfs_rtxnum_t		rtg_extents;
 
 	/*
-	 * Cache of rt summary level per bitmap block with the invariant that
-	 * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
-	 * or 0 if rsum[i][bbno] == 0 for all i.
-	 *
+	 * For bitmap based RT devices this points to a cache of rt summary
+	 * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
+	 * > the maximum i for which rsum[i][bbno] != 0, or 0 if
+	 * rsum[i][bbno] == 0 for all i.
 	 * Reads and writes are serialized by the rsumip inode lock.
+	 *
+	 * For zoned RT devices this points to the open zone structure for
+	 * a group that is open for writers, or is NULL.
 	 */
-	uint8_t			*rtg_rsum_cache;
+	union {
+		uint8_t			*rtg_rsum_cache;
+		struct xfs_open_zone	*rtg_open_zone;
+	};
 };
 
+/*
+ * For zoned RT devices this is set on groups that have no written blocks
+ * and can be picked by the allocator for opening.
+ */
+#define XFS_RTG_FREE			XA_MARK_0
+
 static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
 {
 	return container_of(xg, struct xfs_rtgroup, rtg_group);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 76f3c31573ec..dc1db15f0be5 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -243,6 +243,7 @@ enum xfs_free_counter {
 	 * Number of free RT extents on the RT device.
 	 */
 	XC_FREE_RTEXTENTS,
+
 	XC_FREE_NR,
 };
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f8851ff835de..6493bdb57351 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -20,6 +20,7 @@
 #include "xfs_sysfs.h"
 #include "xfs_sb.h"
 #include "xfs_health.h"
+#include "xfs_zone_alloc.h"
 
 struct kmem_cache	*xfs_log_ticket_cache;
 
@@ -3540,6 +3541,9 @@ xlog_force_shutdown(
 	spin_unlock(&log->l_icloglock);
 
 	wake_up_var(&log->l_opstate);
+	if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
+		xfs_zoned_wake_all(log->l_mp);
+
 	return log_error;
 }
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4a8350c3f771..24c43f22d088 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -40,6 +40,7 @@
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
+#include "xfs_zone_alloc.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -1042,6 +1043,12 @@ xfs_mountfs(
 	if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
 		xfs_log_clean(mp);
 
+	if (xfs_has_zoned(mp)) {
+		error = xfs_mount_zones(mp);
+		if (error)
+			goto out_rtunmount;
+	}
+
 	/*
 	 * Complete the quota initialisation, post-log-replay component.
 	 */
@@ -1084,6 +1091,8 @@ xfs_mountfs(
  out_agresv:
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
+	if (xfs_has_zoned(mp))
+		xfs_unmount_zones(mp);
  out_rtunmount:
 	xfs_rtunmount_inodes(mp);
  out_rele_rip:
@@ -1165,6 +1174,8 @@ xfs_unmountfs(
 	xfs_blockgc_stop(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
+	if (xfs_has_zoned(mp))
+		xfs_unmount_zones(mp);
 	xfs_rtunmount_inodes(mp);
 	xfs_irele(mp->m_rootip);
 	if (mp->m_metadirip)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f08bba9ddb2b..0772b74fc8fd 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -219,6 +219,7 @@ typedef struct xfs_mount {
 	bool			m_fail_unmount;
 	bool			m_finobt_nores; /* no per-AG finobt resv. */
 	bool			m_update_sb;	/* sb needs update in mount */
+	unsigned int		m_max_open_zones;
 
 	/*
 	 * Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -267,6 +268,7 @@ typedef struct xfs_mount {
 
 	struct xfs_groups	m_groups[XG_TYPE_MAX];
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct xfs_zone_info	*m_zone_info;	/* zone allocator information */
 	struct dentry		*m_debugfs;	/* debugfs parent */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 741ccdde2dbe..429838316822 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -33,6 +33,7 @@
 #include "xfs_trace.h"
 #include "xfs_rtrefcount_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
 
 	for (i = 0; i < XFS_RTGI_MAX; i++)
 		xfs_rtginode_irele(&rtg->rtg_inodes[i]);
-	kvfree(rtg->rtg_rsum_cache);
+	if (!xfs_has_zoned(rtg_mount(rtg)))
+		kvfree(rtg->rtg_rsum_cache);
 }
 
 static int
@@ -1573,6 +1575,8 @@ xfs_rtmount_rtg(
 		}
 	}
 
+	if (xfs_has_zoned(mp))
+		return 0;
 	return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
 }
 
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8f530e69c18a..a60556dbd172 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -49,6 +49,8 @@
 #include "xfs_metafile.h"
 #include "xfs_metadir.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5ec45e5edeb5..27ba3013f21b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -102,6 +102,7 @@ struct xfs_rmap_intent;
 struct xfs_refcount_intent;
 struct xfs_metadir_update;
 struct xfs_rtgroup;
+struct xfs_open_zone;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -265,6 +266,105 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
 DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
 DEFINE_GROUP_REF_EVENT(xfs_group_rele);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+	TP_PROTO(struct xfs_rtgroup *rtg),
+	TP_ARGS(rtg),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, used)
+		__field(unsigned int, nr_open)
+	),
+	TP_fast_assign(
+		struct xfs_mount	*mp = rtg_mount(rtg);
+
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rgno = rtg_rgno(rtg);
+		__entry->used = rtg_rmap(rtg)->i_used_blocks;
+		__entry->nr_open = mp->m_zone_info->zi_nr_open_zones;
+	),
+	TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->used,
+		  __entry->nr_open)
+);
+
+#define DEFINE_ZONE_EVENT(name)				\
+DEFINE_EVENT(xfs_zone_class, name,			\
+	TP_PROTO(struct xfs_rtgroup *rtg),		\
+	TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_opened);
+
+TRACE_EVENT(xfs_zone_free_blocks,
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+		 xfs_extlen_t len),
+	TP_ARGS(rtg, rgbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, used)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg_mount(rtg)->m_super->s_dev;
+		__entry->rgno = rtg_rgno(rtg);
+		__entry->used = rtg_rmap(rtg)->i_used_blocks;
+		__entry->rgbno = rgbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->used,
+		  __entry->rgbno,
+		  __entry->len)
+);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
+		 xfs_extlen_t len),
+	TP_ARGS(oz, rgbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, used)
+		__field(xfs_rgblock_t, written)
+		__field(xfs_rgblock_t, write_pointer)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
+		__entry->rgno = rtg_rgno(oz->oz_rtg);
+		__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
+		__entry->written = oz->oz_written;
+		__entry->write_pointer = oz->oz_write_pointer;
+		__entry->rgbno = rgbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->used,
+		  __entry->written,
+		  __entry->write_pointer,
+		  __entry->rgbno,
+		  __entry->len)
+);
+
+#define DEFINE_ZONE_ALLOC_EVENT(name)				\
+DEFINE_EVENT(xfs_zone_alloc_class, name,			\
+	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,	\
+		 xfs_extlen_t len),				\
+	TP_ARGS(oz, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+#endif /* CONFIG_XFS_RT */
+
 TRACE_EVENT(xfs_inodegc_worker,
 	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
 	TP_ARGS(mp, shrinker_hits),
@@ -3983,6 +4083,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
 
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
new file mode 100644
index 000000000000..21734a2d0336
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -0,0 +1,956 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_error.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_refcount.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+void
+xfs_open_zone_put(
+	struct xfs_open_zone	*oz)
+{
+	if (atomic_dec_and_test(&oz->oz_ref)) {
+		xfs_rtgroup_rele(oz->oz_rtg);
+		kfree(oz);
+	}
+}
+
+static void
+xfs_open_zone_mark_full(
+	struct xfs_open_zone	*oz)
+{
+	struct xfs_rtgroup	*rtg = oz->oz_rtg;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+
+	trace_xfs_zone_full(rtg);
+
+	WRITE_ONCE(rtg->rtg_open_zone, NULL);
+
+	spin_lock(&zi->zi_open_zones_lock);
+	if (oz->oz_is_gc) {
+		ASSERT(current == zi->zi_gc_thread);
+		zi->zi_open_gc_zone = NULL;
+	} else {
+		zi->zi_nr_open_zones--;
+		list_del_init(&oz->oz_entry);
+	}
+	spin_unlock(&zi->zi_open_zones_lock);
+	xfs_open_zone_put(oz);
+
+	wake_up_all(&zi->zi_zone_wait);
+}
+
+static void
+xfs_zone_record_blocks(
+	struct xfs_trans	*tp,
+	xfs_fsblock_t		fsbno,
+	xfs_filblks_t		len,
+	struct xfs_open_zone	*oz,
+	bool			used)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_rtgroup	*rtg = oz->oz_rtg;
+	struct xfs_inode	*rmapip = rtg_rmap(rtg);
+
+	trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+	if (used) {
+		rmapip->i_used_blocks += len;
+		ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
+	} else {
+		xfs_add_frextents(mp, len);
+	}
+	oz->oz_written += len;
+	if (oz->oz_written == rtg_blocks(rtg))
+		xfs_open_zone_mark_full(oz);
+	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+}
+
+static int
+xfs_zoned_map_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*new,
+	struct xfs_open_zone	*oz,
+	xfs_fsblock_t		old_startblock)
+{
+	struct xfs_bmbt_irec	data;
+	int			nmaps = 1;
+	int			error;
+
+	/* Grab the corresponding mapping in the data fork. */
+	error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+			       &nmaps, 0);
+	if (error)
+		return error;
+
+	/*
+	 * Cap the update to the existing extent in the data fork because we can
+	 * only overwrite one extent at a time.
+	 */
+	ASSERT(new->br_blockcount >= data.br_blockcount);
+	new->br_blockcount = data.br_blockcount;
+
+	/*
+	 * If a data write raced with this GC write, keep the existing data in
+	 * the data fork, mark our newly written GC extent as reclaimable, then
+	 * move on to the next extent.
+	 */
+	if (old_startblock != NULLFSBLOCK &&
+	    old_startblock != data.br_startblock)
+		goto skip;
+
+	trace_xfs_reflink_cow_remap_from(ip, new);
+	trace_xfs_reflink_cow_remap_to(ip, &data);
+
+	error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+			XFS_IEXT_REFLINK_END_COW_CNT);
+	if (error)
+		return error;
+
+	if (data.br_startblock != HOLESTARTBLOCK) {
+		ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+		ASSERT(!isnullstartblock(data.br_startblock));
+
+		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+		if (xfs_is_reflink_inode(ip)) {
+			xfs_refcount_decrease_extent(tp, true, &data);
+		} else {
+			error = xfs_free_extent_later(tp, data.br_startblock,
+					data.br_blockcount, NULL,
+					XFS_AG_RESV_NONE,
+					XFS_FREE_EXTENT_REALTIME);
+			if (error)
+				return error;
+		}
+	}
+
+	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
+			true);
+
+	/* Map the new blocks into the data fork. */
+	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+	return 0;
+
+skip:
+	trace_xfs_reflink_cow_remap_skip(ip, new);
+	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
+			false);
+	return 0;
+}
+
+int
+xfs_zoned_end_io(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count,
+	xfs_daddr_t		daddr,
+	struct xfs_open_zone	*oz,
+	xfs_fsblock_t		old_startblock)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	struct xfs_bmbt_irec	new = {
+		.br_startoff	= XFS_B_TO_FSBT(mp, offset),
+		.br_startblock	= xfs_daddr_to_rtb(mp, daddr),
+		.br_state	= XFS_EXT_NORM,
+	};
+	unsigned int		resblks =
+		XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	while (new.br_startoff < end_fsb) {
+		new.br_blockcount = end_fsb - new.br_startoff;
+
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+				XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+		if (error)
+			return error;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, 0);
+
+		error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock);
+		if (error)
+			xfs_trans_cancel(tp);
+		else
+			error = xfs_trans_commit(tp);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			return error;
+
+		new.br_startoff += new.br_blockcount;
+		new.br_startblock += new.br_blockcount;
+		if (old_startblock != NULLFSBLOCK)
+			old_startblock += new.br_blockcount;
+	}
+
+	return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	xfs_fsblock_t		fsbno,
+	xfs_filblks_t		len)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*rmapip = rtg_rmap(rtg);
+
+	xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
+
+	if (len > rmapip->i_used_blocks) {
+		xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%u).",
+			len, rmapip->i_used_blocks);
+		ASSERT(len <= rmapip->i_used_blocks);
+		xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return -EFSCORRUPTED;
+	}
+
+	trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+	rmapip->i_used_blocks -= len;
+	xfs_add_frextents(mp, len);
+	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_open_zone *
+xfs_last_used_zone(
+	struct iomap_ioend	*ioend)
+{
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+	struct xfs_rtgroup	*rtg = NULL;
+	struct xfs_open_zone	*oz = NULL;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+				&icur, &got)) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return NULL;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+	if (!rtg)
+		return NULL;
+
+	xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+	oz = READ_ONCE(rtg->rtg_open_zone);
+	if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
+		oz = NULL;
+	xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+
+	xfs_rtgroup_rele(rtg);
+	return oz;
+}
+
+static struct xfs_group *
+xfs_find_free_zone(
+	struct xfs_mount	*mp,
+	unsigned long		start,
+	unsigned long		end)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	XA_STATE		(xas, &mp->m_groups[XG_TYPE_RTG].xa, start);
+	struct xfs_group	*xg;
+
+	xas_lock(&xas);
+	xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE)
+		if (atomic_inc_not_zero(&xg->xg_active_ref))
+			goto found;
+	xas_unlock(&xas);
+	return NULL;
+
+found:
+	xas_clear_mark(&xas, XFS_RTG_FREE);
+	atomic_dec(&zi->zi_nr_free_zones);
+	zi->zi_free_zone_cursor = xg->xg_gno;
+	xas_unlock(&xas);
+	return xg;
+}
+
+static struct xfs_open_zone *
+xfs_init_open_zone(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		write_pointer,
+	bool			is_gc)
+{
+	struct xfs_open_zone	*oz;
+
+	oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL);
+	spin_lock_init(&oz->oz_alloc_lock);
+	atomic_set(&oz->oz_ref, 1);
+	oz->oz_rtg = rtg;
+	oz->oz_write_pointer = write_pointer;
+	oz->oz_written = write_pointer;
+	oz->oz_is_gc = is_gc;
+
+	/*
+	 * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
+	 * inode, but we don't really want to take that here because we are
+	 * under the zone_list_lock.  Ensure the pointer is only set for a fully
+	 * initialized open zone structure so that a racy lookup finding it is
+	 * fine.
+	 */
+	WRITE_ONCE(rtg->rtg_open_zone, oz);
+	return oz;
+}
+
+/*
+ * Find a completely free zone, open it, and return a reference.
+ */
+struct xfs_open_zone *
+xfs_open_zone(
+	struct xfs_mount	*mp,
+	bool			is_gc)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_group	*xg;
+
+	xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX);
+	if (!xg)
+		xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor);
+	if (!xg)
+		return NULL;
+
+	set_current_state(TASK_RUNNING);
+	return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
+}
+
+static struct xfs_open_zone *
+xfs_try_open_zone(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_open_zone	*oz;
+
+	if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES)
+		return NULL;
+	if (atomic_read(&zi->zi_nr_free_zones) <
+	    XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+		return NULL;
+
+	/*
+	 * Increment the open zone count to reserve our slot before dropping
+	 * zi_open_zones_lock.
+	 */
+	zi->zi_nr_open_zones++;
+	spin_unlock(&zi->zi_open_zones_lock);
+	oz = xfs_open_zone(mp, false);
+	spin_lock(&zi->zi_open_zones_lock);
+	if (!oz) {
+		zi->zi_nr_open_zones--;
+		return NULL;
+	}
+
+	atomic_inc(&oz->oz_ref);
+	list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+
+	/*
+	 * If this was the last free zone, other waiters might be waiting
+	 * on us to write to it as well.
+	 */
+	wake_up_all(&zi->zi_zone_wait);
+
+	trace_xfs_zone_opened(oz->oz_rtg);
+	return oz;
+}
+
+static bool
+xfs_try_use_zone(
+	struct xfs_zone_info	*zi,
+	struct xfs_open_zone	*oz)
+{
+	if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+		return false;
+	if (!atomic_inc_not_zero(&oz->oz_ref))
+		return false;
+
+	/*
+	 * If we couldn't match by inode or life time we just pick the first
+	 * zone with enough space above.  For that we want the least busy zone
+	 * for some definition of "least" busy.  For now this simple LRU
+	 * algorithm that rotates every zone to the end of the list will do it,
+	 * even if it isn't exactly cache friendly.
+	 */
+	if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
+		list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
+	return true;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_lru(
+	struct xfs_zone_info	*zi)
+{
+	struct xfs_open_zone	*oz;
+
+	lockdep_assert_held(&zi->zi_open_zones_lock);
+
+	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+		if (xfs_try_use_zone(zi, oz))
+			return oz;
+
+	cond_resched_lock(&zi->zi_open_zones_lock);
+	return NULL;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_mru(
+	struct xfs_zone_info	*zi)
+{
+	struct xfs_open_zone	*oz;
+
+	lockdep_assert_held(&zi->zi_open_zones_lock);
+
+	list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
+		if (xfs_try_use_zone(zi, oz))
+			return oz;
+
+	cond_resched_lock(&zi->zi_open_zones_lock);
+	return NULL;
+}
+
+/*
+ * Try to pack inodes that are written back after they were closed tight instead
+ * of trying to open new zones for them or spread them to the least recently
+ * used zone.  This optimizes the data layout for workloads that untar or copy
+ * a lot of small files.  Right now this does not separate multiple such
+ * streams.
+ */
+static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
+{
+	return !inode_is_open_for_write(VFS_I(ip)) &&
+		!(ip->i_diflags & XFS_DIFLAG_APPEND);
+}
+
+/*
+ * Pick a new zone for writes.
+ *
+ * If we aren't using up our budget of open zones just open a new one from the
+ * freelist.  Else try to find one that matches the expected data lifetime.  If
+ * we don't find one that is good pick any zone that is available.
+ */
+static struct xfs_open_zone *
+xfs_select_zone_nowait(
+	struct xfs_mount	*mp,
+	bool			pack_tight)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_open_zone	*oz = NULL;
+
+	if (xfs_is_shutdown(mp))
+		return NULL;
+
+	spin_lock(&zi->zi_open_zones_lock);
+	if (pack_tight)
+		oz = xfs_select_open_zone_mru(zi);
+	if (oz)
+		goto out_unlock;
+
+	/*
+	 * See if we can open a new zone and use that.
+	 */
+	oz = xfs_try_open_zone(mp);
+	if (oz)
+		goto out_unlock;
+
+	oz = xfs_select_open_zone_lru(zi);
+out_unlock:
+	spin_unlock(&zi->zi_open_zones_lock);
+	return oz;
+}
+
+static struct xfs_open_zone *
+xfs_select_zone(
+	struct xfs_mount	*mp,
+	bool			pack_tight)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	DEFINE_WAIT		(wait);
+	struct xfs_open_zone	*oz;
+
+	oz = xfs_select_zone_nowait(mp, pack_tight);
+	if (oz)
+		return oz;
+
+	for (;;) {
+		prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+		oz = xfs_select_zone_nowait(mp, pack_tight);
+		if (oz)
+			break;
+		schedule();
+	}
+	finish_wait(&zi->zi_zone_wait, &wait);
+	return oz;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+	struct xfs_open_zone	*oz,
+	xfs_filblks_t		count_fsb,
+	sector_t		*sector,
+	bool			*is_seq)
+{
+	struct xfs_rtgroup	*rtg = oz->oz_rtg;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	xfs_rgblock_t		rgbno;
+
+	spin_lock(&oz->oz_alloc_lock);
+	count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+		(xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
+	if (!count_fsb) {
+		spin_unlock(&oz->oz_alloc_lock);
+		return 0;
+	}
+	rgbno = oz->oz_write_pointer;
+	oz->oz_write_pointer += count_fsb;
+	spin_unlock(&oz->oz_alloc_lock);
+
+	trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
+
+	*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
+	if (!*is_seq)
+		*sector += XFS_FSB_TO_BB(mp, rgbno);
+	return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+void
+xfs_mark_rtg_boundary(
+	struct iomap_ioend	*ioend)
+{
+	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+	sector_t		sector = ioend->io_bio.bi_iter.bi_sector;
+
+	if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+		ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+	struct iomap_ioend	*ioend,
+	struct xfs_open_zone	*oz,
+	bool			is_seq)
+{
+	ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+	ioend->io_private = oz;
+	atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
+
+	if (is_seq) {
+		ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+		ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+	} else {
+		xfs_mark_rtg_boundary(ioend);
+	}
+
+	submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+	struct iomap_ioend	*ioend,
+	struct xfs_open_zone	**oz)
+{
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	bool			pack_tight = xfs_zoned_pack_tight(ip);
+	unsigned int		alloc_len;
+	struct iomap_ioend	*split;
+	bool			is_seq;
+
+	if (xfs_is_shutdown(mp))
+		goto out_error;
+
+	/*
+	 * If we don't have a cached zone in this write context, see if the
+	 * last extent before the one we are writing to points to an active
+	 * zone.  If so, just continue writing to it.
+	 */
+	if (!*oz && ioend->io_offset)
+		*oz = xfs_last_used_zone(ioend);
+	if (!*oz) {
+select_zone:
+		*oz = xfs_select_zone(mp, pack_tight);
+		if (!*oz)
+			goto out_error;
+	}
+
+	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
+			&ioend->io_sector, &is_seq);
+	if (!alloc_len) {
+		xfs_open_zone_put(*oz);
+		goto select_zone;
+	}
+
+	while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) {
+		if (IS_ERR(split))
+			goto out_split_error;
+		alloc_len -= split->io_bio.bi_iter.bi_size;
+		xfs_submit_zoned_bio(split, *oz, is_seq);
+		if (!alloc_len) {
+			xfs_open_zone_put(*oz);
+			goto select_zone;
+		}
+	}
+
+	xfs_submit_zoned_bio(ioend, *oz, is_seq);
+	return;
+
+out_split_error:
+	ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split));
+out_error:
+	bio_io_error(&ioend->io_bio);
+}
+
+void
+xfs_zoned_wake_all(
+	struct xfs_mount	*mp)
+{
+	if (!(mp->m_super->s_flags & SB_ACTIVE))
+		return; /* can happen during log recovery */
+	wake_up_all(&mp->m_zone_info->zi_zone_wait);
+}
+
+/*
+ * Check if @rgbno in @rgb is a potentially valid block.  It might still be
+ * unused, but that information is only found in the rmap.
+ */
+bool
+xfs_zone_rgbno_is_valid(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgnumber_t		rgbno)
+{
+	lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
+
+	if (rtg->rtg_open_zone)
+		return rgbno < rtg->rtg_open_zone->oz_write_pointer;
+	return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
+			rtg_rgno(rtg), XFS_RTG_FREE);
+}
+
+static void
+xfs_free_open_zones(
+	struct xfs_zone_info	*zi)
+{
+	struct xfs_open_zone	*oz;
+
+	spin_lock(&zi->zi_open_zones_lock);
+	while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
+			struct xfs_open_zone, oz_entry))) {
+		list_del(&oz->oz_entry);
+		xfs_open_zone_put(oz);
+	}
+	spin_unlock(&zi->zi_open_zones_lock);
+}
+
+struct xfs_init_zones {
+	struct xfs_mount	*mp;
+	uint64_t		available;
+	uint64_t		reclaimable;
+};
+
+static int
+xfs_init_zone(
+	struct xfs_init_zones	*iz,
+	struct xfs_rtgroup	*rtg,
+	struct blk_zone		*zone)
+{
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	uint64_t		used = rtg_rmap(rtg)->i_used_blocks;
+	xfs_rgblock_t		write_pointer, highest_rgbno;
+
+	if (zone && !xfs_zone_validate(zone, rtg, &write_pointer))
+		return -EFSCORRUPTED;
+
+	/*
+	 * For sequential write required zones we retrieved the hardware write
+	 * pointer above.
+	 *
+	 * For conventional zones or conventional devices we don't have that
+	 * luxury.  Instead query the rmap to find the highest recorded block
+	 * and set the write pointer to the block after that.  In case of a
+	 * power loss this misses blocks where the data I/O has completed but
+	 * not recorded in the rmap yet, and it also rewrites blocks if the most
+	 * recently written ones got deleted again before unmount, but this is
+	 * the best we can do without hardware support.
+	 */
+	if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
+		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+		highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
+		if (highest_rgbno == NULLRGBLOCK)
+			write_pointer = 0;
+		else
+			write_pointer = highest_rgbno + 1;
+		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+	}
+
+	if (write_pointer == 0) {
+		/* zone is empty */
+		atomic_inc(&zi->zi_nr_free_zones);
+		xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+		iz->available += rtg_blocks(rtg);
+	} else if (write_pointer < rtg_blocks(rtg)) {
+		/* zone is open */
+		struct xfs_open_zone *oz;
+
+		atomic_inc(&rtg_group(rtg)->xg_active_ref);
+		oz = xfs_init_open_zone(rtg, write_pointer, false);
+		list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+		zi->zi_nr_open_zones++;
+
+		iz->available += (rtg_blocks(rtg) - write_pointer);
+		iz->reclaimable += write_pointer - used;
+	} else if (used < rtg_blocks(rtg)) {
+		/* zone fully written, but has freed blocks */
+		iz->reclaimable += (rtg_blocks(rtg) - used);
+	}
+
+	return 0;
+}
+
+static int
+xfs_get_zone_info_cb(
+	struct blk_zone		*zone,
+	unsigned int		idx,
+	void			*data)
+{
+	struct xfs_init_zones	*iz = data;
+	struct xfs_mount	*mp = iz->mp;
+	xfs_fsblock_t		zsbno = xfs_daddr_to_rtb(mp, zone->start);
+	xfs_rgnumber_t		rgno;
+	struct xfs_rtgroup	*rtg;
+	int			error;
+
+	if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+		xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+		return -EFSCORRUPTED;
+	}
+
+	rgno = xfs_rtb_to_rgno(mp, zsbno);
+	rtg = xfs_rtgroup_grab(mp, rgno);
+	if (!rtg) {
+		xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+		return -EFSCORRUPTED;
+	}
+	error = xfs_init_zone(iz, rtg, zone);
+	xfs_rtgroup_rele(rtg);
+	return error;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of
+ * backing zones available
+ */
+static inline uint32_t
+xfs_max_open_zones(
+	struct xfs_mount	*mp)
+{
+	unsigned int		max_open, max_open_data_zones;
+	/*
+	 * We need two zones for every open data zone,
+	 * one in reserve as we don't reclaim open zones. One data zone
+	 * and its spare is included in XFS_MIN_ZONES.
+	 */
+	max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+	max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+	/*
+	 * Cap the max open limit to 1/4 of available space
+	 */
+	max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+	return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+/*
+ * Normally we use the open zone limit that the device reports.  If there is
+ * none let the user pick one from the command line.
+ *
+ * If the device doesn't report an open zone limit and there is no override,
+ * allow to hold about a quarter of the zones open.  In theory we could allow
+ * all to be open, but at that point we run into GC deadlocks because we can't
+ * reclaim open zones.
+ *
+ * When used on conventional SSDs a lower open limit is advisable as we'll
+ * otherwise overwhelm the FTL just as much as a conventional block allocator.
+ *
+ * Note: To debug the open zone management code, force max_open to 1 here.
+ */
+static int
+xfs_calc_open_zones(
+	struct xfs_mount	*mp)
+{
+	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
+	unsigned int		bdev_open_zones = bdev_max_open_zones(bdev);
+
+	if (!mp->m_max_open_zones) {
+		if (bdev_open_zones)
+			mp->m_max_open_zones = bdev_open_zones;
+		else
+			mp->m_max_open_zones = xfs_max_open_zones(mp);
+	}
+
+	if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+		xfs_notice(mp, "need at least %u open zones.",
+			XFS_MIN_OPEN_ZONES);
+		return -EIO;
+	}
+
+	if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+		mp->m_max_open_zones = bdev_open_zones;
+		xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
+			bdev_open_zones);
+	}
+
+	if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+		mp->m_max_open_zones = xfs_max_open_zones(mp);
+		xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+			mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+	}
+
+	return 0;
+}
+
+static struct xfs_zone_info *
+xfs_alloc_zone_info(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_info	*zi;
+
+	zi = kzalloc(sizeof(*zi), GFP_KERNEL);
+	if (!zi)
+		return NULL;
+	INIT_LIST_HEAD(&zi->zi_open_zones);
+	INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
+	spin_lock_init(&zi->zi_reset_list_lock);
+	spin_lock_init(&zi->zi_open_zones_lock);
+	spin_lock_init(&zi->zi_reservation_lock);
+	init_waitqueue_head(&zi->zi_zone_wait);
+	return zi;
+}
+
+static void
+xfs_free_zone_info(
+	struct xfs_zone_info	*zi)
+{
+	xfs_free_open_zones(zi);
+	kfree(zi);
+}
+
+int
+xfs_mount_zones(
+	struct xfs_mount	*mp)
+{
+	struct xfs_init_zones	iz = {
+		.mp		= mp,
+	};
+	struct xfs_buftarg	*bt = mp->m_rtdev_targp;
+	int			error;
+
+	if (!bt) {
+		xfs_notice(mp, "RT device missing.");
+		return -EINVAL;
+	}
+
+	if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+		xfs_notice(mp, "invalid flag combination.");
+		return -EFSCORRUPTED;
+	}
+	if (mp->m_sb.sb_rextsize != 1) {
+		xfs_notice(mp, "zoned file systems do not support rextsize.");
+		return -EFSCORRUPTED;
+	}
+	if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+		xfs_notice(mp,
+"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
+		return -EFSCORRUPTED;
+	}
+
+	error = xfs_calc_open_zones(mp);
+	if (error)
+		return error;
+
+	mp->m_zone_info = xfs_alloc_zone_info(mp);
+	if (!mp->m_zone_info)
+		return -ENOMEM;
+
+	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+		 mp->m_max_open_zones);
+
+	if (bdev_is_zoned(bt->bt_bdev)) {
+		error = blkdev_report_zones(bt->bt_bdev,
+				XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
+				mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
+		if (error < 0)
+			goto out_free_zone_info;
+	} else {
+		struct xfs_rtgroup	*rtg = NULL;
+
+		while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+			error = xfs_init_zone(&iz, rtg, NULL);
+			if (error)
+				goto out_free_zone_info;
+		}
+	}
+
+	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+			iz.available + iz.reclaimable);
+	return 0;
+
+out_free_zone_info:
+	xfs_free_zone_info(mp->m_zone_info);
+	return error;
+}
+
+void
+xfs_unmount_zones(
+	struct xfs_mount	*mp)
+{
+	xfs_free_zone_info(mp->m_zone_info);
+}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
new file mode 100644
index 000000000000..78cd7bfc6ac8
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+struct iomap_ioend;
+struct xfs_open_zone;
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+		struct xfs_open_zone **oz);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		xfs_fsblock_t fsbno, xfs_filblks_t len);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+		xfs_daddr_t daddr, struct xfs_open_zone *oz,
+		xfs_fsblock_t old_startblock);
+void xfs_open_zone_put(struct xfs_open_zone *oz);
+
+void xfs_zoned_wake_all(struct xfs_mount *mp);
+bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
+void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
+
+#ifdef CONFIG_XFS_RT
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+#else
+static inline int xfs_mount_zones(struct xfs_mount *mp)
+{
+	return -EIO;
+}
+static inline void xfs_unmount_zones(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
new file mode 100644
index 000000000000..23d2fd6088ae
--- /dev/null
+++ b/fs/xfs/xfs_zone_priv.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_PRIV_H
+#define _XFS_ZONE_PRIV_H
+
+struct xfs_open_zone {
+	/*
+	 * Entry in the open zone list and refcount.  Protected by
+	 * zi_open_zones_lock in struct xfs_zone_info.
+	 */
+	struct list_head	oz_entry;
+	atomic_t		oz_ref;
+
+	/*
+	 * oz_write_pointer is the write pointer at which space is handed out
+	 * for conventional zones, or simple the count of blocks handed out
+	 * so far for sequential write required zones and is protected by
+	 * oz_alloc_lock/
+	 */
+	spinlock_t		oz_alloc_lock;
+	xfs_rgblock_t		oz_write_pointer;
+
+	/*
+	 * oz_written is the number of blocks for which we've received a
+	 * write completion.  oz_written must always be <= oz_write_pointer
+	 * and is protected by the ILOCK of the rmap inode.
+	 */
+	xfs_rgblock_t		oz_written;
+
+	/*
+	 * Is this open zone used for garbage collection?  There can only be a
+	 * single open GC zone, which is pointed to by zi_open_gc_zone in
+	 * struct xfs_zone_info.  Constant over the life time of an open zone.
+	 */
+	bool			oz_is_gc;
+
+	/*
+	 * Pointer to the RT groups structure for this open zone.  Constant over
+	 * the life time of an open zone.
+	 */
+	struct xfs_rtgroup	*oz_rtg;
+};
+
+struct xfs_zone_info {
+	/*
+	 * List of pending space reservations:
+	 */
+	spinlock_t		zi_reservation_lock;
+	struct list_head	zi_reclaim_reservations;
+
+	/*
+	 * List and number of open zones:
+	 */
+	spinlock_t		zi_open_zones_lock;
+	struct list_head	zi_open_zones;
+	unsigned int		zi_nr_open_zones;
+
+	/*
+	 * Free zone search cursor and number of free zones:
+	 */
+	unsigned long		zi_free_zone_cursor;
+	atomic_t		zi_nr_free_zones;
+
+	/*
+	 * Wait queue to wait for free zones or open zone resources to become
+	 * available:
+	 */
+	wait_queue_head_t	zi_zone_wait;
+
+	/*
+	 * Pointer to the GC thread, and the current open zone used by GC
+	 * (if any).
+	 *
+	 * zi_open_gc_zone is mostly private to the GC thread, but can be read
+	 * for debugging from other threads, in which case zi_open_zones_lock
+	 * must be taken to access it.
+	 */
+	struct task_struct      *zi_gc_thread;
+	struct xfs_open_zone	*zi_open_gc_zone;
+
+	/*
+	 * List of zones that need a reset:
+	 */
+	spinlock_t		zi_reset_list_lock;
+	struct xfs_group	*zi_reset_list;
+};
+
+struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
+
+#endif /* _XFS_ZONE_PRIV_H */
-- 
cgit v1.2.3


From 0bb2193056b5969e4148fc0909e89a5362da873e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Feb 2025 09:16:06 +0100
Subject: xfs: add support for zoned space reservations

For zoned file systems garbage collection (GC) has to take the iolock
and mmaplock after moving data to a new place to synchronize with
readers.  This means waiting for garbage collection with the iolock can
deadlock.

To avoid this, the worst case required blocks have to be reserved before
taking the iolock, which is done using a new RTAVAILABLE counter that
tracks blocks that are free to write into and don't require garbage
collection.  The new helpers try to take these available blocks, and
if there aren't enough available it wakes and waits for GC.  This is
done using a list of on-stack reservations to ensure fairness.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/Makefile              |   3 +-
 fs/xfs/libxfs/xfs_bmap.c     |  15 ++-
 fs/xfs/libxfs/xfs_types.h    |  12 ++-
 fs/xfs/xfs_mount.c           |  36 ++++---
 fs/xfs/xfs_trace.h           |  23 ++++
 fs/xfs/xfs_zone_alloc.c      |   2 +
 fs/xfs/xfs_zone_alloc.h      |  27 +++++
 fs/xfs/xfs_zone_priv.h       |   2 +
 fs/xfs/xfs_zone_space_resv.c | 244 +++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 343 insertions(+), 21 deletions(-)
 create mode 100644 fs/xfs/xfs_zone_space_resv.c

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 28bd2627e9ef..bdedf4bdb1db 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -138,7 +138,8 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 
 # xfs_rtbitmap is shared with libxfs
 xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
-				   xfs_zone_alloc.o
+				   xfs_zone_alloc.o \
+				   xfs_zone_space_resv.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 522c126e52fb..63255820b58a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -40,6 +40,7 @@
 #include "xfs_symlink_remote.h"
 #include "xfs_inode_util.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
 
 struct kmem_cache		*xfs_bmap_intent_cache;
 
@@ -4788,12 +4789,18 @@ xfs_bmap_del_extent_delay(
 	da_diff = da_old - da_new;
 	fdblocks = da_diff;
 
-	if (bflags & XFS_BMAPI_REMAP)
+	if (bflags & XFS_BMAPI_REMAP) {
 		;
-	else if (isrt)
-		xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
-	else
+	} else if (isrt) {
+		xfs_rtbxlen_t	rtxlen;
+
+		rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+		if (xfs_is_zoned_inode(ip))
+			xfs_zoned_add_available(mp, rtxlen);
+		xfs_add_frextents(mp, rtxlen);
+	} else {
 		fdblocks += del->br_blockcount;
+	}
 
 	xfs_add_fdblocks(mp, fdblocks);
 	xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index dc1db15f0be5..f6f4f2d4b5db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -244,12 +244,22 @@ enum xfs_free_counter {
 	 */
 	XC_FREE_RTEXTENTS,
 
+	/*
+	 * Number of available for use RT extents.
+	 *
+	 * This counter only exists for zoned RT device and indicates the number
+	 * of RT extents that can be directly used by writes.  XC_FREE_RTEXTENTS
+	 * also includes blocks that have been written previously and freed, but
+	 * sit in a rtgroup that still needs a zone reset.
+	 */
+	XC_FREE_RTAVAILABLE,
 	XC_FREE_NR,
 };
 
 #define XFS_FREECOUNTER_STR \
 	{ XC_FREE_BLOCKS,		"blocks" }, \
-	{ XC_FREE_RTEXTENTS,		"rtextents" }
+	{ XC_FREE_RTEXTENTS,		"rtextents" }, \
+	{ XC_FREE_RTAVAILABLE,		"rtavailable" }
 
 /*
  * Type verifier functions
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 24c43f22d088..066805e72054 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -465,6 +465,7 @@ xfs_mount_reset_sbqflags(
 static const char *const xfs_free_pool_name[] = {
 	[XC_FREE_BLOCKS]	= "free blocks",
 	[XC_FREE_RTEXTENTS]	= "free rt extents",
+	[XC_FREE_RTAVAILABLE]	= "available rt extents",
 };
 
 uint64_t
@@ -472,22 +473,27 @@ xfs_default_resblks(
 	struct xfs_mount	*mp,
 	enum xfs_free_counter	ctr)
 {
-	uint64_t resblks;
-
-	if (ctr == XC_FREE_RTEXTENTS)
+	switch (ctr) {
+	case XC_FREE_BLOCKS:
+		/*
+		 * Default to 5% or 8192 FSBs of space reserved, whichever is
+		 * smaller.
+		 *
+		 * This is intended to cover concurrent allocation transactions
+		 * when we initially hit ENOSPC.  These each require a 4 block
+		 * reservation. Hence by default we cover roughly 2000
+		 * concurrent allocation reservations.
+		 */
+		return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+	case XC_FREE_RTEXTENTS:
+	case XC_FREE_RTAVAILABLE:
+		if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+			return xfs_zoned_default_resblks(mp, ctr);
 		return 0;
-
-	/*
-	 * We default to 5% or 8192 fsbs of space reserved, whichever is
-	 * smaller.  This is intended to cover concurrent allocation
-	 * transactions when we initially hit enospc. These each require a 4
-	 * block reservation. Hence by default we cover roughly 2000 concurrent
-	 * allocation reservations.
-	 */
-	resblks = mp->m_sb.sb_dblocks;
-	do_div(resblks, 20);
-	resblks = min_t(uint64_t, resblks, 8192);
-	return resblks;
+	default:
+		ASSERT(0);
+		return 0;
+	}
 }
 
 /* Ensure the summary counts are correct. */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 27ba3013f21b..6d6099ef50af 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -363,6 +363,28 @@ DEFINE_EVENT(xfs_zone_alloc_class, name,			\
 	TP_ARGS(oz, rgbno, len))
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+
+TRACE_EVENT(xfs_zones_mount,
+	TP_PROTO(struct xfs_mount *mp),
+	TP_ARGS(mp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgcount)
+		__field(uint32_t, blocks)
+		__field(unsigned int, max_open_zones)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rgcount = mp->m_sb.sb_rgcount;
+		__entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+		__entry->max_open_zones = mp->m_max_open_zones;
+	),
+	TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		__entry->rgcount,
+		__entry->blocks,
+		__entry->max_open_zones)
+);
 #endif /* CONFIG_XFS_RT */
 
 TRACE_EVENT(xfs_inodegc_worker,
@@ -5767,6 +5789,7 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
 
 TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
 TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
 
 DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
 	TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 21734a2d0336..3d3f7589bf63 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -922,6 +922,7 @@ xfs_mount_zones(
 	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
 		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
 		 mp->m_max_open_zones);
+	trace_xfs_zones_mount(mp);
 
 	if (bdev_is_zoned(bt->bt_bdev)) {
 		error = blkdev_report_zones(bt->bt_bdev,
@@ -939,6 +940,7 @@ xfs_mount_zones(
 		}
 	}
 
+	xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
 	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
 			iz.available + iz.reclaimable);
 	return 0;
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index 78cd7bfc6ac8..28c9cffb72d5 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -5,6 +5,30 @@
 struct iomap_ioend;
 struct xfs_open_zone;
 
+struct xfs_zone_alloc_ctx {
+	struct xfs_open_zone	*open_zone;
+	xfs_filblks_t		reserved_blocks;
+};
+
+/*
+ * Grab any available space, even if it is less than what the caller asked for.
+ */
+#define XFS_ZR_GREEDY		(1U << 0)
+/*
+ * Only grab instantly available space, don't wait or GC.
+ */
+#define XFS_ZR_NOWAIT		(1U << 1)
+/*
+ * Dip into the reserved pool.
+ */
+#define XFS_ZR_RESERVED		(1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+		unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+		struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+
 void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
 		struct xfs_open_zone **oz);
 int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
@@ -18,6 +42,9 @@ void xfs_zoned_wake_all(struct xfs_mount *mp);
 bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
 void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
 
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
+		enum xfs_free_counter ctr);
+
 #ifdef CONFIG_XFS_RT
 int xfs_mount_zones(struct xfs_mount *mp);
 void xfs_unmount_zones(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 23d2fd6088ae..5283d77482d4 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -86,4 +86,6 @@ struct xfs_zone_info {
 
 struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
 
+void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
+
 #endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
new file mode 100644
index 000000000000..eff9be026425
--- /dev/null
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+
+/*
+ * Note: the zoned allocator does not support a rtextsize > 1, so this code and
+ * the allocator itself uses file system blocks interchangeable with realtime
+ * extents without doing the otherwise required conversions.
+ */
+
+/*
+ * Per-task space reservation.
+ *
+ * Tasks that need to wait for GC to free up space allocate one of these
+ * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
+ * The GC thread will then wake the tasks in order when space becomes available.
+ */
+struct xfs_zone_reservation {
+	struct list_head	entry;
+	struct task_struct	*task;
+	xfs_filblks_t		count_fsb;
+};
+
+/*
+ * Calculate the number of reserved blocks.
+ *
+ * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
+ * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
+ * available for writes without waiting for GC.
+ *
+ * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
+ * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
+ * is further restricted by at least one zone as well as the optional
+ * persistently reserved blocks.  This allows the allocator to run more
+ * smoothly by not always triggering GC.
+ */
+uint64_t
+xfs_zoned_default_resblks(
+	struct xfs_mount	*mp,
+	enum xfs_free_counter	ctr)
+{
+	switch (ctr) {
+	case XC_FREE_RTEXTENTS:
+		return (uint64_t)XFS_RESERVED_ZONES *
+			mp->m_groups[XG_TYPE_RTG].blocks +
+			mp->m_sb.sb_rtreserved;
+	case XC_FREE_RTAVAILABLE:
+		return (uint64_t)XFS_GC_ZONES *
+			mp->m_groups[XG_TYPE_RTG].blocks;
+	default:
+		ASSERT(0);
+		return 0;
+	}
+}
+
+void
+xfs_zoned_resv_wake_all(
+	struct xfs_mount		*mp)
+{
+	struct xfs_zone_info		*zi = mp->m_zone_info;
+	struct xfs_zone_reservation	*reservation;
+
+	spin_lock(&zi->zi_reservation_lock);
+	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
+		wake_up_process(reservation->task);
+	spin_unlock(&zi->zi_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+	struct xfs_mount		*mp,
+	xfs_filblks_t			count_fsb)
+{
+	struct xfs_zone_info		*zi = mp->m_zone_info;
+	struct xfs_zone_reservation	*reservation;
+
+	if (list_empty_careful(&zi->zi_reclaim_reservations)) {
+		xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+		return;
+	}
+
+	spin_lock(&zi->zi_reservation_lock);
+	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+	count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
+	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
+		if (reservation->count_fsb > count_fsb)
+			break;
+		wake_up_process(reservation->task);
+		count_fsb -= reservation->count_fsb;
+
+	}
+	spin_unlock(&zi->zi_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+	struct xfs_mount		*mp)
+{
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+	if (fatal_signal_pending(current))
+		return -EINTR;
+	return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+	struct xfs_inode		*ip,
+	xfs_filblks_t			count_fsb,
+	unsigned int			flags)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_zone_info		*zi = mp->m_zone_info;
+	struct xfs_zone_reservation	reservation = {
+		.task		= current,
+		.count_fsb	= count_fsb,
+	};
+	int				error;
+
+	/*
+	 * If there are no waiters, try to directly grab the available blocks
+	 * from the percpu counter.
+	 *
+	 * If the caller wants to dip into the reserved pool also bypass the
+	 * wait list.  This relies on the fact that we have a very graciously
+	 * sized reserved pool that always has enough space.  If the reserved
+	 * allocations fail we're in trouble.
+	 */
+	if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
+	    (flags & XFS_ZR_RESERVED))) {
+		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+				flags & XFS_ZR_RESERVED);
+		if (error != -ENOSPC)
+			return error;
+	}
+
+	if (flags & XFS_ZR_NOWAIT)
+		return -EAGAIN;
+
+	spin_lock(&zi->zi_reservation_lock);
+	list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
+	while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+		set_current_state(TASK_KILLABLE);
+
+		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+				flags & XFS_ZR_RESERVED);
+		if (error != -ENOSPC)
+			break;
+
+		spin_unlock(&zi->zi_reservation_lock);
+		schedule();
+		spin_lock(&zi->zi_reservation_lock);
+	}
+	list_del(&reservation.entry);
+	spin_unlock(&zi->zi_reservation_lock);
+
+	__set_current_state(TASK_RUNNING);
+	return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+	struct xfs_inode		*ip,
+	xfs_filblks_t			*count_fsb,
+	unsigned int			flags)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_zone_info		*zi = mp->m_zone_info;
+	s64				len = *count_fsb;
+	int				error = -ENOSPC;
+
+	spin_lock(&zi->zi_reservation_lock);
+	len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+	if (len > 0) {
+		*count_fsb = len;
+		error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
+				flags & XFS_ZR_RESERVED);
+	}
+	spin_unlock(&zi->zi_reservation_lock);
+	return error;
+}
+
+int
+xfs_zoned_space_reserve(
+	struct xfs_inode		*ip,
+	xfs_filblks_t			count_fsb,
+	unsigned int			flags,
+	struct xfs_zone_alloc_ctx	*ac)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	int				error;
+
+	ASSERT(ac->reserved_blocks == 0);
+	ASSERT(ac->open_zone == NULL);
+
+	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+			flags & XFS_ZR_RESERVED);
+	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
+		error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+	if (error)
+		return error;
+
+	error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+	if (error) {
+		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
+		return error;
+	}
+	ac->reserved_blocks = count_fsb;
+	return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+	struct xfs_inode		*ip,
+	struct xfs_zone_alloc_ctx	*ac)
+{
+	if (ac->reserved_blocks > 0) {
+		struct xfs_mount	*mp = ip->i_mount;
+
+		xfs_zoned_add_available(mp, ac->reserved_blocks);
+		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
+	}
+	if (ac->open_zone)
+		xfs_open_zone_put(ac->open_zone);
+}
-- 
cgit v1.2.3


From 080d01c41d44f0993f2c235a6bfdb681f0a66be6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 15 Feb 2025 19:37:19 +0100
Subject: xfs: implement zoned garbage collection

RT groups on a zoned file system need to be completely empty before their
space can be reused.  This means that partially empty groups need to be
emptied entirely to free up space if no entirely free groups are
available.

Add a garbage collection thread that moves all data out of the least used
zone when not enough free zones are available, and which resets all zones
that have been emptied.  To find empty zone a simple set of 10 buckets
based on the amount of space used in the zone is used.  To empty zones,
the rmap is walked to find the owners and the data is read and then
written to the new place.

To automatically defragment files the rmap records are sorted by inode
and logical offset.  This means defragmentation of parallel writes into
a single zone happens automatically when performing garbage collection.
Because holding the iolock over the entire GC cycle would inject very
noticeable latency for other accesses to the inodes, the iolock is not
taken while performing I/O.  Instead the I/O completion handler checks
that the mapping hasn't changed over the one recorded at the start of
the GC cycle and doesn't update the mapping if it change.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/Makefile              |    1 +
 fs/xfs/libxfs/xfs_group.h    |   21 +-
 fs/xfs/libxfs/xfs_rtgroup.h  |    6 +
 fs/xfs/xfs_extent_busy.c     |    2 +-
 fs/xfs/xfs_mount.c           |    4 +
 fs/xfs/xfs_mount.h           |    3 +
 fs/xfs/xfs_super.c           |   10 +
 fs/xfs/xfs_trace.h           |   25 +
 fs/xfs/xfs_zone_alloc.c      |  155 ++++++
 fs/xfs/xfs_zone_alloc.h      |    8 +
 fs/xfs/xfs_zone_gc.c         | 1165 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_zone_priv.h       |   21 +
 fs/xfs/xfs_zone_space_resv.c |    9 +
 13 files changed, 1425 insertions(+), 5 deletions(-)
 create mode 100644 fs/xfs/xfs_zone_gc.c

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index bdedf4bdb1db..e38838409271 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -139,6 +139,7 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 # xfs_rtbitmap is shared with libxfs
 xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
 				   xfs_zone_alloc.o \
+				   xfs_zone_gc.o \
 				   xfs_zone_space_resv.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
index a70096113384..cff3f815947b 100644
--- a/fs/xfs/libxfs/xfs_group.h
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -19,10 +19,23 @@ struct xfs_group {
 #ifdef __KERNEL__
 	/* -- kernel only structures below this line -- */
 
-	/*
-	 * Track freed but not yet committed extents.
-	 */
-	struct xfs_extent_busy_tree *xg_busy_extents;
+	union {
+		/*
+		 * For perags and non-zoned RT groups:
+		 * Track freed but not yet committed extents.
+		 */
+		struct xfs_extent_busy_tree	*xg_busy_extents;
+
+		/*
+		 * For zoned RT groups:
+		 * List of groups that need a zone reset.
+		 *
+		 * The zonegc code forces a log flush of the rtrmap inode before
+		 * resetting the write pointer, so there is no need for
+		 * individual busy extent tracking.
+		 */
+		struct xfs_group		*xg_next_reset;
+	};
 
 	/*
 	 * Bitsets of per-ag metadata that have been checked and/or are sick.
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 5d8777f819f4..b325aff28264 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -58,6 +58,12 @@ struct xfs_rtgroup {
  */
 #define XFS_RTG_FREE			XA_MARK_0
 
+/*
+ * For zoned RT devices this is set on groups that are fully written and that
+ * have unused blocks.  Used by the garbage collection to pick targets.
+ */
+#define XFS_RTG_RECLAIMABLE		XA_MARK_1
+
 static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
 {
 	return container_of(xg, struct xfs_rtgroup, rtg_group);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ea43c9a6e54c..da3161572735 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
 	while ((pag = xfs_perag_next(mp, pag)))
 		xfs_extent_busy_wait_group(pag_group(pag));
 
-	if (xfs_has_rtgroups(mp))
+	if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
 		while ((rtg = xfs_rtgroup_next(mp, rtg)))
 			xfs_extent_busy_wait_group(rtg_group(rtg));
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 066805e72054..d8aba6a28ba7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1090,6 +1090,8 @@ xfs_mountfs(
 		error = xfs_fs_reserve_ag_blocks(mp);
 		if (error && error != -ENOSPC)
 			goto out_agresv;
+
+		xfs_zone_gc_start(mp);
 	}
 
 	return 0;
@@ -1178,6 +1180,8 @@ xfs_unmountfs(
 	xfs_inodegc_flush(mp);
 
 	xfs_blockgc_stop(mp);
+	if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+		xfs_zone_gc_stop(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
 	if (xfs_has_zoned(mp))
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0772b74fc8fd..4b406f57548a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -556,6 +556,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
 #define XFS_OPSTATE_RESUMING_QUOTAON	18
 /* Kernel has logged a warning about zoned RT device being used on this fs. */
 #define XFS_OPSTATE_WARNED_ZONED	19
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_ZONEGC_RUNNING	20
 
 #define __XFS_IS_OPSTATE(name, NAME) \
 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -600,6 +602,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
 #endif /* CONFIG_XFS_QUOTA */
 __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
 __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)
 
 static inline bool
 xfs_should_warn(struct xfs_mount *mp, long nr)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a840e1c68ff2..39b2bad67fcd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -46,6 +46,7 @@
 #include "xfs_exchmaps_item.h"
 #include "xfs_parent.h"
 #include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
 #include "scrub/stats.h"
 #include "scrub/rcbag_btree.h"
 
@@ -822,6 +823,7 @@ xfs_fs_sync_fs(
 	if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
 		xfs_inodegc_stop(mp);
 		xfs_blockgc_stop(mp);
+		xfs_zone_gc_stop(mp);
 	}
 
 	return 0;
@@ -994,6 +996,7 @@ xfs_fs_freeze(
 	if (ret && !xfs_is_readonly(mp)) {
 		xfs_blockgc_start(mp);
 		xfs_inodegc_start(mp);
+		xfs_zone_gc_start(mp);
 	}
 
 	return ret;
@@ -1015,6 +1018,7 @@ xfs_fs_unfreeze(
 	 * filesystem.
 	 */
 	if (!xfs_is_readonly(mp)) {
+		xfs_zone_gc_start(mp);
 		xfs_blockgc_start(mp);
 		xfs_inodegc_start(mp);
 	}
@@ -1948,6 +1952,9 @@ xfs_remount_rw(
 	/* Re-enable the background inode inactivation worker. */
 	xfs_inodegc_start(mp);
 
+	/* Restart zone reclaim */
+	xfs_zone_gc_start(mp);
+
 	return 0;
 }
 
@@ -1992,6 +1999,9 @@ xfs_remount_ro(
 	 */
 	xfs_inodegc_stop(mp);
 
+	/* Stop zone reclaim */
+	xfs_zone_gc_stop(mp);
+
 	/* Free the per-AG metadata reservation pool. */
 	xfs_fs_unreserve_ag_blocks(mp);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6d6099ef50af..40f5aa1edf6b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -295,8 +295,11 @@ DECLARE_EVENT_CLASS(xfs_zone_class,
 DEFINE_EVENT(xfs_zone_class, name,			\
 	TP_PROTO(struct xfs_rtgroup *rtg),		\
 	TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
 DEFINE_ZONE_EVENT(xfs_zone_full);
 DEFINE_ZONE_EVENT(xfs_zone_opened);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
 
 TRACE_EVENT(xfs_zone_free_blocks,
 	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
@@ -364,6 +367,28 @@ DEFINE_EVENT(xfs_zone_alloc_class, name,			\
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
 
+TRACE_EVENT(xfs_zone_gc_select_victim,
+	TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
+	TP_ARGS(rtg, bucket),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, used)
+		__field(unsigned int, bucket)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg_mount(rtg)->m_super->s_dev;
+		__entry->rgno = rtg_rgno(rtg);
+		__entry->used = rtg_rmap(rtg)->i_used_blocks;
+		__entry->bucket = bucket;
+	),
+	TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->used,
+		  __entry->bucket)
+);
+
 TRACE_EVENT(xfs_zones_mount,
 	TP_PROTO(struct xfs_mount *mp),
 	TP_ARGS(mp),
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 3d3f7589bf63..b7b2820ec0ef 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -35,6 +35,104 @@ xfs_open_zone_put(
 	}
 }
 
+static inline uint32_t
+xfs_zone_bucket(
+	struct xfs_mount	*mp,
+	uint32_t		used_blocks)
+{
+	return XFS_ZONE_USED_BUCKETS * used_blocks /
+			mp->m_groups[XG_TYPE_RTG].blocks;
+}
+
+static inline void
+xfs_zone_add_to_bucket(
+	struct xfs_zone_info	*zi,
+	xfs_rgnumber_t		rgno,
+	uint32_t		to_bucket)
+{
+	__set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]);
+	zi->zi_used_bucket_entries[to_bucket]++;
+}
+
+static inline void
+xfs_zone_remove_from_bucket(
+	struct xfs_zone_info	*zi,
+	xfs_rgnumber_t		rgno,
+	uint32_t		from_bucket)
+{
+	__clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]);
+	zi->zi_used_bucket_entries[from_bucket]--;
+}
+
+static void
+xfs_zone_account_reclaimable(
+	struct xfs_rtgroup	*rtg,
+	uint32_t		freed)
+{
+	struct xfs_group	*xg = &rtg->rtg_group;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	uint32_t		used = rtg_rmap(rtg)->i_used_blocks;
+	xfs_rgnumber_t		rgno = rtg_rgno(rtg);
+	uint32_t		from_bucket = xfs_zone_bucket(mp, used + freed);
+	uint32_t		to_bucket = xfs_zone_bucket(mp, used);
+	bool			was_full = (used + freed == rtg_blocks(rtg));
+
+	/*
+	 * This can be called from log recovery, where the zone_info structure
+	 * hasn't been allocated yet.  Skip all work as xfs_mount_zones will
+	 * add the zones to the right buckets before the file systems becomes
+	 * active.
+	 */
+	if (!zi)
+		return;
+
+	if (!used) {
+		/*
+		 * The zone is now empty, remove it from the bottom bucket and
+		 * trigger a reset.
+		 */
+		trace_xfs_zone_emptied(rtg);
+
+		if (!was_full)
+			xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE);
+
+		spin_lock(&zi->zi_used_buckets_lock);
+		if (!was_full)
+			xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+		spin_unlock(&zi->zi_used_buckets_lock);
+
+		spin_lock(&zi->zi_reset_list_lock);
+		xg->xg_next_reset = zi->zi_reset_list;
+		zi->zi_reset_list = xg;
+		spin_unlock(&zi->zi_reset_list_lock);
+
+		if (zi->zi_gc_thread)
+			wake_up_process(zi->zi_gc_thread);
+	} else if (was_full) {
+		/*
+		 * The zone transitioned from full, mark it up as reclaimable
+		 * and wake up GC which might be waiting for zones to reclaim.
+		 */
+		spin_lock(&zi->zi_used_buckets_lock);
+		xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+		spin_unlock(&zi->zi_used_buckets_lock);
+
+		xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE);
+		if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
+			wake_up_process(zi->zi_gc_thread);
+	} else if (to_bucket != from_bucket) {
+		/*
+		 * Move the zone to a new bucket if it dropped below the
+		 * threshold.
+		 */
+		spin_lock(&zi->zi_used_buckets_lock);
+		xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+		xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+		spin_unlock(&zi->zi_used_buckets_lock);
+	}
+}
+
 static void
 xfs_open_zone_mark_full(
 	struct xfs_open_zone	*oz)
@@ -42,6 +140,7 @@ xfs_open_zone_mark_full(
 	struct xfs_rtgroup	*rtg = oz->oz_rtg;
 	struct xfs_mount	*mp = rtg_mount(rtg);
 	struct xfs_zone_info	*zi = mp->m_zone_info;
+	uint32_t		used = rtg_rmap(rtg)->i_used_blocks;
 
 	trace_xfs_zone_full(rtg);
 
@@ -59,6 +158,8 @@ xfs_open_zone_mark_full(
 	xfs_open_zone_put(oz);
 
 	wake_up_all(&zi->zi_zone_wait);
+	if (used < rtg_blocks(rtg))
+		xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
 }
 
 static void
@@ -244,6 +345,13 @@ xfs_zone_free_blocks(
 	trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
 
 	rmapip->i_used_blocks -= len;
+	/*
+	 * Don't add open zones to the reclaimable buckets.  The I/O completion
+	 * for writing the last block will take care of accounting for already
+	 * unused blocks instead.
+	 */
+	if (!READ_ONCE(rtg->rtg_open_zone))
+		xfs_zone_account_reclaimable(rtg, len);
 	xfs_add_frextents(mp, len);
 	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
 	return 0;
@@ -395,6 +503,9 @@ xfs_try_open_zone(
 	 */
 	wake_up_all(&zi->zi_zone_wait);
 
+	if (xfs_zoned_need_gc(mp))
+		wake_up_process(zi->zi_gc_thread);
+
 	trace_xfs_zone_opened(oz->oz_rtg);
 	return oz;
 }
@@ -702,6 +813,7 @@ xfs_init_zone(
 	struct xfs_zone_info	*zi = mp->m_zone_info;
 	uint64_t		used = rtg_rmap(rtg)->i_used_blocks;
 	xfs_rgblock_t		write_pointer, highest_rgbno;
+	int			error;
 
 	if (zone && !xfs_zone_validate(zone, rtg, &write_pointer))
 		return -EFSCORRUPTED;
@@ -728,6 +840,18 @@ xfs_init_zone(
 		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
 	}
 
+	/*
+	 * If there are no used blocks, but the zone is not in empty state yet
+	 * we lost power before the zoned reset.  In that case finish the work
+	 * here.
+	 */
+	if (write_pointer == rtg_blocks(rtg) && used == 0) {
+		error = xfs_zone_gc_reset_sync(rtg);
+		if (error)
+			return error;
+		write_pointer = 0;
+	}
+
 	if (write_pointer == 0) {
 		/* zone is empty */
 		atomic_inc(&zi->zi_nr_free_zones);
@@ -746,6 +870,7 @@ xfs_init_zone(
 		iz->reclaimable += write_pointer - used;
 	} else if (used < rtg_blocks(rtg)) {
 		/* zone fully written, but has freed blocks */
+		xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
 		iz->reclaimable += (rtg_blocks(rtg) - used);
 	}
 
@@ -856,11 +981,20 @@ xfs_calc_open_zones(
 	return 0;
 }
 
+static unsigned long *
+xfs_alloc_bucket_bitmap(
+	struct xfs_mount	*mp)
+{
+	return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount),
+			sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO);
+}
+
 static struct xfs_zone_info *
 xfs_alloc_zone_info(
 	struct xfs_mount	*mp)
 {
 	struct xfs_zone_info	*zi;
+	int			i;
 
 	zi = kzalloc(sizeof(*zi), GFP_KERNEL);
 	if (!zi)
@@ -871,14 +1005,30 @@ xfs_alloc_zone_info(
 	spin_lock_init(&zi->zi_open_zones_lock);
 	spin_lock_init(&zi->zi_reservation_lock);
 	init_waitqueue_head(&zi->zi_zone_wait);
+	spin_lock_init(&zi->zi_used_buckets_lock);
+	for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+		zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp);
+		if (!zi->zi_used_bucket_bitmap[i])
+			goto out_free_bitmaps;
+	}
 	return zi;
+
+out_free_bitmaps:
+	while (--i > 0)
+		kvfree(zi->zi_used_bucket_bitmap[i]);
+	kfree(zi);
+	return NULL;
 }
 
 static void
 xfs_free_zone_info(
 	struct xfs_zone_info	*zi)
 {
+	int			i;
+
 	xfs_free_open_zones(zi);
+	for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++)
+		kvfree(zi->zi_used_bucket_bitmap[i]);
 	kfree(zi);
 }
 
@@ -943,6 +1093,10 @@ xfs_mount_zones(
 	xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
 	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
 			iz.available + iz.reclaimable);
+
+	error = xfs_zone_gc_mount(mp);
+	if (error)
+		goto out_free_zone_info;
 	return 0;
 
 out_free_zone_info:
@@ -954,5 +1108,6 @@ void
 xfs_unmount_zones(
 	struct xfs_mount	*mp)
 {
+	xfs_zone_gc_unmount(mp);
 	xfs_free_zone_info(mp->m_zone_info);
 }
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index 28c9cffb72d5..1269390bfcda 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -48,6 +48,8 @@ uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
 #ifdef CONFIG_XFS_RT
 int xfs_mount_zones(struct xfs_mount *mp);
 void xfs_unmount_zones(struct xfs_mount *mp);
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
 #else
 static inline int xfs_mount_zones(struct xfs_mount *mp)
 {
@@ -56,6 +58,12 @@ static inline int xfs_mount_zones(struct xfs_mount *mp)
 static inline void xfs_unmount_zones(struct xfs_mount *mp)
 {
 }
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
 #endif /* CONFIG_XFS_RT */
 
 #endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
new file mode 100644
index 000000000000..0e1c39f2aaba
--- /dev/null
+++ b/fs/xfs/xfs_zone_gc.c
@@ -0,0 +1,1165 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Implement Garbage Collection (GC) of partially used zoned.
+ *
+ * To support the purely sequential writes in each zone, zoned XFS needs to be
+ * able to move data remaining in a zone out of it to reset the zone to prepare
+ * for writing to it again.
+ *
+ * This is done by the GC thread implemented in this file.  To support that a
+ * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
+ * write the garbage collected data into.
+ *
+ * Whenever the available space is below the chosen threshold, the GC thread
+ * looks for potential non-empty but not fully used zones that are worth
+ * reclaiming.  Once found the rmap for the victim zone is queried, and after
+ * a bit of sorting to reduce fragmentation, the still live extents are read
+ * into memory and written to the GC target zone, and the bmap btree of the
+ * files is updated to point to the new location.  To avoid taking the IOLOCK
+ * and MMAPLOCK for the entire GC process and thus affecting the latency of
+ * user reads and writes to the files, the GC writes are speculative and the
+ * I/O completion checks that no other writes happened for the affected regions
+ * before remapping.
+ *
+ * Once a zone does not contain any valid data, be that through GC or user
+ * block removal, it is queued for for a zone reset.  The reset operation
+ * carefully ensures that the RT device cache is flushed and all transactions
+ * referencing the rmap have been committed to disk.
+ */
+
+/*
+ * Size of each GC scratch pad.  This is also the upper bound for each
+ * GC I/O, which helps to keep latency down.
+ */
+#define XFS_GC_CHUNK_SIZE	SZ_1M
+
+/*
+ * Scratchpad data to read GCed data into.
+ *
+ * The offset member tracks where the next allocation starts, and freed tracks
+ * the amount of space that is not used anymore.
+ */
+#define XFS_ZONE_GC_NR_SCRATCH	2
+struct xfs_zone_scratch {
+	struct folio			*folio;
+	unsigned int			offset;
+	unsigned int			freed;
+};
+
+/*
+ * Chunk that is read and written for each GC operation.
+ *
+ * Note that for writes to actual zoned devices, the chunk can be split when
+ * reaching the hardware limit.
+ */
+struct xfs_gc_bio {
+	struct xfs_zone_gc_data		*data;
+
+	/*
+	 * Entry into the reading/writing/resetting list.  Only accessed from
+	 * the GC thread, so no locking needed.
+	 */
+	struct list_head		entry;
+
+	/*
+	 * State of this gc_bio.  Done means the current I/O completed.
+	 * Set from the bio end I/O handler, read from the GC thread.
+	 */
+	enum {
+		XFS_GC_BIO_NEW,
+		XFS_GC_BIO_DONE,
+	} state;
+
+	/*
+	 * Pointer to the inode and byte range in the inode that this
+	 * GC chunk is operating on.
+	 */
+	struct xfs_inode		*ip;
+	loff_t				offset;
+	unsigned int			len;
+
+	/*
+	 * Existing startblock (in the zone to be freed) and newly assigned
+	 * daddr in the zone GCed into.
+	 */
+	xfs_fsblock_t			old_startblock;
+	xfs_daddr_t			new_daddr;
+	struct xfs_zone_scratch		*scratch;
+
+	/* Are we writing to a sequential write required zone? */
+	bool				is_seq;
+
+	/* Open Zone being written to */
+	struct xfs_open_zone		*oz;
+
+	/* Bio used for reads and writes, including the bvec used by it */
+	struct bio_vec			bv;
+	struct bio			bio;	/* must be last */
+};
+
+#define XFS_ZONE_GC_RECS		1024
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+	struct xfs_rtgroup		*victim_rtg;
+	unsigned int			rec_count;
+	unsigned int			rec_idx;
+	xfs_agblock_t			next_startblock;
+	struct xfs_rmap_irec		*recs;
+};
+
+/*
+ * Per-mount GC state.
+ */
+struct xfs_zone_gc_data {
+	struct xfs_mount		*mp;
+
+	/* bioset used to allocate the gc_bios */
+	struct bio_set			bio_set;
+
+	/*
+	 * Scratchpad used, and index to indicated which one is used.
+	 */
+	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
+	unsigned int			scratch_idx;
+
+	/*
+	 * List of bios currently being read, written and reset.
+	 * These lists are only accessed by the GC thread itself, and must only
+	 * be processed in order.
+	 */
+	struct list_head		reading;
+	struct list_head		writing;
+	struct list_head		resetting;
+
+	/*
+	 * Iterator for the victim zone.
+	 */
+	struct xfs_zone_gc_iter		iter;
+};
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+	struct xfs_mount	*mp)
+{
+	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+		return false;
+	if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
+	    mp->m_groups[XG_TYPE_RTG].blocks *
+	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+		return true;
+	return false;
+}
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_gc_data	*data;
+	int			i;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
+			GFP_KERNEL);
+	if (!data->iter.recs)
+		goto out_free_data;
+
+	/*
+	 * We actually only need a single bio_vec.  It would be nice to have
+	 * a flag that only allocates the inline bvecs and not the separate
+	 * bvec pool.
+	 */
+	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+			BIOSET_NEED_BVECS))
+		goto out_free_recs;
+	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+		data->scratch[i].folio =
+			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+		if (!data->scratch[i].folio)
+			goto out_free_scratch;
+	}
+	INIT_LIST_HEAD(&data->reading);
+	INIT_LIST_HEAD(&data->writing);
+	INIT_LIST_HEAD(&data->resetting);
+	data->mp = mp;
+	return data;
+
+out_free_scratch:
+	while (--i >= 0)
+		folio_put(data->scratch[i].folio);
+	bioset_exit(&data->bio_set);
+out_free_recs:
+	kfree(data->iter.recs);
+out_free_data:
+	kfree(data);
+	return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+	struct xfs_zone_gc_data	*data)
+{
+	int			i;
+
+	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+		folio_put(data->scratch[i].folio);
+	bioset_exit(&data->bio_set);
+	kfree(data->iter.recs);
+	kfree(data);
+}
+
+static void
+xfs_zone_gc_iter_init(
+	struct xfs_zone_gc_iter	*iter,
+	struct xfs_rtgroup	*victim_rtg)
+
+{
+	iter->next_startblock = 0;
+	iter->rec_count = 0;
+	iter->rec_idx = 0;
+	iter->victim_rtg = victim_rtg;
+}
+
+/*
+ * Query the rmap of the victim zone to gather the records to evacuate.
+ */
+static int
+xfs_zone_gc_query_cb(
+	struct xfs_btree_cur	*cur,
+	const struct xfs_rmap_irec *irec,
+	void			*private)
+{
+	struct xfs_zone_gc_iter	*iter = private;
+
+	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+	iter->recs[iter->rec_count] = *irec;
+	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+		iter->next_startblock =
+			irec->rm_startblock + irec->rm_blockcount;
+		return 1;
+	}
+	return 0;
+}
+
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+	const void			*a,
+	const void			*b)
+{
+	const struct xfs_rmap_irec	*reca = a;
+	const struct xfs_rmap_irec	*recb = b;
+	int				diff;
+
+	diff = cmp_int(reca->rm_owner, recb->rm_owner);
+	if (diff)
+		return diff;
+	return cmp_int(reca->rm_offset, recb->rm_offset);
+}
+
+static int
+xfs_zone_gc_query(
+	struct xfs_mount	*mp,
+	struct xfs_zone_gc_iter	*iter)
+{
+	struct xfs_rtgroup	*rtg = iter->victim_rtg;
+	struct xfs_rmap_irec	ri_low = { };
+	struct xfs_rmap_irec	ri_high;
+	struct xfs_btree_cur	*cur;
+	struct xfs_trans	*tp;
+	int			error;
+
+	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
+	if (iter->next_startblock == rtg_blocks(rtg))
+		goto done;
+
+	ASSERT(iter->next_startblock < rtg_blocks(rtg));
+	ri_low.rm_startblock = iter->next_startblock;
+	memset(&ri_high, 0xFF, sizeof(ri_high));
+
+	iter->rec_idx = 0;
+	iter->rec_count = 0;
+
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+			xfs_zone_gc_query_cb, iter);
+	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+	xfs_trans_cancel(tp);
+
+	if (error < 0)
+		return error;
+
+	/*
+	 * Sort the rmap records by inode number and increasing offset to
+	 * defragment the mappings.
+	 *
+	 * This could be further enhanced by an even bigger look ahead window,
+	 * but that's better left until we have better detection of changes to
+	 * inode mapping to avoid the potential of GCing already dead data.
+	 */
+	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+			xfs_zone_gc_rmap_rec_cmp, NULL);
+
+	if (error == 0) {
+		/*
+		 * We finished iterating through the zone.
+		 */
+		iter->next_startblock = rtg_blocks(rtg);
+		if (iter->rec_count == 0)
+			goto done;
+	}
+
+	return 0;
+done:
+	xfs_rtgroup_rele(iter->victim_rtg);
+	iter->victim_rtg = NULL;
+	return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+	struct xfs_mount	*mp,
+	struct xfs_zone_gc_iter	*iter,
+	struct xfs_rmap_irec	*chunk_rec,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_rmap_irec	*irec;
+	int			error;
+
+	if (!iter->victim_rtg)
+		return false;
+
+retry:
+	if (iter->rec_idx == iter->rec_count) {
+		error = xfs_zone_gc_query(mp, iter);
+		if (error)
+			goto fail;
+		if (!iter->victim_rtg)
+			return false;
+	}
+
+	irec = &iter->recs[iter->rec_idx];
+	error = xfs_iget(mp, NULL, irec->rm_owner,
+			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+	if (error) {
+		/*
+		 * If the inode was already deleted, skip over it.
+		 */
+		if (error == -ENOENT) {
+			iter->rec_idx++;
+			goto retry;
+		}
+		goto fail;
+	}
+
+	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
+		iter->rec_idx++;
+		xfs_irele(*ipp);
+		goto retry;
+	}
+
+	*chunk_rec = *irec;
+	return true;
+
+fail:
+	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+	return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+	struct xfs_zone_gc_iter	*iter,
+	xfs_extlen_t		count_fsb)
+{
+	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
+
+	irec->rm_offset += count_fsb;
+	irec->rm_startblock += count_fsb;
+	irec->rm_blockcount -= count_fsb;
+	if (!irec->rm_blockcount)
+		iter->rec_idx++;
+}
+
+static struct xfs_rtgroup *
+xfs_zone_gc_pick_victim_from(
+	struct xfs_mount	*mp,
+	uint32_t		bucket)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	uint32_t		victim_used = U32_MAX;
+	struct xfs_rtgroup	*victim_rtg = NULL;
+	uint32_t		bit;
+
+	if (!zi->zi_used_bucket_entries[bucket])
+		return NULL;
+
+	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
+			mp->m_sb.sb_rgcount) {
+		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
+
+		if (!rtg)
+			continue;
+
+		/* skip zones that are just waiting for a reset */
+		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
+		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
+			xfs_rtgroup_rele(rtg);
+			continue;
+		}
+
+		if (victim_rtg)
+			xfs_rtgroup_rele(victim_rtg);
+		victim_rtg = rtg;
+		victim_used = rtg_rmap(rtg)->i_used_blocks;
+
+		/*
+		 * Any zone that is less than 1 percent used is fair game for
+		 * instant reclaim. All of these zones are in the last
+		 * bucket, so avoid the expensive division for the zones
+		 * in the other buckets.
+		 */
+		if (bucket == 0 &&
+		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
+			break;
+	}
+
+	return victim_rtg;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate to
+ * reclaim.
+ */
+static bool
+xfs_zone_gc_select_victim(
+	struct xfs_zone_gc_data	*data)
+{
+	struct xfs_zone_gc_iter	*iter = &data->iter;
+	struct xfs_mount	*mp = data->mp;
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_rtgroup	*victim_rtg = NULL;
+	unsigned int		bucket;
+
+	if (xfs_is_shutdown(mp))
+		return false;
+
+	if (iter->victim_rtg)
+		return true;
+
+	/*
+	 * Don't start new work if we are asked to stop or park.
+	 */
+	if (kthread_should_stop() || kthread_should_park())
+		return false;
+
+	if (!xfs_zoned_need_gc(mp))
+		return false;
+
+	spin_lock(&zi->zi_used_buckets_lock);
+	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
+		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
+		if (victim_rtg)
+			break;
+	}
+	spin_unlock(&zi->zi_used_buckets_lock);
+
+	if (!victim_rtg)
+		return false;
+
+	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
+	xfs_zone_gc_iter_init(iter, victim_rtg);
+	return true;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_steal_open(
+	struct xfs_zone_info	*zi)
+{
+	struct xfs_open_zone	*oz, *found = NULL;
+
+	spin_lock(&zi->zi_open_zones_lock);
+	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
+		if (!found ||
+		    oz->oz_write_pointer < found->oz_write_pointer)
+			found = oz;
+	}
+
+	if (found) {
+		found->oz_is_gc = true;
+		list_del_init(&found->oz_entry);
+		zi->zi_nr_open_zones--;
+	}
+
+	spin_unlock(&zi->zi_open_zones_lock);
+	return found;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_select_target(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
+
+	/*
+	 * We need to wait for pending writes to finish.
+	 */
+	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
+		return NULL;
+
+	ASSERT(zi->zi_nr_open_zones <=
+		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
+	oz = xfs_open_zone(mp, true);
+	if (oz)
+		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+	spin_lock(&zi->zi_open_zones_lock);
+	zi->zi_open_gc_zone = oz;
+	spin_unlock(&zi->zi_open_zones_lock);
+	return oz;
+}
+
+/*
+ * Ensure we have a valid open zone to write the GC data to.
+ *
+ * If the current target zone has space keep writing to it, else first wait for
+ * all pending writes and then pick a new one.
+ */
+static struct xfs_open_zone *
+xfs_zone_gc_ensure_target(
+	struct xfs_mount	*mp)
+{
+	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
+
+	if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+		return xfs_zone_gc_select_target(mp);
+	return oz;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+	struct xfs_zone_gc_data	*data)
+{
+	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+	struct xfs_zone_gc_data	*data)
+{
+	struct xfs_open_zone	*oz;
+
+	oz = xfs_zone_gc_ensure_target(data->mp);
+	if (!oz)
+		return false;
+	return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
+		xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+	struct bio		*bio)
+{
+	struct xfs_gc_bio	*chunk =
+		container_of(bio, struct xfs_gc_bio, bio);
+	struct xfs_zone_gc_data	*data = chunk->data;
+
+	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
+	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_alloc_blocks(
+	struct xfs_zone_gc_data	*data,
+	xfs_extlen_t		*count_fsb,
+	xfs_daddr_t		*daddr,
+	bool			*is_seq)
+{
+	struct xfs_mount	*mp = data->mp;
+	struct xfs_open_zone	*oz;
+
+	oz = xfs_zone_gc_ensure_target(mp);
+	if (!oz)
+		return NULL;
+
+	*count_fsb = min(*count_fsb,
+		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+	/*
+	 * Directly allocate GC blocks from the reserved pool.
+	 *
+	 * If we'd take them from the normal pool we could be stealing blocks
+	 * from a regular writer, which would then have to wait for GC and
+	 * deadlock.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	*count_fsb = min(*count_fsb,
+			rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
+	*count_fsb = min3(*count_fsb,
+			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
+			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
+	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
+	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (!*count_fsb)
+		return NULL;
+
+	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
+	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
+	if (!*is_seq)
+		*daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
+	oz->oz_write_pointer += *count_fsb;
+	atomic_inc(&oz->oz_ref);
+	return oz;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+	struct xfs_zone_gc_data	*data)
+{
+	struct xfs_zone_gc_iter	*iter = &data->iter;
+	struct xfs_mount	*mp = data->mp;
+	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
+	struct xfs_open_zone	*oz;
+	struct xfs_rmap_irec	irec;
+	struct xfs_gc_bio	*chunk;
+	struct xfs_inode	*ip;
+	struct bio		*bio;
+	xfs_daddr_t		daddr;
+	bool			is_seq;
+
+	if (xfs_is_shutdown(mp))
+		return false;
+
+	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+		return false;
+	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
+			&is_seq);
+	if (!oz) {
+		xfs_irele(ip);
+		return false;
+	}
+
+	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+	chunk = container_of(bio, struct xfs_gc_bio, bio);
+	chunk->ip = ip;
+	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+	chunk->old_startblock =
+		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+	chunk->new_daddr = daddr;
+	chunk->is_seq = is_seq;
+	chunk->scratch = &data->scratch[data->scratch_idx];
+	chunk->data = data;
+	chunk->oz = oz;
+
+	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+	bio->bi_end_io = xfs_zone_gc_end_io;
+	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+			chunk->scratch->offset);
+	chunk->scratch->offset += chunk->len;
+	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+		data->scratch_idx =
+			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+	}
+	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+	list_add_tail(&chunk->entry, &data->reading);
+	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+	submit_bio(bio);
+	return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+	struct xfs_gc_bio	*chunk)
+{
+	list_del(&chunk->entry);
+	xfs_open_zone_put(chunk->oz);
+	xfs_irele(chunk->ip);
+	bio_put(&chunk->bio);
+}
+
+static void
+xfs_zone_gc_submit_write(
+	struct xfs_zone_gc_data	*data,
+	struct xfs_gc_bio	*chunk)
+{
+	if (chunk->is_seq) {
+		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+	}
+	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+	submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_zone_gc_split_write(
+	struct xfs_zone_gc_data	*data,
+	struct xfs_gc_bio	*chunk)
+{
+	struct queue_limits	*lim =
+		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
+	struct xfs_gc_bio	*split_chunk;
+	int			split_sectors;
+	unsigned int		split_len;
+	struct bio		*split;
+	unsigned int		nsegs;
+
+	if (!chunk->is_seq)
+		return NULL;
+
+	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+			lim->max_zone_append_sectors << SECTOR_SHIFT);
+	if (!split_sectors)
+		return NULL;
+
+	/* ensure the split chunk is still block size aligned */
+	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
+			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
+	split_len = split_sectors << SECTOR_SHIFT;
+
+	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+	split_chunk = container_of(split, struct xfs_gc_bio, bio);
+	split_chunk->data = data;
+	ihold(VFS_I(chunk->ip));
+	split_chunk->ip = chunk->ip;
+	split_chunk->is_seq = chunk->is_seq;
+	split_chunk->scratch = chunk->scratch;
+	split_chunk->offset = chunk->offset;
+	split_chunk->len = split_len;
+	split_chunk->old_startblock = chunk->old_startblock;
+	split_chunk->new_daddr = chunk->new_daddr;
+	split_chunk->oz = chunk->oz;
+	atomic_inc(&chunk->oz->oz_ref);
+
+	chunk->offset += split_len;
+	chunk->len -= split_len;
+	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+	/* add right before the original chunk */
+	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
+	list_add_tail(&split_chunk->entry, &chunk->entry);
+	return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+	struct xfs_gc_bio	*chunk)
+{
+	struct xfs_zone_gc_data	*data = chunk->data;
+	struct xfs_mount	*mp = chunk->ip->i_mount;
+	unsigned int		folio_offset = chunk->bio.bi_io_vec->bv_offset;
+	struct xfs_gc_bio	*split_chunk;
+
+	if (chunk->bio.bi_status)
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+	if (xfs_is_shutdown(mp)) {
+		xfs_zone_gc_free_chunk(chunk);
+		return;
+	}
+
+	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+	list_move_tail(&chunk->entry, &data->writing);
+
+	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
+			folio_offset);
+
+	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
+		xfs_zone_gc_submit_write(data, split_chunk);
+	xfs_zone_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+	struct xfs_gc_bio	*chunk)
+{
+	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	struct xfs_inode	*ip = chunk->ip;
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	if (chunk->bio.bi_status)
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+	if (xfs_is_shutdown(mp)) {
+		xfs_zone_gc_free_chunk(chunk);
+		return;
+	}
+
+	chunk->scratch->freed += chunk->len;
+	if (chunk->scratch->freed == chunk->scratch->offset) {
+		chunk->scratch->offset = 0;
+		chunk->scratch->freed = 0;
+	}
+
+	/*
+	 * Cycle through the iolock and wait for direct I/O and layouts to
+	 * ensure no one is reading from the old mapping before it goes away.
+	 *
+	 * Note that xfs_zoned_end_io() below checks that no other writer raced
+	 * with us to update the mapping by checking that the old startblock
+	 * didn't change.
+	 */
+	xfs_ilock(ip, iolock);
+	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+	if (!error)
+		inode_dio_wait(VFS_I(ip));
+	xfs_iunlock(ip, iolock);
+	if (error)
+		goto free;
+
+	if (chunk->is_seq)
+		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
+	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+			chunk->new_daddr, chunk->oz, chunk->old_startblock);
+free:
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+	xfs_zone_gc_free_chunk(chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+	struct xfs_gc_bio	*chunk)
+{
+	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+
+	if (chunk->bio.bi_status) {
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		goto out;
+	}
+
+	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+	atomic_inc(&zi->zi_nr_free_zones);
+
+	xfs_zoned_add_available(mp, rtg_blocks(rtg));
+
+	wake_up_all(&zi->zi_zone_wait);
+out:
+	list_del(&chunk->entry);
+	bio_put(&chunk->bio);
+}
+
+static bool
+xfs_zone_gc_prepare_reset(
+	struct bio		*bio,
+	struct xfs_rtgroup	*rtg)
+{
+	trace_xfs_zone_reset(rtg);
+
+	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
+	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
+		if (!bdev_max_discard_sectors(bio->bi_bdev))
+			return false;
+		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+		bio->bi_iter.bi_size =
+			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
+	}
+
+	return true;
+}
+
+int
+xfs_zone_gc_reset_sync(
+	struct xfs_rtgroup	*rtg)
+{
+	int			error = 0;
+	struct bio		bio;
+
+	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
+			REQ_OP_ZONE_RESET);
+	if (xfs_zone_gc_prepare_reset(&bio, rtg))
+		error = submit_bio_wait(&bio);
+	bio_uninit(&bio);
+
+	return error;
+}
+
+static void
+xfs_zone_gc_reset_zones(
+	struct xfs_zone_gc_data	*data,
+	struct xfs_group	*reset_list)
+{
+	struct xfs_group	*next = reset_list;
+
+	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+		return;
+	}
+
+	do {
+		struct xfs_rtgroup	*rtg = to_rtg(next);
+		struct xfs_gc_bio	*chunk;
+		struct bio		*bio;
+
+		xfs_log_force_inode(rtg_rmap(rtg));
+
+		next = rtg_group(rtg)->xg_next_reset;
+		rtg_group(rtg)->xg_next_reset = NULL;
+
+		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
+				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
+		bio->bi_private = rtg;
+		bio->bi_end_io = xfs_zone_gc_end_io;
+
+		chunk = container_of(bio, struct xfs_gc_bio, bio);
+		chunk->data = data;
+		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+		list_add_tail(&chunk->entry, &data->resetting);
+
+		/*
+		 * Also use the bio to drive the state machine when neither
+		 * zone reset nor discard is supported to keep things simple.
+		 */
+		if (xfs_zone_gc_prepare_reset(bio, rtg))
+			submit_bio(bio);
+		else
+			bio_endio(bio);
+	} while (next);
+}
+
+/*
+ * Handle the work to read and write data for GC and to reset the zones,
+ * including handling all completions.
+ *
+ * Note that the order of the chunks is preserved so that we don't undo the
+ * optimal order established by xfs_zone_gc_query().
+ */
+static bool
+xfs_zone_gc_handle_work(
+	struct xfs_zone_gc_data	*data)
+{
+	struct xfs_zone_info	*zi = data->mp->m_zone_info;
+	struct xfs_gc_bio	*chunk, *next;
+	struct xfs_group	*reset_list;
+	struct blk_plug		plug;
+
+	spin_lock(&zi->zi_reset_list_lock);
+	reset_list = zi->zi_reset_list;
+	zi->zi_reset_list = NULL;
+	spin_unlock(&zi->zi_reset_list_lock);
+
+	if (!xfs_zone_gc_select_victim(data) ||
+	    !xfs_zone_gc_space_available(data)) {
+		if (list_empty(&data->reading) &&
+		    list_empty(&data->writing) &&
+		    list_empty(&data->resetting) &&
+		    !reset_list)
+			return false;
+	}
+
+	__set_current_state(TASK_RUNNING);
+	try_to_freeze();
+
+	if (reset_list)
+		xfs_zone_gc_reset_zones(data, reset_list);
+
+	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
+		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+			break;
+		xfs_zone_gc_finish_reset(chunk);
+	}
+
+	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
+		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+			break;
+		xfs_zone_gc_finish_chunk(chunk);
+	}
+
+	blk_start_plug(&plug);
+	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
+		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+			break;
+		xfs_zone_gc_write_chunk(chunk);
+	}
+	blk_finish_plug(&plug);
+
+	blk_start_plug(&plug);
+	while (xfs_zone_gc_start_chunk(data))
+		;
+	blk_finish_plug(&plug);
+	return true;
+}
+
+/*
+ * Note that the current GC algorithm would break reflinks and thus duplicate
+ * data that was shared by multiple owners before.  Because of that reflinks
+ * are currently not supported on zoned file systems and can't be created or
+ * mounted.
+ */
+static int
+xfs_zoned_gcd(
+	void			*private)
+{
+	struct xfs_zone_gc_data	*data = private;
+	struct xfs_mount	*mp = data->mp;
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	unsigned int		nofs_flag;
+
+	nofs_flag = memalloc_nofs_save();
+	set_freezable();
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+		xfs_set_zonegc_running(mp);
+		if (xfs_zone_gc_handle_work(data))
+			continue;
+
+		if (list_empty(&data->reading) &&
+		    list_empty(&data->writing) &&
+		    list_empty(&data->resetting) &&
+		    !zi->zi_reset_list) {
+			xfs_clear_zonegc_running(mp);
+			xfs_zoned_resv_wake_all(mp);
+
+			if (kthread_should_stop()) {
+				__set_current_state(TASK_RUNNING);
+				break;
+			}
+
+			if (kthread_should_park()) {
+				__set_current_state(TASK_RUNNING);
+				kthread_parkme();
+				continue;
+			}
+		}
+
+		schedule();
+	}
+	xfs_clear_zonegc_running(mp);
+
+	if (data->iter.victim_rtg)
+		xfs_rtgroup_rele(data->iter.victim_rtg);
+
+	memalloc_nofs_restore(nofs_flag);
+	xfs_zone_gc_data_free(data);
+	return 0;
+}
+
+void
+xfs_zone_gc_start(
+	struct xfs_mount	*mp)
+{
+	if (xfs_has_zoned(mp))
+		kthread_unpark(mp->m_zone_info->zi_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+	struct xfs_mount	*mp)
+{
+	if (xfs_has_zoned(mp))
+		kthread_park(mp->m_zone_info->zi_gc_thread);
+}
+
+int
+xfs_zone_gc_mount(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_zone_gc_data	*data;
+	struct xfs_open_zone	*oz;
+	int			error;
+
+	/*
+	 * If there are no free zones available for GC, pick the open zone with
+	 * the least used space to GC into.  This should only happen after an
+	 * unclean shutdown near ENOSPC while GC was ongoing.
+	 *
+	 * We also need to do this for the first gc zone allocation if we
+	 * unmounted while at the open limit.
+	 */
+	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
+	    zi->zi_nr_open_zones == mp->m_max_open_zones)
+		oz = xfs_zone_gc_steal_open(zi);
+	else
+		oz = xfs_open_zone(mp, true);
+	if (!oz) {
+		xfs_warn(mp, "unable to allocate a zone for gc");
+		error = -EIO;
+		goto out;
+	}
+
+	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+	zi->zi_open_gc_zone = oz;
+
+	data = xfs_zone_gc_data_alloc(mp);
+	if (!data) {
+		error = -ENOMEM;
+		goto out_put_gc_zone;
+	}
+
+	mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
+			"xfs-zone-gc/%s", mp->m_super->s_id);
+	if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
+		xfs_warn(mp, "unable to create zone gc thread");
+		error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
+		goto out_free_gc_data;
+	}
+
+	/* xfs_zone_gc_start will unpark for rw mounts */
+	kthread_park(mp->m_zone_info->zi_gc_thread);
+	return 0;
+
+out_free_gc_data:
+	kfree(data);
+out_put_gc_zone:
+	xfs_open_zone_put(zi->zi_open_gc_zone);
+out:
+	return error;
+}
+
+void
+xfs_zone_gc_unmount(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+
+	kthread_stop(zi->zi_gc_thread);
+	if (zi->zi_open_gc_zone)
+		xfs_open_zone_put(zi->zi_open_gc_zone);
+}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 5283d77482d4..f6c76d751a49 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -40,6 +40,13 @@ struct xfs_open_zone {
 	struct xfs_rtgroup	*oz_rtg;
 };
 
+/*
+ * Number of bitmap buckets to track reclaimable zones.  There are 10 buckets
+ * so that each 10% of the usable capacity get their own bucket and GC can
+ * only has to walk the bitmaps of the lesser used zones if there are any.
+ */
+#define XFS_ZONE_USED_BUCKETS		10u
+
 struct xfs_zone_info {
 	/*
 	 * List of pending space reservations:
@@ -82,10 +89,24 @@ struct xfs_zone_info {
 	 */
 	spinlock_t		zi_reset_list_lock;
 	struct xfs_group	*zi_reset_list;
+
+	/*
+	 * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help
+	 * garbage collection to quickly find the best candidate for reclaim.
+	 */
+	spinlock_t		zi_used_buckets_lock;
+	unsigned int		zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS];
+	unsigned long		*zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS];
+
 };
 
 struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
 
+int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+int xfs_zone_gc_mount(struct xfs_mount *mp);
+void xfs_zone_gc_unmount(struct xfs_mount *mp);
+
 void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
 
 #endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
index eff9be026425..4bf1b18aa7a7 100644
--- a/fs/xfs/xfs_zone_space_resv.c
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -159,6 +159,15 @@ xfs_zoned_reserve_available(
 		if (error != -ENOSPC)
 			break;
 
+		/*
+		 * If there is no reclaimable group left and we aren't still
+		 * processing a pending GC request give up as we're fully out
+		 * of space.
+		 */
+		if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
+		    !xfs_is_zonegc_running(mp))
+			break;
+
 		spin_unlock(&zi->zi_reservation_lock);
 		schedule();
 		spin_lock(&zi->zi_reservation_lock);
-- 
cgit v1.2.3


From 243f40d0c776bbe01fd4cce7ea6d628b38294e24 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Jan 2025 06:30:27 +0100
Subject: xfs: contain more sysfs code in xfs_sysfs.c

Extend the error sysfs initialization helper to include the neighbouring
attributes as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_mount.c | 29 ++++++-----------------------
 fs/xfs/xfs_sysfs.c | 35 ++++++++++++++++++++++++++++-------
 fs/xfs/xfs_sysfs.h |  5 ++---
 3 files changed, 36 insertions(+), 33 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d8aba6a28ba7..e65a659901d5 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -765,27 +765,15 @@ xfs_mountfs(
 	/* enable fail_at_unmount as default */
 	mp->m_fail_unmount = true;
 
-	super_set_sysfs_name_id(mp->m_super);
-
-	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
-			       NULL, mp->m_super->s_id);
-	if (error)
-		goto out;
-
-	error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
-			       &mp->m_kobj, "stats");
+	error = xfs_mount_sysfs_init(mp);
 	if (error)
-		goto out_remove_sysfs;
+		goto out_remove_scrub_stats;
 
 	xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
 
-	error = xfs_error_sysfs_init(mp);
-	if (error)
-		goto out_remove_scrub_stats;
-
 	error = xfs_errortag_init(mp);
 	if (error)
-		goto out_remove_error_sysfs;
+		goto out_remove_sysfs;
 
 	error = xfs_uuid_mount(mp);
 	if (error)
@@ -1148,13 +1136,10 @@ xfs_mountfs(
 	xfs_uuid_unmount(mp);
  out_remove_errortag:
 	xfs_errortag_del(mp);
- out_remove_error_sysfs:
-	xfs_error_sysfs_del(mp);
+ out_remove_sysfs:
+	xfs_mount_sysfs_del(mp);
  out_remove_scrub_stats:
 	xchk_stats_unregister(mp->m_scrub_stats);
-	xfs_sysfs_del(&mp->m_stats.xs_kobj);
- out_remove_sysfs:
-	xfs_sysfs_del(&mp->m_kobj);
  out:
 	return error;
 }
@@ -1231,10 +1216,8 @@ xfs_unmountfs(
 	xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
 	xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
 	xfs_errortag_del(mp);
-	xfs_error_sysfs_del(mp);
 	xchk_stats_unregister(mp->m_scrub_stats);
-	xfs_sysfs_del(&mp->m_stats.xs_kobj);
-	xfs_sysfs_del(&mp->m_kobj);
+	xfs_mount_sysfs_del(mp);
 }
 
 /*
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 60cb5318fdae..c3bd7dff229d 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = {
 };
 ATTRIBUTE_GROUPS(xfs_mp);
 
-const struct kobj_type xfs_mp_ktype = {
+static const struct kobj_type xfs_mp_ktype = {
 	.release = xfs_sysfs_release,
 	.sysfs_ops = &xfs_sysfs_ops,
 	.default_groups = xfs_mp_groups,
@@ -702,39 +702,58 @@ out_error:
 }
 
 int
-xfs_error_sysfs_init(
+xfs_mount_sysfs_init(
 	struct xfs_mount	*mp)
 {
 	int			error;
 
+	super_set_sysfs_name_id(mp->m_super);
+
+	/* .../xfs/<dev>/ */
+	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
+			       NULL, mp->m_super->s_id);
+	if (error)
+		return error;
+
+	/* .../xfs/<dev>/stats/ */
+	error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+			       &mp->m_kobj, "stats");
+	if (error)
+		goto out_remove_fsdir;
+
 	/* .../xfs/<dev>/error/ */
 	error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
 				&mp->m_kobj, "error");
 	if (error)
-		return error;
+		goto out_remove_stats_dir;
 
+	/* .../xfs/<dev>/error/fail_at_unmount */
 	error = sysfs_create_file(&mp->m_error_kobj.kobject,
 				  ATTR_LIST(fail_at_unmount));
 
 	if (error)
-		goto out_error;
+		goto out_remove_error_dir;
 
 	/* .../xfs/<dev>/error/metadata/ */
 	error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
 				"metadata", &mp->m_error_meta_kobj,
 				xfs_error_meta_init);
 	if (error)
-		goto out_error;
+		goto out_remove_error_dir;
 
 	return 0;
 
-out_error:
+out_remove_error_dir:
 	xfs_sysfs_del(&mp->m_error_kobj);
+out_remove_stats_dir:
+	xfs_sysfs_del(&mp->m_stats.xs_kobj);
+out_remove_fsdir:
+	xfs_sysfs_del(&mp->m_kobj);
 	return error;
 }
 
 void
-xfs_error_sysfs_del(
+xfs_mount_sysfs_del(
 	struct xfs_mount	*mp)
 {
 	struct xfs_error_cfg	*cfg;
@@ -749,6 +768,8 @@ xfs_error_sysfs_del(
 	}
 	xfs_sysfs_del(&mp->m_error_meta_kobj);
 	xfs_sysfs_del(&mp->m_error_kobj);
+	xfs_sysfs_del(&mp->m_stats.xs_kobj);
+	xfs_sysfs_del(&mp->m_kobj);
 }
 
 struct xfs_error_cfg *
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 148893ebfdef..1622fe80ad3e 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -7,7 +7,6 @@
 #ifndef __XFS_SYSFS_H__
 #define __XFS_SYSFS_H__
 
-extern const struct kobj_type xfs_mp_ktype;	/* xfs_mount */
 extern const struct kobj_type xfs_dbg_ktype;	/* debug */
 extern const struct kobj_type xfs_log_ktype;	/* xlog */
 extern const struct kobj_type xfs_stats_ktype;	/* stats */
@@ -53,7 +52,7 @@ xfs_sysfs_del(
 	wait_for_completion(&kobj->complete);
 }
 
-int	xfs_error_sysfs_init(struct xfs_mount *mp);
-void	xfs_error_sysfs_del(struct xfs_mount *mp);
+int	xfs_mount_sysfs_init(struct xfs_mount *mp);
+void	xfs_mount_sysfs_del(struct xfs_mount *mp);
 
 #endif	/* __XFS_SYSFS_H__ */
-- 
cgit v1.2.3