summaryrefslogtreecommitdiff
path: root/fs/xfs/scrub
diff options
context:
space:
mode:
authorCarlos Maiolino <cem@kernel.org>2025-03-04 13:25:46 +0300
committerCarlos Maiolino <cem@kernel.org>2025-03-04 13:25:46 +0300
commit4c6283ec9284bb72906dba83bc7a809747e6331e (patch)
tree6a2ed104fc86a90bb787ff0dbee020461e59ec14 /fs/xfs/scrub
parent0a1fd78080c8c9a5582e82100bd91b87ae5ac57c (diff)
parent9c477912b2f58da71751f244aceecf5f8cc549ed (diff)
downloadlinux-4c6283ec9284bb72906dba83bc7a809747e6331e.tar.xz
Merge tag 'xfs-zoned-allocator-2025-03-03' of git://git.infradead.org/users/hch/xfs into xfs-6.15-zoned_devices
xfs: add support for zoned devices Add support for the new zoned space allocator and thus for zoned devices: https://zonedstorage.io/docs/introduction/zoned-storage to XFS. This has been developed for and tested on both SMR hard drives, which are the oldest and most common class of zoned devices: https://zonedstorage.io/docs/introduction/smr and ZNS SSDs: https://zonedstorage.io/docs/introduction/zns It has not been tested with zoned UFS devices, as their current capacity points and performance characteristics aren't too interesting for XFS use cases (but never say never). Sequential write only zones are only supported for data using a new allocator for the RT device, which maps each zone to a rtgroup which is written sequentially. All metadata and (for now) the log require using randomly writable space. This means a realtime device is required to support zoned storage, but for the common case of SMR hard drives that contain random writable zones and sequential write required zones on the same block device, the concept of an internal RT device is added which means using XFS on a SMR HDD is as simple as: $ mkfs.xfs /dev/sda $ mount /dev/sda /mnt When using NVMe ZNS SSDs that do not support conventional zones, the traditional multi-device RT configuration is required. E.g. for an SSD with a conventional namespace 1 and a zoned namespace 2: $ mkfs.xfs /dev/nvme0n1 -o rtdev=/dev/nvme0n2 $ mount -o rtdev=/dev/nvme0n2 /dev/nvme0n1 /mnt The zoned allocator can also be used on conventional block devices, or on conventional zones (e.g. when using an SMR HDD as the external RT device). For example using zoned XFS on normal SSDs shows very nice performance advantages and write amplification reduction for intelligent workloads like RocksDB. Some work is still in progress or planned, but should not affect the integration with the rest of XFS or the on-disk format: - support for quotas - support for reflinks Note that the I/O path already supports reflink, but garbage collection isn't refcount aware yet and would unshare shared blocks, thus rendering the feature useless.
Diffstat (limited to 'fs/xfs/scrub')
-rw-r--r--fs/xfs/scrub/agheader.c2
-rw-r--r--fs/xfs/scrub/bmap.c4
-rw-r--r--fs/xfs/scrub/fscounters.c22
-rw-r--r--fs/xfs/scrub/fscounters_repair.c12
-rw-r--r--fs/xfs/scrub/inode.c7
-rw-r--r--fs/xfs/scrub/inode_repair.c4
-rw-r--r--fs/xfs/scrub/newbt.c2
-rw-r--r--fs/xfs/scrub/reap.c9
-rw-r--r--fs/xfs/scrub/repair.c37
-rw-r--r--fs/xfs/scrub/rtbitmap.c11
-rw-r--r--fs/xfs/scrub/rtrefcount_repair.c34
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c29
-rw-r--r--fs/xfs/scrub/scrub.c2
13 files changed, 87 insertions, 88 deletions
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 9f8c312dfd3c..303374df44bd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -69,6 +69,8 @@ STATIC size_t
xchk_superblock_ondisk_size(
struct xfs_mount *mp)
{
+ if (xfs_has_zoned(mp))
+ return offsetofend(struct xfs_dsb, sb_rtreserved);
if (xfs_has_metadir(mp))
return offsetofend(struct xfs_dsb, sb_pad);
if (xfs_has_metauuid(mp))
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 66da7d4d56ba..4f1e2574660d 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -1038,8 +1038,8 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
- /* No CoW forks on non-reflink filesystems. */
- if (!xfs_has_reflink(mp)) {
+ /* No CoW forks filesystem doesn't support out of place writes */
+ if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index ca23cf4db6c5..e629663e460a 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -350,7 +350,7 @@ retry:
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
- fsc->fdblocks -= mp->m_resblks_avail;
+ fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
fsc->frextents = 0;
fsc->frextents_delayed = 0;
- if (!xfs_has_realtime(mp))
+
+ /*
+ * Don't bother verifying and repairing the fs counters for zoned file
+ * systems as they don't track an on-disk frextents count, and the
+ * in-memory percpu counter also includes reservations.
+ */
+ if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
@@ -513,8 +519,8 @@ xchk_fscounters(
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- frextents = percpu_counter_sum(&mp->m_frextents);
+ fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+ frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
if (icount < 0 || ifree < 0)
@@ -589,15 +595,17 @@ xchk_fscounters(
try_again = true;
}
- if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks)) {
+ if (!xchk_fscount_within_range(sc, fdblocks,
+ &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
try_again = true;
}
- if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+ if (!xfs_has_zoned(mp) &&
+ !xchk_fscount_within_range(sc, frextents,
+ &mp->m_free[XC_FREE_RTEXTENTS].count,
fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index cda13447a373..f0d2b04644e4 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -64,7 +64,7 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
- percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
/*
* Online repair is only supported on v5 file systems, which require
@@ -74,10 +74,12 @@ xrep_fscounters(
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
- percpu_counter_set(&mp->m_frextents,
- fsc->frextents - fsc->frextents_delayed);
- if (!xfs_has_rtgroups(mp))
- mp->m_sb.sb_frextents = fsc->frextents;
+ if (!xfs_has_zoned(mp)) {
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ fsc->frextents - fsc->frextents_delayed);
+ if (!xfs_has_rtgroups(mp))
+ mp->m_sb.sb_frextents = fsc->frextents;
+ }
return 0;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index db6edd5a5fe5..bb3f475b6353 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -273,6 +273,13 @@ xchk_inode_cowextsize(
xfs_failaddr_t fa;
uint32_t value = be32_to_cpu(dip->di_cowextsize);
+ /*
+ * The used block counter for rtrmap is checked and repaired elsewhere.
+ */
+ if (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
+ return;
+
fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 13ff1c933cb8..4299063ffe87 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
XFS_DIFLAG_EXTSZINHERIT);
}
- if (dip->di_version < 3)
+ if (dip->di_version < 3 ||
+ (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
return;
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index ac38f5843090..1588ce971cb8 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
free = sc->sa.pag->pagf_freeblks;
sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
- free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
sz = sc->mp->m_sb.sb_dblocks;
}
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index b32fb233cf84..8703897c0a9c 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks(
if (error)
return error;
- if (xreap_dirty(&rs))
- return xrep_defer_finish(sc);
+ if (xreap_dirty(&rs)) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
- return 0;
+ return xrep_reset_metafile_resv(sc);
}
/*
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 3b5288d3ef4e..f8f9ed30f56b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -43,6 +43,7 @@
#include "xfs_rtalloc.h"
#include "xfs_metafile.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse(
xfs_rtxnum_t startrtx;
xfs_rtxnum_t endrtx;
bool is_free = false;
- int error;
+ int error = 0;
+
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
+ return -EFSCORRUPTED;
+ return 0;
+ }
startrtx = xfs_rgbno_to_rtx(mp, rgbno);
endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
@@ -1386,11 +1393,12 @@ int
xrep_reset_metafile_resv(
struct xfs_scrub *sc)
{
- struct xfs_inode *ip = sc->ip;
+ struct xfs_mount *mp = sc->mp;
int64_t delta;
int error;
- delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
+ delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail -
+ mp->m_metafile_resv_target;
if (delta == 0)
return 0;
@@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv(
if (delta > 0) {
int64_t give_back;
- give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
+ give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail);
if (give_back > 0) {
- xfs_mod_delalloc(ip, 0, -give_back);
- xfs_add_fdblocks(ip->i_mount, give_back);
- ip->i_delayed_blks -= give_back;
+ xfs_mod_sb_delalloc(mp, -give_back);
+ xfs_add_fdblocks(mp, give_back);
+ mp->m_metafile_resv_avail -= give_back;
}
return 0;
@@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv(
/*
* Not enough reservation; try to take some blocks from the filesystem
- * to the metadata inode. @delta is negative here, so invert the sign.
+ * to the metabtree reservation.
*/
- delta = -delta;
- error = xfs_dec_fdblocks(sc->mp, delta, true);
+ delta = -delta; /* delta is negative here, so invert the sign. */
+ error = xfs_dec_fdblocks(mp, delta, true);
while (error == -ENOSPC) {
delta--;
if (delta == 0) {
xfs_warn(sc->mp,
-"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
- ip->i_ino);
+"Insufficient free space to reset metabtree reservation after repair.");
return 0;
}
- error = xfs_dec_fdblocks(sc->mp, delta, true);
+ error = xfs_dec_fdblocks(mp, delta, true);
}
if (error)
return error;
- xfs_mod_delalloc(ip, 0, delta);
- ip->i_delayed_blks += delta;
+ xfs_mod_sb_delalloc(mp, delta);
+ mp->m_metafile_resv_avail += delta;
return 0;
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index e8c776a34c1d..d5ff8609dbfb 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_exchmaps.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
xfs_extlen_t len)
{
struct xfs_rtgroup *rtg = sc->sr.rtg;
- struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
if (xchk_skip_xref(sc->sm))
return;
+ if (xfs_has_zoned(sc->mp)) {
+ if (!xfs_zone_rgbno_is_valid(rtg,
+ xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
+ xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
+ return;
+ }
+
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
@@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (is_free)
- xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
+ xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
}
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
index 257cfb24beb4..983362447826 100644
--- a/fs/xfs/scrub/rtrefcount_repair.c
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -697,32 +697,6 @@ err_cur:
return error;
}
-/*
- * Now that we've logged the roots of the new btrees, invalidate all of the
- * old blocks and free them.
- */
-STATIC int
-xrep_rtrefc_remove_old_tree(
- struct xrep_rtrefc *rr)
-{
- int error;
-
- /*
- * Free all the extents that were allocated to the former rtrefcountbt
- * and aren't cross-linked with something else.
- */
- error = xrep_reap_metadir_fsblocks(rr->sc,
- &rr->old_rtrefcountbt_blocks);
- if (error)
- return error;
-
- /*
- * Ensure the proper reservation for the rtrefcount inode so that we
- * don't fail to expand the btree.
- */
- return xrep_reset_metafile_resv(rr->sc);
-}
-
/* Rebuild the rt refcount btree. */
int
xrep_rtrefcountbt(
@@ -769,8 +743,12 @@ xrep_rtrefcountbt(
if (error)
goto out_bitmap;
- /* Kill the old tree. */
- error = xrep_rtrefc_remove_old_tree(rr);
+ /*
+ * Free all the extents that were allocated to the former rtrefcountbt
+ * and aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc,
+ &rr->old_rtrefcountbt_blocks);
if (error)
goto out_bitmap;
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index f2fdd7a9fc24..fc2592c53af5 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -810,28 +810,6 @@ err_cur:
/* Reaping the old btree. */
-/* Reap the old rtrmapbt blocks. */
-STATIC int
-xrep_rtrmap_remove_old_tree(
- struct xrep_rtrmap *rr)
-{
- int error;
-
- /*
- * Free all the extents that were allocated to the former rtrmapbt and
- * aren't cross-linked with something else.
- */
- error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
- if (error)
- return error;
-
- /*
- * Ensure the proper reservation for the rtrmap inode so that we don't
- * fail to expand the new btree.
- */
- return xrep_reset_metafile_resv(rr->sc);
-}
-
static inline bool
xrep_rtrmapbt_want_live_update(
struct xchk_iscan *iscan,
@@ -995,8 +973,11 @@ xrep_rtrmapbt(
if (error)
goto out_records;
- /* Kill the old tree. */
- error = xrep_rtrmap_remove_old_tree(rr);
+ /*
+ * Free all the extents that were allocated to the former rtrmapbt and
+ * aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
if (error)
goto out_records;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 6fa9e3e5bab7..9908850bf76f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,