diff options
author | Carlos Maiolino <cem@kernel.org> | 2025-03-04 13:25:46 +0300 |
---|---|---|
committer | Carlos Maiolino <cem@kernel.org> | 2025-03-04 13:25:46 +0300 |
commit | 4c6283ec9284bb72906dba83bc7a809747e6331e (patch) | |
tree | 6a2ed104fc86a90bb787ff0dbee020461e59ec14 /fs/xfs/xfs_fsmap.c | |
parent | 0a1fd78080c8c9a5582e82100bd91b87ae5ac57c (diff) | |
parent | 9c477912b2f58da71751f244aceecf5f8cc549ed (diff) | |
download | linux-4c6283ec9284bb72906dba83bc7a809747e6331e.tar.xz |
Merge tag 'xfs-zoned-allocator-2025-03-03' of git://git.infradead.org/users/hch/xfs into xfs-6.15-zoned_devices
xfs: add support for zoned devices
Add support for the new zoned space allocator and thus for zoned devices:
https://zonedstorage.io/docs/introduction/zoned-storage
to XFS. This has been developed for and tested on both SMR hard drives,
which are the oldest and most common class of zoned devices:
https://zonedstorage.io/docs/introduction/smr
and ZNS SSDs:
https://zonedstorage.io/docs/introduction/zns
It has not been tested with zoned UFS devices, as their current capacity
points and performance characteristics aren't too interesting for XFS
use cases (but never say never).
Sequential write only zones are only supported for data using a new
allocator for the RT device, which maps each zone to a rtgroup which
is written sequentially. All metadata and (for now) the log require
using randomly writable space. This means a realtime device is required
to support zoned storage, but for the common case of SMR hard drives
that contain random writable zones and sequential write required zones
on the same block device, the concept of an internal RT device is added
which means using XFS on a SMR HDD is as simple as:
$ mkfs.xfs /dev/sda
$ mount /dev/sda /mnt
When using NVMe ZNS SSDs that do not support conventional zones, the
traditional multi-device RT configuration is required. E.g. for an
SSD with a conventional namespace 1 and a zoned namespace 2:
$ mkfs.xfs /dev/nvme0n1 -o rtdev=/dev/nvme0n2
$ mount -o rtdev=/dev/nvme0n2 /dev/nvme0n1 /mnt
The zoned allocator can also be used on conventional block devices, or
on conventional zones (e.g. when using an SMR HDD as the external RT
device). For example using zoned XFS on normal SSDs shows very nice
performance advantages and write amplification reduction for intelligent
workloads like RocksDB.
Some work is still in progress or planned, but should not affect the
integration with the rest of XFS or the on-disk format:
- support for quotas
- support for reflinks
Note that the I/O path already supports reflink, but garbage collection
isn't refcount aware yet and would unshare shared blocks, thus rendering
the feature useless.
Diffstat (limited to 'fs/xfs/xfs_fsmap.c')
-rw-r--r-- | fs/xfs/xfs_fsmap.c | 86 |
1 files changed, 68 insertions, 18 deletions
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 1dbd2d75f7ae..a4bc1642fe56 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt( struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = NULL; struct xfs_btree_cur *bt_cur = NULL; + xfs_daddr_t rtstart_daddr; xfs_rtblock_t start_rtb; xfs_rtblock_t end_rtb; xfs_rgnumber_t start_rg, end_rg; uint64_t eofs; int error = 0; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical); - end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); + if (keys[0].fmr_physical < rtstart_daddr) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_FS, + .len_daddr = rtstart_daddr, + }; + + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; + return 0; + } + + /* Fabricate an rmap entry for space occupied by the data dev */ + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + } + + start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + + min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_FREE; @@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt( } #endif /* CONFIG_XFS_RT */ +static uint32_t +xfs_getfsmap_device( + struct xfs_mount *mp, + enum xfs_device dev) +{ + if (mp->m_sb.sb_rtstart) + return dev; + + switch (dev) { + case XFS_DEV_DATA: + return new_encode_dev(mp->m_ddev_targp->bt_dev); + case XFS_DEV_LOG: + return new_encode_dev(mp->m_logdev_targp->bt_dev); + case XFS_DEV_RT: + if (!mp->m_rtdev_targp) + break; + return new_encode_dev(mp->m_rtdev_targp->bt_dev); + } + + return -1; +} + /* Do we recognize the device? */ STATIC bool xfs_getfsmap_is_valid_device( struct xfs_mount *mp, struct xfs_fsmap *fm) { - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) - return true; - if (mp->m_logdev_targp && - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) - return true; - if (mp->m_rtdev_targp && - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) - return true; - return false; + return fm->fmr_device == 0 || + fm->fmr_device == UINT_MAX || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) || + (mp->m_rtdev_targp && + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT)); } /* Ensure that the low key is less than the high key. */ @@ -1126,7 +1166,7 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else @@ -1134,13 +1174,17 @@ xfs_getfsmap( if (mp->m_logdev_targp != mp->m_ddev_targp) { handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT - if (mp->m_rtdev_targp) { + /* + * For zoned file systems there is no rtbitmap, so only support fsmap + * if the callers is privileged enough to use the full rmap version. + */ + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) { handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT); if (use_rmap) handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; else @@ -1230,7 +1274,13 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); - head->fmh_oflags = FMH_OF_DEV_T; + + /* + * For internal RT device we need to report different synthetic devices + * for a single physical device, and thus can't report the actual dev_t. + */ + if (!mp->m_sb.sb_rtstart) + head->fmh_oflags = FMH_OF_DEV_T; return error; } |