diff options
author | Carlos Maiolino <cem@kernel.org> | 2025-03-04 13:25:46 +0300 |
---|---|---|
committer | Carlos Maiolino <cem@kernel.org> | 2025-03-04 13:25:46 +0300 |
commit | 4c6283ec9284bb72906dba83bc7a809747e6331e (patch) | |
tree | 6a2ed104fc86a90bb787ff0dbee020461e59ec14 /fs/xfs/xfs_super.c | |
parent | 0a1fd78080c8c9a5582e82100bd91b87ae5ac57c (diff) | |
parent | 9c477912b2f58da71751f244aceecf5f8cc549ed (diff) | |
download | linux-4c6283ec9284bb72906dba83bc7a809747e6331e.tar.xz |
Merge tag 'xfs-zoned-allocator-2025-03-03' of git://git.infradead.org/users/hch/xfs into xfs-6.15-zoned_devices
xfs: add support for zoned devices
Add support for the new zoned space allocator and thus for zoned devices:
https://zonedstorage.io/docs/introduction/zoned-storage
to XFS. This has been developed for and tested on both SMR hard drives,
which are the oldest and most common class of zoned devices:
https://zonedstorage.io/docs/introduction/smr
and ZNS SSDs:
https://zonedstorage.io/docs/introduction/zns
It has not been tested with zoned UFS devices, as their current capacity
points and performance characteristics aren't too interesting for XFS
use cases (but never say never).
Sequential write only zones are only supported for data using a new
allocator for the RT device, which maps each zone to a rtgroup which
is written sequentially. All metadata and (for now) the log require
using randomly writable space. This means a realtime device is required
to support zoned storage, but for the common case of SMR hard drives
that contain random writable zones and sequential write required zones
on the same block device, the concept of an internal RT device is added
which means using XFS on a SMR HDD is as simple as:
$ mkfs.xfs /dev/sda
$ mount /dev/sda /mnt
When using NVMe ZNS SSDs that do not support conventional zones, the
traditional multi-device RT configuration is required. E.g. for an
SSD with a conventional namespace 1 and a zoned namespace 2:
$ mkfs.xfs /dev/nvme0n1 -o rtdev=/dev/nvme0n2
$ mount -o rtdev=/dev/nvme0n2 /dev/nvme0n1 /mnt
The zoned allocator can also be used on conventional block devices, or
on conventional zones (e.g. when using an SMR HDD as the external RT
device). For example using zoned XFS on normal SSDs shows very nice
performance advantages and write amplification reduction for intelligent
workloads like RocksDB.
Some work is still in progress or planned, but should not affect the
integration with the rest of XFS or the on-disk format:
- support for quotas
- support for reflinks
Note that the I/O path already supports reflink, but garbage collection
isn't refcount aware yet and would unshare shared blocks, thus rendering
the feature useless.
Diffstat (limited to 'fs/xfs/xfs_super.c')
-rw-r--r-- | fs/xfs/xfs_super.c | 165 |
1 files changed, 129 insertions, 36 deletions
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0055066fb1d9..af5e63cb6a99 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -46,6 +46,7 @@ #include "xfs_exchmaps_item.h" #include "xfs_parent.h" #include "xfs_rtalloc.h" +#include "xfs_zone_alloc.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -109,7 +110,8 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, + Opt_lifetime, Opt_nolifetime, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u32("max_open_zones", Opt_max_open_zones), + fsparam_flag("lifetime", Opt_lifetime), + fsparam_flag("nolifetime", Opt_nolifetime), {} }; @@ -182,6 +187,7 @@ xfs_fs_show_options( { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, { XFS_FEAT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -233,6 +239,9 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_max_open_zones) + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + return 0; } @@ -533,7 +542,15 @@ xfs_setup_devices( if (error) return error; } - if (mp->m_rtdev_targp) { + + if (mp->m_sb.sb_rtstart) { + if (mp->m_rtdev_targp) { + xfs_warn(mp, + "can't use internal and external rtdev at the same time"); + return -EINVAL; + } + mp->m_rtdev_targp = mp->m_ddev_targp; + } else if (mp->m_rtname) { error = xfs_setsize_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) @@ -757,7 +774,7 @@ xfs_mount_free( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_rtdev_targp); if (mp->m_ddev_targp) xfs_free_buftarg(mp->m_ddev_targp); @@ -814,6 +831,7 @@ xfs_fs_sync_fs( if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { xfs_inodegc_stop(mp); xfs_blockgc_stop(mp); + xfs_zone_gc_stop(mp); } return 0; @@ -834,10 +852,12 @@ xfs_statfs_data( struct kstatfs *st) { int64_t fdblocks = - percpu_counter_sum(&mp->m_fdblocks); + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); /* make sure st->f_bfree does not underflow */ - st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp)); + st->f_bfree = max(0LL, + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); + /* * sb_dblocks can change during growfs, but nothing cares about reporting * the old or new value during growfs. @@ -856,8 +876,9 @@ xfs_statfs_rt( struct kstatfs *st) { st->f_bfree = xfs_rtbxlen_to_blen(mp, - percpu_counter_sum_positive(&mp->m_frextents)); - st->f_blocks = mp->m_sb.sb_rblocks; + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, + mp->m_free[XC_FREE_RTEXTENTS].res_total); } static void @@ -922,24 +943,32 @@ xfs_fs_statfs( } STATIC void -xfs_save_resvblks(struct xfs_mount *mp) +xfs_save_resvblks( + struct xfs_mount *mp) { - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, 0); + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) { + mp->m_free[i].res_saved = mp->m_free[i].res_total; + xfs_reserve_blocks(mp, i, 0); + } } STATIC void -xfs_restore_resvblks(struct xfs_mount *mp) +xfs_restore_resvblks( + struct xfs_mount *mp) { - uint64_t resblks; + uint64_t resblks; + enum xfs_free_counter i; - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else - resblks = xfs_default_resblks(mp); - - xfs_reserve_blocks(mp, resblks); + for (i = 0; i < XC_FREE_NR; i++) { + if (mp->m_free[i].res_saved) { + resblks = mp->m_free[i].res_saved; + mp->m_free[i].res_saved = 0; + } else + resblks = xfs_default_resblks(mp, i); + xfs_reserve_blocks(mp, i, resblks); + } } /* @@ -976,6 +1005,7 @@ xfs_fs_freeze( if (ret && !xfs_is_readonly(mp)) { xfs_blockgc_start(mp); xfs_inodegc_start(mp); + xfs_zone_gc_start(mp); } return ret; @@ -997,6 +1027,7 @@ xfs_fs_unfreeze( * filesystem. */ if (!xfs_is_readonly(mp)) { + xfs_zone_gc_start(mp); xfs_blockgc_start(mp); xfs_inodegc_start(mp); } @@ -1058,6 +1089,19 @@ xfs_finish_flags( return -EINVAL; } + if (!xfs_has_zoned(mp)) { + if (mp->m_max_open_zones) { + xfs_warn(mp, +"max_open_zones mount option only supported on zoned file systems."); + return -EINVAL; + } + if (mp->m_features & XFS_FEAT_NOLIFETIME) { + xfs_warn(mp, +"nolifetime mount option only supported on zoned file systems."); + return -EINVAL; + } + } + return 0; } @@ -1065,7 +1109,8 @@ static int xfs_init_percpu_counters( struct xfs_mount *mp) { - int error; + int error; + int i; error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); if (error) @@ -1075,30 +1120,29 @@ xfs_init_percpu_counters( if (error) goto free_icount; - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); - if (error) - goto free_ifree; - error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); if (error) - goto free_fdblocks; + goto free_ifree; error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); - if (error) - goto free_delalloc_rt; + for (i = 0; i < XC_FREE_NR; i++) { + error = percpu_counter_init(&mp->m_free[i].count, 0, + GFP_KERNEL); + if (error) + goto free_freecounters; + } return 0; -free_delalloc_rt: +free_freecounters: + while (--i > 0) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); -free_fdblocks: - percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters( { percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); + if (!xfs_has_zoned(mp)) + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + mp->m_sb.sb_frextents); } static void xfs_destroy_percpu_counters( struct xfs_mount *mp) { + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); - percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); percpu_counter_destroy(&mp->m_delalloc_rtextents); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); - percpu_counter_destroy(&mp->m_frextents); } static int @@ -1210,6 +1258,18 @@ xfs_fs_shutdown( xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); } +static int +xfs_fs_show_stats( + struct seq_file *m, + struct dentry *root) +{ + struct xfs_mount *mp = XFS_M(root->d_sb); + + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) + xfs_zoned_show_stats(m, mp); + return 0; +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = { .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, + .show_stats = xfs_fs_show_stats, }; static int @@ -1436,6 +1497,15 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_max_open_zones: + parsing_mp->m_max_open_zones = result.uint_32; + return 0; + case Opt_lifetime: + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; + return 0; + case Opt_nolifetime: + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1780,8 +1850,17 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } - if (xfs_has_metadir(mp)) + if (xfs_has_zoned(mp)) { + if (!xfs_has_metadir(mp)) { + xfs_alert(mp, + "metadir feature required for zoned realtime devices."); + error = -EINVAL; + goto out_filestream_unmount; + } + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); + } else if (xfs_has_metadir(mp)) { xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); + } if (xfs_has_reflink(mp)) { if (xfs_has_realtime(mp) && @@ -1793,6 +1872,13 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_zoned(mp)) { + xfs_alert(mp, + "reflink not compatible with zoned RT device!"); + error = -EINVAL; + goto out_filestream_unmount; + } + if (xfs_globals.always_cow) { xfs_info(mp, "using DEBUG-only always_cow mode."); mp->m_always_cow = true; @@ -1917,6 +2003,9 @@ xfs_remount_rw( /* Re-enable the background inode inactivation worker. */ xfs_inodegc_start(mp); + /* Restart zone reclaim */ + xfs_zone_gc_start(mp); + return 0; } @@ -1961,6 +2050,9 @@ xfs_remount_ro( */ xfs_inodegc_stop(mp); + /* Stop zone reclaim */ + xfs_zone_gc_stop(mp); + /* Free the per-AG metadata reservation pool. */ xfs_fs_unreserve_ag_blocks(mp); @@ -2082,6 +2174,7 @@ xfs_init_fs_context( for (i = 0; i < XG_TYPE_MAX; i++) xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; |