From ef6a31d035a1000071dc4846aebd02ad081db9e4 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:01 +0000 Subject: btrfs: add definitions and constants for remap-tree Add an incompat flag for the new remap-tree feature, and the constants and definitions needed to support it. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 1 + include/uapi/linux/btrfs_tree.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e8fd92789423..9165154a274d 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14) #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16) +#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17) struct btrfs_ioctl_feature_flags { __u64 compat_flags; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d..f011d34cb699 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -76,6 +76,9 @@ /* Tracks RAID stripes in block groups. */ #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL +/* Holds details of remapped addresses after relocation. */ +#define BTRFS_REMAP_TREE_OBJECTID 13ULL + /* device stats in the device tree */ #define BTRFS_DEV_STATS_OBJECTID 0ULL @@ -282,6 +285,10 @@ #define BTRFS_RAID_STRIPE_KEY 230 +#define BTRFS_IDENTITY_REMAP_KEY 234 +#define BTRFS_REMAP_KEY 235 +#define BTRFS_REMAP_BACKREF_KEY 236 + /* * Records the overall state of the qgroups. * There's only one instance of this key present, @@ -1161,6 +1168,7 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) +#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) @@ -1323,4 +1331,13 @@ struct btrfs_verity_descriptor_item { __u8 encryption; } __attribute__ ((__packed__)); +/* + * For a range identified by a BTRFS_REMAP_KEY item in the remap tree, gives + * the address that the start of the range will get remapped to. This + * structure is also shared by BTRFS_REMAP_BACKREF_KEY. + */ +struct btrfs_remap_item { + __le64 address; +} __attribute__ ((__packed__)); + #endif /* _BTRFS_CTREE_H_ */ -- cgit v1.2.3 From 0b4d29fa98ca1a49c4498353253f857573871ba0 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:02 +0000 Subject: btrfs: add METADATA_REMAP chunk type Add a new METADATA_REMAP chunk type, which is a metadata chunk that holds the remap tree. This is needed for bootstrapping purposes: the remap tree can't itself be remapped, and must be relocated the existing way, by COWing every leaf. The remap tree can't go in the SYSTEM chunk as space there is limited, because a copy of the chunk item gets placed in the superblock. The changes in fs/btrfs/volumes.h are because we're adding a new block group type bit after the profile bits, and so can no longer rely on the const_ilog2 trick. The sizing to 32MB per chunk, matching the SYSTEM chunk, is an estimate here, we can adjust it later if it proves to be too big or too small. This works out to be ~500,000 remap items, which for a 4KB block size covers ~2GB of remapped data in the worst case and ~500TB in the best case. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 8 ++++++++ fs/btrfs/block-rsv.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/fs.h | 2 ++ fs/btrfs/space-info.c | 13 ++++++++++++- fs/btrfs/sysfs.c | 2 ++ fs/btrfs/tree-checker.c | 13 +++++++++++-- fs/btrfs/volumes.c | 3 +++ fs/btrfs/volumes.h | 10 +++++++++- include/uapi/linux/btrfs_tree.h | 4 +++- 10 files changed, 52 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 96cf7a162987..e823230c09b7 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -419,6 +419,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_TREE_LOG_OBJECTID: root->block_rsv = &fs_info->treelog_rsv; break; + case BTRFS_REMAP_TREE_OBJECTID: + root->block_rsv = &fs_info->remap_block_rsv; + break; default: root->block_rsv = NULL; break; @@ -432,6 +435,9 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_REMAP); + fs_info->remap_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; @@ -458,6 +464,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->remap_block_rsv.size > 0); + WARN_ON(fs_info->remap_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 79ae9d05cd91..8359fb96bc3c 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -22,6 +22,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_REMAP, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, BTRFS_BLOCK_RSV_TREELOG, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index faa1c2c20ecd..922e69038d81 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2751,6 +2751,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP); btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index e3e5e52e97a2..195428ecfd75 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -509,6 +509,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* Block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for remap tree. */ + struct btrfs_block_rsv remap_block_rsv; /* Block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 1d76242f5e0d..2c9cf1ab232b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -215,7 +215,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) return SZ_32M; /* Handle BTRFS_BLOCK_GROUP_METADATA */ @@ -348,6 +348,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + goto out; } else { flags = BTRFS_BLOCK_GROUP_METADATA; ret = create_space_info(fs_info, flags); @@ -356,7 +358,15 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) flags = BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + goto out; + } + + if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; + ret = create_space_info(fs_info, flags); } + out: return ret; } @@ -611,6 +621,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, remap_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8834a1dd499c..27bfb7b55ec4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1929,6 +1929,8 @@ static const char *alloc_name(struct btrfs_space_info *space_info) case BTRFS_BLOCK_GROUP_SYSTEM: ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; + case BTRFS_BLOCK_GROUP_METADATA_REMAP: + return "metadata-remap"; default: WARN_ON(1); return "invalid-combination"; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index aedc208a95b8..a6c158cd8fcd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -748,17 +748,26 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } + if (unlikely(flags & BTRFS_BLOCK_GROUP_METADATA_REMAP && + !btrfs_fs_incompat(fs_info, REMAP_TREE))) { + block_group_err(leaf, slot, +"invalid flags, have 0x%llx (METADATA_REMAP flag set) but no remap-tree incompat flag", + flags); + return -EUCLEAN; + } + type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && type != BTRFS_BLOCK_GROUP_METADATA && type != BTRFS_BLOCK_GROUP_SYSTEM && + type != BTRFS_BLOCK_GROUP_METADATA_REMAP && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { block_group_err(leaf, slot, -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA_REMAP, BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d2b7352eb7cb..eda6505f3ee5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -231,6 +231,9 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + /* Block groups containing the remap tree. */ + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap"); + /* Block group that has been remapped. */ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 59347a4bb185..e4b3cb50f94a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,6 @@ static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); */ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); -static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); /* ilog2() can handle both constants and variables */ #define BTRFS_BG_FLAG_TO_INDEX(profile) \ @@ -80,6 +79,15 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; +static_assert(BTRFS_RAID_RAID0 == 1); +static_assert(BTRFS_RAID_RAID1 == 2); +static_assert(BTRFS_RAID_DUP == 3); +static_assert(BTRFS_RAID_RAID10 == 4); +static_assert(BTRFS_RAID_RAID5 == 5); +static_assert(BTRFS_RAID_RAID6 == 6); +static_assert(BTRFS_RAID_RAID1C3 == 7); +static_assert(BTRFS_RAID_RAID1C4 == 8); + /* * Use sequence counter to get consistent device stat data on * 32-bit processors. diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index f011d34cb699..76578426671c 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1169,12 +1169,14 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) #define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) +#define BTRFS_BLOCK_GROUP_METADATA_REMAP (1ULL << 12) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) + BTRFS_BLOCK_GROUP_METADATA | \ + BTRFS_BLOCK_GROUP_METADATA_REMAP) #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ -- cgit v1.2.3 From 7977011460cffc6f5a0cd830584c832c4aa07076 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:07 +0000 Subject: btrfs: add extended version of struct block_group_item Add a struct btrfs_block_group_item_v2, which is used in the block group tree if the remap-tree incompat flag is set. This adds two new fields to the block group item: `remap_bytes` and `identity_remap_count`. `remap_bytes` records the amount of data that's physically within this block group, but nominally in another, remapped block group. This is necessary because this data will need to be moved first if this block group is itself relocated. If `remap_bytes` > 0, this is an indicator to the relocation thread that it will need to search the remap-tree for backrefs. A block group must also have `remap_bytes` == 0 before it can be dropped. `identity_remap_count` records how many identity remap items are located in the remap tree for this block group. When relocation is begun for this block group, this is set to the number of holes in the free-space tree for this range. As identity remaps are converted into actual remaps by the relocation process, this number is decreased. Once it reaches 0, either because of relocation or because extents have been deleted, the block group has been fully remapped and its chunk's device extents are removed. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 20 +++++++++ fs/btrfs/block-group.c | 92 ++++++++++++++++++++++++++++++----------- fs/btrfs/block-group.h | 10 ++++- fs/btrfs/discard.c | 2 +- fs/btrfs/tree-checker.c | 10 ++++- include/uapi/linux/btrfs_tree.h | 8 ++++ 6 files changed, 114 insertions(+), 28 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 09cdd6bfddf5..9797f9e8d4e5 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -240,6 +240,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_block_group_item_v2 */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2, + used, 64); +BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags, + struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes, + struct btrfs_block_group_item_v2, remap_bytes, 64); +BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2, + remap_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count, + struct btrfs_block_group_item_v2, identity_remap_count, 32); +BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2, + identity_remap_count, 32); + /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5709acc84297..a1ab513fa8ea 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2371,7 +2371,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) } static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_block_group_item *bgi, + struct btrfs_block_group_item_v2 *bgi, const struct btrfs_key *key, int need_clear) { @@ -2386,11 +2386,15 @@ static int read_one_block_group(struct btrfs_fs_info *info, return -ENOMEM; cache->length = key->offset; - cache->used = btrfs_stack_block_group_used(bgi); + cache->used = btrfs_stack_block_group_v2_used(bgi); cache->last_used = cache->used; - cache->flags = btrfs_stack_block_group_flags(bgi); - cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->flags = btrfs_stack_block_group_v2_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); + cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi); + cache->last_remap_bytes = cache->remap_bytes; + cache->identity_remap_count = btrfs_stack_block_group_v2_identity_remap_count(bgi); + cache->last_identity_remap_count = cache->identity_remap_count; btrfs_set_free_space_tree_thresholds(cache); @@ -2455,7 +2459,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, } else if (cache->length == cache->used) { cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); - } else if (cache->used == 0) { + } else if (cache->used == 0 && cache->remap_bytes == 0) { cache->cached = BTRFS_CACHE_FINISHED; ret = btrfs_add_new_free_space(cache, cache->start, cache->start + cache->length, NULL); @@ -2475,7 +2479,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { - if (cache->used == 0) { + if (cache->used == 0 && cache->remap_bytes == 0) { ASSERT(list_empty(&cache->bg_list)); if (btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_discard_queue_work(&info->discard_ctl, cache); @@ -2579,9 +2583,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct extent_buffer *leaf; int slot; + size_t size; ret = find_first_block_group(info, path, &key); if (ret > 0) @@ -2592,8 +2597,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) leaf = path->nodes[0]; slot = path->slots[0]; + if (btrfs_fs_incompat(info, REMAP_TREE)) { + size = sizeof(struct btrfs_block_group_item_v2); + } else { + size = sizeof(struct btrfs_block_group_item); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0); + } + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); + size); btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_release_path(path); @@ -2663,25 +2676,34 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; u64 old_last_used; + size_t size; int ret; spin_lock(&block_group->lock); - btrfs_set_stack_block_group_used(&bgi, block_group->used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - block_group->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + btrfs_set_stack_block_group_v2_used(&bgi, block_group->used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, block_group->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, block_group->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, block_group->identity_remap_count); old_last_used = block_group->last_used; block_group->last_used = block_group->used; + block_group->last_remap_bytes = block_group->remap_bytes; + block_group->last_identity_remap_count = block_group->identity_remap_count; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; spin_unlock(&block_group->lock); - ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + size = sizeof(struct btrfs_block_group_item_v2); + else + size = sizeof(struct btrfs_block_group_item); + + ret = btrfs_insert_item(trans, root, &key, &bgi, size); if (ret < 0) { spin_lock(&block_group->lock); block_group->last_used = old_last_used; @@ -3132,10 +3154,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_key key; - u64 old_last_used; - u64 used; + u64 old_last_used, old_last_remap_bytes; + u32 old_last_identity_remap_count; + u64 used, remap_bytes; + u32 identity_remap_count; /* * Block group items update can be triggered out of commit transaction @@ -3145,13 +3169,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, */ spin_lock(&cache->lock); old_last_used = cache->last_used; + old_last_remap_bytes = cache->last_remap_bytes; + old_last_identity_remap_count = cache->last_identity_remap_count; used = cache->used; - /* No change in used bytes, can safely skip it. */ - if (cache->last_used == used) { + remap_bytes = cache->remap_bytes; + identity_remap_count = cache->identity_remap_count; + /* No change in values, can safely skip it. */ + if (cache->last_used == used && + cache->last_remap_bytes == remap_bytes && + cache->last_identity_remap_count == identity_remap_count) { spin_unlock(&cache->lock); return 0; } cache->last_used = used; + cache->last_remap_bytes = remap_bytes; + cache->last_identity_remap_count = identity_remap_count; spin_unlock(&cache->lock); key.objectid = cache->start; @@ -3167,11 +3199,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - cache->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, cache->flags); - write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); + btrfs_set_stack_block_group_v2_used(&bgi, used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, cache->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags); + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, cache->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, + cache->identity_remap_count); + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item_v2)); + } else { + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item)); + } + fail: btrfs_release_path(path); /* @@ -3186,6 +3228,8 @@ fail: if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->last_used = old_last_used; + cache->last_remap_bytes = old_last_remap_bytes; + cache->last_identity_remap_count = old_last_identity_remap_count; spin_unlock(&cache->lock); } return ret; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index b0fb85a36d97..ecabb1a9fc0e 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -129,6 +129,8 @@ struct btrfs_block_group { u64 flags; u64 cache_generation; u64 global_root_id; + u64 remap_bytes; + u32 identity_remap_count; /* * The last committed used bytes of this block group, if the above @used @@ -136,6 +138,11 @@ struct btrfs_block_group { * group item of this block group. */ u64 last_used; + /* The last committed remap_bytes value of this block group. */ + u64 last_remap_bytes; + /* The last commited identity_remap_count value of this block group. */ + u32 last_identity_remap_count; + /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. @@ -282,7 +289,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) { lockdep_assert_held(&bg->lock); - return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 || + bg->remap_bytes > 0); } static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 89fe85778115..ee5f5b2788e1 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -373,7 +373,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; - if (block_group->used == 0) + if (block_group->used == 0 && block_group->remap_bytes == 0) add_to_discard_unused_list(discard_ctl, block_group); else add_to_discard_list(discard_ctl, block_group); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ead2e1e2a0bb..452394b34d01 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf, u64 chunk_objectid; u64 flags; u64 type; + size_t exp_size; /* * Here we don't really care about alignment since extent allocator can @@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (unlikely(item_size != sizeof(bgi))) { + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + exp_size = sizeof(struct btrfs_block_group_item_v2); + else + exp_size = sizeof(struct btrfs_block_group_item); + + if (unlikely(item_size != exp_size)) { block_group_err(leaf, slot, "invalid item size, have %u expect %zu", - item_size, sizeof(bgi)); + item_size, exp_size); return -EUCLEAN; } diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 76578426671c..86820a9644e8 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1229,6 +1229,14 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_block_group_item_v2 { + __le64 used; + __le64 chunk_objectid; + __le64 flags; + __le64 remap_bytes; + __le32 identity_remap_count; +} __attribute__ ((__packed__)); + struct btrfs_free_space_info { __le32 extent_count; __le32 flags; -- cgit v1.2.3 From 8620da16fb6be1fd9906374fa1c763a10c6918df Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:08 +0000 Subject: btrfs: allow mounting filesystems with remap-tree incompat flag If we encounter a filesystem with the remap-tree incompat flag set, validate its compatibility with the other flags, and load the remap tree using the values that have been added to the superblock. The remap-tree feature depends on the free-space-tree, but no-holes and block-group-tree have been made dependencies to reduce the testing matrix. Similarly I'm not aware of any reason why mixed-bg and zoned would be incompatible with remap-tree, but this is blocked for the time being until it can be fully tested. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 2 + fs/btrfs/accessors.h | 6 +++ fs/btrfs/disk-io.c | 105 +++++++++++++++++++++++++++++++++++----- fs/btrfs/extent-tree.c | 2 + fs/btrfs/fs.h | 4 +- fs/btrfs/transaction.c | 7 +++ include/uapi/linux/btrfs_tree.h | 5 +- 7 files changed, 116 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 423122786a93..ede184b6eda1 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -116,4 +116,6 @@ config BTRFS_EXPERIMENTAL - asynchronous checksum generation for data writes + - remap-tree - logical address remapping tree + If unsure, say N. diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 9797f9e8d4e5..8938357fcb40 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -883,6 +883,12 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, nr_global_roots, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block, + remap_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block, + remap_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block, + remap_root_level, 8); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cd46b9d85880..c69734c74c26 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1136,6 +1136,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_RAID_STRIPE_TREE_OBJECTID: return btrfs_grab_root(fs_info->stripe_root); + case BTRFS_REMAP_TREE_OBJECTID: + return btrfs_grab_root(fs_info->remap_root); default: return NULL; } @@ -1226,6 +1228,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); btrfs_put_root(fs_info->stripe_root); + btrfs_put_root(fs_info->remap_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1778,6 +1781,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); free_root_extent_buffers(info->stripe_root); + free_root_extent_buffers(info->remap_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2191,21 +2195,44 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (ret) goto out; - /* - * This tree can share blocks with some other fs tree during relocation - * and we need a proper setup by btrfs_get_fs_root - */ - root = btrfs_get_fs_root(tree_root->fs_info, - BTRFS_DATA_RELOC_TREE_OBJECTID, true); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - ret = PTR_ERR(root); - goto out; + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* The remap_root has already been loaded in load_important_roots(). */ + root = fs_info->remap_root; + + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + + root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + /* Check that data reloc tree doesn't also exist. */ + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + root = btrfs_read_tree_root(fs_info->tree_root, &location); + if (!IS_ERR(root)) { + btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled"); + btrfs_put_root(root); + return -EIO; + } else if (PTR_ERR(root) != -ENOENT) { + btrfs_warn(fs_info, "error %ld when checking for data reloc tree", + PTR_ERR(root)); } } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; + /* + * This tree can share blocks with some other fs tree during + * relocation and we need a proper setup by btrfs_get_fs_root(). + */ + root = btrfs_get_fs_root(tree_root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID, true); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; + } } location.objectid = BTRFS_QUOTA_TREE_OBJECTID; @@ -2445,6 +2472,35 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* + * Reduce test matrix for remap tree by requiring block-group-tree + * and no-holes. Free-space-tree is a hard requirement. + */ + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + btrfs_err(fs_info, +"remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, ZONED)) { + btrfs_err(fs_info, "remap-tree not supported with zoned devices"); + ret = -EINVAL; + } + + if (sectorsize > PAGE_SIZE) { + btrfs_err(fs_info, "remap-tree not supported when block size > page size"); + ret = -EINVAL; + } + } + /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later @@ -2603,6 +2659,18 @@ static int load_important_roots(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "couldn't read tree root"); return ret; } + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + bytenr = btrfs_super_remap_root(sb); + gen = btrfs_super_remap_root_generation(sb); + level = btrfs_super_remap_root_level(sb); + ret = load_super_root(fs_info->remap_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read remap root"); + return ret; + } + } + return 0; } @@ -3231,6 +3299,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; struct btrfs_root *chunk_root; + struct btrfs_root *remap_root; int ret; int level; @@ -3365,6 +3434,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret < 0) goto fail_alloc; + if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID, + GFP_KERNEL); + fs_info->remap_root = remap_root; + if (!remap_root) { + ret = -ENOMEM; + goto fail_alloc; + } + } + /* * At this point our mount options are validated, if we set ->max_inline * to something non-standard make sure we truncate it to sectorsize. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 48a453fa3063..ce4bda1f37ad 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2593,6 +2593,8 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) flags = BTRFS_BLOCK_GROUP_DATA; else if (root == fs_info->chunk_root) flags = BTRFS_BLOCK_GROUP_SYSTEM; + else if (root == fs_info->remap_root) + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; else flags = BTRFS_BLOCK_GROUP_METADATA; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 195428ecfd75..13b0aa0b9da9 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -315,7 +315,8 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \ + BTRFS_FEATURE_INCOMPAT_REMAP_TREE) #else @@ -475,6 +476,7 @@ struct btrfs_fs_info { struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; struct btrfs_root *stripe_root; + struct btrfs_root *remap_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e2f993b1783f..f4cc9e1a1b93 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1967,6 +1967,13 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + root_item = &fs_info->remap_root->root_item; + super->remap_root = root_item->bytenr; + super->remap_root_generation = root_item->generation; + super->remap_root_level = root_item->level; + } } int btrfs_transaction_blocked(struct btrfs_fs_info *info) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 86820a9644e8..f7843e6bb978 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -721,9 +721,12 @@ struct btrfs_super_block { __u8 metadata_uuid[BTRFS_FSID_SIZE]; __u64 nr_global_roots; + __le64 remap_root; + __le64 remap_root_generation; + __u8 remap_root_level; /* Future expansion */ - __le64 reserved[27]; + __u8 reserved[199]; __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; -- cgit v1.2.3