From cf89af146b7e62af55470cf5f3ec3c56ec144a5e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 30 Oct 2020 06:53:56 +0800 Subject: btrfs: dev-replace: fail mount if we don't have replace item with target device If there is a device BTRFS_DEV_REPLACE_DEVID without the device replace item, then it means the filesystem is inconsistent state. This is either corruption or a crafted image. Fail the mount as this needs a closer look what is actually wrong. As of now if BTRFS_DEV_REPLACE_DEVID is present without the replace item, in __btrfs_free_extra_devids() we determine that there is an extra device, and free those extra devices but continue to mount the device. However, we were wrong in keeping tack of the rw_devices so the syzbot testcase failed: WARNING: CPU: 1 PID: 3612 at fs/btrfs/volumes.c:1166 close_fs_devices.part.0+0x607/0x800 fs/btrfs/volumes.c:1166 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 3612 Comm: syz-executor.2 Not tainted 5.9.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 panic+0x347/0x7c0 kernel/panic.c:231 __warn.cold+0x20/0x46 kernel/panic.c:600 report_bug+0x1bd/0x210 lib/bug.c:198 handle_bug+0x38/0x90 arch/x86/kernel/traps.c:234 exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:254 asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536 RIP: 0010:close_fs_devices.part.0+0x607/0x800 fs/btrfs/volumes.c:1166 RSP: 0018:ffffc900091777e0 EFLAGS: 00010246 RAX: 0000000000040000 RBX: ffffffffffffffff RCX: ffffc9000c8b7000 RDX: 0000000000040000 RSI: ffffffff83097f47 RDI: 0000000000000007 RBP: dffffc0000000000 R08: 0000000000000001 R09: ffff8880988a187f R10: 0000000000000000 R11: 0000000000000001 R12: ffff88809593a130 R13: ffff88809593a1ec R14: ffff8880988a1908 R15: ffff88809593a050 close_fs_devices fs/btrfs/volumes.c:1193 [inline] btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179 open_ctree+0x4984/0x4a2d fs/btrfs/disk-io.c:3434 btrfs_fill_super fs/btrfs/super.c:1316 [inline] btrfs_mount_root.cold+0x14/0x165 fs/btrfs/super.c:1672 The fix here is, when we determine that there isn't a replace item then fail the mount if there is a replace target device (devid 0). CC: stable@vger.kernel.org # 4.19+ Reported-by: syzbot+4cfe71a4da060be47502@syzkaller.appspotmail.com Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5b9e3f3ace22..10638537b9ef 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -91,6 +91,17 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { no_valid_dev_replace_entry_found: + /* + * We don't have a replace item or it's corrupted. If there is + * a replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "found replace target device without a valid replace item"); + ret = -EUCLEAN; + goto out; + } ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; @@ -143,8 +154,19 @@ no_valid_dev_replace_entry_found: case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - dev_replace->srcdev = NULL; - dev_replace->tgtdev = NULL; + /* + * We don't have an active replace item but if there is a + * replace target, fail the mount. + */ + if (btrfs_find_device(fs_info->fs_devices, + BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + btrfs_err(fs_info, + "replace devid present without an active replace item"); + ret = -EUCLEAN; + } else { + dev_replace->srcdev = NULL; + dev_replace->tgtdev = NULL; + } break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: -- cgit v1.2.3 From b2598edf8b36f8b7c52e3f5f611c49cbd1c67b36 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 Nov 2020 13:49:43 +0800 Subject: btrfs: remove unused argument seed from btrfs_find_device Commit 343694eee8d8 ("btrfs: switch seed device to list api"), missed to check if the parameter seed is true in the function btrfs_find_device(). This tells it whether to traverse the seed device list or not. After this commit, the argument is unused and can be removed. In device_list_add() it's not necessary because fs_devices always points to the device's fs_devices. So with the devid+uuid matching, it will find the right device and return, thus not needing to traverse seed devices. Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 8 ++++---- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/volumes.c | 22 ++++++++++------------ fs/btrfs/volumes.h | 2 +- 5 files changed, 19 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 10638537b9ef..dc3481014f16 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -96,7 +96,7 @@ no_valid_dev_replace_entry_found: * a replace target, fail the mount. */ if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); ret = -EUCLEAN; @@ -159,7 +159,7 @@ no_valid_dev_replace_entry_found: * replace target, fail the mount. */ if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { btrfs_err(fs_info, "replace devid present without an active replace item"); ret = -EUCLEAN; @@ -171,10 +171,10 @@ no_valid_dev_replace_entry_found: case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, - src_devid, NULL, NULL, true); + src_devid, NULL, NULL); dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, BTRFS_DEV_REPLACE_DEVID, - NULL, NULL, true); + NULL, NULL); /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2904f92c3813..e7ccba83e8a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1717,7 +1717,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -3360,7 +3360,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, rcu_read_lock(); dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, - NULL, true); + NULL); if (!dev) { ret = -ENODEV; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 57ee06d92150..74acb7ca6eda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3885,7 +3885,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, goto out_free_ctx; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4062,7 +4062,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (dev) sctx = dev->scrub_ctx; if (sctx) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd72bdc7455e..7132af058a95 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -822,7 +822,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } else { mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, devid, - disk_super->dev_item.uuid, NULL, false); + disk_super->dev_item.uuid, NULL); /* * If this disk has been pulled into an fs devices created by @@ -2294,10 +2294,10 @@ static struct btrfs_device *btrfs_find_device_by_path( dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->metadata_uuid, true); + disk_super->metadata_uuid); else device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid, true); + disk_super->fsid); btrfs_release_disk_super(disk_super); if (!device) @@ -2317,7 +2317,7 @@ struct btrfs_device *btrfs_find_device_by_devspec( if (devid) { device = btrfs_find_device(fs_info->fs_devices, devid, NULL, - NULL, true); + NULL); if (!device) return ERR_PTR(-ENOENT); return device; @@ -2466,7 +2466,7 @@ next_slot: read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -6459,8 +6459,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, - bool seed) + u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; @@ -6667,7 +6666,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL, true); + devid, uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -6806,7 +6805,7 @@ static int read_one_dev(struct extent_buffer *leaf, } device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -7469,8 +7468,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, - true); + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7601,7 +7599,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device bondary */ - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7d1faf1c4ce4..5e08274d1252 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -466,7 +466,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, bool seed); + u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 5b316468983dfa9473ff0f1c42e4e30b4c267141 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:07 +0900 Subject: btrfs: get zone information of zoned block devices If a zoned block device is found, get its zone information (number of zones and zone size). To avoid costly run-time zone report commands to test the device zones type during block allocation, attach the seq_zones bitmap to the device structure to indicate if a zone is sequential or accept random writes. Also it attaches the empty_zones bitmap to indicate if a zone is empty or not. This patch also introduces the helper function btrfs_dev_is_sequential() to test if the zone storing a block is a sequential write required zone and btrfs_dev_is_empty_zone() to test if the zone is a empty zone. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Damien Le Moal Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 1 + fs/btrfs/dev-replace.c | 5 ++ fs/btrfs/super.c | 5 ++ fs/btrfs/volumes.c | 20 +++++- fs/btrfs/volumes.h | 4 ++ fs/btrfs/zoned.c | 168 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 87 +++++++++++++++++++++++++ 7 files changed, 287 insertions(+), 3 deletions(-) create mode 100644 fs/btrfs/zoned.c create mode 100644 fs/btrfs/zoned.h (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index e738f6206ea5..0497fdc37f90 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -16,6 +16,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index dc3481014f16..71a1bddcbc76 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -21,6 +21,7 @@ #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" +#include "zoned.h" /* * Device replace overview @@ -313,6 +314,10 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; + ret = btrfs_get_dev_zone_info(device); + if (ret) + goto error; + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 348f8899f4f4..3c39e363aec5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2513,6 +2513,11 @@ static void __init btrfs_print_mod_info(void) #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ", zoned=yes" +#else + ", zoned=no" #endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7132af058a95..f250bf4dc91f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -31,6 +31,7 @@ #include "space-info.h" #include "block-group.h" #include "discard.h" +#include "zoned.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -374,6 +375,7 @@ void btrfs_free_device(struct btrfs_device *device) rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); bio_put(device->flush_bio); + btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -667,6 +669,10 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; + ret = btrfs_get_dev_zone_info(device); + if (ret != 0) + goto error_free_page; + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1137,6 +1143,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->bdev = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + btrfs_destroy_dev_zone_info(device); device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); @@ -2541,10 +2548,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_assign_pointer(device->name, name); + device->fs_info = fs_info; + device->bdev = bdev; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + goto error_free_device; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto error_free_device; + goto error_free_zone; } q = bdev_get_queue(bdev); @@ -2557,8 +2571,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; - device->fs_info = fs_info; - device->bdev = bdev; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; @@ -2705,6 +2717,8 @@ error_trans: sb->s_flags |= SB_RDONLY; if (trans) btrfs_end_transaction(trans); +error_free_zone: + btrfs_destroy_dev_zone_info(device); error_free_device: btrfs_free_device(device); error: diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5e08274d1252..1997a4649a66 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -52,6 +52,8 @@ struct btrfs_io_geometry { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) +struct btrfs_zoned_device_info; + struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ struct list_head dev_alloc_list; /* chunk mutex */ @@ -65,6 +67,8 @@ struct btrfs_device { struct block_device *bdev; + struct btrfs_zoned_device_info *zone_info; + /* the mode sent to blkdev_get */ fmode_t mode; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c new file mode 100644 index 000000000000..38d7bf45b83f --- /dev/null +++ b/fs/btrfs/zoned.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "ctree.h" +#include "volumes.h" +#include "zoned.h" +#include "rcu-string.h" + +/* Maximum number of zones to report per blkdev_report_zones() call */ +#define BTRFS_REPORT_NR_ZONES 4096 + +static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) +{ + struct blk_zone *zones = data; + + memcpy(&zones[idx], zone, sizeof(*zone)); + + return 0; +} + +static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int *nr_zones) +{ + int ret; + + if (!*nr_zones) + return 0; + + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, + copy_zone_info_cb, zones); + if (ret < 0) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to read zone %llu on %s (devid %llu)", + pos, rcu_str_deref(device->name), + device->devid); + return ret; + } + *nr_zones = ret; + if (!ret) + return -EIO; + + return 0; +} + +int btrfs_get_dev_zone_info(struct btrfs_device *device) +{ + struct btrfs_zoned_device_info *zone_info = NULL; + struct block_device *bdev = device->bdev; + sector_t nr_sectors; + sector_t sector = 0; + struct blk_zone *zones = NULL; + unsigned int i, nreported = 0, nr_zones; + unsigned int zone_sectors; + int ret; + + if (!bdev_is_zoned(bdev)) + return 0; + + if (device->zone_info) + return 0; + + zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return -ENOMEM; + + nr_sectors = bdev->bd_part->nr_sects; + zone_sectors = bdev_zone_sectors(bdev); + /* Check if it's power of 2 (see is_power_of_2) */ + ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); + zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + zone_info->zone_size_shift = ilog2(zone_info->zone_size); + zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + if (!IS_ALIGNED(nr_sectors, zone_sectors)) + zone_info->nr_zones++; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) { + ret = -ENOMEM; + goto out; + } + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) { + ret = -ENOMEM; + goto out; + } + + zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) { + ret = -ENOMEM; + goto out; + } + + /* Get zones type */ + while (sector < nr_sectors) { + nr_zones = BTRFS_REPORT_NR_ZONES; + ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, + &nr_zones); + if (ret) + goto out; + + for (i = 0; i < nr_zones; i++) { + if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) + __set_bit(nreported, zone_info->seq_zones); + if (zones[i].cond == BLK_ZONE_COND_EMPTY) + __set_bit(nreported, zone_info->empty_zones); + nreported++; + } + sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; + } + + if (nreported != zone_info->nr_zones) { + btrfs_err_in_rcu(device->fs_info, + "inconsistent number of zones on %s (%u/%u)", + rcu_str_deref(device->name), nreported, + zone_info->nr_zones); + ret = -EIO; + goto out; + } + + kfree(zones); + + device->zone_info = zone_info; + + /* device->fs_info is not safe to use for printing messages */ + btrfs_info_in_rcu(NULL, + "host-%s zoned block device %s, %u zones of %llu bytes", + bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", + rcu_str_deref(device->name), zone_info->nr_zones, + zone_info->zone_size); + + return 0; + +out: + kfree(zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->seq_zones); + kfree(zone_info); + + return ret; +} + +void btrfs_destroy_dev_zone_info(struct btrfs_device *device) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return; + + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + kfree(zone_info); + device->zone_info = NULL; +} + +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + unsigned int nr_zones = 1; + int ret; + + ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); + if (ret != 0 || !nr_zones) + return ret ? ret : -EIO; + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h new file mode 100644 index 000000000000..00c871f09d03 --- /dev/null +++ b/fs/btrfs/zoned.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ZONED_H +#define BTRFS_ZONED_H + +#include + +struct btrfs_zoned_device_info { + /* + * Number of zones, zone size and types of zones if bdev is a + * zoned block device. + */ + u64 zone_size; + u8 zone_size_shift; + u32 nr_zones; + unsigned long *seq_zones; + unsigned long *empty_zones; +}; + +#ifdef CONFIG_BLK_DEV_ZONED +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone); +int btrfs_get_dev_zone_info(struct btrfs_device *device); +void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + return 0; +} + +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) +{ + return 0; +} + +static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } + +#endif + +static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return false; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones); +} + +static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return true; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device, + u64 pos, bool set) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno; + + if (!zone_info) + return; + + zno = pos >> zone_info->zone_size_shift; + if (set) + set_bit(zno, zone_info->empty_zones); + else + clear_bit(zno, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, true); +} + +static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, false); +} + +#endif -- cgit v1.2.3 From b70f509774ad4b75d4253ad23b65c35d89402026 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 10 Nov 2020 20:26:08 +0900 Subject: btrfs: check and enable ZONED mode Introduce function btrfs_check_zoned_mode() to check if ZONED flag is enabled on the file system and if the file system consists of zoned devices with equal zone size. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Damien Le Moal Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 14 +++++++++ fs/btrfs/dev-replace.c | 7 +++++ fs/btrfs/disk-io.c | 10 +++++++ fs/btrfs/super.c | 1 + fs/btrfs/volumes.c | 5 ++++ fs/btrfs/zoned.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 26 ++++++++++++++++ 7 files changed, 143 insertions(+) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c0c6e79c43f9..8f20219e2caa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -956,6 +956,15 @@ struct btrfs_fs_info { /* Type of exclusive operation running */ unsigned long exclusive_operation; + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + union { + u64 zone_size; + u64 zoned; + }; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -3677,4 +3686,9 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zoned != 0; +} + #endif diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 71a1bddcbc76..a98e33f232d5 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -260,6 +260,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return PTR_ERR(bdev); } + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + btrfs_err(fs_info, + "dev-replace: zoned type of target device mismatch with filesystem"); + ret = -EINVAL; + goto error; + } + sync_blockdev(bdev); list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 37b2e0aad162..32e29e2bc99a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "block-group.h" #include "discard.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -3046,6 +3047,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); + fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); + /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size @@ -3202,6 +3205,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_block_groups; } + ret = btrfs_check_zoned_mode(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to initialize zoned mode: %d", + ret); + goto fail_block_groups; + } + ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3c39e363aec5..b754205f21d0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -44,6 +44,7 @@ #include "backref.h" #include "space-info.h" #include "sysfs.h" +#include "zoned.h" #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f250bf4dc91f..cf3f4975107f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2515,6 +2515,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(bdev)) return PTR_ERR(bdev); + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = 1; down_write(&sb->s_umount); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 38d7bf45b83f..56bb701e3b53 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -166,3 +166,83 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } + +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 zoned_devices = 0; + u64 nr_devices = 0; + u64 zone_size = 0; + const bool incompat_zoned = btrfs_is_zoned(fs_info); + int ret = 0; + + /* Count zoned devices */ + list_for_each_entry(device, &fs_devices->devices, dev_list) { + enum blk_zoned_model model; + + if (!device->bdev) + continue; + + model = bdev_zoned_model(device->bdev); + if (model == BLK_ZONED_HM || + (model == BLK_ZONED_HA && incompat_zoned)) { + zoned_devices++; + if (!zone_size) { + zone_size = device->zone_info->zone_size; + } else if (device->zone_info->zone_size != zone_size) { + btrfs_err(fs_info, + "zoned: unequal block device zone sizes: have %llu found %llu", + device->zone_info->zone_size, + zone_size); + ret = -EINVAL; + goto out; + } + } + nr_devices++; + } + + if (!zoned_devices && !incompat_zoned) + goto out; + + if (!zoned_devices && incompat_zoned) { + /* No zoned block device found on ZONED filesystem */ + btrfs_err(fs_info, + "zoned: no zoned devices found on a zoned filesystem"); + ret = -EINVAL; + goto out; + } + + if (zoned_devices && !incompat_zoned) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found"); + ret = -EINVAL; + goto out; + } + + if (zoned_devices != nr_devices) { + btrfs_err(fs_info, + "zoned: cannot mix zoned and regular devices"); + ret = -EINVAL; + goto out; + } + + /* + * stripe_size is always aligned to BTRFS_STRIPE_LEN in + * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, + * check the alignment here. + */ + if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, + "zoned: zone size %llu not aligned to stripe %u", + zone_size, BTRFS_STRIPE_LEN); + ret = -EINVAL; + goto out; + } + + fs_info->zone_size = zone_size; + + btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); +out: + return ret; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 00c871f09d03..e4ee1a50be2d 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -4,6 +4,7 @@ #define BTRFS_ZONED_H #include +#include struct btrfs_zoned_device_info { /* @@ -22,6 +23,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -36,6 +38,15 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } +static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_err(fs_info, "zoned block devices support is not enabled"); + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -84,4 +95,19 @@ static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 p btrfs_dev_set_empty_zone_bit(device, pos, false); } +static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, + struct block_device *bdev) +{ + u64 zone_size; + + if (btrfs_is_zoned(fs_info)) { + zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT; + /* Do not allow non-zoned device */ + return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size; + } + + /* Do not allow Host Manged zoned device */ + return bdev_zoned_model(bdev) != BLK_ZONED_HM; +} + #endif -- cgit v1.2.3