diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-13 22:02:07 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-13 22:02:07 +0300 |
commit | f02bf8578bd8dd400903291ccebc69665adc911c (patch) | |
tree | 9e05913ac5f4f13a2dd1b5db24f2d896d9fbb400 /fs/btrfs/volumes.c | |
parent | 7fef2edf7cc753b51f7ccc74993971b0a9c81eca (diff) | |
parent | ea32af47f00a046a1f953370514d6d946efe0152 (diff) | |
download | linux-f02bf8578bd8dd400903291ccebc69665adc911c.tar.xz |
Merge tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs zoned mode fixes from David Sterba:
- fix deadlock when allocating system chunk
- fix wrong mutex unlock on an error path
- fix extent map splitting for append operation
- update and fix message reporting unusable chunk space
- don't block when background zone reclaim runs with balance in
parallel
* tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
btrfs: don't block if we can't acquire the reclaim lock
btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
btrfs: fix deadlock with concurrent chunk allocations involving system chunks
btrfs: zoned: print unusable percentage when reclaiming block groups
btrfs: zoned: fix types for u64 division in btrfs_reclaim_bgs_work
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r-- | fs/btrfs/volumes.c | 355 |
1 files changed, 270 insertions, 85 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 807502cd6510..1e4d43ffe38b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1745,19 +1745,14 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); goto out; } *dev_extent_len = btrfs_dev_extent_length(leaf, extent); ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Failed to remove dev extent item"); - } else { + if (ret == 0) set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); - } out: btrfs_free_path(path); return ret; @@ -2942,7 +2937,7 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) u32 cur; struct btrfs_key key; - mutex_lock(&fs_info->chunk_mutex); + lockdep_assert_held(&fs_info->chunk_mutex); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; @@ -2972,7 +2967,6 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) cur += len; } } - mutex_unlock(&fs_info->chunk_mutex); return ret; } @@ -3012,6 +3006,29 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, return em; } +static int remove_chunk_item(struct btrfs_trans_handle *trans, + struct map_lookup *map, u64 chunk_offset) +{ + int i; + + /* + * Removing chunk items and updating the device items in the chunks btree + * requires holding the chunk_mutex. + * See the comment at btrfs_chunk_alloc() for the details. + */ + lockdep_assert_held(&trans->fs_info->chunk_mutex); + + for (i = 0; i < map->num_stripes; i++) { + int ret; + + ret = btrfs_update_device(trans, map->stripes[i].dev); + if (ret) + return ret; + } + + return btrfs_free_chunk(trans, chunk_offset); +} + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -3032,14 +3049,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return PTR_ERR(em); } map = em->map_lookup; - mutex_lock(&fs_info->chunk_mutex); - check_system_chunk(trans, map->type); - mutex_unlock(&fs_info->chunk_mutex); /* - * Take the device list mutex to prevent races with the final phase of - * a device replace operation that replaces the device object associated - * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + * First delete the device extent items from the devices btree. + * We take the device_list_mutex to avoid racing with the finishing phase + * of a device replace operation. See the comment below before acquiring + * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex + * because that can result in a deadlock when deleting the device extent + * items from the devices btree - COWing an extent buffer from the btree + * may result in allocating a new metadata chunk, which would attempt to + * lock again fs_info->chunk_mutex. */ mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { @@ -3061,18 +3080,73 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); } + } + mutex_unlock(&fs_devices->device_list_mutex); - ret = btrfs_update_device(trans, device); + /* + * We acquire fs_info->chunk_mutex for 2 reasons: + * + * 1) Just like with the first phase of the chunk allocation, we must + * reserve system space, do all chunk btree updates and deletions, and + * update the system chunk array in the superblock while holding this + * mutex. This is for similar reasons as explained on the comment at + * the top of btrfs_chunk_alloc(); + * + * 2) Prevent races with the final phase of a device replace operation + * that replaces the device object associated with the map's stripes, + * because the device object's id can change at any time during that + * final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of + * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating + * the device item, which does not exists on the chunk btree. + * The finishing phase of device replace acquires both the + * device_list_mutex and the chunk_mutex, in that order, so we are + * safe by just acquiring the chunk_mutex. + */ + trans->removing_chunk = true; + mutex_lock(&fs_info->chunk_mutex); + + check_system_chunk(trans, map->type); + + ret = remove_chunk_item(trans, map, chunk_offset); + /* + * Normally we should not get -ENOSPC since we reserved space before + * through the call to check_system_chunk(). + * + * Despite our system space_info having enough free space, we may not + * be able to allocate extents from its block groups, because all have + * an incompatible profile, which will force us to allocate a new system + * block group with the right profile, or right after we called + * check_system_space() above, a scrub turned the only system block group + * with enough free space into RO mode. + * This is explained with more detail at do_chunk_alloc(). + * + * So if we get -ENOSPC, allocate a new system chunk and retry once. + */ + if (ret == -ENOSPC) { + const u64 sys_flags = btrfs_system_alloc_profile(fs_info); + struct btrfs_block_group *sys_bg; + + sys_bg = btrfs_alloc_chunk(trans, sys_flags); + if (IS_ERR(sys_bg)) { + ret = PTR_ERR(sys_bg); + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); if (ret) { - mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); goto out; } - } - mutex_unlock(&fs_devices->device_list_mutex); - ret = btrfs_free_chunk(trans, chunk_offset); - if (ret) { + ret = remove_chunk_item(trans, map, chunk_offset); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + } else if (ret) { btrfs_abort_transaction(trans, ret); goto out; } @@ -3087,6 +3161,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } } + mutex_unlock(&fs_info->chunk_mutex); + trans->removing_chunk = false; + + /* + * We are done with chunk btree updates and deletions, so release the + * system space we previously reserved (with check_system_chunk()). + */ + btrfs_trans_release_chunk_metadata(trans); + ret = btrfs_remove_block_group(trans, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3094,6 +3177,10 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } out: + if (trans->removing_chunk) { + mutex_unlock(&fs_info->chunk_mutex); + trans->removing_chunk = false; + } /* once for us */ free_extent_map(em); return ret; @@ -4860,13 +4947,12 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, u32 array_size; u8 *ptr; - mutex_lock(&fs_info->chunk_mutex); + lockdep_assert_held(&fs_info->chunk_mutex); + array_size = btrfs_super_sys_array_size(super_copy); if (array_size + item_size + sizeof(disk_key) - > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { - mutex_unlock(&fs_info->chunk_mutex); + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; - } ptr = super_copy->sys_chunk_array + array_size; btrfs_cpu_key_to_disk(&disk_key, key); @@ -4875,7 +4961,6 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, memcpy(ptr, chunk, item_size); item_size += sizeof(disk_key); btrfs_set_super_sys_array_size(super_copy, array_size + item_size); - mutex_unlock(&fs_info->chunk_mutex); return 0; } @@ -5225,13 +5310,14 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, } } -static int create_chunk(struct btrfs_trans_handle *trans, +static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = trans->fs_info; struct map_lookup *map = NULL; struct extent_map_tree *em_tree; + struct btrfs_block_group *block_group; struct extent_map *em; u64 start = ctl->start; u64 type = ctl->type; @@ -5241,7 +5327,7 @@ static int create_chunk(struct btrfs_trans_handle *trans, map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); if (!map) - return -ENOMEM; + return ERR_PTR(-ENOMEM); map->num_stripes = ctl->num_stripes; for (i = 0; i < ctl->ndevs; ++i) { @@ -5263,7 +5349,7 @@ static int create_chunk(struct btrfs_trans_handle *trans, em = alloc_extent_map(); if (!em) { kfree(map); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; @@ -5279,12 +5365,12 @@ static int create_chunk(struct btrfs_trans_handle *trans, if (ret) { write_unlock(&em_tree->lock); free_extent_map(em); - return ret; + return ERR_PTR(ret); } write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); - if (ret) + block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); + if (IS_ERR(block_group)) goto error_del_extent; for (i = 0; i < map->num_stripes; i++) { @@ -5304,7 +5390,7 @@ static int create_chunk(struct btrfs_trans_handle *trans, check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); - return 0; + return block_group; error_del_extent: write_lock(&em_tree->lock); @@ -5316,34 +5402,36 @@ error_del_extent: /* One for the tree reference */ free_extent_map(em); - return ret; + return block_group; } -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) +struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; struct btrfs_device_info *devices_info = NULL; struct alloc_chunk_ctl ctl; + struct btrfs_block_group *block_group; int ret; lockdep_assert_held(&info->chunk_mutex); if (!alloc_profile_is_valid(type, 0)) { ASSERT(0); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (list_empty(&fs_devices->alloc_list)) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, "%s: no writable device", __func__); - return -ENOSPC; + return ERR_PTR(-ENOSPC); } if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(info, "invalid chunk type 0x%llx requested", type); ASSERT(0); - return -EINVAL; + return ERR_PTR(-EINVAL); } ctl.start = find_next_chunk(info); @@ -5353,46 +5441,43 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = gather_device_info(fs_devices, &ctl, devices_info); - if (ret < 0) + if (ret < 0) { + block_group = ERR_PTR(ret); goto out; + } ret = decide_stripe_size(fs_devices, &ctl, devices_info); - if (ret < 0) + if (ret < 0) { + block_group = ERR_PTR(ret); goto out; + } - ret = create_chunk(trans, &ctl, devices_info); + block_group = create_chunk(trans, &ctl, devices_info); out: kfree(devices_info); - return ret; + return block_group; } /* - * Chunk allocation falls into two parts. The first part does work - * that makes the new allocated chunk usable, but does not do any operation - * that modifies the chunk tree. The second part does the work that - * requires modifying the chunk tree. This division is important for the - * bootstrap process of adding storage to a seed btrfs. + * This function, btrfs_finish_chunk_alloc(), belongs to phase 2. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. */ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_key key; struct btrfs_device *device; - struct btrfs_chunk *chunk; - struct btrfs_stripe *stripe; struct extent_map *em; struct map_lookup *map; - size_t item_size; u64 dev_offset; u64 stripe_size; - int i = 0; + int i; int ret = 0; em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); @@ -5400,53 +5485,117 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, return PTR_ERR(em); map = em->map_lookup; - item_size = btrfs_chunk_item_size(map->num_stripes); stripe_size = em->orig_block_len; - chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) { - ret = -ENOMEM; - goto out; - } - /* * Take the device list mutex to prevent races with the final phase of * a device replace operation that replaces the device object associated * with the map's stripes, because the device object's id can change * at any time during that final phase of the device replace operation - * (dev-replace.c:btrfs_dev_replace_finishing()). + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, + * resulting in persisting a device extent item with such ID. */ mutex_lock(&fs_info->fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; - ret = btrfs_update_device(trans, device); - if (ret) - break; ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, dev_offset, stripe_size); if (ret) break; } - if (ret) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + free_extent_map(em); + return ret; +} + +/* + * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the + * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system + * chunks. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + struct extent_map *em; + struct map_lookup *map; + size_t item_size; + int i; + int ret; + + /* + * We take the chunk_mutex for 2 reasons: + * + * 1) Updates and insertions in the chunk btree must be done while holding + * the chunk_mutex, as well as updating the system chunk array in the + * superblock. See the comment on top of btrfs_chunk_alloc() for the + * details; + * + * 2) To prevent races with the final phase of a device replace operation + * that replaces the device object associated with the map's stripes, + * because the device object's id can change at any time during that + * final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, + * which would cause a failure when updating the device item, which does + * not exists, or persisting a stripe of the chunk item with such ID. + * Here we can't use the device_list_mutex because our caller already + * has locked the chunk_mutex, and the final phase of device replace + * acquires both mutexes - first the device_list_mutex and then the + * chunk_mutex. Using any of those two mutexes protects us from a + * concurrent device replace. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + + em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + btrfs_abort_transaction(trans, ret); + return ret; + } + + map = em->map_lookup; + item_size = btrfs_chunk_item_size(map->num_stripes); + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) { + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + + ret = btrfs_update_device(trans, device); + if (ret) + goto out; + } + stripe = &chunk->stripe; for (i = 0; i < map->num_stripes; i++) { - device = map->stripes[i].dev; - dev_offset = map->stripes[i].physical; + struct btrfs_device *device = map->stripes[i].dev; + const u64 dev_offset = map->stripes[i].physical; btrfs_set_stack_stripe_devid(stripe, device->devid); btrfs_set_stack_stripe_offset(stripe, dev_offset); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_set_stack_chunk_length(chunk, chunk_size); + btrfs_set_stack_chunk_length(chunk, bg->length); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); btrfs_set_stack_chunk_type(chunk, map->type); @@ -5458,15 +5607,18 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.type = BTRFS_CHUNK_ITEM_KEY; - key.offset = chunk_offset; + key.offset = bg->start; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - /* - * TODO: Cleanup of inserted chunk root in case of - * failure. - */ + if (ret) + goto out; + + bg->chunk_item_inserted = 1; + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); + if (ret) + goto out; } out: @@ -5479,16 +5631,41 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; - int ret; + struct btrfs_block_group *meta_bg; + struct btrfs_block_group *sys_bg; + + /* + * When adding a new device for sprouting, the seed device is read-only + * so we must first allocate a metadata and a system chunk. But before + * adding the block group items to the extent, device and chunk btrees, + * we must first: + * + * 1) Create both chunks without doing any changes to the btrees, as + * otherwise we would get -ENOSPC since the block groups from the + * seed device are read-only; + * + * 2) Add the device item for the new sprout device - finishing the setup + * of a new block group requires updating the device item in the chunk + * btree, so it must exist when we attempt to do it. The previous step + * ensures this does not fail with -ENOSPC. + * + * After that we can add the block group items to their btrees: + * update existing device item in the chunk btree, add a new block group + * item to the extent btree, add a new chunk item to the chunk btree and + * finally add the new device extent items to the devices btree. + */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - ret = btrfs_alloc_chunk(trans, alloc_profile); - if (ret) - return ret; + meta_bg = btrfs_alloc_chunk(trans, alloc_profile); + if (IS_ERR(meta_bg)) + return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - ret = btrfs_alloc_chunk(trans, alloc_profile); - return ret; + sys_bg = btrfs_alloc_chunk(trans, alloc_profile); + if (IS_ERR(sys_bg)) + return PTR_ERR(sys_bg); + + return 0; } static inline int btrfs_chunk_max_errors(struct map_lookup *map) @@ -7415,10 +7592,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; + + /* + * We are only called at mount time, so no need to take + * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, + * we always lock first fs_info->chunk_mutex before + * acquiring any locks on the chunk tree. This is a + * requirement for chunk allocation, see the comment on + * top of btrfs_chunk_alloc() for details. + */ + ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - mutex_lock(&fs_info->chunk_mutex); ret = read_one_chunk(&found_key, leaf, chunk); - mutex_unlock(&fs_info->chunk_mutex); if (ret) goto error; } |