diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/block-group.c | 47 | ||||
-rw-r--r-- | fs/btrfs/block-group.h | 4 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 3 | ||||
-rw-r--r-- | fs/btrfs/dev-replace.c | 5 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 43 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 30 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 33 | ||||
-rw-r--r-- | fs/btrfs/file.c | 2 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 21 | ||||
-rw-r--r-- | fs/btrfs/root-tree.c | 5 | ||||
-rw-r--r-- | fs/btrfs/space-info.c | 2 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 8 | ||||
-rw-r--r-- | fs/btrfs/xattr.c | 3 | ||||
-rw-r--r-- | fs/btrfs/zoned.c | 139 |
14 files changed, 213 insertions, 132 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 993aca2f1e18..e0375ba9d0fe 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -440,39 +440,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, btrfs_put_caching_control(caching_ctl); } -int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) +static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl) +{ + wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); + return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; +} + +static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) { struct btrfs_caching_control *caching_ctl; - int ret = 0; + int ret; caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; - - wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); - if (cache->cached == BTRFS_CACHE_ERROR) - ret = -EIO; + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); btrfs_put_caching_control(caching_ctl); return ret; } -static bool space_cache_v1_done(struct btrfs_block_group *cache) -{ - bool ret; - - spin_lock(&cache->lock); - ret = cache->cached != BTRFS_CACHE_FAST; - spin_unlock(&cache->lock); - - return ret; -} - -void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, - struct btrfs_caching_control *caching_ctl) -{ - wait_event(caching_ctl->wait, space_cache_v1_done(cache)); -} - #ifdef CONFIG_BTRFS_DEBUG static void fragment_free_space(struct btrfs_block_group *block_group) { @@ -750,9 +737,8 @@ done: btrfs_put_block_group(block_group); } -int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only) +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) { - DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; @@ -785,10 +771,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only } WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; - if (btrfs_test_opt(fs_info, SPACE_CACHE)) - cache->cached = BTRFS_CACHE_FAST; - else - cache->cached = BTRFS_CACHE_STARTED; + cache->cached = BTRFS_CACHE_STARTED; cache->has_caching_ctl = 1; spin_unlock(&cache->lock); @@ -801,8 +784,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); out: - if (load_cache_only && caching_ctl) - btrfs_wait_space_cache_v1_finished(cache, caching_ctl); + if (wait && caching_ctl) + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); if (caching_ctl) btrfs_put_caching_control(caching_ctl); @@ -3312,7 +3295,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && !btrfs_block_group_done(cache)) - btrfs_cache_block_group(cache, 1); + btrfs_cache_block_group(cache, true); byte_in_group = bytenr - cache->start; WARN_ON(byte_in_group > cache->length); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 35e0e860cc0b..6b3cdc4cbc41 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -263,9 +263,7 @@ void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); -int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache); -int btrfs_cache_block_group(struct btrfs_block_group *cache, - int load_cache_only); +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4edb4bfb2166..df8c99c99df9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -505,7 +505,6 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO, BTRFS_CACHE_STARTED, - BTRFS_CACHE_FAST, BTRFS_CACHE_FINISHED, BTRFS_CACHE_ERROR, }; @@ -1089,8 +1088,6 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; - /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ - wait_queue_head_t zone_finish_wait; /* Updates are not protected by any lock */ struct btrfs_commit_stats commit_stats; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f43196a893ca..41cddd3ff059 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -165,7 +165,7 @@ no_valid_dev_replace_entry_found: */ if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, - "replace devid present without an active replace item"); +"replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; } else { dev_replace->srcdev = NULL; @@ -1129,8 +1129,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) up_write(&dev_replace->rwsem); /* Scrub for replace must not be running in suspended state */ - ret = btrfs_scrub_cancel(fs_info); - ASSERT(ret != -ENOTCONN); + btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 820b1f1e6b67..2633137c3e9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3068,7 +3068,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); - init_waitqueue_head(&fs_info->zone_finish_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; @@ -4476,6 +4475,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + * We must do this before stopping the block group reclaim task, because + * at btrfs_relocate_block_group() we wait for this bit, and after the + * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we + * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will + * return 1. + */ + btrfs_wake_unfinished_drop(fs_info); + + /* * We may have the reclaim task running and relocating a data block group, * in which case it may create delayed iputs. So stop it before we park * the cleaner kthread otherwise we can get new delayed iputs after @@ -4493,12 +4503,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); - /* - * If we had UNFINISHED_DROPS we could still be processing them, so - * clear that bit and wake up relocation so it can stop. - */ - btrfs_wake_unfinished_drop(fs_info); - /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -4521,6 +4525,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + /* + * After we parked the cleaner kthread, ordered extents may have + * completed and created new delayed iputs. If one of the async reclaim + * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we + * can hang forever trying to stop it, because if a delayed iput is + * added after it ran btrfs_run_delayed_iputs() and before it called + * btrfs_wait_on_delayed_iputs(), it will hang forever since there is + * no one else to run iputs. + * + * So wait for all ongoing ordered extents to complete and then run + * delayed iputs. This works because once we reach this point no one + * can either create new ordered extents nor create delayed iputs + * through some other means. + * + * Also note that btrfs_wait_ordered_roots() is not safe here, because + * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, + * but the delayed iput for the respective inode is made only when doing + * the final btrfs_put_ordered_extent() (which must happen at + * btrfs_finish_ordered_io() when we are unmounting). + */ + btrfs_flush_workqueue(fs_info->endio_write_workers); + /* Ordered extents for free space inodes. */ + btrfs_flush_workqueue(fs_info->endio_freespace_worker); + btrfs_run_delayed_iputs(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ab944d1f94ef..6914cd8024ba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2551,17 +2551,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, return -EINVAL; /* - * pull in the free space cache (if any) so that our pin - * removes the free space from the cache. We have load_only set - * to one because the slow code to read in the free extents does check - * the pinned extents. + * Fully cache the free space first so that our pin removes the free space + * from the cache. */ - btrfs_cache_block_group(cache, 1); - /* - * Make sure we wait until the cache is completely built in case it is - * missing or is invalid and therefore needs to be rebuilt. - */ - ret = btrfs_wait_block_group_cache_done(cache); + ret = btrfs_cache_block_group(cache, true); if (ret) goto out; @@ -2584,12 +2577,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, if (!block_group) return -EINVAL; - btrfs_cache_block_group(block_group, 1); - /* - * Make sure we wait until the cache is completely built in case it is - * missing or is invalid and therefore needs to be rebuilt. - */ - ret = btrfs_wait_block_group_cache_done(block_group); + ret = btrfs_cache_block_group(block_group, true); if (ret) goto out; @@ -4399,7 +4387,7 @@ have_block_group: ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; - ret = btrfs_cache_block_group(block_group, 0); + ret = btrfs_cache_block_group(block_group, false); /* * If we get ENOMEM here or something else we want to @@ -6169,13 +6157,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { - ret = btrfs_cache_block_group(cache, 0); - if (ret) { - bg_failed++; - bg_ret = ret; - continue; - } - ret = btrfs_wait_block_group_cache_done(cache); + ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; bg_ret = ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eed81a7e36a4..cf4f19e80e2f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3233,7 +3233,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 bio_size = bio->bi_iter.bi_size; u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - bool contig; + bool contig = false; int ret; ASSERT(bio); @@ -3242,10 +3242,35 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + + if (bio->bi_iter.bi_size == 0) { + /* We can always add a page into an empty bio. */ + contig = true; + } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { + struct bio_vec *bvec = bio_last_bvec_all(bio); + + /* + * The contig check requires the following conditions to be met: + * 1) The pages are belonging to the same inode + * This is implied by the call chain. + * + * 2) The range has adjacent logical bytenr + * + * 3) The range has adjacent file offset + * This is required for the usage of btrfs_bio->file_offset. + */ + if (bio_end_sector(bio) == sector && + page_offset(bvec->bv_page) + bvec->bv_offset + + bvec->bv_len == page_offset(page) + pg_offset) + contig = true; + } else { + /* + * For compression, all IO should have its logical bytenr + * set to the starting bytenr of the compressed extent. + */ contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; + } + if (!contig) return 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c822182ecc..5a3f6e0d9688 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2482,6 +2482,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); goto out; } @@ -2498,6 +2499,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f0c97d25b4a0..1372210869b1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1644,10 +1644,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, done_offset = end; if (done_offset == start) { - struct btrfs_fs_info *info = inode->root->fs_info; - - wait_var_event(&info->zone_finish_wait, - !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); continue; } @@ -7694,6 +7693,20 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, bool unlock_extents = false; /* + * We could potentially fault if we have a buffer > PAGE_SIZE, and if + * we're NOWAIT we may submit a bio for a partial range and return + * EIOCBQUEUED, which would result in an errant short read. + * + * The best way to handle this would be to allow for partial completions + * of iocb's, so we could submit the partial bio, return and fault in + * the rest of the pages, and then submit the io for the rest of the + * range. However we don't have that currently, so simply return + * -EAGAIN at this point so that the normal path is used. + */ + if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) + return -EAGAIN; + + /* * Cap the size of reads to that usually seen in buffered I/O as we need * to allocate a contiguous array for the checksums. */ diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a64b26b16904..d647cb2938c0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -349,9 +349,10 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { + err = ret; goto out; - if (ret == 0) { + } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d0cbeb7ae81c..435559ba94fa 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -199,7 +199,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); if (flags & BTRFS_BLOCK_GROUP_DATA) - return SZ_1G; + return BTRFS_MAX_DATA_CHUNK_SIZE; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return SZ_32M; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 272901514b0c..f63ff91e2883 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2345,8 +2345,11 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, &bdev, &disk_super); - if (ret) + if (ret) { + btrfs_put_dev_args_from_path(args); return ret; + } + args->devid = btrfs_stack_device_id(&disk_super->dev_item); memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) @@ -5264,6 +5267,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, ctl->stripe_size); } + /* Stripe size should not go beyond 1G. */ + ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); + /* Align to BTRFS_STRIPE_LEN */ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); ctl->chunk_size = ctl->stripe_size * data_stripes; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 7421abcf325a..5bb8d8c86311 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -371,6 +371,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, const char *name, const void *buffer, size_t size, int flags) { + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + name = xattr_full_name(handler, name); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b150b07ba1a7..73c6929f7be6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -421,10 +421,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) * since btrfs adds the pages one by one to a bio, and btrfs cannot * increase the metadata reservation even if it increases the number of * extents, it is safe to stick with the limit. + * + * With the zoned emulation, we can have non-zoned device on the zoned + * mode. In this case, we don't have a valid max zone append size. So, + * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. */ - zone_info->max_zone_append_size = - min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + if (bdev_is_zoned(bdev)) { + zone_info->max_zone_append_size = min_t(u64, + (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + } else { + zone_info->max_zone_append_size = + (u64)bdev_max_segments(bdev) << PAGE_SHIFT; + } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -1178,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) * offset. */ static int calculate_alloc_pointer(struct btrfs_block_group *cache, - u64 *offset_ret) + u64 *offset_ret, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; @@ -1188,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, int ret; u64 length; + /* + * Avoid tree lookups for a new block group, there's no use for it. + * It must always be 0. + * + * Also, we have a lock chain of extent buffer lock -> chunk mutex. + * For new a block group, this function is called from + * btrfs_make_block_group() which is already taking the chunk mutex. + * Thus, we cannot call calculate_alloc_pointer() which takes extent + * buffer locks to avoid deadlock. + */ + if (new) { + *offset_ret = 0; + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1323,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) else num_conventional++; + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); + if (!is_sequential) { alloc_offsets[i] = WP_CONVENTIONAL; continue; @@ -1389,45 +1420,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) __set_bit(i, active); break; } - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); } if (num_sequential > 0) cache->seq_zone = true; if (num_conventional > 0) { - /* - * Avoid calling calculate_alloc_pointer() for new BG. It - * is no use for new BG. It must be always 0. - * - * Also, we have a lock chain of extent buffer lock -> - * chunk mutex. For new BG, this function is called from - * btrfs_make_block_group() which is already taking the - * chunk mutex. Thus, we cannot call - * calculate_alloc_pointer() which takes extent buffer - * locks to avoid deadlock. - */ - /* Zone capacity is always zone size in emulation */ cache->zone_capacity = cache->length; - if (new) { - cache->alloc_offset = 0; - goto out; - } - ret = calculate_alloc_pointer(cache, &last_alloc); - if (ret || map->num_stripes == num_conventional) { - if (!ret) - cache->alloc_offset = last_alloc; - else - btrfs_err(fs_info, + ret = calculate_alloc_pointer(cache, &last_alloc, new); + if (ret) { + btrfs_err(fs_info, "zoned: failed to determine allocation offset of bg %llu", - cache->start); + cache->start); + goto out; + } else if (map->num_stripes == num_conventional) { + cache->alloc_offset = last_alloc; + cache->zone_is_active = 1; goto out; } } @@ -1495,13 +1504,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - if (cache->zone_is_active) { - btrfs_get_block_group(cache); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } - out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1526,10 +1528,16 @@ out: ret = -EIO; } - if (!ret) + if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; - - if (ret) { + if (cache->zone_is_active) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + } else { kfree(cache->physical_map); cache->physical_map = NULL; } @@ -1910,10 +1918,44 @@ out_unlock: return ret; } +static void wait_eb_writebacks(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + const u64 end = block_group->start + block_group->length; + struct radix_tree_iter iter; + struct extent_buffer *eb; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, + block_group->start >> fs_info->sectorsize_bits) { + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + if (radix_tree_deref_retry(eb)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (eb->start < block_group->start) + continue; + if (eb->start >= end) + break; + + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + wait_on_extent_buffer_writeback(eb); + rcu_read_lock(); + } + rcu_read_unlock(); +} + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; + const bool is_metadata = (block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); int ret = 0; int i; @@ -1924,8 +1966,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } /* Check if we have unwritten allocated space */ - if ((block_group->flags & - (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && + if (is_metadata && block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; @@ -1950,6 +1991,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* No need to wait for NOCOW writers. Zoned mode does not allow that */ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, block_group->length); + /* Wait for extent buffers to be written. */ + if (is_metadata) + wait_eb_writebacks(block_group); spin_lock(&block_group->lock); @@ -2007,8 +2051,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* For active_bg_list */ btrfs_put_block_group(block_group); - clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); - wake_up_all(&fs_info->zone_finish_wait); + clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return 0; } |