diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 564 |
1 files changed, 301 insertions, 263 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a7bc66121330..5871ef78edba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "discard.h" +#include "rcu-string.h" #undef SCRAMBLE_DELAYED_REFS @@ -64,10 +65,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - set_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + set_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); return 0; } @@ -79,10 +78,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) start = cache->start; end = start + cache->length - 1; - clear_extent_bits(&fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE); - clear_extent_bits(&fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE); + clear_extent_bits(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE); } static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) @@ -1193,24 +1190,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); - } else { - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); - } - return ret; -} - static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, @@ -1469,7 +1448,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, @@ -1494,11 +1472,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, path, bytenr, parent, + root_objectid); + } else { + ret = insert_extent_data_ref(trans, path, bytenr, parent, + root_objectid, owner, offset, + refs_to_add); + } if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1583,7 +1567,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, int err = 0; int metadata = !extent_op->is_data; - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) @@ -1604,7 +1588,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); if (ret < 0) { @@ -1703,10 +1686,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, { int ret = 0; - if (trans->aborted) { + if (TRANS_ABORTED(trans)) { if (insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return 0; } @@ -1721,8 +1703,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans->fs_info, node->bytenr, - node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); return ret; } @@ -1867,8 +1848,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(fs_info, head->bytenr, - head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, fs_info->csum_root, head->bytenr, head->num_bytes); @@ -2135,22 +2115,6 @@ static u64 find_middle(struct rb_root *root) } #endif -static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) -{ - u64 num_bytes; - - num_bytes = heads * (sizeof(struct btrfs_extent_item) + - sizeof(struct btrfs_extent_inline_ref)); - if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - num_bytes += heads * sizeof(struct btrfs_tree_block_info); - - /* - * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to use. - */ - return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); -} - /* * Takes the number of bytes to be csumm'ed and figures out how many leaves it * would require to store the csums for that many bytes. @@ -2191,7 +2155,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ - if (trans->aborted) + if (TRANS_ABORTED(trans)) return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) @@ -2238,7 +2202,7 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 flags, + struct extent_buffer *eb, u64 flags, int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; @@ -2254,7 +2218,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -2342,7 +2306,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root, static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 objectid, u64 offset, u64 bytenr, + bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; @@ -2384,9 +2349,13 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) goto out; - /* If extent created before last snapshot => it's definitely shared */ - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) + /* + * If extent created before last snapshot => it's shared unless the + * snapshot has been deleted. Use the heuristic if strict is false. + */ + if (!strict && + (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2411,7 +2380,7 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr) + u64 bytenr, bool strict) { struct btrfs_path *path; int ret; @@ -2422,7 +2391,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, do { ret = check_committed_ref(root, path, objectid, - offset, bytenr); + offset, bytenr, strict); if (ret && ret != -ENOENT) goto out; @@ -2463,7 +2432,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) return 0; if (full_backref) @@ -2588,7 +2557,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) return bytenr; } -static int pin_down_extent(struct btrfs_block_group *cache, +static int pin_down_extent(struct btrfs_trans_handle *trans, + struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -2607,22 +2577,20 @@ static int pin_down_extent(struct btrfs_block_group *cache, percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); - set_extent_dirty(fs_info->pinned_extents, bytenr, + set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; } -int btrfs_pin_extent(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group *cache; - ASSERT(fs_info->running_transaction); - - cache = btrfs_lookup_block_group(fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -2631,13 +2599,15 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(fs_info, bytenr); + btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes); + + cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; @@ -2649,7 +2619,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, */ btrfs_cache_block_group(cache, 1); - pin_down_extent(cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -2763,11 +2733,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) } } - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - fs_info->pinned_extents = &fs_info->freed_extents[1]; - else - fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->commit_root_sem); btrfs_update_global_block_rsv(fs_info); @@ -2908,12 +2873,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 end; int ret; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; + unpin = &trans->transaction->pinned_extents; - while (!trans->aborted) { + while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); @@ -2923,6 +2885,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + clear_extent_bits(&fs_info->excluded_extents, start, + end, EXTENT_UPTODATE); if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, @@ -2950,14 +2915,14 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) u64 trimmed = 0; ret = -EROFS; - if (!trans->aborted) + if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, block_group->length, &trimmed); list_del_init(&block_group->bg_list); - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); if (ret) { @@ -3000,7 +2965,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -3301,7 +3265,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(cache, buf->start, buf->len, 1); + pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -3345,7 +3309,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) (ref->type == BTRFS_REF_DATA && ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); + btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { @@ -3395,6 +3359,7 @@ static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) + __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; @@ -3438,6 +3403,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +enum btrfs_extent_allocation_policy { + BTRFS_EXTENT_ALLOC_CLUSTERED, +}; + /* * Structure used internally for find_free_extent() function. Wraps needed * parameters. @@ -3454,6 +3423,8 @@ struct find_free_extent_ctl { /* For clustered allocation */ u64 empty_cluster; + struct btrfs_free_cluster *last_ptr; + bool use_cluster; bool have_caching_bg; bool orig_have_caching_bg; @@ -3489,6 +3460,12 @@ struct find_free_extent_ctl { /* Found result */ u64 found_offset; + + /* Hint where to start looking for an empty space */ + u64 hint_byte; + + /* Allocation policy */ + enum btrfs_extent_allocation_policy policy; }; @@ -3501,11 +3478,11 @@ struct find_free_extent_ctl { * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group **cluster_bg_ret) + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **cluster_bg_ret) { struct btrfs_block_group *cluster_bg; + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 aligned_cluster; u64 offset; int ret; @@ -3605,9 +3582,9 @@ refill_cluster: * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, - struct btrfs_free_cluster *last_ptr, - struct find_free_extent_ctl *ffe_ctl) + struct find_free_extent_ctl *ffe_ctl) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 offset; /* @@ -3663,16 +3640,101 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, return 0; } +static int do_allocation_clustered(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + int ret; + + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); + if (ret >= 0 || ret == -EAGAIN) + return ret; + /* ret == -ENOENT case falls through */ + } + + return find_free_extent_unclustered(block_group, ffe_ctl); +} + +static int do_allocation(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + default: + BUG(); + } +} + +static void release_block_group(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + int delalloc) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + break; + default: + BUG(); + } + + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + ffe_ctl->index); + btrfs_release_block_group(block_group, delalloc); +} + +static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + if (!ffe_ctl->use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } +} + +static void found_extent(struct find_free_extent_ctl *ffe_ctl, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + found_extent_clustered(ffe_ctl, ins); + break; + default: + BUG(); + } +} + +static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + /* + * If we can't allocate a new chunk we've already looped through + * at least once, move on to the NO_EMPTY_SIZE case. + */ + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + return 0; + default: + BUG(); + } +} + /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, - struct btrfs_free_cluster *last_ptr, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, - int full_search, bool use_cluster) + bool full_search) { struct btrfs_root *root = fs_info->extent_root; int ret; @@ -3689,11 +3751,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return 1; if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } + found_extent(ffe_ctl, ins); return 0; } @@ -3739,16 +3797,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, CHUNK_ALLOC_FORCE); - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; - /* Do not bail out on ENOSPC since we can do more. */ - if (ret < 0 && ret != -ENOSPC) + if (ret == -ENOSPC) + ret = chunk_allocation_failed(ffe_ctl); + else if (ret < 0) btrfs_abort_transaction(trans, ret); else ret = 0; @@ -3759,6 +3811,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) + return -ENOSPC; + /* * Don't loop again if we already have no empty_size and * no empty_cluster. @@ -3774,6 +3829,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } +static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + /* + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. + */ + if (space_info->max_extent_size) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + ffe_ctl->num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + ffe_ctl->use_cluster = false; + } + spin_unlock(&space_info->lock); + } + + ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl->empty_cluster); + if (ffe_ctl->last_ptr) { + struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; + + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + ffe_ctl->hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + ffe_ctl->hint_byte = last_ptr->window_start; + ffe_ctl->use_cluster = false; + } + spin_unlock(&last_ptr->lock); + } + + return 0; +} + +static int prepare_allocation(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, + struct btrfs_key *ins) +{ + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + default: + BUG(); + } +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -3801,16 +3921,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, + u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { int ret = 0; int cache_block_group_error = 0; - struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - bool use_cluster = true; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); @@ -3819,13 +3937,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; ffe_ctl.search_start = 0; - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; ffe_ctl.delalloc = delalloc; ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); ffe_ctl.have_caching_bg = false; ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; + ffe_ctl.hint_byte = hint_byte_orig; + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + + /* For clustered allocation */ + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.last_ptr = NULL; + ffe_ctl.use_cluster = true; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; @@ -3839,51 +3963,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - /* - * If our free space is heavily fragmented we may not be able to make - * big contiguous allocations, so instead of doing the expensive search - * for free space, simply return ENOSPC with our max_extent_size so we - * can go ahead and search for a more manageable chunk. - * - * If our max_extent_size is large enough for our allocation simply - * disable clustering since we will likely not be able to find enough - * space to create a cluster and induce latency trying. - */ - if (unlikely(space_info->max_extent_size)) { - spin_lock(&space_info->lock); - if (space_info->max_extent_size && - num_bytes > space_info->max_extent_size) { - ins->offset = space_info->max_extent_size; - spin_unlock(&space_info->lock); - return -ENOSPC; - } else if (space_info->max_extent_size) { - use_cluster = false; - } - spin_unlock(&space_info->lock); - } - - last_ptr = fetch_cluster_info(fs_info, space_info, - &ffe_ctl.empty_cluster); - if (last_ptr) { - spin_lock(&last_ptr->lock); - if (last_ptr->block_group) - hint_byte = last_ptr->window_start; - if (last_ptr->fragmented) { - /* - * We still set window_start so we can keep track of the - * last place we found an allocation to try and save - * some time. - */ - hint_byte = last_ptr->window_start; - use_cluster = false; - } - spin_unlock(&last_ptr->lock); - } + ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); + if (ret < 0) + return ret; ffe_ctl.search_start = max(ffe_ctl.search_start, first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); - if (ffe_ctl.search_start == hint_byte) { + ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); + if (ffe_ctl.search_start == ffe_ctl.hint_byte) { block_group = btrfs_lookup_block_group(fs_info, ffe_ctl.search_start); /* @@ -3924,6 +4011,8 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[ffe_ctl.index], list) { + struct btrfs_block_group *bg_ret; + /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; @@ -3984,39 +4073,20 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - /* - * Ok we want to try and use the cluster allocator, so - * lets look there - */ - if (last_ptr && use_cluster) { - struct btrfs_block_group *cluster_bg = NULL; - - ret = find_free_extent_clustered(block_group, last_ptr, - &ffe_ctl, &cluster_bg); - - if (ret == 0) { - if (cluster_bg && cluster_bg != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = cluster_bg; - } - goto checks; - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { - goto loop; + bg_ret = NULL; + ret = do_allocation(block_group, &ffe_ctl, &bg_ret); + if (ret == 0) { + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, delalloc); + block_group = bg_ret; } - /* ret == -ENOENT case falls through */ - } - - ret = find_free_extent_unclustered(block_group, last_ptr, - &ffe_ctl); - if (ret == -EAGAIN) + } else if (ret == -EAGAIN) { goto have_block_group; - else if (ret > 0) + } else if (ret > 0) { goto loop; - /* ret == 0 case falls through */ -checks: + } + + /* Checks */ ffe_ctl.search_start = round_up(ffe_ctl.found_offset, fs_info->stripesize); @@ -4050,17 +4120,12 @@ checks: btrfs_release_block_group(block_group, delalloc); break; loop: - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - ffe_ctl.index); - btrfs_release_block_group(block_group, delalloc); + release_block_group(block_group, &ffe_ctl, delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, - full_search, use_cluster); + ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search); if (ret > 0) goto search; @@ -4189,18 +4254,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, + u64 len) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, start); if (!cache) { - btrfs_err(fs_info, "unable to find block group for %llu", start); + btrfs_err(trans->fs_info, "unable to find block group for %llu", + start); return -ENOSPC; } - ret = pin_down_extent(cache, start, len, 1); + ret = pin_down_extent(trans, cache, start, len, 1); btrfs_put_block_group(cache); return ret; } @@ -4431,7 +4498,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1); if (ret) - btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); btrfs_put_block_group(block_group); return ret; } @@ -4750,8 +4817,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb->start, - eb->len, flag, + ret = btrfs_set_disk_extent_flags(trans, eb, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; @@ -5209,9 +5275,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * * If called with for_reloc == 0, may exit early with -EAGAIN */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -5240,7 +5304,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out; } - trans = btrfs_start_transaction(tree_root, 0); + /* + * Use join to avoid potential EINTR from transaction start. See + * wait_reserve_ticket and the whole reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -5250,9 +5321,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (err) goto out_end_trans; - if (block_rsv) - trans->block_rsv = block_rsv; - /* * This will help us catch people modifying the fs tree while we're * dropping it. It is unsafe to mess with the fs tree while it's being @@ -5380,8 +5448,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, err = PTR_ERR(trans); goto out_free; } - if (block_rsv) - trans->block_rsv = block_rsv; } } btrfs_release_path(path); @@ -5413,13 +5479,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { + /* + * This subvolume is going to be completely dropped, and won't be + * recorded as dirty roots, thus pertrans meta rsv will not be freed at + * commit transaction time. So free it here manually. + */ + btrfs_qgroup_convert_reserved_meta(root, INT_MAX); + btrfs_qgroup_free_meta_all_pertrans(root); + + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); - } else { - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - btrfs_put_fs_root(root); - } + else + btrfs_put_root(root); root_dropped = true; out_end_trans: btrfs_end_transaction_throttle(trans); @@ -5436,8 +5507,6 @@ out: */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); - if (err && err != -EAGAIN) - btrfs_handle_fs_error(fs_info, err, NULL); return err; } @@ -5605,6 +5674,19 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + /* Check if there are any CHUNK_* bits left */ + if (start > device->total_bytes) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_in_rcu(fs_info, +"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", + start, end - start + 1, + rcu_str_deref(device->name), + device->total_bytes); + mutex_unlock(&fs_info->chunk_mutex); + ret = 0; + break; + } + /* Ensure we skip the reserved area in the first 1M */ start = max_t(u64, start, SZ_1M); @@ -5749,47 +5831,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) return bg_ret; return dev_ret; } - -/* - * btrfs_{start,end}_write_no_snapshotting() are similar to - * mnt_{want,drop}_write(), they are used to prevent some tasks from writing - * data into the page cache through nocow before the subvolume is snapshoted, - * but flush the data into disk after the snapshot creation, or to prevent - * operations while snapshotting is ongoing and that cause the snapshot to be - * inconsistent (writes followed by expanding truncates for example). - */ -void btrfs_end_write_no_snapshotting(struct btrfs_root *root) -{ - percpu_counter_dec(&root->subv_writers->counter); - cond_wake_up(&root->subv_writers->wait); -} - -int btrfs_start_write_no_snapshotting(struct btrfs_root *root) -{ - if (atomic_read(&root->will_be_snapshotted)) - return 0; - - percpu_counter_inc(&root->subv_writers->counter); - /* - * Make sure counter is updated before we check for snapshot creation. - */ - smp_mb(); - if (atomic_read(&root->will_be_snapshotted)) { - btrfs_end_write_no_snapshotting(root); - return 0; - } - return 1; -} - -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshotting(root); - if (ret) - break; - wait_var_event(&root->will_be_snapshotted, - !atomic_read(&root->will_be_snapshotted)); - } -} |