summaryrefslogtreecommitdiff
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c564
1 files changed, 301 insertions, 263 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a7bc66121330..5871ef78edba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "rcu-string.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -64,10 +65,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
- set_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- set_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
+ set_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
return 0;
}
@@ -79,10 +78,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
start = cache->start;
end = start + cache->length - 1;
- clear_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- clear_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
+ clear_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
}
static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
@@ -1193,24 +1190,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int insert_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- u64 bytenr, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add)
-{
- int ret;
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- BUG_ON(refs_to_add != 1);
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
- root_objectid);
- } else {
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
- root_objectid, owner, offset,
- refs_to_add);
- }
- return ret;
-}
-
static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
@@ -1469,7 +1448,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
@@ -1494,11 +1472,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
- ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
- owner, offset, refs_to_add);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ BUG_ON(refs_to_add != 1);
+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
+ root_objectid, owner, offset,
+ refs_to_add);
+ }
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -1583,7 +1567,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
int err = 0;
int metadata = !extent_op->is_data;
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
@@ -1604,7 +1588,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
@@ -1703,10 +1686,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
- if (trans->aborted) {
+ if (TRANS_ABORTED(trans)) {
if (insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return 0;
}
@@ -1721,8 +1703,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
else
BUG();
if (ret && insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return ret;
}
@@ -1867,8 +1848,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
- btrfs_pin_extent(fs_info, head->bytenr,
- head->num_bytes, 1);
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, fs_info->csum_root,
head->bytenr, head->num_bytes);
@@ -2135,22 +2115,6 @@ static u64 find_middle(struct rb_root *root)
}
#endif
-static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
-{
- u64 num_bytes;
-
- num_bytes = heads * (sizeof(struct btrfs_extent_item) +
- sizeof(struct btrfs_extent_inline_ref));
- if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- num_bytes += heads * sizeof(struct btrfs_tree_block_info);
-
- /*
- * We don't ever fill up leaves all the way so multiply by 2 just to be
- * closer to what we're really going to want to use.
- */
- return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
-}
-
/*
* Takes the number of bytes to be csumm'ed and figures out how many leaves it
* would require to store the csums for that many bytes.
@@ -2191,7 +2155,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
@@ -2238,7 +2202,7 @@ out:
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 flags,
+ struct extent_buffer *eb, u64 flags,
int level, int is_data)
{
struct btrfs_delayed_extent_op *extent_op;
@@ -2254,7 +2218,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -2342,7 +2306,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
@@ -2384,9 +2349,13 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
- /* If extent created before last snapshot => it's definitely shared */
- if (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item))
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2411,7 +2380,7 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr)
+ u64 bytenr, bool strict)
{
struct btrfs_path *path;
int ret;
@@ -2422,7 +2391,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
do {
ret = check_committed_ref(root, path, objectid,
- offset, bytenr);
+ offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
@@ -2463,7 +2432,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
return 0;
if (full_backref)
@@ -2588,7 +2557,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
return bytenr;
}
-static int pin_down_extent(struct btrfs_block_group *cache,
+static int pin_down_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -2607,22 +2577,20 @@ static int pin_down_extent(struct btrfs_block_group *cache,
percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(fs_info->pinned_extents, bytenr,
+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
}
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_block_group *cache;
- ASSERT(fs_info->running_transaction);
-
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(cache, bytenr, num_bytes, reserved);
+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
@@ -2631,13 +2599,15 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group *cache;
int ret;
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
+
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
@@ -2649,7 +2619,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
*/
btrfs_cache_block_group(cache, 1);
- pin_down_extent(cache, bytenr, num_bytes, 0);
+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@@ -2763,11 +2733,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
}
}
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- fs_info->pinned_extents = &fs_info->freed_extents[1];
- else
- fs_info->pinned_extents = &fs_info->freed_extents[0];
-
up_write(&fs_info->commit_root_sem);
btrfs_update_global_block_rsv(fs_info);
@@ -2908,12 +2873,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 end;
int ret;
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- unpin = &fs_info->freed_extents[1];
- else
- unpin = &fs_info->freed_extents[0];
+ unpin = &trans->transaction->pinned_extents;
- while (!trans->aborted) {
+ while (!TRANS_ABORTED(trans)) {
struct extent_state *cached_state = NULL;
mutex_lock(&fs_info->unused_bg_unpin_mutex);
@@ -2923,6 +2885,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ clear_extent_bits(&fs_info->excluded_extents, start,
+ end, EXTENT_UPTODATE);
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
@@ -2950,14 +2915,14 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 trimmed = 0;
ret = -EROFS;
- if (!trans->aborted)
+ if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info,
block_group->start,
block_group->length,
&trimmed);
list_del_init(&block_group->bg_list);
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
if (ret) {
@@ -3000,7 +2965,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -3301,7 +3265,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(cache, buf->start, buf->len, 1);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
@@ -3345,7 +3309,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
(ref->type == BTRFS_REF_DATA &&
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
- btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
@@ -3395,6 +3359,7 @@ static struct btrfs_block_group *btrfs_lock_cluster(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
+ __acquires(&cluster->refill_lock)
{
struct btrfs_block_group *used_bg = NULL;
@@ -3438,6 +3403,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
+enum btrfs_extent_allocation_policy {
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
+};
+
/*
* Structure used internally for find_free_extent() function. Wraps needed
* parameters.
@@ -3454,6 +3423,8 @@ struct find_free_extent_ctl {
/* For clustered allocation */
u64 empty_cluster;
+ struct btrfs_free_cluster *last_ptr;
+ bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
@@ -3489,6 +3460,12 @@ struct find_free_extent_ctl {
/* Found result */
u64 found_offset;
+
+ /* Hint where to start looking for an empty space */
+ u64 hint_byte;
+
+ /* Allocation policy */
+ enum btrfs_extent_allocation_policy policy;
};
@@ -3501,11 +3478,11 @@ struct find_free_extent_ctl {
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
static int find_free_extent_clustered(struct btrfs_block_group *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group **cluster_bg_ret)
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **cluster_bg_ret)
{
struct btrfs_block_group *cluster_bg;
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 aligned_cluster;
u64 offset;
int ret;
@@ -3605,9 +3582,9 @@ refill_cluster:
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl)
+ struct find_free_extent_ctl *ffe_ctl)
{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 offset;
/*
@@ -3663,16 +3640,101 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
return 0;
}
+static int do_allocation_clustered(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ int ret;
+
+ /* We want to try and use the cluster allocator, so lets look there */
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ /* ret == -ENOENT case falls through */
+ }
+
+ return find_free_extent_unclustered(block_group, ffe_ctl);
+}
+
+static int do_allocation(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ default:
+ BUG();
+ }
+}
+
+static void release_block_group(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ int delalloc)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ break;
+ default:
+ BUG();
+ }
+
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ ffe_ctl->index);
+ btrfs_release_block_group(block_group, delalloc);
+}
+
+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ if (!ffe_ctl->use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
+}
+
+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ found_extent_clustered(ffe_ctl, ins);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ /*
+ * If we can't allocate a new chunk we've already looped through
+ * at least once, move on to the NO_EMPTY_SIZE case.
+ */
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+ return 0;
+ default:
+ BUG();
+ }
+}
+
/*
* Return >0 means caller needs to re-search for free extent
* Return 0 means we have the needed free extent.
* Return <0 means we failed to locate any free extent.
*/
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
- struct btrfs_free_cluster *last_ptr,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
- int full_search, bool use_cluster)
+ bool full_search)
{
struct btrfs_root *root = fs_info->extent_root;
int ret;
@@ -3689,11 +3751,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return 1;
if (ins->objectid) {
- if (!use_cluster && last_ptr) {
- spin_lock(&last_ptr->lock);
- last_ptr->window_start = ins->objectid;
- spin_unlock(&last_ptr->lock);
- }
+ found_extent(ffe_ctl, ins);
return 0;
}
@@ -3739,16 +3797,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
CHUNK_ALLOC_FORCE);
- /*
- * If we can't allocate a new chunk we've already looped
- * through at least once, move on to the NO_EMPTY_SIZE
- * case.
- */
- if (ret == -ENOSPC)
- ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
-
/* Do not bail out on ENOSPC since we can do more. */
- if (ret < 0 && ret != -ENOSPC)
+ if (ret == -ENOSPC)
+ ret = chunk_allocation_failed(ffe_ctl);
+ else if (ret < 0)
btrfs_abort_transaction(trans, ret);
else
ret = 0;
@@ -3759,6 +3811,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
+ return -ENOSPC;
+
/*
* Don't loop again if we already have no empty_size and
* no empty_cluster.
@@ -3774,6 +3829,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ /*
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
+ */
+ if (space_info->max_extent_size) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ ffe_ctl->num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
+ }
+
+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
+ &ffe_ctl->empty_cluster);
+ if (ffe_ctl->last_ptr) {
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&last_ptr->lock);
+ }
+
+ return 0;
+}
+
+static int prepare_allocation(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return prepare_allocation_clustered(fs_info, ffe_ctl,
+ space_info, ins);
+ default:
+ BUG();
+ }
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -3801,16 +3921,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
*/
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte, struct btrfs_key *ins,
+ u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
int ret = 0;
int cache_block_group_error = 0;
- struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
- bool use_cluster = true;
bool full_search = false;
WARN_ON(num_bytes < fs_info->sectorsize);
@@ -3819,13 +3937,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ffe_ctl.empty_size = empty_size;
ffe_ctl.flags = flags;
ffe_ctl.search_start = 0;
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
ffe_ctl.delalloc = delalloc;
ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
ffe_ctl.have_caching_bg = false;
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
+ ffe_ctl.hint_byte = hint_byte_orig;
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+
+ /* For clustered allocation */
+ ffe_ctl.retry_clustered = false;
+ ffe_ctl.retry_unclustered = false;
+ ffe_ctl.last_ptr = NULL;
+ ffe_ctl.use_cluster = true;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
@@ -3839,51 +3963,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
- /*
- * If our free space is heavily fragmented we may not be able to make
- * big contiguous allocations, so instead of doing the expensive search
- * for free space, simply return ENOSPC with our max_extent_size so we
- * can go ahead and search for a more manageable chunk.
- *
- * If our max_extent_size is large enough for our allocation simply
- * disable clustering since we will likely not be able to find enough
- * space to create a cluster and induce latency trying.
- */
- if (unlikely(space_info->max_extent_size)) {
- spin_lock(&space_info->lock);
- if (space_info->max_extent_size &&
- num_bytes > space_info->max_extent_size) {
- ins->offset = space_info->max_extent_size;
- spin_unlock(&space_info->lock);
- return -ENOSPC;
- } else if (space_info->max_extent_size) {
- use_cluster = false;
- }
- spin_unlock(&space_info->lock);
- }
-
- last_ptr = fetch_cluster_info(fs_info, space_info,
- &ffe_ctl.empty_cluster);
- if (last_ptr) {
- spin_lock(&last_ptr->lock);
- if (last_ptr->block_group)
- hint_byte = last_ptr->window_start;
- if (last_ptr->fragmented) {
- /*
- * We still set window_start so we can keep track of the
- * last place we found an allocation to try and save
- * some time.
- */
- hint_byte = last_ptr->window_start;
- use_cluster = false;
- }
- spin_unlock(&last_ptr->lock);
- }
+ ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+ if (ret < 0)
+ return ret;
ffe_ctl.search_start = max(ffe_ctl.search_start,
first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
- if (ffe_ctl.search_start == hint_byte) {
+ ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
+ if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
ffe_ctl.search_start);
/*
@@ -3924,6 +4011,8 @@ search:
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
&space_info->block_groups[ffe_ctl.index], list) {
+ struct btrfs_block_group *bg_ret;
+
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro))
continue;
@@ -3984,39 +4073,20 @@ have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
- /*
- * Ok we want to try and use the cluster allocator, so
- * lets look there
- */
- if (last_ptr && use_cluster) {
- struct btrfs_block_group *cluster_bg = NULL;
-
- ret = find_free_extent_clustered(block_group, last_ptr,
- &ffe_ctl, &cluster_bg);
-
- if (ret == 0) {
- if (cluster_bg && cluster_bg != block_group) {
- btrfs_release_block_group(block_group,
- delalloc);
- block_group = cluster_bg;
- }
- goto checks;
- } else if (ret == -EAGAIN) {
- goto have_block_group;
- } else if (ret > 0) {
- goto loop;
+ bg_ret = NULL;
+ ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+ if (ret == 0) {
+ if (bg_ret && bg_ret != block_group) {
+ btrfs_release_block_group(block_group, delalloc);
+ block_group = bg_ret;
}
- /* ret == -ENOENT case falls through */
- }
-
- ret = find_free_extent_unclustered(block_group, last_ptr,
- &ffe_ctl);
- if (ret == -EAGAIN)
+ } else if (ret == -EAGAIN) {
goto have_block_group;
- else if (ret > 0)
+ } else if (ret > 0) {
goto loop;
- /* ret == 0 case falls through */
-checks:
+ }
+
+ /* Checks */
ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
fs_info->stripesize);
@@ -4050,17 +4120,12 @@ checks:
btrfs_release_block_group(block_group, delalloc);
break;
loop:
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
- ffe_ctl.index);
- btrfs_release_block_group(block_group, delalloc);
+ release_block_group(block_group, &ffe_ctl, delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
- full_search, use_cluster);
+ ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4189,18 +4254,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 len)
{
struct btrfs_block_group *cache;
int ret = 0;
- cache = btrfs_lookup_block_group(fs_info, start);
+ cache = btrfs_lookup_block_group(trans->fs_info, start);
if (!cache) {
- btrfs_err(fs_info, "unable to find block group for %llu", start);
+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
+ start);
return -ENOSPC;
}
- ret = pin_down_extent(cache, start, len, 1);
+ ret = pin_down_extent(trans, cache, start, len, 1);
btrfs_put_block_group(cache);
return ret;
}
@@ -4431,7 +4498,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1);
if (ret)
- btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
btrfs_put_block_group(block_group);
return ret;
}
@@ -4750,8 +4817,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_set_disk_extent_flags(trans, eb->start,
- eb->len, flag,
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
@@ -5209,9 +5275,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*
* If called with for_reloc == 0, may exit early with -EAGAIN
*/
-int btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref,
- int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@@ -5240,7 +5304,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction start. See
+ * wait_reserve_ticket and the whole reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5250,9 +5321,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (err)
goto out_end_trans;
- if (block_rsv)
- trans->block_rsv = block_rsv;
-
/*
* This will help us catch people modifying the fs tree while we're
* dropping it. It is unsafe to mess with the fs tree while it's being
@@ -5380,8 +5448,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
err = PTR_ERR(trans);
goto out_free;
}
- if (block_rsv)
- trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
@@ -5413,13 +5479,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
+ /*
+ * This subvolume is going to be completely dropped, and won't be
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
+ * commit transaction time. So free it here manually.
+ */
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
+ btrfs_qgroup_free_meta_all_pertrans(root);
+
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
- } else {
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- btrfs_put_fs_root(root);
- }
+ else
+ btrfs_put_root(root);
root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans);
@@ -5436,8 +5507,6 @@ out:
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
- if (err && err != -EAGAIN)
- btrfs_handle_fs_error(fs_info, err, NULL);
return err;
}
@@ -5605,6 +5674,19 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
&start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ /* Check if there are any CHUNK_* bits left */
+ if (start > device->total_bytes) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_in_rcu(fs_info,
+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
+ start, end - start + 1,
+ rcu_str_deref(device->name),
+ device->total_bytes);
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = 0;
+ break;
+ }
+
/* Ensure we skip the reserved area in the first 1M */
start = max_t(u64, start, SZ_1M);
@@ -5749,47 +5831,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
return bg_ret;
return dev_ret;
}
-
-/*
- * btrfs_{start,end}_write_no_snapshotting() are similar to
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
- * data into the page cache through nocow before the subvolume is snapshoted,
- * but flush the data into disk after the snapshot creation, or to prevent
- * operations while snapshotting is ongoing and that cause the snapshot to be
- * inconsistent (writes followed by expanding truncates for example).
- */
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
-{
- percpu_counter_dec(&root->subv_writers->counter);
- cond_wake_up(&root->subv_writers->wait);
-}
-
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
-{
- if (atomic_read(&root->will_be_snapshotted))
- return 0;
-
- percpu_counter_inc(&root->subv_writers->counter);
- /*
- * Make sure counter is updated before we check for snapshot creation.
- */
- smp_mb();
- if (atomic_read(&root->will_be_snapshotted)) {
- btrfs_end_write_no_snapshotting(root);
- return 0;
- }
- return 1;
-}
-
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
-{
- while (true) {
- int ret;
-
- ret = btrfs_start_write_no_snapshotting(root);
- if (ret)
- break;
- wait_var_event(&root->will_be_snapshotted,
- !atomic_read(&root->will_be_snapshotted));
- }
-}