summaryrefslogtreecommitdiff
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c1210
1 files changed, 931 insertions, 279 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07204bf601ed..60cc1399c64f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
@@ -95,8 +96,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota);
+ int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags,
int force);
@@ -125,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
{
atomic_inc(&cache->count);
}
@@ -332,13 +332,34 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
kfree(ctl);
}
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ u64 start = block_group->key.objectid;
+ u64 len = block_group->key.offset;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ root->nodesize : root->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+}
+#endif
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
* since their free space will be released as soon as the transaction commits.
*/
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
{
u64 extent_start, extent_end, size, total_added = 0;
int ret;
@@ -375,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
return total_added;
}
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
- struct btrfs_caching_control *caching_ctl;
struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -387,19 +407,28 @@ static noinline void caching_thread(struct btrfs_work *work)
u64 total_found = 0;
u64 last = 0;
u32 nritems;
- int ret = -ENOMEM;
+ int ret;
+ bool wakeup = true;
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
extent_root = fs_info->extent_root;
path = btrfs_alloc_path();
if (!path)
- goto out;
+ return -ENOMEM;
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(extent_root, block_group))
+ wakeup = false;
+#endif
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
@@ -408,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 1;
+ path->reada = READA_FORWARD;
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
- mutex_lock(&caching_ctl->mutex);
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto err;
+ goto out;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -441,17 +466,20 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
- goto again;
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ goto next;
}
ret = btrfs_next_leaf(extent_root, path);
if (ret < 0)
- goto err;
+ goto out;
if (ret)
break;
leaf = path->nodes[0];
@@ -464,7 +492,8 @@ next:
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -489,9 +518,10 @@ next:
else
last = key.objectid + key.offset;
- if (total_found > (1024 * 1024 * 2)) {
+ if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
- wake_up(&caching_ctl->wait);
+ if (wakeup)
+ wake_up(&caching_ctl->wait);
}
}
path->slots[0]++;
@@ -503,25 +533,58 @@ next:
block_group->key.offset);
caching_ctl->progress = (u64)-1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_root *extent_root;
+ int ret;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ extent_root = fs_info->extent_root;
+
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ ret = load_free_space_tree(caching_ctl);
+ else
+ ret = load_extent_tree_free(caching_ctl);
+
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_FINISHED;
+ block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
-err:
- btrfs_free_path(path);
- up_read(&fs_info->commit_root_sem);
-
- free_excluded_extents(extent_root, block_group);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ u64 bytes_used;
- mutex_unlock(&caching_ctl->mutex);
-out:
- if (ret) {
+ spin_lock(&block_group->space_info->lock);
spin_lock(&block_group->lock);
- block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_ERROR;
+ bytes_used = block_group->key.offset -
+ btrfs_block_group_used(&block_group->item);
+ block_group->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(extent_root, block_group);
}
+#endif
+
+ caching_ctl->progress = (u64)-1;
+
+ up_read(&fs_info->commit_root_sem);
+ free_excluded_extents(fs_info->extent_root, block_group);
+ mutex_unlock(&caching_ctl->mutex);
+
wake_up(&caching_ctl->wait);
put_caching_control(caching_ctl);
@@ -607,6 +670,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
}
spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret == 1 &&
+ btrfs_should_fragment_free_space(fs_info->extent_root,
+ cache)) {
+ u64 bytes_used;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ bytes_used = cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fragment_free_space(fs_info->extent_root, cache);
+ }
+#endif
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
@@ -617,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
} else {
/*
- * We are not going to do the fast caching, set cached to the
- * appropriate value and wakeup any waiters.
+ * We're either using the free space tree or no caching at all.
+ * Set cached to the appropriate value and wakeup any waiters.
*/
spin_lock(&cache->lock);
if (load_cache_only) {
@@ -1316,8 +1395,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
- struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
@@ -1883,10 +1961,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_issue_discard(struct block_device *bdev,
- u64 start, u64 len)
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+ u64 *discarded_bytes)
{
- return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+ int j, ret = 0;
+ u64 bytes_left, end;
+ u64 aligned_start = ALIGN(start, 1 << 9);
+
+ if (WARN_ON(start != aligned_start)) {
+ len -= aligned_start - start;
+ len = round_down(len, 1 << 9);
+ start = aligned_start;
+ }
+
+ *discarded_bytes = 0;
+
+ if (!len)
+ return 0;
+
+ end = start + len;
+ bytes_left = len;
+
+ /* Skip any superblocks on this device. */
+ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+ u64 sb_start = btrfs_sb_offset(j);
+ u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+ u64 size = sb_start - start;
+
+ if (!in_range(sb_start, start, bytes_left) &&
+ !in_range(sb_end, start, bytes_left) &&
+ !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+ continue;
+
+ /*
+ * Superblock spans beginning of range. Adjust start and
+ * try again.
+ */
+ if (sb_start <= start) {
+ start += sb_end - start;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ continue;
+ }
+
+ if (size) {
+ ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += size;
+ else if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ start = sb_end;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ }
+
+ if (bytes_left) {
+ ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += bytes_left;
+ }
+ return ret;
}
int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1907,14 +2052,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ u64 bytes;
if (!stripe->dev->can_discard)
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
stripe->physical,
- stripe->length);
+ stripe->length,
+ &bytes);
if (!ret)
- discarded_bytes += stripe->length;
+ discarded_bytes += bytes;
else if (ret != -EOPNOTSUPP)
break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
@@ -1941,8 +2088,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset,
- int no_quota)
+ u64 root_objectid, u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1954,12 +2100,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ BTRFS_ADD_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ num_bytes, parent, root_objectid,
+ owner, offset, 0,
+ BTRFS_ADD_DELAYED_REF, NULL);
}
return ret;
}
@@ -1980,16 +2126,12 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
- no_quota = 1;
-
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
@@ -2015,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
@@ -2128,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
path, 0, 1);
@@ -2223,8 +2365,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
- ref->level, &ins,
- node->no_quota);
+ ref->level, &ins);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node,
parent, ref_root,
@@ -2277,6 +2418,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(root->fs_info,
+ head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
@@ -2365,7 +2511,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2760,11 +2920,15 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
+ bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
return 0;
+ if (root->fs_info->creating_free_space_tree)
+ return 0;
+
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
@@ -2776,6 +2940,7 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
+ trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, root, count);
if (ret < 0) {
btrfs_abort_transaction(trans, root, ret);
@@ -2825,6 +2990,7 @@ again:
}
out:
assert_qgroups_uptodate(trans);
+ trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -2841,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return -ENOMEM;
extent_op->flags_to_set = flags;
- extent_op->update_flags = 1;
- extent_op->update_key = 0;
- extent_op->is_data = is_data ? 1 : 0;
+ extent_op->update_flags = true;
+ extent_op->update_key = false;
+ extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
@@ -3038,7 +3204,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64, int);
+ u64, u64, u64, u64, u64, u64);
if (btrfs_test_is_dummy_root(root))
@@ -3079,15 +3245,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, 1);
+ key.offset);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0,
- 1);
+ parent, ref_root, level - 1, 0);
if (ret)
goto fail;
}
@@ -3182,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
- if (block_group->key.offset < (100 * 1024 * 1024)) {
+ if (block_group->key.offset < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -3268,28 +3433,47 @@ again:
spin_unlock(&block_group->lock);
/*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
+ /*
* Try to preallocate enough space based on how big the block group is.
* Keep in mind this has to include any pinned space which could end up
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+ num_pages = div_u64(block_group->key.offset, SZ_256M);
if (!num_pages)
num_pages = 1;
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
num_pages, num_pages,
&alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
if (!ret)
dcs = BTRFS_DC_SETUP;
- btrfs_free_reserved_data_space(inode, num_pages);
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -3519,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM;
/*
- * We don't need the lock here since we are protected by the transaction
- * commit. We want to do the cache_save_setup first and then run the
+ * Even though we are in the critical section of the transaction commit,
+ * we can still have concurrent tasks adding elements to this
+ * transaction's list of dirty block groups. These tasks correspond to
+ * endio free space workers started when writeback finishes for a
+ * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+ * allocate new block groups as a result of COWing nodes of the root
+ * tree when updating the free space inode. The writeback for the space
+ * caches is triggered by an earlier call to
+ * btrfs_start_dirty_block_groups() and iterations of the following
+ * loop.
+ * Also we want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all
* in one shot.
*/
+ spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
@@ -3535,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* finish and then do it all again
*/
if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
/*
@@ -3547,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* on any pending IO
*/
list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1;
cache_save_setup(cache, trans, path);
@@ -3571,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
if (!ret) {
ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * One of the free space endio workers might have
+ * created a new block group while updating a free space
+ * cache's inode (at inode.c:btrfs_finish_ordered_io())
+ * and hasn't released its transaction handle yet, in
+ * which case the new block group is still attached to
+ * its transaction handle and its creation has not
+ * finished yet (no block group item in the extent tree
+ * yet, etc). If this is the case, wait for all free
+ * space endio workers to finish and retry. This is a
+ * a very rare case so no need for a more efficient and
+ * complex approach.
+ */
+ if (ret == -ENOENT) {
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+ ret = write_one_cache_group(trans, root, path,
+ cache);
+ }
if (ret)
btrfs_abort_transaction(trans, root, ret);
}
@@ -3578,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
+ spin_unlock(&cur_trans->dirty_bgs_lock);
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -3674,10 +3892,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_reserved = 0;
found->bytes_readonly = 0;
found->bytes_may_use = 0;
- if (total_bytes > 0)
- found->full = 0;
- else
- found->full = 1;
+ found->full = 0;
+ found->max_extent_size = 0;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3754,7 +3970,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
- u64 tmp;
+ u64 raid_type;
+ u64 allowed = 0;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3772,31 +3989,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
spin_unlock(&root->fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */
- if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID5);
- if (num_devices < 3)
- flags &= ~BTRFS_BLOCK_GROUP_RAID6;
- if (num_devices < 4)
- flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
- tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
- flags &= ~tmp;
-
- if (tmp & BTRFS_BLOCK_GROUP_RAID6)
- tmp = BTRFS_BLOCK_GROUP_RAID6;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
- tmp = BTRFS_BLOCK_GROUP_RAID5;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
- tmp = BTRFS_BLOCK_GROUP_RAID10;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
- tmp = BTRFS_BLOCK_GROUP_RAID1;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
- tmp = BTRFS_BLOCK_GROUP_RAID0;
-
- return extended_to_chunk(flags | tmp);
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ allowed |= btrfs_raid_group[raid_type];
+ }
+ allowed &= flags;
+
+ if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ allowed = BTRFS_BLOCK_GROUP_RAID5;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ allowed = BTRFS_BLOCK_GROUP_RAID10;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+ flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ return extended_to_chunk(flags | allowed);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3835,11 +4047,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return ret;
}
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3938,7 +4146,8 @@ commit_trans:
if (IS_ERR(trans))
return PTR_ERR(trans);
if (have_pinned_space >= 0 ||
- trans->transaction->have_free_bgs ||
+ test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ &trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
if (ret)
@@ -3960,38 +4169,86 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- ret = btrfs_qgroup_reserve(root, write_bytes);
- if (ret)
- goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
-out:
spin_unlock(&data_sinfo->lock);
return ret;
}
/*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
+
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ *
+ * TODO: Find a good method to avoid reserve data space for NOCOW
+ * range, but don't impact performance on quota disable case.
+ */
+ ret = btrfs_qgroup_reserve_data(inode, start, len);
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
- /* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, root->sectorsize);
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- WARN_ON(data_sinfo->bytes_may_use < bytes);
- data_sinfo->bytes_may_use -= bytes;
+ if (WARN_ON(data_sinfo->bytes_may_use < len))
+ data_sinfo->bytes_may_use = 0;
+ else
+ data_sinfo->bytes_may_use -= len;
trace_btrfs_space_reservation(root->fs_info, "space_info",
- data_sinfo->flags, bytes, 0);
+ data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_qgroup_free_data(inode, start, len);
+}
+
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -4035,14 +4292,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- thresh = max_t(u64, 64 * 1024 * 1024,
- div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
if (num_bytes - num_allocated < thresh)
return 1;
}
- if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+ if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -4241,7 +4497,8 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ if (trans->can_flush_pending_bgs &&
+ trans->chunk_bytes_reserved >= (u64)SZ_2M) {
btrfs_create_pending_block_groups(trans, trans->root);
btrfs_trans_release_chunk_metadata(trans);
}
@@ -4339,7 +4596,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
return nr;
}
-#define EXTENT_SIZE_PER_ITEM (256 * 1024)
+#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
* shrink metadata reservation for delalloc
@@ -4544,8 +4801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
u64 expected;
u64 to_reclaim;
- to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
- 16 * 1024 * 1024);
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL)) {
@@ -4556,8 +4812,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
- if (can_overcommit(root, space_info, 1024 * 1024,
- BTRFS_RESERVE_FLUSH_ALL))
+ if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -4822,13 +5077,9 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->csum_root && trans->adding_csums)
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->uuid_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ (root == root->fs_info->csum_root && trans->adding_csums) ||
+ (root == root->fs_info->uuid_root))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -5117,7 +5368,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
- block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
+ block_rsv->size = min_t(u64, num_bytes, SZ_512M);
num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -5271,7 +5522,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
- ret = btrfs_qgroup_reserve(root, num_bytes);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
@@ -5289,10 +5540,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
- if (ret) {
- if (*qgroup_reserved)
- btrfs_qgroup_free(root, *qgroup_reserved);
- }
+ if (ret && *qgroup_reserved)
+ btrfs_qgroup_free_meta(root, *qgroup_reserved);
return ret;
}
@@ -5453,15 +5702,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve_meta(root,
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
@@ -5584,41 +5833,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
* @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
*
* This will do the following things
*
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ * also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
* o add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
*
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- if (ret) {
- btrfs_free_reserved_data_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, start, len);
+ if (ret < 0)
return ret;
- }
-
- return 0;
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space(inode, start, len);
+ return ret;
}
/**
* btrfs_delalloc_release_space - release data and metadata space for delalloc
* @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
*
* This must be matched with a call to btrfs_delalloc_reserve_space. This is
* called in the case that we don't need the metadata AND data reservations
@@ -5627,11 +5883,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
{
- btrfs_delalloc_release_metadata(inode, num_bytes);
- btrfs_free_reserved_data_space(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, len);
+ btrfs_free_reserved_data_space(inode, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
@@ -5708,19 +5965,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
- /*
- * No longer have used bytes in this block group, queue
- * it for deletion.
- */
- if (old_val == 0) {
- spin_lock(&info->unused_bgs_lock);
- if (list_empty(&cache->bg_list)) {
- btrfs_get_block_group(cache);
- list_add_tail(&cache->bg_list,
- &info->unused_bgs);
- }
- spin_unlock(&info->unused_bgs_lock);
- }
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5732,6 +5976,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
+ /*
+ * No longer have used bytes in this block group, queue it for
+ * deletion. We do this after adding the block group to the
+ * dirty list to avoid races between cleaner kthread and space
+ * cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
+
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
@@ -5996,6 +6256,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+ u64 *empty_cluster)
+{
+ struct btrfs_free_cluster *ret = NULL;
+ bool ssd = btrfs_test_opt(root, SSD);
+
+ *empty_cluster = 0;
+ if (btrfs_mixed_space_info(space_info))
+ return ret;
+
+ if (ssd)
+ *empty_cluster = SZ_2M;
+ if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ ret = &root->fs_info->meta_alloc_cluster;
+ if (!ssd)
+ *empty_cluster = SZ_64K;
+ } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+ ret = &root->fs_info->data_alloc_cluster;
+ }
+
+ return ret;
+}
+
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
const bool return_free_space)
{
@@ -6003,7 +6291,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_free_cluster *cluster = NULL;
u64 len;
+ u64 total_unpinned = 0;
+ u64 empty_cluster = 0;
bool readonly;
while (start <= end) {
@@ -6012,8 +6303,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
start >= cache->key.objectid + cache->key.offset) {
if (cache)
btrfs_put_block_group(cache);
+ total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache); /* Logic error */
+
+ cluster = fetch_cluster_info(root,
+ cache->space_info,
+ &empty_cluster);
+ empty_cluster <<= 1;
}
len = cache->key.objectid + cache->key.offset - start;
@@ -6026,12 +6323,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
}
start += len;
+ total_unpinned += len;
space_info = cache->space_info;
+ /*
+ * If this space cluster has been marked as fragmented and we've
+ * unpinned enough in this block group to potentially allow a
+ * cluster to be created inside of it go ahead and clear the
+ * fragmented check.
+ */
+ if (cluster && cluster->fragmented &&
+ total_unpinned > empty_cluster) {
+ spin_lock(&cluster->lock);
+ cluster->fragmented = 0;
+ spin_unlock(&cluster->lock);
+ }
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
@@ -6062,20 +6374,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- if (trans->aborted)
- return 0;
-
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
unpin = &fs_info->freed_extents[0];
- while (1) {
+ while (!trans->aborted) {
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
@@ -6094,6 +6405,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
+ /*
+ * Transaction is finished. We don't need the lock anymore. We
+ * do need to clean up the block groups in case of a transaction
+ * abort.
+ */
+ deleted_bgs = &trans->transaction->deleted_bgs;
+ list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+ u64 trimmed = 0;
+
+ ret = -EROFS;
+ if (!trans->aborted)
+ ret = btrfs_discard_extent(root,
+ block_group->key.objectid,
+ block_group->key.offset,
+ &trimmed);
+
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group_trimming(block_group);
+ btrfs_put_block_group(block_group);
+
+ if (ret) {
+ const char *errstr = btrfs_decode_error(ret);
+ btrfs_warn(fs_info,
+ "Discard failed while removing blockgroup: errno=%d %s\n",
+ ret, errstr);
+ }
+ }
+
return 0;
}
@@ -6137,7 +6476,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
- int no_quota = node->no_quota;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
@@ -6146,14 +6484,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
- if (!info->quota_enabled || !is_fstree(root_objectid))
- no_quota = 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -6349,7 +6684,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(root, path, iref));
+ extent_data_ref_count(path, iref));
if (iref) {
BUG_ON(path->slots[0] != extent_slot);
} else {
@@ -6376,6 +6711,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+ num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
@@ -6474,7 +6816,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
buf->start, buf->len,
parent, root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL, 0);
+ BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret); /* -ENOMEM */
}
@@ -6522,7 +6864,7 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int no_quota)
+ u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6545,13 +6887,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+ BTRFS_DROP_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF,
- NULL, no_quota);
+ offset, 0,
+ BTRFS_DROP_DELAYED_REF, NULL);
}
return ret;
}
@@ -6737,7 +7079,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
- int empty_cluster = 2 * 1024 * 1024;
+ u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
@@ -6747,6 +7089,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
+ bool orig_have_caching_bg = false;
+ bool full_search = false;
WARN_ON(num_bytes < root->sectorsize);
ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6762,36 +7106,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
/*
- * If the space info is for both data and metadata it means we have a
- * small filesystem and we can't use the clustering stuff.
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
*/
- if (btrfs_mixed_space_info(space_info))
- use_cluster = false;
-
- if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
- last_ptr = &root->fs_info->meta_alloc_cluster;
- if (!btrfs_test_opt(root, SSD))
- empty_cluster = 64 * 1024;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
- btrfs_test_opt(root, SSD)) {
- last_ptr = &root->fs_info->data_alloc_cluster;
+ if (unlikely(space_info->max_extent_size)) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
}
+ last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ hint_byte = last_ptr->window_start;
+ use_cluster = false;
+ }
spin_unlock(&last_ptr->lock);
}
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
-
- if (!last_ptr)
- empty_cluster = 0;
-
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
@@ -6826,6 +7181,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
search:
have_caching_bg = false;
+ if (index == 0 || index == __get_raid_index(flags))
+ full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
@@ -6859,6 +7216,7 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
+ have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -6873,7 +7231,7 @@ have_block_group:
* Ok we want to try and use the cluster allocator, so
* lets look there
*/
- if (last_ptr) {
+ if (last_ptr && use_cluster) {
struct btrfs_block_group_cache *used_block_group;
unsigned long aligned_cluster;
/*
@@ -6999,6 +7357,16 @@ refill_cluster:
}
unclustered_alloc:
+ /*
+ * We are doing an unclustered alloc, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get
+ * more space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
@@ -7031,8 +7399,6 @@ unclustered_alloc:
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
- if (!cached)
- have_caching_bg = true;
goto loop;
}
checks:
@@ -7073,6 +7439,10 @@ loop:
}
up_read(&space_info->groups_sem);
+ if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+ && !orig_have_caching_bg)
+ orig_have_caching_bg = true;
+
if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
goto search;
@@ -7089,7 +7459,20 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- loop++;
+ if (loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any unached bgs and we've alrelady done a
+ * full search through.
+ */
+ if (orig_have_caching_bg || !full_search)
+ loop = LOOP_CACHING_WAIT;
+ else
+ loop = LOOP_ALLOC_CHUNK;
+ } else {
+ loop++;
+ }
+
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
@@ -7107,6 +7490,15 @@ loop:
ret = do_chunk_alloc(trans, root, flags,
CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ loop = LOOP_NO_EMPTY_SIZE;
+
/*
* Do not bail out on ENOSPC since we
* can do more things.
@@ -7123,6 +7515,15 @@ loop:
}
if (loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (empty_size == 0 &&
+ empty_cluster == 0) {
+ ret = -ENOSPC;
+ goto out;
+ }
empty_size = 0;
empty_cluster = 0;
}
@@ -7131,11 +7532,20 @@ loop:
} else if (!ins->objectid) {
ret = -ENOSPC;
} else if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
ret = 0;
}
out:
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ spin_lock(&space_info->lock);
+ space_info->max_extent_size = max_extent_size;
+ spin_unlock(&space_info->lock);
ins->offset = max_extent_size;
+ }
return ret;
}
@@ -7184,7 +7594,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
{
- bool final_tried = false;
+ bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
@@ -7319,6 +7729,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ ins->offset);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7333,8 +7748,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota)
+ int level, struct btrfs_key *ins)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7400,6 +7814,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ num_bytes);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7415,7 +7834,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins)
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins)
{
int ret;
@@ -7424,7 +7844,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
ins->offset, 0,
root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+ ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+ NULL);
return ret;
}
@@ -7480,7 +7901,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
- btrfs_set_buffer_uptodate(buf);
+ set_extent_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
@@ -7567,9 +7988,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
/*
* finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
* returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -7629,19 +8047,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
else
memset(&extent_op->key, 0, sizeof(extent_op->key));
extent_op->flags_to_set = flags;
- if (skinny_metadata)
- extent_op->update_key = 0;
- else
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
+ extent_op->update_key = skinny_metadata ? false : true;
+ extent_op->update_flags = true;
+ extent_op->is_data = false;
extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
ins.objectid, ins.offset,
parent, root_objectid, level,
BTRFS_ADD_DELAYED_EXTENT,
- extent_op, 0);
+ extent_op);
if (ret)
goto out_free_delayed;
}
@@ -7757,21 +8172,47 @@ reada:
}
/*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
*/
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_qgroup_extent_record *qrecord;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+ if (!qrecord)
+ return -ENOMEM;
+
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ kfree(qrecord);
+ spin_unlock(&delayed_refs->lock);
+
+ return 0;
+}
+
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type;
+ int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
+ /* We can be called directly from walk_up_proc() */
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
@@ -7790,6 +8231,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -7858,8 +8303,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7937,6 +8380,11 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = record_one_subtree_extent(trans, root, child_bytenr,
+ root->nodesize);
+ if (ret)
+ goto out;
}
if (level == 0) {
@@ -8182,14 +8630,15 @@ skip:
ret = account_shared_subtree(trans, root, next,
generation, level - 1);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "Error "
"%d accounting shared subtree. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0, 0);
+ root->root_key.objectid, level - 1, 0);
BUG_ON(ret); /* -ENOMEM */
}
btrfs_tree_unlock(next);
@@ -8274,10 +8723,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = account_leaf_items(trans, root, eb);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "error "
"%d accounting leaf items. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
/* make block locked assertion in clean_tree_block happy */
@@ -8576,7 +9026,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
- btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
+ btrfs_add_dropped_root(trans, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
@@ -8599,7 +9049,7 @@ out:
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
- btrfs_std_error(root->fs_info, err);
+ btrfs_std_error(root->fs_info, err, NULL);
return err;
}
@@ -8723,14 +9173,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 min_allocable_bytes;
int ret = -ENOSPC;
-
/*
* We need some metadata space and system metadata space for
* allocating chunks in some corner cases until we force to set
@@ -8739,7 +9188,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
if ((sinfo->flags &
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
!force)
- min_allocable_bytes = 1 * 1024 * 1024;
+ min_allocable_bytes = SZ_1M;
else
min_allocable_bytes = 0;
@@ -8747,6 +9196,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
spin_lock(&cache->lock);
if (cache->ro) {
+ cache->ro++;
ret = 0;
goto out;
}
@@ -8758,7 +9208,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
- cache->ro = 1;
+ cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
@@ -8768,7 +9218,7 @@ out:
return ret;
}
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
@@ -8776,8 +9226,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
u64 alloc_flags;
int ret;
- BUG_ON(cache->ro);
-
again:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -8789,7 +9237,7 @@ again:
* back off and let this transaction commit
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (trans->transaction->dirty_bg_run) {
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -8820,7 +9268,7 @@ again:
goto out;
}
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8828,7 +9276,7 @@ again:
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8891,7 +9339,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -8901,11 +9349,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
- sinfo->bytes_readonly -= num_bytes;
- cache->ro = 0;
- list_del_init(&cache->ro_list);
+ if (!--cache->ro) {
+ num_bytes = cache->key.offset - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ list_del_init(&cache->ro_list);
+ }
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -9270,6 +9720,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
cache->full_stripe_len = btrfs_full_stripe_len(root,
&root->fs_info->mapping_tree,
start);
+ set_free_space_tree_thresholds(cache);
+
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@@ -9281,6 +9733,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
+ mutex_init(&cache->free_space_lock);
return cache;
}
@@ -9305,7 +9758,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (btrfs_test_opt(root, SPACE_CACHE) &&
@@ -9421,7 +9874,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid)) {
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
} else if (btrfs_block_group_used(&cache->item) == 0) {
spin_lock(&info->unused_bgs_lock);
/* Should always be true but just in case. */
@@ -9449,11 +9902,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_RAID0],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_SINGLE],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
}
init_global_block_rsv(info);
@@ -9471,7 +9924,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
+ bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
+ trans->can_flush_pending_bgs = false;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
if (ret)
goto next;
@@ -9489,9 +9944,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ add_block_group_free_space(trans, root->fs_info, block_group);
+ /* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
}
+ trans->can_flush_pending_bgs = can_flush_pending_bgs;
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -9518,6 +9976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->needs_free_space = 1;
ret = exclude_super_stripes(root, cache);
if (ret) {
/*
@@ -9534,6 +9993,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(root, cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ bytes_used += new_bytes_used >> 1;
+ fragment_free_space(root, cache);
+ }
+#endif
/*
* Call to ensure the corresponding space_info object is created and
* assigned to our block group, but don't update its counters just yet.
@@ -9834,6 +10301,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* currently running transaction might finish and a new one start,
* allowing for new block groups to be created that can reuse the same
* physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
*/
remove_em = (atomic_read(&block_group->trimming) == 0);
/*
@@ -9875,6 +10347,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
unlock_chunks(root);
+ ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+ if (ret)
+ goto out;
+
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9890,6 +10366,47 @@ out:
return ret;
}
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = (struct map_lookup *)em->bdev;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -9908,26 +10425,30 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
u64 start, end;
+ int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
- space_info = block_group->space_info;
list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved ||
btrfs_block_group_used(&block_group->item) ||
- block_group->ro) {
+ block_group->ro ||
+ list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -9941,7 +10462,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
/* We don't want to force the issue, only flip if it's ok. */
- ret = set_block_group_ro(block_group, 0);
+ ret = inc_block_group_ro(block_group, 0);
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
@@ -9952,10 +10473,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- /* 1 for btrfs_orphan_reserve_metadata() */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->key.objectid);
if (IS_ERR(trans)) {
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
goto next;
}
@@ -9982,14 +10503,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -10007,16 +10528,47 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
+ /* DISCARD can flip during remount */
+ trimming = btrfs_test_opt(root, DISCARD);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_get_block_group_trimming(block_group);
+
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
ret = btrfs_remove_chunk(trans, root,
block_group->key.objectid);
+
+ if (ret) {
+ if (trimming)
+ btrfs_put_block_group_trimming(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
end_trans:
btrfs_end_transaction(trans, root);
next:
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -10066,10 +10618,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
return unpin_extent_range(root, start, end, false);
}
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space. Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time. We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses. For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+ u64 minlen, u64 *trimmed)
+{
+ u64 start = 0, len = 0;
+ int ret;
+
+ *trimmed = 0;
+
+ /* Not writeable = nothing to do. */
+ if (!device->writeable)
+ return 0;
+
+ /* No free space = nothing to do. */
+ if (device->total_bytes <= device->bytes_used)
+ return 0;
+
+ ret = 0;
+
+ while (1) {
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct btrfs_transaction *trans;
+ u64 bytes;
+
+ ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+ if (ret)
+ return ret;
+
+ down_read(&fs_info->commit_root_sem);
+
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ret = find_free_dev_extent_start(trans, device, minlen, start,
+ &start, &len);
+ if (trans)
+ btrfs_put_transaction(trans);
+
+ if (ret) {
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (ret)
+ break;
+
+ start += len;
+ *trimmed += bytes;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_device *device;
+ struct list_head *devices;
u64 group_trimmed;
u64 start;
u64 end;
@@ -10124,6 +10765,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
cache = next_block_group(fs_info->tree_root, cache);
}
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ devices = &root->fs_info->fs_devices->alloc_list;
+ list_for_each_entry(device, devices, dev_alloc_list) {
+ ret = btrfs_trim_free_extents(device, range->minlen,
+ &group_trimmed);
+ if (ret)
+ break;
+
+ trimmed += group_trimmed;
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
range->len = trimmed;
return ret;
}
@@ -10140,8 +10793,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
- * Make sure counter is updated before we wake up
- * waiters.
+ * Make sure counter is updated before we wake up waiters.
*/
smp_mb();
if (waitqueue_active(&root->subv_writers->wait))