diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 1210 |
1 files changed, 931 insertions, 279 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 07204bf601ed..60cc1399c64f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "raid56.h" #include "locking.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "math.h" #include "sysfs.h" #include "qgroup.h" @@ -95,8 +96,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota); + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -125,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } -static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) { atomic_inc(&cache->count); } @@ -332,13 +332,34 @@ static void put_caching_control(struct btrfs_caching_control *ctl) kfree(ctl); } +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + root->nodesize : root->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) { u64 extent_start, extent_end, size, total_added = 0; int ret; @@ -375,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static noinline void caching_thread(struct btrfs_work *work) +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group_cache *block_group; struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -387,19 +407,28 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = -ENOMEM; + int ret; + bool wakeup = true; - caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; extent_root = fs_info->extent_root; path = btrfs_alloc_path(); if (!path) - goto out; + return -ENOMEM; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(extent_root, block_group)) + wakeup = false; +#endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -408,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 1; + path->reada = READA_FORWARD; key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; -again: - mutex_lock(&caching_ctl->mutex); - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto err; + goto out; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -441,17 +466,20 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; } ret = btrfs_next_leaf(extent_root, path); if (ret < 0) - goto err; + goto out; if (ret) break; leaf = path->nodes[0]; @@ -464,7 +492,8 @@ next: key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -489,9 +518,10 @@ next: else last = key.objectid + key.offset; - if (total_found > (1024 * 1024 * 2)) { + if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; - wake_up(&caching_ctl->wait); + if (wakeup) + wake_up(&caching_ctl->wait); } } path->slots[0]++; @@ -503,25 +533,58 @@ next: block_group->key.offset); caching_ctl->progress = (u64)-1; +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + spin_lock(&block_group->lock); block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_FINISHED; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); -err: - btrfs_free_path(path); - up_read(&fs_info->commit_root_sem); - - free_excluded_extents(extent_root, block_group); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(extent_root, block_group)) { + u64 bytes_used; - mutex_unlock(&caching_ctl->mutex); -out: - if (ret) { + spin_lock(&block_group->space_info->lock); spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_ERROR; + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(extent_root, block_group); } +#endif + + caching_ctl->progress = (u64)-1; + + up_read(&fs_info->commit_root_sem); + free_excluded_extents(fs_info->extent_root, block_group); + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -607,6 +670,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(fs_info->extent_root, + cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(fs_info->extent_root, cache); + } +#endif mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); @@ -617,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } else { /* - * We are not going to do the fast caching, set cached to the - * appropriate value and wakeup any waiters. + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. */ spin_lock(&cache->lock); if (load_cache_only) { @@ -1316,8 +1395,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_root *root, - struct btrfs_path *path, +static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; @@ -1883,10 +1961,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) { - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); + int j, ret = 0; + u64 bytes_left, end; + u64 aligned_start = ALIGN(start, 1 << 9); + + if (WARN_ON(start != aligned_start)) { + len -= aligned_start - start; + len = round_down(len, 1 << 9); + start = aligned_start; + } + + *discarded_bytes = 0; + + if (!len) + return 0; + + end = start + len; + bytes_left = len; + + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_start = btrfs_sb_offset(j); + u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; + u64 size = sb_start - start; + + if (!in_range(sb_start, start, bytes_left) && + !in_range(sb_end, start, bytes_left) && + !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) + continue; + + /* + * Superblock spans beginning of range. Adjust start and + * try again. + */ + if (sb_start <= start) { + start += sb_end - start; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + continue; + } + + if (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + start = sb_end; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += bytes_left; + } + return ret; } int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@ -1907,14 +2052,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 bytes; if (!stripe->dev->can_discard) continue; ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, - stripe->length); + stripe->length, + &bytes); if (!ret) - discarded_bytes += stripe->length; + discarded_bytes += bytes; else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ @@ -1941,8 +2088,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, - int no_quota) + u64 root_objectid, u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1954,12 +2100,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + BTRFS_ADD_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + num_bytes, parent, root_objectid, + owner, offset, 0, + BTRFS_ADD_DELAYED_REF, NULL); } return ret; } @@ -1980,16 +2126,12 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 num_bytes = node->num_bytes; u64 refs; int ret; - int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) - no_quota = 1; - - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, @@ -2015,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, @@ -2128,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 1); @@ -2223,8 +2365,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins, - node->no_quota); + ref->level, &ins); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, @@ -2277,6 +2418,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(root->fs_info, + head->qgroup_ref_root, + head->qgroup_reserved); return ret; } @@ -2365,7 +2511,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2760,11 +2920,15 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) return 0; + if (root->fs_info->creating_free_space_tree) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2776,6 +2940,7 @@ again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif + trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { btrfs_abort_transaction(trans, root, ret); @@ -2825,6 +2990,7 @@ again: } out: assert_qgroups_uptodate(trans); + trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -2841,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return -ENOMEM; extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, @@ -3038,7 +3204,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); + u64, u64, u64, u64, u64, u64); if (btrfs_test_is_dummy_root(root)) @@ -3079,15 +3245,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, 1); + key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - 1); + parent, ref_root, level - 1, 0); if (ret) goto fail; } @@ -3182,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, * If this block group is smaller than 100 megs don't bother caching the * block group. */ - if (block_group->key.offset < (100 * 1024 * 1024)) { + if (block_group->key.offset < (100 * SZ_1M)) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3268,28 +3433,47 @@ again: spin_unlock(&block_group->lock); /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* * Try to preallocate enough space based on how big the block group is. * Keep in mind this has to include any pinned space which could end up * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, SZ_256M); if (!num_pages) num_pages = 1; num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages, num_pages); + ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -3519,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; /* - * We don't need the lock here since we are protected by the transaction - * commit. We want to do the cache_save_setup first and then run the + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the * delayed refs to make sure we have the best chance at doing this all * in one shot. */ + spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, @@ -3535,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * finish and then do it all again */ if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_wait_cache_io(root, trans, cache, &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } /* @@ -3547,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * on any pending IO */ list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); should_put = 1; cache_save_setup(cache, trans, path); @@ -3571,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, root, path, + cache); + } if (ret) btrfs_abort_transaction(trans, root, ret); } @@ -3578,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } + spin_unlock(&cur_trans->dirty_bgs_lock); while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group_cache, @@ -3674,10 +3892,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - if (total_bytes > 0) - found->full = 0; - else - found->full = 1; + found->full = 0; + found->max_extent_size = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3754,7 +3970,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; - u64 tmp; + u64 raid_type; + u64 allowed = 0; /* * see if restripe for this chunk_type is in progress, if so @@ -3772,31 +3989,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) spin_unlock(&root->fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ - if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5); - if (num_devices < 3) - flags &= ~BTRFS_BLOCK_GROUP_RAID6; - if (num_devices < 4) - flags &= ~BTRFS_BLOCK_GROUP_RAID10; - - tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); - flags &= ~tmp; - - if (tmp & BTRFS_BLOCK_GROUP_RAID6) - tmp = BTRFS_BLOCK_GROUP_RAID6; - else if (tmp & BTRFS_BLOCK_GROUP_RAID5) - tmp = BTRFS_BLOCK_GROUP_RAID5; - else if (tmp & BTRFS_BLOCK_GROUP_RAID10) - tmp = BTRFS_BLOCK_GROUP_RAID10; - else if (tmp & BTRFS_BLOCK_GROUP_RAID1) - tmp = BTRFS_BLOCK_GROUP_RAID1; - else if (tmp & BTRFS_BLOCK_GROUP_RAID0) - tmp = BTRFS_BLOCK_GROUP_RAID0; - - return extended_to_chunk(flags | tmp); + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_group[raid_type]; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); } static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) @@ -3835,11 +4047,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3938,7 +4146,8 @@ commit_trans: if (IS_ERR(trans)) return PTR_ERR(trans); if (have_pinned_space >= 0 || - trans->transaction->have_free_bgs || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || need_commit > 0) { ret = btrfs_commit_transaction(trans, root); if (ret) @@ -3960,38 +4169,86 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - ret = btrfs_qgroup_reserve(root, write_bytes); - if (ret) - goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); -out: spin_unlock(&data_sinfo->lock); return ret; } /* - * Called if we need to clear a data reservation for this inode. + * New check_data_free_space() with ability for precious data reservation + * Will replace old btrfs_check_data_free_space(), but for patch split, + * add a new function first and then replace it. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + /* align the range */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (ret < 0) + return ret; + + /* + * Use new btrfs_qgroup_reserve_data to reserve precious data space + * + * TODO: Find a good method to avoid reserve data space for NOCOW + * range, but don't impact performance on quota disable case. + */ + ret = btrfs_qgroup_reserve_data(inode, start, len); + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, root->sectorsize); + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - WARN_ON(data_sinfo->bytes_may_use < bytes); - data_sinfo->bytes_may_use -= bytes; + if (WARN_ON(data_sinfo->bytes_may_use < len)) + data_sinfo->bytes_may_use = 0; + else + data_sinfo->bytes_may_use -= len; trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 0); + data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-indoe data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +{ + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, start, len); +} + static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4035,14 +4292,13 @@ static int should_alloc_chunk(struct btrfs_root *root, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - thresh = max_t(u64, 64 * 1024 * 1024, - div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); if (num_bytes - num_allocated < thresh) return 1; } - if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) + if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) return 0; return 1; } @@ -4241,7 +4497,8 @@ out: * the block groups that were made dirty during the lifetime of the * transaction. */ - if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + if (trans->can_flush_pending_bgs && + trans->chunk_bytes_reserved >= (u64)SZ_2M) { btrfs_create_pending_block_groups(trans, trans->root); btrfs_trans_release_chunk_metadata(trans); } @@ -4339,7 +4596,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) return nr; } -#define EXTENT_SIZE_PER_ITEM (256 * 1024) +#define EXTENT_SIZE_PER_ITEM SZ_256K /* * shrink metadata reservation for delalloc @@ -4544,8 +4801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim; - to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, - 16 * 1024 * 1024); + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, BTRFS_RESERVE_FLUSH_ALL)) { @@ -4556,8 +4812,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; - if (can_overcommit(root, space_info, 1024 * 1024, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -4822,13 +5077,9 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->csum_root && trans->adding_csums) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->uuid_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == root->fs_info->csum_root && trans->adding_csums) || + (root == root->fs_info->uuid_root)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -5117,7 +5368,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); + block_rsv->size = min_t(u64, num_bytes, SZ_512M); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -5271,7 +5522,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ num_bytes = 3 * root->nodesize; - ret = btrfs_qgroup_reserve(root, num_bytes); + ret = btrfs_qgroup_reserve_meta(root, num_bytes); if (ret) return ret; } else { @@ -5289,10 +5540,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); - if (ret) { - if (*qgroup_reserved) - btrfs_qgroup_free(root, *qgroup_reserved); - } + if (ret && *qgroup_reserved) + btrfs_qgroup_free_meta(root, *qgroup_reserved); return ret; } @@ -5453,15 +5702,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve_meta(root, + nr_extents * root->nodesize); if (ret) goto out_fail; } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, nr_extents * root->nodesize); + btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } @@ -5584,41 +5833,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc * @inode: inode we're writing to - * @num_bytes: the number of bytes we want to allocate + * @start: start range we are writing to + * @len: how long the range we are writing to + * + * TODO: This function will finally replace old btrfs_delalloc_reserve_space() * * This will do the following things * - * o reserve space in the data space info for num_bytes - * o reserve space in the metadata space info based on number of outstanding + * o reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * o reserve space for metadata space, based on the number of outstanding * extents and how much csums will be needed - * o add to the inodes ->delalloc_bytes + * also reserve metadata space in a per root over-reserve method. + * o add to the inodes->delalloc_bytes * o add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) * - * This will return 0 for success and -ENOSPC if there is no space left. + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); - if (ret) - return ret; - - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); - if (ret) { - btrfs_free_reserved_data_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, start, len); + if (ret < 0) return ret; - } - - return 0; + ret = btrfs_delalloc_reserve_metadata(inode, len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, start, len); + return ret; } /** * btrfs_delalloc_release_space - release data and metadata space for delalloc * @inode: inode we're releasing space for - * @num_bytes: the number of bytes we want to free up + * @start: start position of the space already reserved + * @len: the len of the space already reserved * * This must be matched with a call to btrfs_delalloc_reserve_space. This is * called in the case that we don't need the metadata AND data reservations @@ -5627,11 +5883,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, num_bytes); - btrfs_free_reserved_data_space(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, len); + btrfs_free_reserved_data_space(inode, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -5708,19 +5965,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - /* - * No longer have used bytes in this block group, queue - * it for deletion. - */ - if (old_val == 0) { - spin_lock(&info->unused_bgs_lock); - if (list_empty(&cache->bg_list)) { - btrfs_get_block_group(cache); - list_add_tail(&cache->bg_list, - &info->unused_bgs); - } - spin_unlock(&info->unused_bgs_lock); - } } spin_lock(&trans->transaction->dirty_bgs_lock); @@ -5732,6 +5976,22 @@ static int update_block_group(struct btrfs_trans_handle *trans, } spin_unlock(&trans->transaction->dirty_bgs_lock); + /* + * No longer have used bytes in this block group, queue it for + * deletion. We do this after adding the block group to the + * dirty list to avoid races between cleaner kthread and space + * cache writeout. + */ + if (!alloc && old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; @@ -5996,6 +6256,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, + u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + bool ssd = btrfs_test_opt(root, SSD); + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (ssd) + *empty_cluster = SZ_2M; + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &root->fs_info->meta_alloc_cluster; + if (!ssd) + *empty_cluster = SZ_64K; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + ret = &root->fs_info->data_alloc_cluster; + } + + return ret; +} + static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, const bool return_free_space) { @@ -6003,7 +6291,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_block_group_cache *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; bool readonly; while (start <= end) { @@ -6012,8 +6303,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); + total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(root, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; } len = cache->key.objectid + cache->key.offset - start; @@ -6026,12 +6323,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, } start += len; + total_unpinned += len; space_info = cache->space_info; + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; @@ -6062,20 +6374,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; u64 end; int ret; - if (trans->aborted) - return 0; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else unpin = &fs_info->freed_extents[0]; - while (1) { + while (!trans->aborted) { mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); @@ -6094,6 +6405,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + /* + * Transaction is finished. We don't need the lock anymore. We + * do need to clean up the block groups in case of a transaction + * abort. + */ + deleted_bgs = &trans->transaction->deleted_bgs; + list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { + u64 trimmed = 0; + + ret = -EROFS; + if (!trans->aborted) + ret = btrfs_discard_extent(root, + block_group->key.objectid, + block_group->key.offset, + &trimmed); + + list_del_init(&block_group->bg_list); + btrfs_put_block_group_trimming(block_group); + btrfs_put_block_group(block_group); + + if (ret) { + const char *errstr = btrfs_decode_error(ret); + btrfs_warn(fs_info, + "Discard failed while removing blockgroup: errno=%d %s\n", + ret, errstr); + } + } + return 0; } @@ -6137,7 +6476,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - int no_quota = node->no_quota; u32 item_size; u64 refs; u64 bytenr = node->bytenr; @@ -6146,14 +6484,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (!info->quota_enabled || !is_fstree(root_objectid)) - no_quota = 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -6349,7 +6684,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(root, path, iref)); + extent_data_ref_count(path, iref)); if (iref) { BUG_ON(path->slots[0] != extent_slot); } else { @@ -6376,6 +6711,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + ret = add_to_free_space_tree(trans, root->fs_info, bytenr, + num_bytes); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); @@ -6474,7 +6816,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, 0); + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); /* -ENOMEM */ } @@ -6522,7 +6864,7 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota) + u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6545,13 +6887,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, no_quota); + BTRFS_DROP_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, no_quota); + offset, 0, + BTRFS_DROP_DELAYED_REF, NULL); } return ret; } @@ -6737,7 +7079,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; - int empty_cluster = 2 * 1024 * 1024; + u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); @@ -6747,6 +7089,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; + bool orig_have_caching_bg = false; + bool full_search = false; WARN_ON(num_bytes < root->sectorsize); ins->type = BTRFS_EXTENT_ITEM_KEY; @@ -6762,36 +7106,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } /* - * If the space info is for both data and metadata it means we have a - * small filesystem and we can't use the clustering stuff. + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. */ - if (btrfs_mixed_space_info(space_info)) - use_cluster = false; - - if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { - last_ptr = &root->fs_info->meta_alloc_cluster; - if (!btrfs_test_opt(root, SSD)) - empty_cluster = 64 * 1024; - } - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && - btrfs_test_opt(root, SSD)) { - last_ptr = &root->fs_info->data_alloc_cluster; + if (unlikely(space_info->max_extent_size)) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + use_cluster = false; + } + spin_unlock(&space_info->lock); } + last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + hint_byte = last_ptr->window_start; + use_cluster = false; + } spin_unlock(&last_ptr->lock); } search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - - if (!last_ptr) - empty_cluster = 0; - if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); @@ -6826,6 +7181,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } search: have_caching_bg = false; + if (index == 0 || index == __get_raid_index(flags)) + full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -6859,6 +7216,7 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { + have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6873,7 +7231,7 @@ have_block_group: * Ok we want to try and use the cluster allocator, so * lets look there */ - if (last_ptr) { + if (last_ptr && use_cluster) { struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* @@ -6999,6 +7357,16 @@ refill_cluster: } unclustered_alloc: + /* + * We are doing an unclustered alloc, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get + * more space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < @@ -7031,8 +7399,6 @@ unclustered_alloc: failed_alloc = true; goto have_block_group; } else if (!offset) { - if (!cached) - have_caching_bg = true; goto loop; } checks: @@ -7073,6 +7439,10 @@ loop: } up_read(&space_info->groups_sem); + if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg + && !orig_have_caching_bg) + orig_have_caching_bg = true; + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) goto search; @@ -7089,7 +7459,20 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - loop++; + if (loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any unached bgs and we've alrelady done a + * full search through. + */ + if (orig_have_caching_bg || !full_search) + loop = LOOP_CACHING_WAIT; + else + loop = LOOP_ALLOC_CHUNK; + } else { + loop++; + } + if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; @@ -7107,6 +7490,15 @@ loop: ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + loop = LOOP_NO_EMPTY_SIZE; + /* * Do not bail out on ENOSPC since we * can do more things. @@ -7123,6 +7515,15 @@ loop: } if (loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (empty_size == 0 && + empty_cluster == 0) { + ret = -ENOSPC; + goto out; + } empty_size = 0; empty_cluster = 0; } @@ -7131,11 +7532,20 @@ loop: } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } ret = 0; } out: - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + spin_lock(&space_info->lock); + space_info->max_extent_size = max_extent_size; + spin_unlock(&space_info->lock); ins->offset = max_extent_size; + } return ret; } @@ -7184,7 +7594,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { - bool final_tried = false; + bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; @@ -7319,6 +7729,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + ins->offset); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7333,8 +7748,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota) + int level, struct btrfs_key *ins) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -7400,6 +7814,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + num_bytes); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7415,7 +7834,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) { int ret; @@ -7424,7 +7844,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, ins->offset, 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + ram_bytes, BTRFS_ADD_DELAYED_EXTENT, + NULL); return ret; } @@ -7480,7 +7901,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); - btrfs_set_buffer_uptodate(buf); + set_extent_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; @@ -7567,9 +7988,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, /* * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, @@ -7629,19 +8047,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - if (skinny_metadata) - extent_op->update_key = 0; - else - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->is_data = false; extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); + extent_op); if (ret) goto out_free_delayed; } @@ -7757,21 +8172,47 @@ reada: } /* - * TODO: Modify related function to add related node/leaf to dirty_extent_root, - * for later qgroup accounting. - * - * Current, this function does nothing. + * These may not be seen by the usual inc/dec ref code so we have to + * add them here. */ +static int record_one_subtree_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_qgroup_extent_record *qrecord; + struct btrfs_delayed_ref_root *delayed_refs; + + qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); + if (!qrecord) + return -ENOMEM; + + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + kfree(qrecord); + spin_unlock(&delayed_refs->lock); + + return 0; +} + static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type; + int i, extent_type, ret; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; + /* We can be called directly from walk_up_proc() */ + if (!root->fs_info->quota_enabled) + return 0; + for (i = 0; i < nr; i++) { btrfs_item_key_to_cpu(eb, &key, i); @@ -7790,6 +8231,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + if (ret) + return ret; } return 0; } @@ -7858,8 +8303,6 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. - * TODO: Modify this function to mark all (including complete shared node) - * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7937,6 +8380,11 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = record_one_subtree_extent(trans, root, child_bytenr, + root->nodesize); + if (ret) + goto out; } if (level == 0) { @@ -8182,14 +8630,15 @@ skip: ret = account_shared_subtree(trans, root, next, generation, level - 1); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "Error " "%d accounting shared subtree. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); + root->root_key.objectid, level - 1, 0); BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); @@ -8274,10 +8723,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = account_leaf_items(trans, root, eb); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "error " "%d accounting leaf items. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } /* make block locked assertion in clean_tree_block happy */ @@ -8576,7 +9026,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { - btrfs_drop_and_free_fs_root(tree_root->fs_info, root); + btrfs_add_dropped_root(trans, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); @@ -8599,7 +9049,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err); + btrfs_std_error(root->fs_info, err, NULL); return err; } @@ -8723,14 +9173,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; u64 min_allocable_bytes; int ret = -ENOSPC; - /* * We need some metadata space and system metadata space for * allocating chunks in some corner cases until we force to set @@ -8739,7 +9188,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) if ((sinfo->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && !force) - min_allocable_bytes = 1 * 1024 * 1024; + min_allocable_bytes = SZ_1M; else min_allocable_bytes = 0; @@ -8747,6 +9196,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) spin_lock(&cache->lock); if (cache->ro) { + cache->ro++; ret = 0; goto out; } @@ -8758,7 +9208,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - cache->ro = 1; + cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } @@ -8768,7 +9218,7 @@ out: return ret; } -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8776,8 +9226,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, u64 alloc_flags; int ret; - BUG_ON(cache->ro); - again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -8789,7 +9237,7 @@ again: * back off and let this transaction commit */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (trans->transaction->dirty_bg_run) { + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { u64 transid = trans->transid; mutex_unlock(&root->fs_info->ro_block_group_mutex); @@ -8820,7 +9268,7 @@ again: goto out; } - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -8828,7 +9276,7 @@ again: CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); @@ -8891,7 +9339,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -8901,11 +9349,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, spin_lock(&sinfo->lock); spin_lock(&cache->lock); - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - cache->ro = 0; - list_del_init(&cache->ro_list); + if (!--cache->ro) { + num_bytes = cache->key.offset - cache->reserved - + cache->pinned - cache->bytes_super - + btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -9270,6 +9720,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) cache->full_stripe_len = btrfs_full_stripe_len(root, &root->fs_info->mapping_tree, start); + set_free_space_tree_thresholds(cache); + atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); @@ -9281,6 +9733,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); return cache; } @@ -9305,7 +9758,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && @@ -9421,7 +9874,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) { - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } else if (btrfs_block_group_used(&cache->item) == 0) { spin_lock(&info->unused_bgs_lock); /* Should always be true but just in case. */ @@ -9449,11 +9902,11 @@ int btrfs_read_block_groups(struct btrfs_root *root) list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_RAID0], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_SINGLE], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } init_global_block_rsv(info); @@ -9471,7 +9924,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; + trans->can_flush_pending_bgs = false; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { if (ret) goto next; @@ -9489,9 +9944,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + add_block_group_free_space(trans, root->fs_info, block_group); + /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); } + trans->can_flush_pending_bgs = can_flush_pending_bgs; } int btrfs_make_block_group(struct btrfs_trans_handle *trans, @@ -9518,6 +9976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; ret = exclude_super_stripes(root, cache); if (ret) { /* @@ -9534,6 +9993,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(root, cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(root, cache); + } +#endif /* * Call to ensure the corresponding space_info object is created and * assigned to our block group, but don't update its counters just yet. @@ -9834,6 +10301,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. */ remove_em = (atomic_read(&block_group->trimming) == 0); /* @@ -9875,6 +10347,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, unlock_chunks(root); + ret = remove_block_group_free_space(trans, root->fs_info, block_group); + if (ret) + goto out; + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9890,6 +10366,47 @@ out: return ret; } +struct btrfs_trans_handle * +btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, + const u64 chunk_offset) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map *em; + struct map_lookup *map; + unsigned int num_items; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + read_unlock(&em_tree->lock); + ASSERT(em && em->start == chunk_offset); + + /* + * We need to reserve 3 + N units from the metadata space info in order + * to remove a block group (done at btrfs_remove_chunk() and at + * btrfs_remove_block_group()), which are used for: + * + * 1 unit for adding the free space inode's orphan (located in the tree + * of tree roots). + * 1 unit for deleting the block group item (located in the extent + * tree). + * 1 unit for deleting the free space item (located in tree of tree + * roots). + * N units for deleting N device extent items corresponding to each + * stripe (located in the device tree). + * + * In order to remove a block group we also need to reserve units in the + * system space info in order to update the chunk tree (update one or + * more device items and remove one chunk item), but this is done at + * btrfs_remove_chunk() through a call to check_system_chunk(). + */ + map = (struct map_lookup *)em->bdev; + num_items = 3 + map->num_stripes; + free_extent_map(em); + + return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, + num_items, 1); +} + /* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. @@ -9908,26 +10425,30 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { u64 start, end; + int trimming; block_group = list_first_entry(&fs_info->unused_bgs, struct btrfs_block_group_cache, bg_list); - space_info = block_group->space_info; list_del_init(&block_group->bg_list); + + space_info = block_group->space_info; + if (ret || btrfs_mixed_space_info(space_info)) { btrfs_put_block_group(block_group); continue; } spin_unlock(&fs_info->unused_bgs_lock); - mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->delete_unused_bgs_mutex); /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); if (block_group->reserved || btrfs_block_group_used(&block_group->item) || - block_group->ro) { + block_group->ro || + list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do @@ -9941,7 +10462,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); /* We don't want to force the issue, only flip if it's ok. */ - ret = set_block_group_ro(block_group, 0); + ret = inc_block_group_ro(block_group, 0); up_write(&space_info->groups_sem); if (ret < 0) { ret = 0; @@ -9952,10 +10473,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ - /* 1 for btrfs_orphan_reserve_metadata() */ - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_trans_remove_block_group(fs_info, + block_group->key.objectid); if (IS_ERR(trans)) { - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); ret = PTR_ERR(trans); goto next; } @@ -9982,14 +10503,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } mutex_unlock(&fs_info->unused_bg_unpin_mutex); @@ -10007,16 +10528,47 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(root, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ ret = btrfs_remove_chunk(trans, root, block_group->key.objectid); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + spin_lock(&fs_info->unused_bgs_lock); + /* + * A concurrent scrub might have added us to the list + * fs_info->unused_bgs, so use a list_move operation + * to add the block group to the deleted_bgs list. + */ + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_get_block_group(block_group); + } end_trans: btrfs_end_transaction(trans, root); next: - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } @@ -10066,10 +10618,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) return unpin_extent_range(root, start, end, false); } +/* + * It used to be that old block groups would be left around forever. + * Iterating over them would be enough to trim unused space. Since we + * now automatically remove them, we also need to iterate over unallocated + * space. + * + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account + * to ensure that we're not discarding chunks that were released in + * the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the + * transaction. + */ +static int btrfs_trim_free_extents(struct btrfs_device *device, + u64 minlen, u64 *trimmed) +{ + u64 start = 0, len = 0; + int ret; + + *trimmed = 0; + + /* Not writeable = nothing to do. */ + if (!device->writeable) + return 0; + + /* No free space = nothing to do. */ + if (device->total_bytes <= device->bytes_used) + return 0; + + ret = 0; + + while (1) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_transaction *trans; + u64 bytes; + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) + return ret; + + down_read(&fs_info->commit_root_sem); + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); + if (trans) + btrfs_put_transaction(trans); + + if (ret) { + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + if (ret == -ENOSPC) + ret = 0; + break; + } + + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) + break; + + start += len; + *trimmed += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_device *device; + struct list_head *devices; u64 group_trimmed; u64 start; u64 end; @@ -10124,6 +10765,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) cache = next_block_group(fs_info->tree_root, cache); } + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + devices = &root->fs_info->fs_devices->alloc_list; + list_for_each_entry(device, devices, dev_alloc_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); + if (ret) + break; + + trimmed += group_trimmed; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + range->len = trimmed; return ret; } @@ -10140,8 +10793,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* - * Make sure counter is updated before we wake up - * waiters. + * Make sure counter is updated before we wake up waiters. */ smp_mb(); if (waitqueue_active(&root->subv_writers->wait)) |