diff options
Diffstat (limited to 'fs/btrfs/qgroup.c')
-rw-r--r-- | fs/btrfs/qgroup.c | 372 |
1 files changed, 280 insertions, 92 deletions
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4e473a998219..c1cd5558a646 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, parent_node = *p; entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, node); - if (bytenr < entry->bytenr) + if (bytenr < entry->bytenr) { p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) + } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; - else + } else { + if (record->data_rsv && !entry->data_rsv) { + entry->data_rsv = record->data_rsv; + entry->data_rsv_refroot = + record->data_rsv_refroot; + } return 1; + } } rb_link_node(&record->node, parent_node, p); @@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - record = kmalloc(sizeof(*record), gfp_flag); + record = kzalloc(sizeof(*record), gfp_flag); if (!record) return -ENOMEM; @@ -1832,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; } @@ -1973,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; need_cleanup = true; } @@ -2017,86 +2023,30 @@ out: return ret; } -/* - * Inform qgroup to trace subtree swap used in balance. - * - * Unlike btrfs_qgroup_trace_subtree(), this function will only trace - * new tree blocks whose generation is equal to (or larger than) @last_snapshot. - * - * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and - * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, - * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the counterpart of the tree block, then mark both tree blocks as qgroup dirty, - * and skip all tree blocks whose generation is smaller than last_snapshot. - * - * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), - * which could be the cause of very slow balance if the file tree is large. - * - * @src_parent, @src_slot: pointer to src (file tree) eb. - * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. - */ -int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *bg_cache, - struct extent_buffer *src_parent, int src_slot, - struct extent_buffer *dst_parent, int dst_slot, - u64 last_snapshot) +static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct extent_buffer *src_eb, + struct extent_buffer *dst_eb, + u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *dst_path = NULL; - struct btrfs_key first_key; - struct extent_buffer *src_eb = NULL; - struct extent_buffer *dst_eb = NULL; - bool trace_leaf = false; - u64 child_gen; - u64 child_bytenr; int level; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - /* Check parameter order */ - if (btrfs_node_ptr_generation(src_parent, src_slot) > - btrfs_node_ptr_generation(dst_parent, dst_slot)) { + /* Wrong parameter order */ + if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, - btrfs_node_ptr_generation(src_parent, src_slot), - btrfs_node_ptr_generation(dst_parent, dst_slot)); + btrfs_header_generation(src_eb), + btrfs_header_generation(dst_eb)); return -EUCLEAN; } - /* - * Only trace leaf if we're relocating data block groups, this could - * reduce tons of data extents tracing for meta/sys bg relocation. - */ - if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA) - trace_leaf = true; - /* Read out real @src_eb, pointed by @src_parent and @src_slot */ - child_bytenr = btrfs_node_blockptr(src_parent, src_slot); - child_gen = btrfs_node_ptr_generation(src_parent, src_slot); - btrfs_node_key_to_cpu(src_parent, &first_key, src_slot); - - src_eb = read_tree_block(fs_info, child_bytenr, child_gen, - btrfs_header_level(src_parent) - 1, &first_key); - if (IS_ERR(src_eb)) { - ret = PTR_ERR(src_eb); - goto out; - } - - /* Read out real @dst_eb, pointed by @src_parent and @src_slot */ - child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot); - child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot); - btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot); - - dst_eb = read_tree_block(fs_info, child_bytenr, child_gen, - btrfs_header_level(dst_parent) - 1, &first_key); - if (IS_ERR(dst_eb)) { - ret = PTR_ERR(dst_eb); - goto out; - } - if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { - ret = -EINVAL; + ret = -EIO; goto out; } @@ -2106,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, ret = -ENOMEM; goto out; } - /* For dst_path */ extent_buffer_get(dst_eb); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; - /* Do the generation-aware breadth-first search */ + /* Do the generation aware breadth-first search */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, level, last_snapshot, trace_leaf); if (ret < 0) @@ -2121,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, ret = 0; out: - free_extent_buffer(src_eb); - free_extent_buffer(dst_eb); btrfs_free_path(dst_path); if (ret < 0) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -2207,7 +2154,7 @@ walk_down: path->slots[level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, @@ -2576,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) goto cleanup; } + /* Free the reserved data space */ + btrfs_qgroup_free_refroot(fs_info, + record->data_rsv_refroot, + record->data_rsv, + BTRFS_QGROUP_RSV_DATA); /* * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current @@ -2842,16 +2794,15 @@ out: /* * Two limits to commit transaction in advance. * - * For RATIO, it will be 1/RATIO of the remaining limit - * (excluding data and prealloc meta) as threshold. + * For RATIO, it will be 1/RATIO of the remaining limit as threshold. * For SIZE, it will be in byte unit as threshold. */ -#define QGROUP_PERTRANS_RATIO 32 -#define QGROUP_PERTRANS_SIZE SZ_32M +#define QGROUP_FREE_RATIO 32 +#define QGROUP_FREE_SIZE SZ_32M static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 limit; + u64 free; u64 threshold; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && @@ -2870,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, */ if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - limit = qg->max_excl; - else - limit = qg->max_rfer; - threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - - qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / - QGROUP_PERTRANS_RATIO; - threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; + threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } else { + free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; + threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } /* * Use transaction_kthread to commit transaction, so we no * longer need to bother nested transaction nor lock context. */ - if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + if (free < threshold) btrfs_commit_transaction_locksafe(fs_info); } @@ -2959,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, num_bytes, type); qgroup_rsv_add(fs_info, qg, num_bytes, type); } @@ -3026,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type); qgroup_rsv_release(fs_info, qg, num_bytes, type); list_for_each_entry(glist, &qg->groups, next_group) { @@ -3783,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) } extent_changeset_release(&changeset); } + +void btrfs_qgroup_init_swapped_blocks( + struct btrfs_qgroup_swapped_blocks *swapped_blocks) +{ + int i; + + spin_lock_init(&swapped_blocks->lock); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + swapped_blocks->blocks[i] = RB_ROOT; + swapped_blocks->swapped = false; +} + +/* + * Delete all swapped blocks record of @root. + * Every record here means we skipped a full subtree scan for qgroup. + * + * Gets called when committing one transaction. + */ +void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) +{ + struct btrfs_qgroup_swapped_blocks *swapped_blocks; + int i; + + swapped_blocks = &root->swapped_blocks; + + spin_lock(&swapped_blocks->lock); + if (!swapped_blocks->swapped) + goto out; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + struct rb_root *cur_root = &swapped_blocks->blocks[i]; + struct btrfs_qgroup_swapped_block *entry; + struct btrfs_qgroup_swapped_block *next; + + rbtree_postorder_for_each_entry_safe(entry, next, cur_root, + node) + kfree(entry); + swapped_blocks->blocks[i] = RB_ROOT; + } + swapped_blocks->swapped = false; +out: + spin_unlock(&swapped_blocks->lock); +} + +/* + * Add subtree roots record into @subvol_root. + * + * @subvol_root: tree root of the subvolume tree get swapped + * @bg: block group under balance + * @subvol_parent/slot: pointer to the subtree root in subvolume tree + * @reloc_parent/slot: pointer to the subtree root in reloc tree + * BOTH POINTERS ARE BEFORE TREE SWAP + * @last_snapshot: last snapshot generation of the subvolume tree + */ +int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *subvol_root, + struct btrfs_block_group_cache *bg, + struct extent_buffer *subvol_parent, int subvol_slot, + struct extent_buffer *reloc_parent, int reloc_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = subvol_root->fs_info; + struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct rb_node **cur; + struct rb_node *parent = NULL; + int level = btrfs_header_level(subvol_parent) - 1; + int ret = 0; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", + __func__, + btrfs_node_ptr_generation(subvol_parent, subvol_slot), + btrfs_node_ptr_generation(reloc_parent, reloc_slot)); + return -EUCLEAN; + } + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + ret = -ENOMEM; + goto out; + } + + /* + * @reloc_parent/slot is still before swap, while @block is going to + * record the bytenr after swap, so we do the swap here. + */ + block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); + block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, + reloc_slot); + block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); + block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, + subvol_slot); + block->last_snapshot = last_snapshot; + block->level = level; + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + block->trace_leaf = true; + else + block->trace_leaf = false; + btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); + + /* Insert @block into @blocks */ + spin_lock(&blocks->lock); + cur = &blocks->blocks[level].rb_node; + while (*cur) { + struct btrfs_qgroup_swapped_block *entry; + + parent = *cur; + entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, + node); + + if (entry->subvol_bytenr < block->subvol_bytenr) { + cur = &(*cur)->rb_left; + } else if (entry->subvol_bytenr > block->subvol_bytenr) { + cur = &(*cur)->rb_right; + } else { + if (entry->subvol_generation != + block->subvol_generation || + entry->reloc_bytenr != block->reloc_bytenr || + entry->reloc_generation != + block->reloc_generation) { + /* + * Duplicated but mismatch entry found. + * Shouldn't happen. + * + * Marking qgroup inconsistent should be enough + * for end users. + */ + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + ret = -EEXIST; + } + kfree(block); + goto out_unlock; + } + } + rb_link_node(&block->node, parent, cur); + rb_insert_color(&block->node, &blocks->blocks[level]); + blocks->swapped = true; +out_unlock: + spin_unlock(&blocks->lock); +out: + if (ret < 0) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + +/* + * Check if the tree block is a subtree root, and if so do the needed + * delayed subtree trace for qgroup. + * + * This is called during btrfs_cow_block(). + */ +int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *subvol_eb) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct extent_buffer *reloc_eb = NULL; + struct rb_node *node; + bool found = false; + bool swapped = false; + int level = btrfs_header_level(subvol_eb); + int ret = 0; + int i; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + if (!is_fstree(root->root_key.objectid) || !root->reloc_root) + return 0; + + spin_lock(&blocks->lock); + if (!blocks->swapped) { + spin_unlock(&blocks->lock); + return 0; + } + node = blocks->blocks[level].rb_node; + + while (node) { + block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); + if (block->subvol_bytenr < subvol_eb->start) { + node = node->rb_left; + } else if (block->subvol_bytenr > subvol_eb->start) { + node = node->rb_right; + } else { + found = true; + break; + } + } + if (!found) { + spin_unlock(&blocks->lock); + goto out; + } + /* Found one, remove it from @blocks first and update blocks->swapped */ + rb_erase(&block->node, &blocks->blocks[level]); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (RB_EMPTY_ROOT(&blocks->blocks[i])) { + swapped = true; + break; + } + } + blocks->swapped = swapped; + spin_unlock(&blocks->lock); + + /* Read out reloc subtree root */ + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, + block->reloc_generation, block->level, + &block->first_key); + if (IS_ERR(reloc_eb)) { + ret = PTR_ERR(reloc_eb); + reloc_eb = NULL; + goto free_out; + } + if (!extent_buffer_uptodate(reloc_eb)) { + ret = -EIO; + goto free_out; + } + + ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, + block->last_snapshot, block->trace_leaf); +free_out: + kfree(block); + free_extent_buffer(reloc_eb); +out: + if (ret < 0) { + btrfs_err_rl(fs_info, + "failed to account subtree at bytenr %llu: %d", + subvol_eb->start, ret); + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + return ret; +} |