diff options
Diffstat (limited to 'fs/btrfs/qgroup.c')
-rw-r--r-- | fs/btrfs/qgroup.c | 937 |
1 files changed, 695 insertions, 242 deletions
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2cf905877aaf..cf5aead95a7f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -32,6 +32,7 @@ #include "ulist.h" #include "backref.h" #include "extent_io.h" +#include "qgroup.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -84,8 +85,8 @@ struct btrfs_qgroup { /* * temp variables for accounting operations */ - u64 tag; - u64 refcnt; + u64 old_refcnt; + u64 new_refcnt; }; /* @@ -98,6 +99,9 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) + static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info, return -ENOENT; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + return 0; +#endif path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; @@ -1174,33 +1201,198 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } +static int comp_oper(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return -1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + if (oper1->type < oper2->type) + return -1; + if (oper1->type > oper2->type) + return 1; + return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + p = &fs_info->qgroup_op_tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); + cmp = comp_oper(cur, oper); + if (cmp < 0) { + p = &(*p)->rb_right; + } else if (cmp) { + p = &(*p)->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + rb_link_node(&oper->n, parent, p); + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} /* - * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts - * the modification into a list that's later used by btrfs_end_transaction to - * pass the recorded modifications on to btrfs_qgroup_account_ref. + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point. If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK. */ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, int mod_seq) { - struct qgroup_update *u; + struct btrfs_qgroup_operation *oper; + int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + return 0; - BUG_ON(!trans->delayed_ref_elem.seq); - u = kmalloc(sizeof(*u), GFP_NOFS); - if (!u) + oper = kmalloc(sizeof(*oper), GFP_NOFS); + if (!oper) return -ENOMEM; - u->node = node; - u->extent_op = extent_op; - list_add_tail(&u->list, &trans->qgroup_ref_list); + oper->ref_root = ref_root; + oper->bytenr = bytenr; + oper->num_bytes = num_bytes; + oper->type = type; + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); + INIT_LIST_HEAD(&oper->elem.list); + oper->elem.seq = 0; + ret = insert_qgroup_oper(fs_info, oper); + if (ret) { + /* Shouldn't happen so have an assert for developers */ + ASSERT(0); + kfree(oper); + return ret; + } + list_add_tail(&oper->list, &trans->qgroup_ref_list); + + if (mod_seq) + btrfs_get_tree_mod_seq(fs_info, &oper->elem); return 0; } -static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq) +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + */ +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct btrfs_qgroup *qgroup; + struct ulist *tmp; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int sign = 0; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + sign = 1; + break; + case BTRFS_QGROUP_OPER_SUB_EXCL: + sign = -1; + break; + default: + ASSERT(0); + } + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); + qgroup->excl += sign * oper->num_bytes; + qgroup->excl_cmpr += sign * oper->num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + qgroup->excl += sign * oper->num_bytes; + if (sign < 0) + WARN_ON(qgroup->excl < oper->num_bytes); + qgroup->excl_cmpr += sign * oper->num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(tmp); + return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, + u64 root_to_skip, struct ulist *tmp, + struct ulist *roots, struct ulist *qgroups, + u64 seq, int *old_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1211,256 +1403,549 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + /* We don't count our current root here */ + if (unode->val == root_to_skip) + continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; + /* + * We could have a pending removal of this same ref so we may + * not have actually found our ref root when doing + * btrfs_find_all_roots, so we need to keep track of how many + * old roots we find in case we removed ours and added a + * different one at the same time. I don't think this could + * happen in practice but that sort of thinking leads to pain + * and suffering and to the dark side. + */ + (*old_roots)++; ulist_reinit(tmp); - /* XXX id not needed */ - ret = ulist_add(tmp, qg->qgroupid, - (u64)(uintptr_t)qg, GFP_ATOMIC); + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); if (ret < 0) return ret; ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->refcnt < seq) - qg->refcnt = seq + 1; + qg = u64_to_ptr(tmp_unode->aux); + /* + * We use this sequence number to keep from having to + * run the whole list and 0 out the refcnt every time. + * We basically use sequnce as the known 0 count and + * then add 1 everytime we see a qgroup. This is how we + * get how many of the roots actually point up to the + * upper level qgroups in order to determine exclusive + * counts. + * + * For rescan we want to set old_refcnt to seq so our + * exclusive calculations end up correct. + */ + if (rescan) + qg->old_refcnt = seq; + else if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; else - ++qg->refcnt; + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; ret = ulist_add(tmp, glist->group->qgroupid, - (u64)(uintptr_t)glist->group, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } } + return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct ulist *tmp, + struct ulist *qgroups, u64 seq, + int *old_roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_operation *tmp_oper; + struct rb_node *n; + int ret; + + ulist_reinit(tmp); + /* + * We only walk forward in the tree since we're only interested in + * removals that happened _after_ our operation. + */ + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + return 0; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + while (tmp_oper->bytenr == oper->bytenr) { + /* + * If it's not a removal we don't care, additions work out + * properly with our refcnt tracking. + */ + if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && + tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) + goto next; + qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); + if (!qg) + goto next; + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret) { + if (ret < 0) + return ret; + /* + * We only want to increase old_roots if this qgroup is + * not already in the list of qgroups. If it is already + * there then that means it must have been re-added or + * the delete will be discarded because we had an + * existing ref that we haven't looked up yet. In this + * case we don't want to increase old_roots. So if ret + * == 1 then we know that this is the first time we've + * seen this qgroup and we can bump the old_roots. + */ + (*old_roots)++; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + } +next: + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&tmp_oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + break; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + } + + /* Ok now process the qgroups we found */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } return 0; } -static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes, - struct btrfs_qgroup *qgroup) +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct btrfs_qgroup *qgroup, + struct ulist *tmp, struct ulist *qgroups, + u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; int ret; ulist_reinit(tmp); - ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); if (ret < 0) return ret; - ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; - if (qg->refcnt < seq) { - /* not visited by step 1 */ - qg->rfer += sgn * num_bytes; - qg->rfer_cmpr += sgn * num_bytes; - if (roots->nnodes == 0) { - qg->excl += sgn * num_bytes; - qg->excl_cmpr += sgn * num_bytes; - } - qgroup_dirty(fs_info, qg); - } - WARN_ON(qg->tag >= seq); - qg->tag = seq; + struct btrfs_qgroup_list *glist; + qg = u64_to_ptr(unode->aux); + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + } else { + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + } list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } - return 0; } -static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes) +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, + u64 root_to_skip, u64 num_bytes, + struct ulist *qgroups, u64 seq, + int old_roots, int new_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - int ret; + u64 cur_new_count, cur_old_count; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; - ulist_reinit(tmp); - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); - if (ret < 0) - return ret; + qg = u64_to_ptr(unode->aux); + /* + * Wasn't referenced before but is now, add to the reference + * counters. + */ + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; + /* + * Was referenced before but isn't now, subtract from the + * reference counters. + */ + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->tag == seq) - continue; + if (qg->old_refcnt < seq) + cur_old_count = 0; + else + cur_old_count = qg->old_refcnt - seq; + if (qg->new_refcnt < seq) + cur_new_count = 0; + else + cur_new_count = qg->new_refcnt - seq; - if (qg->refcnt - seq == roots->nnodes) { - qg->excl -= sgn * num_bytes; - qg->excl_cmpr -= sgn * num_bytes; - qgroup_dirty(fs_info, qg); - } + /* + * If our refcount was the same as the roots previously but our + * new count isn't the same as the number of roots now then we + * went from having a exclusive reference on this range to not. + */ + if (old_roots && cur_old_count == old_roots && + (cur_new_count != new_roots || new_roots == 0)) { + WARN_ON(cur_new_count != new_roots && new_roots == 0); + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) - return ret; - } + /* + * If we didn't reference all the roots before but now we do we + * have an exclusive reference to this range. + */ + if ((!old_roots || (old_roots && cur_old_count != old_roots)) + && cur_new_count == new_roots) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; } - } + if (dirty) + qgroup_dirty(fs_info, qg); + } return 0; } /* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr. If we do then we can just discard this operation. */ -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) +static int check_existing_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) { - struct btrfs_root *quota_root; - u64 ref_root; - struct btrfs_qgroup *qgroup; struct ulist *roots = NULL; - u64 seq; + struct ulist_node *unode; + struct ulist_iterator uiter; int ret = 0; - int sgn; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + oper->elem.seq, &roots); + if (ret < 0) + return ret; + ret = 0; - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - struct btrfs_delayed_tree_ref *ref; - ref = btrfs_delayed_node_to_tree_ref(node); - ref_root = ref->root; - } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - ref_root = ref->root; - } else { - BUG(); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + if (unode->val == oper->ref_root) { + ret = 1; + break; + } } + ulist_free(roots); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); - if (!is_fstree(ref_root)) { - /* - * non-fs-trees are not being accounted - */ - return 0; - } + return ret; +} - switch (node->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - sgn = 1; - seq = btrfs_tree_mod_seq_prev(node->seq); - break; - case BTRFS_DROP_DELAYED_REF: - sgn = -1; - seq = node->seq; - break; - case BTRFS_UPDATE_DELAYED_HEAD: - return 0; - default: - BUG(); - } +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters. The basic premise is this + * + * 1) We have seq to represent a 0 count. Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count. This means that if + * anybody is equal to or below this sequence they were never referenced. We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently. This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently. We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + * [qgroup 1/0] + * / | \ + * [qg 0/0] [qg 0/1] [qg 0/2] + * \ | / + * [ extent ] + * + * Say we are adding a reference that is covered by qg 0/0. The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2. Because it is adding new_roots will be 1. We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes. Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist *qgroups, *tmp; + struct btrfs_qgroup *qgroup; + struct seq_list elem = {}; + u64 seq; + int old_roots = 0; + int new_roots = 0; + int ret = 0; - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); + if (oper->elem.seq) { + ret = check_existing_refs(trans, fs_info, oper); + if (ret < 0) + return ret; + if (ret) return 0; - } } - mutex_unlock(&fs_info->qgroup_rescan_lock); - /* - * the delayed ref sequence number we pass depends on the direction of - * the operation. for add operations, we pass - * tree_mod_log_prev_seq(node->seq) to skip - * the delayed ref's current sequence number, because we need the state - * of the tree before the add operation. for delete operations, we pass - * (node->seq) to include the delayed ref's current sequence number, - * because we need the state of the tree after the delete operation. - */ - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); - if (ret < 0) - return ret; - - spin_lock(&fs_info->qgroup_lock); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) - goto unlock; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; - qgroup = find_qgroup_rb(fs_info, ref_root); + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, + &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) { + ulist_free(qgroups); + ulist_free(tmp); + return ret; + } + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, oper->ref_root); if (!qgroup) - goto unlock; + goto out; + seq = fs_info->qgroup_seq; /* - * step 1: for each old ref, visit all nodes once and inc refcnt + * So roots is the list of all the roots currently pointing at the + * bytenr, including the ref we are adding if we are adding, or not if + * we are removing a ref. So we pass in the ref_root to skip that root + * in our calculations. We set old_refnct and new_refcnt cause who the + * hell knows what everything looked like before, and it doesn't matter + * except... */ - ulist_reinit(fs_info->qgroup_ulist); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, + seq, &old_roots, 0); + if (ret < 0) + goto out; - ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, - seq); - if (ret) - goto unlock; + /* + * Now adjust the refcounts of the qgroups that care about this + * reference, either the old_count in the case of removal or new_count + * in the case of an addition. + */ + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, + seq); + if (ret < 0) + goto out; /* - * step 2: walk from the new root + * ...in the case of removals. If we had a removal before we got around + * to processing this operation then we need to find that guy and count + * his references as if they really existed so we don't end up screwing + * up the exclusive counts. Then whenever we go to process the delete + * everything will be grand and we can account for whatever exclusive + * changes need to be made there. We also have to pass in old_roots so + * we have an accurate count of the roots as it pertains to this + * operations view of the world. */ - ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes, qgroup); - if (ret) - goto unlock; + ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, + &old_roots); + if (ret < 0) + goto out; /* - * step 3: walk again from old refs + * We are adding our root, need to adjust up the number of roots, + * otherwise old_roots is the number of roots we want. */ - ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes); - if (ret) - goto unlock; + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + new_roots = old_roots + 1; + } else { + new_roots = old_roots; + old_roots++; + } + fs_info->qgroup_seq += old_roots + 1; -unlock: + + /* + * And now the magic happens, bless Arne for having a pretty elegant + * solution for this. + */ + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, + qgroups, seq, old_roots, new_roots, 0); +out: spin_unlock(&fs_info->qgroup_lock); + ulist_free(qgroups); ulist_free(roots); + ulist_free(tmp); + return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + ASSERT(is_fstree(oper->ref_root)); + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + case BTRFS_QGROUP_OPER_SUB_EXCL: + ret = qgroup_excl_accounting(fs_info, oper); + break; + case BTRFS_QGROUP_OPER_ADD_SHARED: + case BTRFS_QGROUP_OPER_SUB_SHARED: + ret = qgroup_shared_accounting(trans, fs_info, oper); + break; + default: + ASSERT(0); + } + return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup_operation *oper; + int ret = 0; + while (!list_empty(&trans->qgroup_ref_list)) { + oper = list_first_entry(&trans->qgroup_ref_list, + struct btrfs_qgroup_operation, list); + list_del_init(&oper->list); + if (!ret || !trans->aborted) + ret = btrfs_qgroup_account(trans, fs_info, oper); + spin_lock(&fs_info->qgroup_op_lock); + rb_erase(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + kfree(oper); + } return ret; } @@ -1629,8 +2114,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; - dstgroup->rfer = srcgroup->rfer - level_size; - dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; qgroup_dirty(fs_info, dstgroup); @@ -1734,7 +2227,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + (s64)qg->rfer + num_bytes > @@ -1766,7 +2259,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved += num_bytes; } @@ -1812,7 +2305,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved -= num_bytes; @@ -1848,15 +2341,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *tmp, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, struct ulist *qgroups, + struct ulist *tmp, struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; struct seq_list tree_mod_seq_elem = {}; + u64 num_bytes; u64 seq; + int new_roots; int slot; int ret; @@ -1897,8 +2390,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { - u64 num_bytes; - btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -1908,76 +2399,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ret = btrfs_find_all_roots(trans, fs_info, found.objectid, - tree_mod_seq_elem.seq, &roots); + ulist_reinit(qgroups); + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, + &roots); if (ret < 0) goto out; spin_lock(&fs_info->qgroup_lock); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); - if (ret) { + new_roots = 0; + ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, + seq, &new_roots, 1); + if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); goto out; } - /* - * step2 of btrfs_qgroup_account_ref works from a single root, - * we're doing all at once here. - */ - ulist_reinit(tmp); - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct btrfs_qgroup *qg; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } - - /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; - qg->rfer += num_bytes; - qg->rfer_cmpr += num_bytes; - WARN_ON(qg->tag >= seq); - if (qg->refcnt - seq == roots->nnodes) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - } - qgroup_dirty(fs_info, qg); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, + seq, 0, new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; } - spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); - ret = 0; } - out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -1990,13 +2439,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL; + struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; path = btrfs_alloc_path(); if (!path) goto out; + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + goto out; tmp = ulist_alloc(GFP_NOFS); if (!tmp) goto out; @@ -2015,7 +2467,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - tmp, scratch_leaf); + qgroups, tmp, scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2025,6 +2477,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); + ulist_free(qgroups); ulist_free(tmp); btrfs_free_path(path); |