diff options
Diffstat (limited to 'fs/btrfs')
53 files changed, 4763 insertions, 2818 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4dabeb893b7c..1ce06c849a86 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -85,9 +85,10 @@ BTRFS_WORK_HELPER(extent_refs_helper); BTRFS_WORK_HELPER(scrub_helper); BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); +BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -132,7 +133,7 @@ static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh) { diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index e386c29ef1f6..b0b093b6afec 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,9 +64,11 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper); BTRFS_WORK_HELPER_PROTO(scrub_helper); BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); +BTRFS_WORK_HELPER_PROTO(scrubparity_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh); void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f55721ff9385..802fabb30e15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * the first item to check. But sometimes, we may enter it with * slot==nritems. In that case, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) - ret = btrfs_next_old_leaf(root, path, time_seq); + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + if (time_seq == (u64)-1) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + } while (!ret && count < total_refs) { eb = path->nodes[0]; @@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - ret = btrfs_next_old_item(root, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_next_item(root, path); + else + ret = btrfs_next_old_item(root, path, time_seq); } if (ret > 0) @@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); + else if (time_seq == (u64)-1) + root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, + 0, 0); + else + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, + time_seq); /* root node has been locked, we can release @subvol_srcu safely here */ srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, } /* - * merge two lists of backrefs and adjust counts accordingly + * merge backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. @@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode) ref2 = list_entry(pos2, struct __prelim_ref, list); + if (!ref_for_same_block(ref1, ref2)) + continue; if (mode == 1) { - if (!ref_for_same_block(ref1, ref2)) - continue; if (!ref1->parent && ref2->parent) { xchg = ref1; ref1 = ref2; @@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct list_head *prefs, u64 *total_refs, u64 inum) { + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; struct btrfs_key key; struct btrfs_key op_key = {0}; int sgn; @@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, btrfs_disk_key_to_cpu(&op_key, &extent_op->key); spin_lock(&head->lock); - n = rb_first(&head->ref_root); - while (n) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - n = rb_next(n); + list_for_each_entry(node, &head->ref_list, list) { if (node->seq > seq) continue; @@ -880,6 +891,13 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * + * NOTE: This can return values > 0 + * + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * much like trans == NULL case, the difference only lies in it will not + * commit root. + * The special case is for qgroup to search roots in commit_transaction(). + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -918,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } + if (time_seq == (u64)-1) + path->skip_locking = 1; + /* * grab both a lock on the path and a lock on the delayed ref head. * We need both to get a consistent picture of how the refs look @@ -932,9 +953,10 @@ again: BUG_ON(ret == 0); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (trans && likely(trans->type != __TRANS_DUMMY)) { + if (trans && likely(trans->type != __TRANS_DUMMY) && + time_seq != (u64)-1) { #else - if (trans) { + if (trans && time_seq != (u64)-1) { #endif /* * look if there are updates for this ref queued and lock the @@ -1032,7 +1054,10 @@ again: eb = read_tree_block(fs_info->extent_root, ref->parent, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; @@ -1198,6 +1223,19 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } +/** + * btrfs_check_shared - tell us whether an extent is shared + * + * @trans: optional trans handle + * + * btrfs_check_shared uses the backref walking code but will short + * circuit as soon as it finds a root or inode that doesn't match the + * one passed in. This provides a significant performance benefit for + * callers (such as fiemap) which want to know whether the extent is + * shared but do not need a ref count. + * + * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. + */ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 root_objectid, u64 inum, u64 bytenr) @@ -1206,7 +1244,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; tmp = ulist_alloc(GFP_NOFS); @@ -1226,11 +1264,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, roots, NULL, root_objectid, inum); if (ret == BACKREF_FOUND_SHARED) { + /* this is the only condition under which we return 1 */ ret = 1; break; } if (ret < 0 && ret != -ENOENT) break; + ret = 0; node = ulist_next(tmp, &uiter); if (!node) break; @@ -1610,7 +1650,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index de5e4f2adfea..81220b2203c6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,8 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* DIO is ready to submit */ +#define BTRFS_INODE_DIO_READY 12 /* * The following 3 bits are meant only for the btree inode. * When any of them is set, it means an error happened while writing an @@ -66,7 +68,11 @@ struct btrfs_inode { */ struct btrfs_key location; - /* Lock for counters */ + /* + * Lock for counters and all fields used to determine if the inode is in + * the log or not (last_trans, last_sub_trans, last_log_commit, + * logged_trans). + */ spinlock_t lock; /* the extent_tree has caches of all the extent mappings to disk */ @@ -250,6 +256,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { + int ret = 0; + + spin_lock(&BTRFS_I(inode)->lock); if (BTRFS_I(inode)->logged_trans == generation && BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && @@ -263,9 +272,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) */ smp_mb(); if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) - return 1; + ret = 1; } - return 0; + spin_unlock(&BTRFS_I(inode)->lock); + return ret; } #define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index d897ef803b3b..ce7dec88f4b8 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, - GFP_NOFS); + mapped_datav = kmalloc_array(bio->bi_vcnt, + sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; @@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root, mutex_unlock(&btrfsic_mutex); - if (is_vmalloc_addr(state)) - vfree(state); - else - kfree(state); + kvfree(state); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e9df8862012c..ce62324c78e7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->orig_bio = bio; nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); - cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) goto fail1; @@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; -static struct btrfs_compress_op *btrfs_compress_op[] = { +static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, }; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d181f70caae0..13a4dc0436c9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -77,7 +77,7 @@ struct btrfs_compress_op { size_t srclen, size_t destlen); }; -extern struct btrfs_compress_op btrfs_zlib_compress; -extern struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zlib_compress; +extern const struct btrfs_compress_op btrfs_lzo_compress; #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d67f32e648d..54114b4887dd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); if (!tm_list) return -ENOMEM; @@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), flags); if (!tm_list) { ret = -ENOMEM; @@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) return 0; - tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) return 0; nritems = btrfs_header_nritems(eb); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), - GFP_NOFS); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); *last_ref = 1; } return 0; @@ -1440,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); old = read_tree_block(root, logical, 0); - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { - free_extent_buffer(old); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { @@ -1678,7 +1678,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root, blocknr); + cur = btrfs_find_tree_block(root->fs_info, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else @@ -1686,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur || !uptodate) { if (!cur) { cur = read_tree_block(root, blocknr, gen); - if (!cur || !extent_buffer_uptodate(cur)) { + if (IS_ERR(cur)) { + return PTR_ERR(cur); + } else if (!extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; } @@ -1865,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { + if (!IS_ERR(eb)) + free_extent_buffer(eb); eb = NULL; } @@ -1943,7 +1946,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1997,7 +2000,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -2041,7 +2044,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2259,7 +2262,7 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, slot); blocksize = root->nodesize; - eb = btrfs_find_tree_block(root, search); + eb = btrfs_find_tree_block(root->fs_info, search); if (eb) { free_extent_buffer(eb); return; @@ -2319,7 +2322,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1); + eb = btrfs_find_tree_block(root->fs_info, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2332,7 +2335,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2); + eb = btrfs_find_tree_block(root->fs_info, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); @@ -2450,7 +2453,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - tmp = btrfs_find_tree_block(root, blocknr); + tmp = btrfs_find_tree_block(root->fs_info, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { @@ -2495,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, ret = -EAGAIN; tmp = read_tree_block(root, blocknr, 0); - if (tmp) { + if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now @@ -3126,7 +3129,8 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -3137,7 +3141,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); + tree_mod_log_set_node_key(fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -3151,7 +3155,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; @@ -3173,7 +3178,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } /* @@ -3692,7 +3697,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(trans, root, left); + clean_tree_block(trans, root->fs_info, left); btrfs_mark_buffer_dirty(right); @@ -3704,7 +3709,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); + clean_tree_block(trans, root->fs_info, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3928,10 +3933,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -4168,6 +4173,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int mid; int slot; struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int wret; int split; @@ -4271,10 +4277,10 @@ again: btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, + write_extent_buffer(right, fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + write_extent_buffer(right, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); @@ -4297,7 +4303,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; @@ -4615,7 +4621,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } item = btrfs_item_nr(slot); @@ -4716,7 +4722,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4888,7 +4894,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, level + 1); + fixup_low_keys(root->fs_info, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4981,7 +4987,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(trans, root, leaf); + clean_tree_block(trans, root->fs_info, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -4990,7 +4996,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9c89cae39ee..aac314e14188 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,7 +174,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4, 0 }; +static int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1061,6 +1061,12 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +#define BTRFS_QGROUP_LEVEL_SHIFT 48 +static inline u64 btrfs_qgroup_level(u64 qgroupid) +{ + return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; +} + /* * is subvolume quota turned on? */ @@ -1256,6 +1262,20 @@ struct btrfs_caching_control { atomic_t count; }; +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache { /* For dirty block groups */ struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; }; /* delayed seq elem */ @@ -1329,6 +1352,8 @@ struct seq_list { u64 seq; }; +#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -1472,6 +1497,12 @@ struct btrfs_fs_info { struct mutex chunk_mutex; struct mutex volume_mutex; + /* + * this is taken to make sure we don't set block groups ro after + * the free space cache has been allocated on them + */ + struct mutex ro_block_group_mutex; + /* this is used during read/modify/write to make sure * no two ios are trying to mod the same stripe at the same * time @@ -1513,6 +1544,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + struct rw_semaphore delayed_iput_sem; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -1587,10 +1619,7 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - struct kobject super_kobj; struct kobject *space_info_kobj; - struct kobject *device_dir_kobj; - struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; @@ -1666,6 +1695,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_nocow_workers; + struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1703,7 +1733,7 @@ struct btrfs_fs_info { /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ @@ -1748,6 +1778,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; + struct mutex delete_unused_bgs_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; @@ -3295,6 +3326,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) } /* extent-tree.c */ + +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes); + static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { @@ -3385,6 +3419,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int no_quota); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, @@ -3417,10 +3453,11 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); @@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); @@ -3477,6 +3515,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3486,7 +3527,8 @@ int btrfs_previous_item(struct btrfs_root *root, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); @@ -4011,6 +4053,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_BTRFS_ASSERT +__cold static inline void assfail(char *expr, char *file, int line) { pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", @@ -4026,10 +4069,12 @@ static inline void assfail(char *expr, char *file, int line) #define btrfs_assert() __printf(5, 6) +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -4072,11 +4117,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ - #define btrfs_abort_transaction(trans, root, errno) \ do { \ - __btrfs_abort_transaction(trans, root, __func__, \ - __LINE__, errno); \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ } while (0) #define btrfs_std_error(fs_info, errno) \ @@ -4093,6 +4144,7 @@ do { \ } while (0) __printf(5, 6) +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); @@ -4180,7 +4232,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || - (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && + !btrfs_qgroup_level(rootid))) return 1; return 0; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 82f0c7c95474..a2ae42720a6a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1383,7 +1383,7 @@ out: static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int nr) + struct btrfs_fs_info *fs_info, int nr) { struct btrfs_async_delayed_work *async_work; @@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; - btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); + btrfs_queue_work(fs_info->delayed_workers, &async_work->work); return 0; } @@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + struct btrfs_fs_info *fs_info = root->fs_info; delayed_root = btrfs_get_delayed_root(root); @@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) seq = atomic_read(&delayed_root->items_seq); - ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); + ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); if (ret) return; @@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) return; } - btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); + btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } /* Will return 0 or -ENOMEM */ @@ -1801,6 +1802,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6d16bea94e1c..ac3e81da6d4e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -22,6 +22,7 @@ #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" +#include "qgroup.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1, - bool compare_seq) -{ - if (ref1->bytenr < ref2->bytenr) - return -1; - if (ref1->bytenr > ref2->bytenr) - return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) - return -1; - if (ref1->is_head) - return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - if (ref1->no_quota > ref2->no_quota) - return 1; - if (ref1->no_quota < ref2->no_quota) - return -1; - /* merging of sequenced refs is not allowed */ - if (compare_seq) { - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; - } - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1), - ref1->type); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; - int cmp; - - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, ins, 1); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, rb_erase(&head->href_node, &delayed_refs->href_root); } else { assert_spin_locked(&head->lock); - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } -static int merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref, u64 seq) -{ - struct rb_node *node; - int mod = 0; - int done = 0; - - node = rb_next(&ref->rb_node); - while (!done && node) { - struct btrfs_delayed_ref_node *next; - - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - if (seq && next->seq >= seq) - break; - if (comp_entry(ref, next, 0)) - continue; - - if (ref->action == next->action) { - mod = next->ref_mod; - } else { - if (ref->ref_mod < next->ref_mod) { - struct btrfs_delayed_ref_node *tmp; - - tmp = ref; - ref = next; - next = tmp; - done = 1; - } - mod = -next->ref_mod; - } - - drop_delayed_ref(trans, delayed_refs, head, next); - ref->ref_mod += mod; - if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); - done = 1; - } else { - /* - * You can't have multiples of the same ref on a tree - * block. - */ - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); - } - } - return done; -} - -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - u64 seq = 0; - - assert_spin_locked(&head->lock); - /* - * We don't have too much refs to merge in the case of delayed data - * refs. - */ - if (head->is_data) - return; - - spin_lock(&fs_info->tree_mod_seq_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - spin_unlock(&fs_info->tree_mod_seq_lock); - - node = rb_first(&head->ref_root); - while (node) { - struct btrfs_delayed_ref_node *ref; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - /* We can't merge refs that are outside of our seq count */ - if (seq && ref->seq >= seq) - break; - if (merge_ref(trans, delayed_refs, head, ref, seq)) - node = rb_first(&head->ref_root); - else - node = rb_next(&ref->rb_node); - } -} - int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -443,45 +270,71 @@ again: } /* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent + * Helper to insert the ref_node to the tail or merge with tail. * - * This may free existing if the update cancels out whatever - * operation it was doing. + * Return 0 for insert. + * Return >0 for merge. */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) +static int +add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { - if (update->action != existing->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, head, existing); - else - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + struct btrfs_delayed_ref_node *exist; + int mod; + int ret = 0; + + spin_lock(&href->lock); + /* Check whether we can merge the tail node with ref */ + if (list_empty(&href->ref_list)) + goto add_tail; + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, + list); + /* No need to compare bytenr nor is_head */ + if (exist->type != ref->type || exist->no_quota != ref->no_quota || + exist->seq != ref->seq) + goto add_tail; + + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), + btrfs_delayed_node_to_tree_ref(ref), + ref->type)) + goto add_tail; + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || + exist->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), + btrfs_delayed_node_to_data_ref(ref))) + goto add_tail; + + /* Now we are sure we can merge */ + ret = 1; + if (exist->action == ref->action) { + mod = ref->ref_mod; } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; + /* Need to change action */ + if (exist->ref_mod < ref->ref_mod) { + exist->action = ref->action; + mod = -exist->ref_mod; + exist->ref_mod = ref->ref_mod; + } else + mod = -ref->ref_mod; } + exist->ref_mod += mod; + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) + drop_delayed_ref(trans, root, href, exist); + spin_unlock(&href->lock); + return ret; + +add_tail: + list_add_tail(&ref->list, &href->ref_list); + atomic_inc(&root->num_entries); + trans->delayed_ref_updates++; + spin_unlock(&href->lock); + return ret; } /* @@ -489,11 +342,13 @@ update_existing_ref(struct btrfs_trans_handle *trans, * existing and update must have the same bytenr */ static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_node *existing, +update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; + int old_ref_mod; existing_ref = btrfs_delayed_node_to_head(existing); ref = btrfs_delayed_node_to_head(update); @@ -541,7 +396,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ + old_ref_mod = existing_ref->total_ref_mod; existing->ref_mod += update->ref_mod; + existing_ref->total_ref_mod += update->ref_mod; + + /* + * If we are going to from a positive ref mod to a negative or vice + * versa we need to make sure to adjust pending_csums accordingly. + */ + if (existing_ref->is_data) { + if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + delayed_refs->pending_csums -= existing->num_bytes; + if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + delayed_refs->pending_csums += existing->num_bytes; + } spin_unlock(&existing_ref->lock); } @@ -553,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, int action, int is_data) + struct btrfs_delayed_ref_node *ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -603,8 +473,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - head_ref->ref_root = RB_ROOT; + INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; + head_ref->total_ref_mod = count_mod; + + /* Record qgroup extent info if provided */ + if (qrecord) { + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qrecord); + if (qexisting) + kfree(qrecord); + } spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -614,7 +497,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(&existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -622,6 +505,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (is_data && count_mod < 0) + delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -641,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -675,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + /* + * XXX: memory should be freed at the same level allocated. + * But bad practice is anywhere... Follow it now. Need cleanup. + */ + if (ret > 0) kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -703,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -740,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -772,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -782,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + if (!head_ref) + goto free_ref; + + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) + goto free_head_ref; } head_ref->extent_op = extent_op; @@ -796,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -805,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_ref: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + return -ENOMEM; } /* @@ -821,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -836,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; } + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, + head_ref); + return -ENOMEM; + } + } + head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -845,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -873,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a764e2340d48..13fb5e6090fe 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -24,9 +24,25 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +/* + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the + * same ref_node structure. + * Ref_head is in a higher logic level than tree/data ref, and duplicated + * bytenr/num_bytes in ref_node is really a waste or memory, they should be + * referred from ref_head. + * This gets more disgusting after we use list to store tree/data ref in + * ref_head. Must clean this mess up later. + */ struct btrfs_delayed_ref_node { + /* + * ref_head use rb tree, stored in ref_root->href. + * indexed by bytenr + */ struct rb_node rb_node; + /*data/tree ref use list, stored in ref_head->ref_list. */ + struct list_head list; + /* the starting bytenr of the extent */ u64 bytenr; @@ -83,11 +99,19 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_root; + struct list_head ref_list; struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; + + /* + * This is used to track the final ref_mod from all the refs associated + * with this head ref, this is not adjusted as delayed refs are run, + * this is meant to track if we need to do the csum accounting or not. + */ + int total_ref_mod; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -124,6 +148,9 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root href_root; + /* dirty extent records */ + struct rb_root dirty_extent_root; + /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -138,6 +165,8 @@ struct btrfs_delayed_ref_root { /* total number of head nodes ready for processing */ unsigned long num_heads_ready; + u64 pending_csums; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need @@ -146,6 +175,14 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manully + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5ec03d999c37..564a7de17d99 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : @@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ @@ -670,8 +673,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); + args->status.progress_1000 = div_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -806,7 +809,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_dev_replace_status(fs_info, status_args); progress = status_args->status.progress_1000; kfree(status_args); - do_div(progress, 10); + progress = div_u64(progress, 10); printk_in_rcu(KERN_INFO "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", dev_replace->srcdev->missing ? "<missing disk>" : diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 639f2663ed3f..f556c3732c2c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -54,7 +54,7 @@ #include <asm/cpufeature.h> #endif -static struct extent_io_ops btree_extent_io_ops; +static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, @@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result) * compute the csum for a btree block, and either verify it or write it * into the csum field of the block. */ -static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, +static int csum_tree_block(struct btrfs_fs_info *fs_info, + struct extent_buffer *buf, int verify) { - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, offset += cur_len; } if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + result = kzalloc(csum_size, GFP_NOFS); if (!result) return 1; } else { @@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", - root->fs_info->sb->s_id, buf->start, + fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); @@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb) if (memcmp(raw_disk_sb, result, csum_size)) ret = 1; - - if (ret && btrfs_super_generation(disk_sb) < 10) { - printk(KERN_WARNING - "BTRFS: super block crcs don't match, older mkfs detected\n"); - ret = 0; - } } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { @@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * we only fill in the checksum field in the first page of a multi-page block */ -static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) { u64 start = page_offset(page); u64 found_start; @@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) found_start = btrfs_header_bytenr(eb); if (WARN_ON(found_start != start || !PageUptodate(page))) return 0; - csum_tree_block(root, eb, 0); + csum_tree_block(fs_info, eb, 0); return 0; } -static int check_tree_block_fsid(struct btrfs_root *root, +static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; @@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; goto err; } - if (check_tree_block_fsid(root, eb)) { + if (check_tree_block_fsid(root->fs_info, eb)) { printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; @@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(root, eb, 1); + ret = csum_tree_block(root->fs_info, eb, 1); if (ret) { ret = -EIO; goto err; @@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root, bvec->bv_page); + ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) break; } @@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, return 0; } -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { - return find_extent_buffer(root->fs_info, bytenr); + return find_extent_buffer(fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, @@ -1154,22 +1149,21 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) - return NULL; + return ERR_PTR(-ENOMEM); ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { free_extent_buffer(buf); - return NULL; + return ERR_PTR(ret); } return buf; } -void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, struct extent_buffer *buf) { - struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1515,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), generation); - if (!root->node) { - ret = -ENOMEM; + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto read_fail; + free_extent_buffer(root->node); + goto find_fail; } root->commit_root = btrfs_root_node(root); out: btrfs_free_path(path); return root; -read_fail: - free_extent_buffer(root->node); find_fail: kfree(root); alloc_fail: @@ -1751,13 +1744,14 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio_nodec(bio, error); + bio_endio(bio, error); } static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; int again; + struct btrfs_trans_handle *trans; do { again = 0; @@ -1779,7 +1773,6 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); - btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -1788,6 +1781,16 @@ static int cleaner_kthread(void *arg) * needn't do anything special here. */ btrfs_run_defrag_inodes(root->fs_info); + + /* + * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * with relocation (btrfs_relocate_chunk) and relocation + * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) + * after acquiring fs_info->delete_unused_bgs_mutex. So we + * can't hold, nor need to, fs_info->cleaner_mutex when deleting + * unused block groups. + */ + btrfs_delete_unused_bgs(root->fs_info); sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); @@ -1796,6 +1799,34 @@ sleep: __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); + + /* + * Transaction kthread is stopped before us and wakes us up. + * However we might have started a new transaction and COWed some + * tree blocks when deleting unused block groups for example. So + * make sure we commit the transaction we started to have a clean + * shutdown when evicting the btree inode - if it has dirty pages + * when we do the final iput() on it, eviction will trigger a + * writeback for it which will fail with null pointer dereferences + * since work queues and other resources were already released and + * destroyed by the time the iput/eviction/writeback is made. + */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + btrfs_err(root->fs_info, + "cleaner transaction attach returned %ld", + PTR_ERR(trans)); + } else { + int ret; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + btrfs_err(root->fs_info, + "cleaner open transaction commit returned %d", + ret); + } + return 0; } @@ -2146,6 +2177,271 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) } } +static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) +{ + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + fs_info->scrub_workers_refcnt = 0; +} + +static void btrfs_init_balance(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); +} + +static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(fs_info->btree_inode, 1); + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); + btrfs_insert_inode_hash(fs_info->btree_inode); +} + +static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) +{ + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); + init_waitqueue_head(&fs_info->replace_wait); +} + +static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; + mutex_init(&fs_info->qgroup_rescan_lock); +} + +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int max_active = fs_info->thread_pool_size; + unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); + + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); + + /* + * a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->extent_workers && + fs_info->qgroup_rescan_workers)) { + return -ENOMEM; + } + + return 0; +} + +static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *log_tree_root; + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "BTRFS: log replay required " + "on RO media\n"); + return -EIO; + } + + log_tree_root = btrfs_alloc_root(fs_info); + if (!log_tree_root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, log_tree_root, fs_info, + BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + fs_info->generation + 1); + if (IS_ERR(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + ret = PTR_ERR(log_tree_root->node); + kfree(log_tree_root); + return ret; + } else if (!extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return -EIO; + } + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return ret; + } + + if (fs_info->sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + if (ret) + return ret; + } + + return 0; +} + +static int btrfs_read_roots(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + struct btrfs_root *root; + struct btrfs_key location; + int ret; + + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); + + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; + + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(root)) { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + fs_info->quota_root = root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + return ret; + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->uuid_root = root; + } + + return 0; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -2160,21 +2456,12 @@ int open_ctree(struct super_block *sb, struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; - struct btrfs_root *extent_root; - struct btrfs_root *csum_root; struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; int max_active; - int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - bool create_uuid_tree; - bool check_uuid_tree; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); @@ -2241,13 +2528,14 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); - mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); + init_rwsem(&fs_info->delayed_iput_sem); - init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -2276,7 +2564,7 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); + fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2294,55 +2582,18 @@ int open_ctree(struct super_block *sb, } btrfs_init_delayed_root(fs_info->delayed_root); - mutex_init(&fs_info->scrub_lock); - atomic_set(&fs_info->scrubs_running, 0); - atomic_set(&fs_info->scrub_pause_req, 0); - atomic_set(&fs_info->scrubs_paused, 0); - atomic_set(&fs_info->scrub_cancel_req, 0); - init_waitqueue_head(&fs_info->replace_wait); - init_waitqueue_head(&fs_info->scrub_pause_wait); - fs_info->scrub_workers_refcnt = 0; + btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; #endif - - spin_lock_init(&fs_info->balance_lock); - mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); - atomic_set(&fs_info->balance_pause_req, 0); - atomic_set(&fs_info->balance_cancel_req, 0); - fs_info->balance_ctl = NULL; - init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; - fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - set_nlink(fs_info->btree_inode, 1); - /* - * we set the i_size on the btree inode to the max possible int. - * the real end of the address space is determined by all of - * the devices in the system - */ - fs_info->btree_inode->i_size = OFFSET_MAX; - fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - - RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); - extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping); - BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); - - BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - set_bit(BTRFS_INODE_DUMMY, - &BTRFS_I(fs_info->btree_inode)->runtime_flags); - btrfs_insert_inode_hash(fs_info->btree_inode); + btrfs_init_btree_inode(fs_info, tree_root); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2363,26 +2614,14 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); - fs_info->dev_replace.lock_owner = 0; - atomic_set(&fs_info->dev_replace.nesting_level, 0); - mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - mutex_init(&fs_info->dev_replace.lock_management_lock); - mutex_init(&fs_info->dev_replace.lock); - spin_lock_init(&fs_info->qgroup_lock); - mutex_init(&fs_info->qgroup_ioctl_lock); - fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_op_tree = RB_ROOT; - INIT_LIST_HEAD(&fs_info->dirty_qgroups); - fs_info->qgroup_seq = 1; - fs_info->quota_enabled = 0; - fs_info->pending_quota_state = 0; - fs_info->qgroup_ulist = NULL; - mutex_init(&fs_info->qgroup_rescan_lock); + btrfs_init_dev_replace_locks(fs_info); + btrfs_init_qgroup(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2554,75 +2793,9 @@ int open_ctree(struct super_block *sb, max_active = fs_info->thread_pool_size; - fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); - - fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - - fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - - fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); - - /* - * a higher idle thresh on the submit workers makes it much more - * likely that bios will be send down in a sane order to the - * devices - */ - fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, - min_t(u64, fs_devices->num_devices, - max_active), 64); - - fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); - - /* - * endios are largely parallel and should have a very - * low idle thresh - */ - fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); - fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); - fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); - fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); - fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); - fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); - fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); - fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); - fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); - fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); - - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->submit_workers && fs_info->flush_workers && - fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && - fs_info->endio_repair_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && - fs_info->endio_freespace_worker && fs_info->rmw_workers && - fs_info->caching_workers && fs_info->readahead_workers && - fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && - fs_info->qgroup_rescan_workers)) { - err = -ENOMEM; + ret = btrfs_init_workqueues(fs_info, fs_devices); + if (ret) { + err = ret; goto fail_sb_buffer; } @@ -2665,10 +2838,11 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), generation); - if (!chunk_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (IS_ERR(chunk_root->node) || + !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + chunk_root->node = NULL; goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); @@ -2688,7 +2862,7 @@ int open_ctree(struct super_block *sb, * keep the device that is marked to be the target device for the * dev_replace procedure */ - btrfs_close_extra_devices(fs_info, fs_devices, 0); + btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_ERR "BTRFS: failed to read devices on %s\n", @@ -2702,11 +2876,11 @@ retry_root_backup: tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), generation); - if (!tree_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); - + tree_root->node = NULL; goto recovery_tree_root; } @@ -2714,61 +2888,9 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); - location.objectid = BTRFS_EXTENT_TREE_OBJECTID; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = 0; - - extent_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(extent_root)) { - ret = PTR_ERR(extent_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); - fs_info->extent_root = extent_root; - - location.objectid = BTRFS_DEV_TREE_OBJECTID; - dev_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(dev_root)) { - ret = PTR_ERR(dev_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); - fs_info->dev_root = dev_root; - btrfs_init_devices_late(fs_info); - - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - csum_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(csum_root)) { - ret = PTR_ERR(csum_root); + ret = btrfs_read_roots(fs_info, tree_root); + if (ret) goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); - fs_info->csum_root = csum_root; - - location.objectid = BTRFS_QUOTA_TREE_OBJECTID; - quota_root = btrfs_read_tree_root(tree_root, &location); - if (!IS_ERR(quota_root)) { - set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); - fs_info->quota_enabled = 1; - fs_info->pending_quota_state = 1; - fs_info->quota_root = quota_root; - } - - location.objectid = BTRFS_UUID_TREE_OBJECTID; - uuid_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - if (ret != -ENOENT) - goto recovery_tree_root; - create_uuid_tree = true; - check_uuid_tree = false; - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); - fs_info->uuid_root = uuid_root; - create_uuid_tree = false; - check_uuid_tree = - generation != btrfs_super_uuid_tree_generation(disk_super); - } fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2792,12 +2914,24 @@ retry_root_backup: goto fail_block_groups; } - btrfs_close_extra_devices(fs_info, fs_devices, 1); + btrfs_close_extra_devices(fs_devices, 1); + + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + if (ret) { + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_device(fs_devices); + if (ret) { + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + goto fail_fsdev_sysfs; + } ret = btrfs_sysfs_add_one(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); - goto fail_block_groups; + goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); @@ -2806,7 +2940,7 @@ retry_root_backup: goto fail_sysfs; } - ret = btrfs_read_block_groups(extent_root); + ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); goto fail_sysfs; @@ -2864,48 +2998,11 @@ retry_root_backup: /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0) { - u64 bytenr = btrfs_super_log_root(disk_super); - - if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); - err = -EIO; - goto fail_qgroup; - } - - log_tree_root = btrfs_alloc_root(fs_info); - if (!log_tree_root) { - err = -ENOMEM; - goto fail_qgroup; - } - - __setup_root(nodesize, sectorsize, stripesize, - log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - - log_tree_root->node = read_tree_block(tree_root, bytenr, - generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); - goto fail_qgroup; - } - /* returns with log_tree_root freed on success */ - ret = btrfs_recover_log_trees(log_tree_root); + ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { - btrfs_error(tree_root->fs_info, ret, - "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + err = ret; goto fail_qgroup; } - - if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - if (ret) - goto fail_qgroup; - } } ret = btrfs_find_orphan_roots(tree_root); @@ -2966,7 +3063,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (create_uuid_tree) { + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { @@ -2975,8 +3072,9 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (check_uuid_tree || - btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { + } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + fs_info->generation != + btrfs_super_uuid_tree_generation(disk_super)) { pr_info("BTRFS: checking UUID tree\n"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { @@ -3011,6 +3109,9 @@ fail_cleaner: fail_sysfs: btrfs_sysfs_remove_one(fs_info); +fail_fsdev_sysfs: + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3225,11 +3326,8 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio, int err) { - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - } if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3257,11 +3355,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("BTRFS: disabling barriers on dev %s\n", - rcu_str_deref(device->name)); - device->nobarriers = 1; - } else if (!bio_flagged(bio, BIO_UPTODATE)) { + if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); @@ -3668,7 +3762,7 @@ void close_ctree(struct btrfs_root *root) if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) - btrfs_err(root->fs_info, "commit super ret %d", ret); + btrfs_err(fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) @@ -3680,14 +3774,15 @@ void close_ctree(struct btrfs_root *root) fs_info->closing = 2; smp_mb(); - btrfs_free_qgroup_config(root->fs_info); + btrfs_free_qgroup_config(fs_info); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { - btrfs_info(root->fs_info, "at unmount delalloc count %lld", + btrfs_info(fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -3723,7 +3818,7 @@ void close_ctree(struct btrfs_root *root) btrfs_free_stripe_hash_table(fs_info); - btrfs_free_block_rsv(root, root->orphan_block_rsv); + __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; lock_chunks(root); @@ -4016,6 +4111,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *tmp; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, @@ -4031,11 +4127,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((node = rb_first(&head->ref_root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, + list) { ref->in_tree = 0; - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } @@ -4134,7 +4229,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - eb = btrfs_find_tree_block(root, start); + eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; if (!eb) continue; @@ -4285,7 +4380,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } -static struct extent_io_ops btree_extent_io_ops = { +static const struct extent_io_ops btree_extent_io_ops = { .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 27d44c0fd236..d4cbfeeeedd4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); + struct btrfs_fs_info *fs_info, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 37d164540c3a..8d052209f473 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -152,7 +152,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -220,8 +220,8 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = child->d_inode; - struct inode *dir = parent->d_inode; + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8b353ad02f03..07204bf601ed 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op, - int no_quota); + struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, - int no_quota, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; u64 refs; int ret; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; + int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) @@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) + if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; - /* - * Ok we were able to insert an inline extent and it appears to be a new - * reference, deal with the qgroup accounting. - */ - if (!ret && !no_quota) { - ASSERT(root->fs_info->quota_enabled); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) - type = BTRFS_QGROUP_OPER_ADD_SHARED; - btrfs_release_path(path); - - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - goto out; - } /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); - if (refs) - type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - if (ret) - goto out; - } - path->reada = 1; path->leave_spinning = 1; /* now insert the actual backref */ @@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - node->no_quota, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_free_extent(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op, node->no_quota); + extent_op); } else { BUG(); } @@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->level, &ins, node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, node->no_quota, + ret = __btrfs_inc_extent_ref(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op, - node->no_quota); + ret = __btrfs_free_extent(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else { BUG(); } @@ -2323,28 +2293,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline struct btrfs_delayed_ref_node * +static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_node *ref, *last = NULL;; + struct btrfs_delayed_ref_node *ref; + + if (list_empty(&head->ref_list)) + return NULL; /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. */ - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry(ref, &head->ref_list, list) { if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - else if (last == NULL) - last = ref; - node = rb_next(node); } - return last; + + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, + list); } /* @@ -2396,16 +2365,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2482,7 +2442,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root) || + if (!list_empty(&locked_ref->ref_list) || locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); @@ -2496,7 +2456,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } else { actual_count++; ref->in_tree = 0; - rb_erase(&ref->rb_node, &locked_ref->ref_root); + list_del(&ref->list); } atomic_dec(&delayed_refs->num_entries); @@ -2538,6 +2498,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * list before we release it. */ if (btrfs_delayed_ref_is_head(ref)) { + if (locked_ref->is_data && + locked_ref->total_ref_mod < 0) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= ref->num_bytes; + spin_unlock(&delayed_refs->lock); + } btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; } @@ -2561,8 +2527,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ spin_lock(&delayed_refs->lock); avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - avg = div64_u64(avg, 4); - fs_info->avg_delayed_ref_runtime = avg; + fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ spin_unlock(&delayed_refs->lock); } return 0; @@ -2624,7 +2589,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) * We don't ever fill up leaves all the way so multiply by 2 just to be * closer to what we're really going to want to ouse. */ - return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +/* + * Takes the number of bytes to be csumm'ed and figures out how many leaves it + * would require to store the csums for that many bytes. + */ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) +{ + u64 csum_size; + u64 num_csums_per_leaf; + u64 num_csums; + + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = div64_u64(csum_size, + (u64)btrfs_super_csum_size(root->fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, root->sectorsize); + num_csums += num_csums_per_leaf - 1; + num_csums = div64_u64(num_csums, num_csums_per_leaf); + return num_csums; } int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, @@ -2632,7 +2616,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, { struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 num_bytes; + u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; + u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; num_bytes = btrfs_calc_trans_metadata_size(root, 1); @@ -2640,17 +2626,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, if (num_heads > 1) num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; + num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; + num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, + num_dirty_bgs); global_rsv = &root->fs_info->global_block_rsv; /* * If we can't allocate any more chunks lets make sure we have _lots_ of * wiggle room since running delayed refs can create more delayed refs. */ - if (global_rsv->space_info->full) + if (global_rsv->space_info->full) { + num_dirty_bgs_bytes <<= 1; num_bytes <<= 1; + } spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes) + if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) ret = 1; spin_unlock(&global_rsv->lock); return ret; @@ -2833,9 +2824,6 @@ again: goto again; } out: - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); - if (ret) - return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2874,7 +2862,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2903,11 +2890,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - + list_for_each_entry(ref, &head->ref_list, list) { /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3147,10 +3130,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); fail: - if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_release_path(path); return ret; } @@ -3193,7 +3174,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct inode *inode = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - int num_pages = 0; + u64 num_pages = 0; int retries = 0; int ret = 0; @@ -3267,15 +3248,14 @@ again: if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) goto out_put; } spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE) || - block_group->delalloc_bytes) { + !btrfs_test_opt(root, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3293,14 +3273,14 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages); + ret = btrfs_check_data_free_space(inode, num_pages, num_pages); if (ret) goto out_put; @@ -3351,16 +3331,188 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, return 0; } +/* + * transaction commit does final block group cache writeback during a + * critical section where nothing is allowed to change the FS. This is + * required in order for the cache to actually match the block group, + * but can introduce a lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group + * cache IO. There's a chance we'll have to redo some of it if the + * block group changes again during the commit, but it greatly reduces + * the commit latency by getting rid of the easy block groups while + * we're still allowing others to join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* + * make sure all the block groups on our dirty list actually + * exist + */ + btrfs_create_pending_block_groups(trans, root); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + /* + * this can happen if something re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide + * if it should update the cache_state. Don't delete + * until after we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * the cache_write_mutex is protecting + * the io_list + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, root, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, root, ret); + } + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + + if (ret) + break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } + + btrfs_free_path(path); + return ret; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; + int should_put; struct btrfs_path *path; - - if (list_empty(&cur_trans->dirty_bgs)) - return 0; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3376,16 +3528,64 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); + + /* + * this can happen if cache_save_setup re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + /* + * don't remove from the dirty list until after we've waited + * on any pending IO + */ list_del_init(&cache->dirty_list); - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - cache_save_setup(cache, trans, path); - if (!ret) - ret = btrfs_run_delayed_refs(trans, root, - (unsigned long) -1); - if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) - btrfs_write_out_cache(root, trans, cache, path); + should_put = 1; + + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + } + + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); } @@ -3445,7 +3645,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; - found->full = 0; + if (total_bytes > 0) + found->full = 0; spin_unlock(&found->lock); *space_info = found; return 0; @@ -3473,7 +3674,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - found->full = 0; + if (total_bytes > 0) + found->full = 0; + else + found->full = 1; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3635,19 +3839,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 used; - int ret = 0, committed = 0, alloc_chunk = 1; + int ret = 0; + int need_commit = 2; + int have_pinned_space; /* make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, root->sectorsize); if (btrfs_is_free_space_inode(inode)) { - committed = 1; + need_commit = 0; ASSERT(current->journal_info); } @@ -3669,7 +3875,7 @@ again: * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ - if (!data_sinfo->full && alloc_chunk) { + if (!data_sinfo->full) { u64 alloc_target; data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; @@ -3697,8 +3903,10 @@ alloc: if (ret < 0) { if (ret != -ENOSPC) return ret; - else + else { + have_pinned_space = 1; goto commit_trans; + } } if (!data_sinfo) @@ -3709,26 +3917,42 @@ alloc: /* * If we don't have enough pinned space to deal with this - * allocation don't bother committing the transaction. + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. */ - if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, - bytes) < 0) - committed = 1; + have_pinned_space = percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes); spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ commit_trans: - if (!committed && + if (need_commit && !atomic_read(&root->fs_info->open_ioctl_trans)) { - committed = 1; + need_commit--; + + if (need_commit > 0) + btrfs_wait_ordered_roots(fs_info, -1); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; + if (have_pinned_space >= 0 || + trans->transaction->have_free_bgs || + need_commit > 0) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + /* + * make sure that all running delayed iput are + * done + */ + down_write(&root->fs_info->delayed_iput_sem); + up_write(&root->fs_info->delayed_iput_sem); + goto again; + } else { + btrfs_end_transaction(trans, root); + } } trace_btrfs_space_reservation(root->fs_info, @@ -3736,12 +3960,16 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } + ret = btrfs_qgroup_reserve(root, write_bytes); + if (ret) + goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); +out: spin_unlock(&data_sinfo->lock); - return 0; + return ret; } /* @@ -3819,7 +4047,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) { u64 num_dev; @@ -3833,24 +4061,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) else num_dev = 1; /* DUP or single */ - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); + return num_dev; } -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 type) { struct btrfs_space_info *info; u64 left; u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use; spin_unlock(&info->lock); - thresh = get_system_chunk_thresh(root, type); + num_devs = get_profile_num_devs(root, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + + btrfs_calc_trans_metadata_size(root, 1); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); @@ -3861,7 +4108,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, u64 flags; flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, root, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, + &root->fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; } } @@ -3966,6 +4227,24 @@ out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + btrfs_create_pending_block_groups(trans, trans->root); + btrfs_trans_release_chunk_metadata(trans); + } return ret; } @@ -4298,8 +4577,13 @@ out: static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, struct btrfs_fs_info *fs_info, u64 used) { - return (used >= div_factor_fine(space_info->total_bytes, 98) && - !btrfs_fs_closing(fs_info) && + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if (space_info->bytes_used >= thresh) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } @@ -4354,10 +4638,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (!btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) return; - } while (flush_state <= COMMIT_TRANS); - - if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) - queue_work(system_unbound_wq, work); + } while (flush_state < COMMIT_TRANS); } void btrfs_init_async_reclaim_work(struct work_struct *work) @@ -4700,6 +4981,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) +{ + kfree(rsv); +} + int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) @@ -4812,10 +5098,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * csum_size * 2; - num_bytes += div64_u64(data_used + meta_used, 50); + num_bytes += div_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div_u64(meta_used, 3); return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } @@ -4912,6 +5198,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, trans->bytes_reserved = 0; } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->root->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) @@ -4998,8 +5302,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -5066,30 +5368,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, int reserve) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 csum_size; - int num_csums_per_leaf; - int num_csums; - int old_csums; + u64 old_csums, num_csums; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && BTRFS_I(inode)->csum_bytes == 0) return 0; - old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); if (reserve) BTRFS_I(inode)->csum_bytes += num_bytes; else BTRFS_I(inode)->csum_bytes -= num_bytes; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); - num_csums_per_leaf = (int)div64_u64(csum_size, - sizeof(struct btrfs_csum_item) + - sizeof(struct btrfs_disk_key)); - num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); - num_csums = num_csums + num_csums_per_leaf - 1; - num_csums = num_csums / num_csums_per_leaf; - - old_csums = old_csums + num_csums_per_leaf - 1; - old_csums = old_csums / num_csums_per_leaf; + num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) @@ -5163,8 +5453,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5172,8 +5461,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, num_bytes + - nr_extents * root->nodesize); + btrfs_qgroup_free(root, nr_extents * root->nodesize); goto out_fail; } @@ -5290,10 +5578,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); - if (root->fs_info->quota_enabled) { - btrfs_qgroup_free(root, num_bytes + - dropped * root->nodesize); - } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); @@ -5318,7 +5602,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); if (ret) return ret; @@ -5390,14 +5674,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5446,6 +5722,16 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->unused_bgs_lock); } } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->transaction->num_dirty_bgs++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; @@ -5834,11 +6120,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; struct btrfs_path *path; @@ -5852,10 +6137,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int no_quota = node->no_quota; u32 item_size; u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; int last_ref = 0; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6036,7 +6323,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { - type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -6098,18 +6384,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* Deal with the quota accounting */ - if (!ret && last_ref && !no_quota) { - int mod_seq = 0; - - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && - type == BTRFS_QGROUP_OPER_SUB_SHARED) - mod_seq = 1; - - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, - bytenr, num_bytes, type, - mod_seq); - } out: btrfs_free_path(path); return ret; @@ -6135,7 +6409,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (rb_first(&head->ref_root)) + if (!list_empty(&head->ref_list)) goto out; if (head->extent_op) { @@ -6956,15 +7230,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, return -ENOSPC; } - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_discard_extent(root, start, len, NULL); - if (pin) pin_down_extent(root, cache, start, len, 1); else { + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -7045,13 +7319,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - /* Always set parent to 0 here since its exclusive anyway. */ - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, ins->offset, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7095,9 +7362,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { + btrfs_free_path(path); btrfs_free_and_pin_reserved_extent(root, ins->objectid, root->nodesize); - btrfs_free_path(path); return ret; } @@ -7133,14 +7400,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, num_bytes, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - } - ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7217,7 +7476,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); @@ -7311,7 +7570,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * returns the key for the extent through ins, and a tree buffer for * the first block of the extent through buf. * - * returns the tree buffer or NULL. + * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7322,6 +7581,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; u64 flags = 0; int ret; u32 blocksize = root->nodesize; @@ -7342,13 +7602,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(root, blocksize, blocksize, empty_size, hint, &ins, 0, 0); - if (ret) { - unuse_block_rsv(root->fs_info, block_rsv, blocksize); - return ERR_PTR(ret); - } + if (ret) + goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); - BUG_ON(IS_ERR(buf)); /* -ENOMEM */ + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -7358,9 +7619,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; extent_op = btrfs_alloc_delayed_extent_op(); - BUG_ON(!extent_op); /* -ENOMEM */ + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -7375,13 +7638,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - ins.objectid, - ins.offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); - BUG_ON(ret); /* -ENOMEM */ + ins.objectid, ins.offset, + parent, root_objectid, level, + BTRFS_ADD_DELAYED_EXTENT, + extent_op, 0); + if (ret) + goto out_free_delayed; } return buf; + +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); +out_unuse: + unuse_block_rsv(root->fs_info, block_rsv, blocksize); + return ERR_PTR(ret); } struct walk_control { @@ -7482,12 +7756,18 @@ reada: wc->reada_slot = slot; } +/* + * TODO: Modify related function to add related node/leaf to dirty_extent_root, + * for later qgroup accounting. + * + * Current, this function does nothing. + */ static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; + int i, extent_type; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; @@ -7510,13 +7790,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - bytenr, num_bytes, - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); - if (ret) - return ret; } return 0; } @@ -7585,6 +7858,8 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. + * TODO: Modify this function to mark all (including complete shared node) + * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7647,7 +7922,11 @@ walk_down: child_gen = btrfs_node_ptr_generation(eb, parent_slot); eb = read_tree_block(root, child_bytenr, child_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); ret = -EIO; goto out; } @@ -7658,16 +7937,6 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - child_bytenr, - root->nodesize, - BTRFS_QGROUP_OPER_SUB_SUBTREE, - 0); - if (ret) - goto out; - } if (level == 0) { @@ -7815,7 +8084,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr); + next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); if (!next) @@ -7878,7 +8147,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, generation); - if (!next || !extent_buffer_uptodate(next)) { + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; } @@ -8016,7 +8287,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(trans, root, eb); + clean_tree_block(trans, root->fs_info, eb); } if (eb == root->node) { @@ -8260,24 +8531,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - /* - * Qgroup update accounting is run from - * delayed ref handling. This usually works - * out because delayed refs are normally the - * only way qgroup updates are added. However, - * we may have added updates during our tree - * walk so run qgroups here to make sure we - * don't lose any updates. - */ - ret = btrfs_delayed_qgroup_accounting(trans, - root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8331,14 +8584,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); @@ -8533,10 +8778,48 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); +again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (trans->transaction->dirty_bg_run) { + u64 transid = trans->transid; + + mutex_unlock(&root->fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans, root); + + ret = btrfs_wait_for_commit(root, transid); + if (ret) + return ret; + goto again; + } + + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } + ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8549,8 +8832,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); + lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, root, alloc_flags); + unlock_chunks(root->fs_info->chunk_root); } + mutex_unlock(&root->fs_info->ro_block_group_mutex); btrfs_end_transaction(trans, root); return ret; @@ -8720,7 +9006,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) min_free <<= 1; } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; - do_div(min_free, dev_min); + min_free = div64_u64(min_free, dev_min); } /* We need to do this so that we can look at pending chunks */ @@ -8992,6 +9278,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9247,6 +9534,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + /* + * Call to ensure the corresponding space_info object is created and + * assigned to our block group, but don't update its counters just yet. + * We want our bg to be added to the rbtree with its ->space_info set. + */ + ret = update_space_info(root->fs_info, cache->flags, 0, 0, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = btrfs_add_block_group_cache(root->fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); @@ -9254,6 +9554,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); if (ret) { @@ -9355,7 +9659,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ inode = lookup_free_space_inode(tree_root, block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * make sure our free spache cache IO is done before remove the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); if (ret) { @@ -9448,18 +9783,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&trans->transaction->dirty_bgs_lock); if (!list_empty(&block_group->dirty_list)) { - list_del_init(&block_group->dirty_list); - btrfs_put_block_group(block_group); + WARN_ON(1); + } + if (!list_empty(&block_group->io_list)) { + WARN_ON(1); } spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; + spin_unlock(&block_group->space_info->lock); memcpy(&key, &block_group->key, sizeof(key)); @@ -9574,6 +9920,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); @@ -9647,8 +9995,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + space_info->bytes_pinned -= block_group->pinned; + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add(&space_info->total_bytes_pinned, + -block_group->pinned); block_group->pinned = 0; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. @@ -9658,6 +10016,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans, root); next: + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/fs/btrfs/extent-tree.h diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d688cfe5d496..02d05817cbdf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, @@ -2767,8 +2772,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, else btrfsic_submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; bio_put(bio); return ret; } @@ -4492,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + flags |= FIEMAP_EXTENT_UNWRITTEN; free_extent_map(em); em = NULL; @@ -4514,8 +4519,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } ret = fiemap_fill_next_extent(fieinfo, em_start, disko, em_len, flags); - if (ret) + if (ret) { + if (ret == 1) + ret = 0; goto out_free; + } } out_free: free_extent_map(em); @@ -4557,36 +4565,37 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) do { index--; page = eb->pages[index]; - if (page && mapped) { + if (!page) + continue; + if (mapped) spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. + * We need to make sure we haven't be attached + * to a new eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - } - if (page) { - /* One for when we alloced the page */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ page_cache_release(page); } + + if (mapped) + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ + page_cache_release(page); } while (index != 0); } @@ -4768,6 +4777,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); + /* + * Lock our eb's refs_lock to avoid races with + * free_extent_buffer. When we get our eb it might be flagged + * with EXTENT_BUFFER_STALE and another task running + * free_extent_buffer might have seen that flag set, + * eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process + * of decrementing the extent buffer's reference count twice. + * So here we could race and increment the eb's reference count, + * clear its stale flag, mark it as dirty and drop our reference + * before the other task finishes executing free_extent_buffer, + * which would later result in an attempt to free an extent + * buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); + } mark_extent_buffer_accessed(eb, NULL); return eb; } @@ -4867,6 +4895,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, mark_extent_buffer_accessed(exists, p); goto free_eb; } + exists = NULL; /* * Do this so attach doesn't complain and we need to @@ -4930,12 +4959,12 @@ again: return eb; free_eb: + WARN_ON(!atomic_dec_and_test(&eb->refs)); for (i = 0; i < num_pages; i++) { if (eb->pages[i]) unlock_page(eb->pages[i]); } - WARN_ON(!atomic_dec_and_test(&eb->refs)); btrfs_release_extent_buffer(eb); return exists; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 695b0ccfb755..c668f36898d3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -97,7 +97,7 @@ struct extent_io_tree { u64 dirty_bytes; int track_uptodate; spinlock_t lock; - struct extent_io_ops *ops; + const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 84a2d1868271..58ece6558430 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, - GFP_NOFS); + btrfs_bio->csum_allocated = kmalloc_array(nblocks, + csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); return -ENOMEM; @@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root, btrfs_truncate_item(root, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(root, path, key); + btrfs_set_item_key_safe(root->fs_info, path, key); } else { BUG(); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 30982bbd31c3..b823fac91c92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,7 +24,6 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> -#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> @@ -32,6 +31,7 @@ #include <linux/compat.h> #include <linux/slab.h> #include <linux/btrfs.h> +#include <linux/uio.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) defrag = rb_entry(node, struct inode_defrag, rb_node); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - if (need_resched()) { - spin_unlock(&fs_info->defrag_inodes_lock); - cond_resched(); - spin_lock(&fs_info->defrag_inodes_lock); - } + cond_resched_lock(&fs_info->defrag_inodes_lock); node = rb_first(&fs_info->defrag_inodes); } @@ -868,7 +864,7 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -1126,7 +1122,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1160,7 +1156,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); - pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes); + ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); if (ret == -ENOSPC && (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) { @@ -1635,8 +1631,8 @@ again: btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { - u64 lockstart = round_down(pos, root->sectorsize); - u64 lockend = lockstart + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + (dirty_pages << PAGE_CACHE_SHIFT) - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, @@ -1739,28 +1735,20 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, u64 start_pos; u64 end_pos; ssize_t num_written = 0; - ssize_t err = 0; - size_t count = iov_iter_count(from); bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); - loff_t pos = iocb->ki_pos; + ssize_t err; + loff_t pos; + size_t count; mutex_lock(&inode->i_mutex); - - current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) { + err = generic_write_checks(iocb, from); + if (err <= 0) { mutex_unlock(&inode->i_mutex); - goto out; - } - - if (count == 0) { - mutex_unlock(&inode->i_mutex); - goto out; + return err; } - iov_iter_truncate(from, count); - - err = file_remove_suid(file); + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_privs(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1786,6 +1774,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); + pos = iocb->ki_pos; + count = iov_iter_count(from); start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { /* Expand hole size to cover write data, preventing empty gap */ @@ -1800,7 +1790,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (file->f_flags & O_DIRECT) { + if (iocb->ki_flags & IOCB_DIRECT) { num_written = __btrfs_direct_write(iocb, from, pos); } else { num_written = __btrfs_buffered_write(file, from, pos); @@ -1815,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; + spin_unlock(&BTRFS_I(inode)->lock); if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0) @@ -1870,12 +1862,13 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0; bool full_sync = 0; + const u64 len = end - start + 1; trace_btrfs_sync_file(file, datasync); @@ -1904,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * all extents are persisted and the respective file extent * items are in the fs/subvol btree. */ - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + ret = btrfs_wait_ordered_range(inode, start, len); } else { /* * Start any new ordered operations before starting to log the @@ -1976,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed)) { + (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed && + (full_sync || + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2168,7 +2163,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(root, path, &key); + btrfs_set_item_key_safe(root->fs_info, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2551,7 +2546,6 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2576,14 +2570,9 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); if (ret) return ret; - if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); - if (ret) - goto out_reserve_fail; - } mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); @@ -2673,23 +2662,35 @@ static long btrfs_fallocate(struct file *file, int mode, 1 << inode->i_blkbits, offset + len, &alloc_hint); - - if (ret < 0) { - free_extent_map(em); - break; - } } else if (actual_end > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + /* * We didn't need to allocate any more space, but we * still extended the size of the file so we need to - * update i_size. + * update i_size and the inode item. */ - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, + NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, + root); + } } free_extent_map(em); + if (ret < 0) + break; cur_offset = last_byte; if (cur_offset >= alloc_end) { @@ -2701,9 +2702,6 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, alloc_end - alloc_start); -out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; @@ -2806,8 +2804,6 @@ out: const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a71978578fa7..fb5a6b1c62a6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, } mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_mask(inode->i_mapping) & + ~(__GFP_FS | __GFP_HIGHMEM)); return inode; } @@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root, key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { btrfs_release_path(path); return ret; } + leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); @@ -225,9 +226,39 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode) { int ret = 0; + struct btrfs_path *path = btrfs_alloc_path(); + bool locked = false; + + if (!path) { + ret = -ENOMEM; + goto fail; + } + + if (block_group) { + locked = true; + mutex_lock(&trans->transaction->cache_write_mutex); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + } + + /* + * now that we've truncated the cache away, its no longer + * setup or written + */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + } + btrfs_free_path(path); btrfs_i_size_write(inode, 0); truncate_pagecache(inode, 0); @@ -235,15 +266,19 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, /* * We don't need an orphan item because truncating the free space cache * will never be split across transactions. + * We don't need to check for -EAGAIN because we're a free space + * cache inode */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } + if (ret) + goto fail; ret = btrfs_update_inode(trans, root, inode); + +fail: + if (locked) + mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, root, ret); @@ -269,18 +304,7 @@ static int readahead_cache(struct inode *inode) return 0; } -struct io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_root *root; - unsigned long size; - int index; - int num_pages; - unsigned check_crcs:1; -}; - -static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, struct btrfs_root *root, int write) { int num_pages; @@ -296,45 +320,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) return -ENOSPC; - memset(io_ctl, 0, sizeof(struct io_ctl)); + memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); - io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; io_ctl->num_pages = num_pages; io_ctl->root = root; io_ctl->check_crcs = check_crcs; + io_ctl->inode = inode; return 0; } -static void io_ctl_free(struct io_ctl *io_ctl) +static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { kfree(io_ctl->pages); + io_ctl->pages = NULL; } -static void io_ctl_unmap_page(struct io_ctl *io_ctl) +static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) { if (io_ctl->cur) { - kunmap(io_ctl->page); io_ctl->cur = NULL; io_ctl->orig = NULL; } } -static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) { ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; - io_ctl->cur = kmap(io_ctl->page); + io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_CACHE_SIZE; if (clear) memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); } -static void io_ctl_drop_pages(struct io_ctl *io_ctl) +static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) { int i; @@ -349,7 +374,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, int uptodate) { struct page *page; @@ -383,7 +408,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, return 0; } -static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *val; @@ -406,7 +431,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) io_ctl->cur += sizeof(u64); } -static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *gen; @@ -435,7 +460,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) return 0; } -static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp; u32 crc = ~(u32)0; @@ -453,13 +478,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; *tmp = crc; - kunmap(io_ctl->pages[0]); } -static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp, val; u32 crc = ~(u32)0; @@ -473,10 +497,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; val = *tmp; - kunmap(io_ctl->pages[0]); io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, @@ -492,7 +515,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) return 0; } -static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, +static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, void *bitmap) { struct btrfs_free_space_entry *entry; @@ -522,7 +545,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, return 0; } -static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) { if (!io_ctl->cur) return -ENOSPC; @@ -545,7 +568,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) return 0; } -static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) { /* * If we're not on the boundary we know we've modified the page and we @@ -562,7 +585,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) } } -static int io_ctl_read_entry(struct io_ctl *io_ctl, +static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; @@ -589,7 +612,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, return 0; } -static int io_ctl_read_bitmap(struct io_ctl *io_ctl, +static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry) { int ret; @@ -648,7 +671,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct io_ctl io_ctl; + struct btrfs_io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; LIST_HEAD(bitmaps); @@ -877,7 +900,7 @@ out: } static noinline_for_stack -int write_cache_extent_entries(struct io_ctl *io_ctl, +int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, int *entries, int *bitmaps, @@ -885,6 +908,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, { int ret; struct btrfs_free_cluster *cluster = NULL; + struct btrfs_free_cluster *cluster_locked = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); struct btrfs_trim_range *trim_entry; @@ -896,6 +920,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, } if (!node && cluster) { + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); node = rb_first(&cluster->root); cluster = NULL; } @@ -919,9 +945,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, node = rb_next(node); if (!node && cluster) { node = rb_first(&cluster->root); + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); cluster = NULL; } } + if (cluster_locked) { + spin_unlock(&cluster_locked->lock); + cluster_locked = NULL; + } /* * Make sure we don't miss any range that was removed from our rbtree @@ -939,6 +971,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, return 0; fail: + if (cluster_locked) + spin_unlock(&cluster_locked->lock); return -ENOSPC; } @@ -1000,7 +1034,7 @@ fail: static noinline_for_stack int write_pinned_extent_entries(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, - struct io_ctl *io_ctl, + struct btrfs_io_ctl *io_ctl, int *entries) { u64 start, extent_start, extent_end, len; @@ -1050,7 +1084,7 @@ write_pinned_extent_entries(struct btrfs_root *root, } static noinline_for_stack int -write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { struct list_head *pos, *n; int ret; @@ -1083,10 +1117,7 @@ static int flush_dirty_cache(struct inode *inode) } static void noinline_for_stack -cleanup_write_cache_enospc(struct inode *inode, - struct io_ctl *io_ctl, - struct extent_state **cached_state, - struct list_head *bitmap_list) +cleanup_bitmap_list(struct list_head *bitmap_list) { struct list_head *pos, *n; @@ -1095,12 +1126,85 @@ cleanup_write_cache_enospc(struct inode *inode, list_entry(pos, struct btrfs_free_space, list); list_del_init(&entry->list); } +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct btrfs_io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, cached_state, GFP_NOFS); } +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset) +{ + int ret; + struct inode *inode = io_ctl->inode; + + if (!inode) + return 0; + + if (block_group) + root = root->fs_info->tree_root; + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + io_ctl->entries, io_ctl->bitmaps); +out: + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + if (block_group) { +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); +#endif + } + } + btrfs_update_inode(trans, root, inode); + + if (block_group) { + /* the dirty list is protected by the dirty_bgs_lock */ + spin_lock(&trans->transaction->dirty_bgs_lock); + + /* the disk_cache_state is protected by the block group lock */ + spin_lock(&block_group->lock); + + /* + * only mark this as written if we didn't get put back on + * the dirty list while waiting for IO. Otherwise our + * cache state won't be right, and we won't get written again + */ + if (!ret && list_empty(&block_group->dirty_list)) + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + else if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + + spin_unlock(&block_group->lock); + spin_unlock(&trans->transaction->dirty_bgs_lock); + io_ctl->inode = NULL; + iput(inode); + } + + return ret; + +} + /** * __btrfs_write_out_cache - write out cached info to an inode * @root - the root the inode belongs to @@ -1112,27 +1216,29 @@ cleanup_write_cache_enospc(struct inode *inode, * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. + * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 offset) { struct extent_state *cached_state = NULL; - struct io_ctl io_ctl; LIST_HEAD(bitmap_list); int entries = 0; int bitmaps = 0; int ret; + int must_iput = 0; if (!i_size_read(inode)) - return -1; + return -EIO; - ret = io_ctl_init(&io_ctl, inode, root, 1); + WARN_ON(io_ctl->pages); + ret = io_ctl_init(io_ctl, inode, root, 1); if (ret) - return -1; + return ret; if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { down_write(&block_group->data_rwsem); @@ -1143,55 +1249,59 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, up_write(&block_group->data_rwsem); BTRFS_I(inode)->generation = 0; ret = 0; + must_iput = 1; goto out; } spin_unlock(&block_group->lock); } /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, inode, 0); + if (ret) + goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state); - io_ctl_set_generation(&io_ctl, trans->transid); + io_ctl_set_generation(io_ctl, trans->transid); mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ - ret = write_cache_extent_entries(&io_ctl, ctl, + spin_lock(&ctl->tree_lock); + ret = write_cache_extent_entries(io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + if (ret) + goto out_nospc_locked; /* * Some spaces that are freed in the current transaction are pinned, * they will be added into free space cache after the transaction is * committed, we shouldn't lose them. + * + * If this changes while we are working we'll get added back to + * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); + if (ret) + goto out_nospc_locked; /* * At last, we write out all the bitmaps and keep cache_writeout_mutex * locked while doing it because a concurrent trim can be manipulating * or freeing the bitmap. */ - ret = write_bitmap_entries(&io_ctl, &bitmap_list); + ret = write_bitmap_entries(io_ctl, &bitmap_list); + spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); + io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), &cached_state); if (ret) goto out_nospc; @@ -1202,30 +1312,44 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * Release the pages and unlock the extent, we will flush * them out later */ - io_ctl_drop_pages(&io_ctl); + io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - /* Flush the dirty pages in the cache file. */ - ret = flush_dirty_cache(inode); + /* + * at this point the pages are under IO and we're happy, + * The caller is responsible for waiting on them and updating the + * the cache and the inode + */ + io_ctl->entries = entries; + io_ctl->bitmaps = bitmaps; + + ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); if (ret) goto out; - /* Update the cache item to tell everyone this cache file is valid. */ - ret = update_cache_item(trans, root, inode, path, offset, - entries, bitmaps); + return 0; + out: - io_ctl_free(&io_ctl); + io_ctl->inode = NULL; + io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); return ret; +out_nospc_locked: + cleanup_bitmap_list(&bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + out_nospc: - cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list); if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -1241,7 +1365,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; - enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1250,34 +1373,34 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); return 0; } - - if (block_group->delalloc_bytes) { - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, + ret = __btrfs_write_out_cache(root, inode, ctl, block_group, + &block_group->io_ctl, trans, path, block_group->key.objectid); if (ret) { - dcs = BTRFS_DC_ERROR; - ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->key.objectid); #endif + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + + block_group->io_ctl.inode = NULL; + iput(inode); } - spin_lock(&block_group->lock); - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - iput(inode); + /* + * if ret == 0 the caller is expected to call btrfs_wait_cache_io + * to wait for IO and put the inode + */ + return ret; } @@ -1298,11 +1421,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u64 bytes_per_bitmap; + u32 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1521,10 +1644,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max(max_bitmaps, 1); + max_bitmaps = max_t(u32, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1537,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * - div64_u64(size, 1024 * 1024 * 1024); + div_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1552,14 +1675,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) } /* - * we want the extent entry threshold to always be at most 1/2 the maxw + * we want the extent entry threshold to always be at most 1/2 the max * bytes we can have, or whatever is less than that. */ extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); ctl->extents_thresh = - div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -1673,7 +1796,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, */ if (*bytes >= align) { tmp = entry->offset - ctl->start + align - 1; - do_div(tmp, align); + tmp = div64_u64(tmp, align); tmp = tmp * align + ctl->start; align_off = tmp - entry->offset; } else { @@ -2402,11 +2525,8 @@ static void __btrfs_remove_free_space_cache_locked( } else { free_bitmap(ctl, info); } - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } } @@ -2431,11 +2551,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); spin_unlock(&ctl->tree_lock); @@ -3346,13 +3463,29 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; int ret; + struct btrfs_io_ctl io_ctl; + bool release_metadata = true; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); + memset(&io_ctl, 0, sizeof(io_ctl)); + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, + trans, path, 0); + if (!ret) { + /* + * At this point writepages() didn't error out, so our metadata + * reservation is released when the writeback finishes, at + * inode.c:btrfs_finish_ordered_io(), regardless of it finishing + * with or without an error. + */ + release_metadata = false; + ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + } + if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); + if (release_metadata) + btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free ino cache for root %llu", diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 88b2238a0aed..a16a029ad3b1 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -48,6 +48,8 @@ struct btrfs_free_space_op { struct btrfs_free_space *info); }; +struct btrfs_io_ctl; + struct inode *lookup_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset); int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); - struct inode *lookup_free_ino_inode(struct btrfs_root *root, struct btrfs_path *path); int create_free_ino_inode(struct btrfs_root *root, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 74faea3a516e..d4a582ac3f73 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -246,6 +246,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock; struct btrfs_free_space *info; struct rb_node *n; u64 count; @@ -254,24 +255,30 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) return; while (1) { + bool add_to_ctl = true; + + spin_lock(rbroot_lock); n = rb_first(rbroot); - if (!n) + if (!n) { + spin_unlock(rbroot_lock); break; + } info = rb_entry(n, struct btrfs_free_space, offset_index); BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->ino_cache_progress) - goto free; + add_to_ctl = false; else if (info->offset + info->bytes > root->ino_cache_progress) count = root->ino_cache_progress - info->offset + 1; else count = info->bytes; - __btrfs_add_free_space(ctl, info->offset, count); -free: rb_erase(&info->offset_index, rbroot); - kfree(info); + spin_unlock(rbroot_lock); + if (add_to_ctl) + __btrfs_add_free_space(ctl, info->offset, count); + kmem_cache_free(btrfs_free_space_cachep, info); } } @@ -456,7 +463,7 @@ again: } if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2e732d7af52..e33dff356460 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,7 +32,6 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> -#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -43,6 +42,7 @@ #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> +#include <linux/uio.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -59,6 +59,7 @@ #include "backref.h" #include "hash.h" #include "props.h" +#include "qgroup.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -470,7 +471,7 @@ again: */ if (inode_need_compress(inode)) { WARN_ON(pages); - pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ goto cont; @@ -752,7 +753,6 @@ retry: } goto out_free; } - /* * here we're doing allocation and writeback of the * compressed pages @@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) if (empty) return; + down_read(&fs_info->delayed_iput_sem); + spin_lock(&fs_info->delayed_iput_lock); list_splice_init(&fs_info->delayed_iputs, &list); spin_unlock(&fs_info->delayed_iput_lock); @@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) iput(delayed->inode); kfree(delayed); } + + up_read(&root->fs_info->delayed_iput_sem); } /* @@ -3628,25 +3632,28 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. + * + * This is required for both inode re-read from disk and delayed inode + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == root->fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - inode->i_version = btrfs_inode_sequence(leaf, inode_item); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; - rdev = btrfs_inode_rdev(leaf, inode_item); - - BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - -cache_index: path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -4016,16 +4023,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int ret; trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); - ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (ret) goto out; @@ -4124,7 +4131,7 @@ out: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; @@ -4151,7 +4158,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (!err) btrfs_i_size_write(inode, 0); @@ -4162,6 +4169,21 @@ out: return err; } +static int truncate_space_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytes_deleted) +{ + int ret; + + bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, + bytes_deleted, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->bytes_reserved += bytes_deleted; + return ret; + +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4187,7 +4209,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 last_size = (u64)-1; + u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4197,9 +4219,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int ret; int err = 0; u64 ino = btrfs_ino(inode); + u64 bytes_deleted = 0; + bool be_nice = 0; + bool should_throttle = 0; + bool should_end = 0; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + /* + * for non-free space inodes and ref cows, we want to back off from + * time to time + */ + if (!btrfs_is_free_space_inode(inode) && + test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + be_nice = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4229,6 +4263,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + /* + * with a 16K leaf size and 128MB extents, you can actually queue + * up a huge file in a single leaf. Most of the time that + * bytes_deleted is > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (btrfs_should_end_transaction(trans, root)) { + err = -EAGAIN; + goto error; + } + } + + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) { @@ -4371,22 +4418,39 @@ delete: } else { break; } + should_throttle = 0; + if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); + bytes_deleted += extent_num_bytes; ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset, 0); BUG_ON(ret); + if (btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_async_run_delayed_refs(root, + trans->delayed_ref_updates * 2, 0); + if (be_nice) { + if (truncate_space_check(trans, root, + extent_num_bytes)) { + should_end = 1; + } + if (btrfs_should_throttle_delayed_refs(trans, + root)) { + should_throttle = 1; + } + } } if (found_type == BTRFS_INODE_ITEM_KEY) break; if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot) { + path->slots[0] != pending_del_slot || + should_throttle || should_end) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4399,6 +4463,23 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); + if (should_throttle) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } + /* + * if we failed to refill our space rsv, bail out + * and let the transaction restart + */ + if (should_end) { + err = -EAGAIN; + goto error; + } goto search_again; } else { path->slots[0]--; @@ -4412,10 +4493,20 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_free_path(path); + + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } return err; } @@ -4826,7 +4917,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; int err; @@ -4894,24 +4985,41 @@ static void evict_inode_truncate_pages(struct inode *inode) } write_unlock(&map_tree->lock); + /* + * Keep looping until we have no more ranges in the io tree. + * We can have ongoing bios started by readpages (called from readahead) + * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some + * ranges can still be locked and eviction started because before + * submitting those bios, which are executed by a separate task (work + * queue kthread), inode references (inode->i_count) were not taken + * (which would be dropped in the end io callback of each bio). + * Therefore here we effectively end up waiting for those bios and + * anyone else holding locked ranges without having bumped the inode's + * reference count - if we don't do it, when they access the inode's + * io_tree to unlock a range it may be too late, leading to an + * use-after-free issue. + */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; + u64 start; + u64 end; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); - atomic_inc(&state->refs); + start = state->start; + end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, state->start, state->end, - 0, &cached_state); - clear_extent_bit(io_tree, state->start, state->end, + lock_extent_bits(io_tree, start, end, 0, &cached_state); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); - free_extent_state(state); cond_resched(); spin_lock(&io_tree->lock); @@ -4924,6 +5032,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; + int steal_from_global = 0; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); int ret; @@ -4991,9 +5100,20 @@ void btrfs_evict_inode(struct inode *inode) * hard as possible to get this to work. */ if (ret) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); + steal_from_global++; + else + steal_from_global = 0; + ret = 0; - if (ret) { + /* + * steal_from_global == 0: we reserved stuff, hooray! + * steal_from_global == 1: we didn't reserve stuff, boo! + * steal_from_global == 2: we've committed, still not a lot of + * room but maybe we'll have room in the global reserve this + * time. + * steal_from_global == 3: abandon all hope! + */ + if (steal_from_global > 2) { btrfs_warn(root->fs_info, "Could not get space for a delete, will truncate on mount %d", ret); @@ -5009,10 +5129,40 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + /* + * We can't just steal from the global reserve, we need tomake + * sure there is room to do it, if not we need to commit and try + * again. + */ + if (steal_from_global) { + if (!btrfs_check_space_for_delayed_refs(trans, root)) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, + min_size); + else + ret = -ENOSPC; + } + + /* + * Couldn't steal from the global reserve, we have too much + * pending stuff built up, commit the transaction and try it + * again. + */ + if (ret) { + ret = btrfs_commit_transaction(trans, root); + if (ret) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + continue; + } else { + steal_from_global = 0; + } + trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -ENOSPC) + if (ret != -ENOSPC && ret != -EAGAIN) break; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -5416,10 +5566,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) - inode = dentry->d_parent->d_inode; + inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root; @@ -6226,7 +6376,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); u64 index; int err; int drop_inode = 0; @@ -7396,6 +7546,7 @@ unlock: current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); + set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); } /* @@ -7721,8 +7872,6 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct bio *dio_bio; int ret; - if (err) - goto out_done; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -7745,7 +7894,6 @@ out_test: ordered = NULL; goto again; } -out_done: dio_bio = dip->dio_bio; kfree(dip); @@ -8013,9 +8161,8 @@ out_err: static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio *io_bio; + struct btrfs_dio_private *dip = NULL; + struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -8032,7 +8179,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_io_bio; + goto free_ordered; } dip->private = dio_bio->bi_private; @@ -8060,28 +8207,58 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); -free_io_bio: - bio_put(io_bio); free_ordered: /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. + * If we arrived here it means either we failed to submit the dip + * or we either failed to clone the dio_bio or failed to allocate the + * dip. If we cloned the dio_bio and allocated the dip, we can just + * call bio_endio against our io_bio so that we get proper resource + * cleanup if we fail to submit the dip, otherwise, we must do the + * same as btrfs_endio_direct_[write|read] because we can't call these + * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len, 1); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); + if (io_bio && dip) { + bio_endio(io_bio, ret); + /* + * The end io callbacks free our dip, do the final put on io_bio + * and all the cleanup and final put for dio_bio (through + * dio_end_io()). + */ + dip = NULL; + io_bio = NULL; + } else { + if (write) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, + file_offset); + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + /* + * Decrements our ref on the ordered extent and removes + * the ordered extent from the inode's ordered tree, + * doing all the proper resource cleanup such as for the + * reserved space and waking up any waiters for this + * ordered extent (through btrfs_remove_ordered_extent). + */ + btrfs_finish_ordered_io(ordered); + } else { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + /* + * Releases and cleans up our dio_bio, no need to bio_put() + * nor bio_endio()/bio_io_error() against dio_bio. + */ + dio_end_io(dio_bio, ret); } - bio_endio(dio_bio, ret); + if (io_bio) + bio_put(io_bio); + kfree(dip); } -static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, +static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, const struct iov_iter *iter, loff_t offset) { int seg; @@ -8096,7 +8273,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io goto out; /* If this is a write we don't need to check anymore */ - if (rw & WRITE) + if (iov_iter_rw(iter) == WRITE) return 0; /* * Check to make sure we don't have duplicate iov_base's in this @@ -8114,8 +8291,8 @@ out: return retval; } -static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -8126,10 +8303,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, bool relock = false; ssize_t ret; - if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset)) + if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset)) return 0; - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb__after_atomic(); /* @@ -8144,7 +8321,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, filemap_fdatawrite_range(inode->i_mapping, offset, offset + count - 1); - if (rw & WRITE) { + if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update * the isize, but it is protected by i_mutex. So we can @@ -8169,26 +8346,35 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { - inode_dio_done(inode); + inode_dio_end(inode); flags = DIO_LOCKING | DIO_SKIP_HOLES; wakeup = false; } - ret = __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iter, offset, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (rw & WRITE) { + ret = __blockdev_direct_IO(iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iter, offset, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) - btrfs_delalloc_release_space(inode, count); - else if (ret >= 0 && (size_t)ret < count) + if (ret < 0 && ret != -EIOCBQUEUED) { + /* + * If the error comes from submitting stage, + * btrfs_get_blocsk_direct() has free'd data space, + * and metadata space will be handled by + * finish_ordered_fn, don't do that again to make + * sure bytes_may_use is correct. + */ + if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, + &BTRFS_I(inode)->runtime_flags)) + btrfs_delalloc_release_space(inode, count); + } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); } out: if (wakeup) - inode_dio_done(inode); + inode_dio_end(inode); if (relock) mutex_lock(&inode->i_mutex); @@ -8581,7 +8767,7 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -ENOSPC) { + if (ret != -ENOSPC && ret != -EAGAIN) { err = ret; break; } @@ -8875,7 +9061,7 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { u64 delalloc_bytes; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); @@ -8896,8 +9082,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); struct timespec ctime = CURRENT_TIME; u64 index = 0; u64 root_objectid; @@ -9009,7 +9195,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.len); } else { ret = __btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, + d_inode(old_dentry), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) @@ -9033,12 +9219,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, dest, new_dir, - new_dentry->d_inode, + d_inode(new_dentry), new_dentry->d_name.name, new_dentry->d_name.len); } if (!ret && new_inode->i_nlink == 0) - ret = btrfs_orphan_add(trans, new_dentry->d_inode); + ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_fail; @@ -9451,6 +9637,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 74609b931ba5..0770c91586ca 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 { static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff); + u64 off, u64 olen, u64 olen_aligned, u64 destoff, + int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -456,6 +457,13 @@ static noinline int create_subvol(struct inode *dir, if (ret) return ret; + /* + * Don't create subvolume whose level is not zero. Or qgroup will be + * screwed up since it assume subvolme qgroup's level to be 0. + */ + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * The same as the snapshot creation, please see the comment @@ -546,8 +554,8 @@ static noinline int create_subvol(struct inode *dir, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -717,7 +725,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; @@ -761,10 +769,10 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) { int error; - if (!victim->d_inode) + if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(victim->d_parent->d_inode != dir); + BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); @@ -772,8 +780,8 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || + IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -792,7 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -810,7 +818,7 @@ static noinline int btrfs_mksubvol(struct path *parent, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = parent->dentry->d_inode; + struct inode *dir = d_inode(parent->dentry); struct dentry *dentry; int error; @@ -824,7 +832,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out_dput; error = btrfs_may_create(dir, dentry); @@ -1311,7 +1319,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index + 1; + max_to_defrag = last_index - i + 1; /* * make writeback starts from i, so the defrag range can be @@ -1361,7 +1369,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, cluster); - ra_index += max_cluster; + ra_index += cluster; } mutex_lock(&inode->i_mutex); @@ -1564,7 +1572,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - do_div(new_size, root->sectorsize); + new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", @@ -2264,10 +2272,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, { struct btrfs_ioctl_ino_lookup_args *args; struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) @@ -2275,13 +2280,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, inode = file_inode(file); + /* + * Unprivileged query to obtain the containing subvolume root id. The + * path is reset so it's consistent with btrfs_search_path_in_tree. + */ if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { + args->name[0] = 0; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, args->treeid, args->objectid, args->name); +out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2294,7 +2314,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, { struct dentry *parent = file->f_path.dentry; struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2333,12 +2353,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_dir; } - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { err = -ENOENT; goto out_dput; } - inode = dentry->d_inode; + inode = d_inode(dentry); dest = BTRFS_I(inode)->root; if (!capable(CAP_SYS_ADMIN)) { /* @@ -2403,11 +2423,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, "Attempt to delete subvolume %llu during send", dest->root_key.objectid); err = -EPERM; - goto out_dput; + goto out_unlock_inode; } - d_invalidate(dentry); - down_write(&root->fs_info->subvol_sem); err = may_destroy_subvol(dest); @@ -2498,9 +2516,10 @@ out_up_write: root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); } +out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { - shrink_dcache_sb(root->fs_info->sb); + d_invalidate(dentry); btrfs_invalidate_inodes(dest); d_delete(dentry); ASSERT(dest->send_in_progress == 0); @@ -2747,14 +2766,11 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, u64 off) +static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - pgoff_t index; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - index = off >> PAGE_CACHE_SHIFT; - page = grab_cache_page(inode->i_mapping, index); if (!page) return NULL; @@ -2775,6 +2791,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off) return page; } +static int gather_extent_pages(struct inode *inode, struct page **pages, + int num_pages, u64 off) +{ + int i; + pgoff_t index = off >> PAGE_CACHE_SHIFT; + + for (i = 0; i < num_pages; i++) { + pages[i] = extent_same_get_page(inode, index + i); + if (!pages[i]) + return -ENOMEM; + } + return 0; +} + static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) { /* do any pending delalloc/csum calc on src, one way or @@ -2800,52 +2830,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) } } -static void btrfs_double_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); - mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); } -static void btrfs_double_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + if (inode1 != inode2) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) { if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); lock_extent_range(inode1, loff1, len); - if (inode1 != inode2) { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + if (inode1 != inode2) lock_extent_range(inode2, loff2, len); +} + +struct cmp_pages { + int num_pages; + struct page **src_pages; + struct page **dst_pages; +}; + +static void btrfs_cmp_data_free(struct cmp_pages *cmp) +{ + int i; + struct page *pg; + + for (i = 0; i < cmp->num_pages; i++) { + pg = cmp->src_pages[i]; + if (pg) + page_cache_release(pg); + pg = cmp->dst_pages[i]; + if (pg) + page_cache_release(pg); + } + kfree(cmp->src_pages); + kfree(cmp->dst_pages); +} + +static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, + struct inode *dst, u64 dst_loff, + u64 len, struct cmp_pages *cmp) +{ + int ret; + int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + struct page **src_pgarr, **dst_pgarr; + + /* + * We must gather up all the pages before we initiate our + * extent locking. We use an array for the page pointers. Size + * of the array is bounded by len, which is in turn bounded by + * BTRFS_MAX_DEDUPE_LEN. + */ + src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + if (!src_pgarr || !dst_pgarr) { + kfree(src_pgarr); + kfree(dst_pgarr); + return -ENOMEM; } + cmp->num_pages = num_pages; + cmp->src_pages = src_pgarr; + cmp->dst_pages = dst_pgarr; + + ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff); + if (ret) + goto out; + + ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff); + +out: + if (ret) + btrfs_cmp_data_free(cmp); + return 0; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, - u64 dst_loff, u64 len) + u64 dst_loff, u64 len, struct cmp_pages *cmp) { int ret = 0; + int i; struct page *src_page, *dst_page; unsigned int cmp_len = PAGE_CACHE_SIZE; void *addr, *dst_addr; + i = 0; while (len) { if (len < PAGE_CACHE_SIZE) cmp_len = len; - src_page = extent_same_get_page(src, loff); - if (!src_page) - return -EINVAL; - dst_page = extent_same_get_page(dst, dst_loff); - if (!dst_page) { - page_cache_release(src_page); - return -EINVAL; - } + BUG_ON(i >= cmp->num_pages); + + src_page = cmp->src_pages[i]; + dst_page = cmp->dst_pages[i]; + addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -2857,26 +2955,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, kunmap_atomic(addr); kunmap_atomic(dst_addr); - page_cache_release(src_page); - page_cache_release(dst_page); if (ret) break; - loff += cmp_len; - dst_loff += cmp_len; len -= cmp_len; + i++; } return ret; } -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, + u64 olen) { + u64 len = *plen; u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - if (off + len > inode->i_size || off + len < off) + if (off + olen > inode->i_size || off + olen < off) return -EINVAL; + + /* if we extend to eof, continue to block boundary */ + if (off + len == inode->i_size) + *plen = len = ALIGN(inode->i_size, bs) - off; + /* Check that we are block aligned - btrfs_clone() requires this */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) return -EINVAL; @@ -2884,28 +2986,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; + u64 len = olen; + struct cmp_pages cmp; + int same_inode = 0; + u64 same_lock_start = 0; + u64 same_lock_len = 0; - /* - * btrfs_clone() can't handle extents in the same file - * yet. Once that works, we can drop this check and replace it - * with a check for the same inode, but overlapping extents. - */ if (src == dst) - return -EINVAL; + same_inode = 1; + + if (len == 0) + return 0; - btrfs_double_lock(src, loff, dst, dst_loff, len); + if (same_inode) { + mutex_lock(&src->i_mutex); - ret = extent_same_check_offsets(src, loff, len); - if (ret) - goto out_unlock; + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; - ret = extent_same_check_offsets(dst, dst_loff, len); - if (ret) - goto out_unlock; + /* + * Single inode case wants the same checks, except we + * don't want our length pushed out past i_size as + * comparing that data range makes no sense. + * + * extent_same_check_offsets() will do this for an + * unaligned length at i_size, so catch it here and + * reject the request. + * + * This effectively means we require aligned extents + * for the single-inode case, whereas the other cases + * allow an unaligned length so long as it ends at + * i_size. + */ + if (len != olen) { + ret = -EINVAL; + goto out_unlock; + } + + /* Check for overlapping ranges */ + if (dst_loff + len > loff && dst_loff < loff + len) { + ret = -EINVAL; + goto out_unlock; + } + + same_lock_start = min_t(u64, loff, dst_loff); + same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; + } else { + btrfs_double_inode_lock(src, dst); + + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); + if (ret) + goto out_unlock; + } /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != @@ -2914,12 +3055,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, goto out_unlock; } - ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); + if (ret) + goto out_unlock; + + if (same_inode) + lock_extent_range(src, same_lock_start, same_lock_len); + else + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + + /* pass original length for comparison so we stay within i_size */ + ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); if (ret == 0) - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + + if (same_inode) + unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, + same_lock_start + same_lock_len - 1); + else + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_cmp_data_free(&cmp); out_unlock: - btrfs_double_unlock(src, loff, dst, dst_loff, len); + if (same_inode) + mutex_unlock(&src->i_mutex); + else + btrfs_double_inode_unlock(src, dst); return ret; } @@ -2929,7 +3090,7 @@ out_unlock: static long btrfs_ioctl_file_extent_same(struct file *file, struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_args *same = NULL; struct btrfs_ioctl_same_extent_info *info; struct inode *src = file_inode(file); u64 off; @@ -2959,6 +3120,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (IS_ERR(same)) { ret = PTR_ERR(same); + same = NULL; goto out; } @@ -3029,6 +3191,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, out: mnt_drop_write_file(file); + kfree(same); return ret; } @@ -3039,7 +3202,7 @@ out: static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 disko) { - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist *roots; struct ulist_iterator uiter; struct ulist_node *root_node = NULL; @@ -3071,13 +3234,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, - const u64 olen) + const u64 olen, + int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (!no_time_update) + inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -3162,13 +3327,13 @@ static void clone_update_extent_map(struct inode *inode, * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen, extent_same uses - * identical values here + * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff) + const u64 destoff, int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -3202,6 +3367,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { + u64 next_key_min_offset = key.offset + 1; + /* * note the key will change type as we walk through the * tree. @@ -3282,7 +3449,7 @@ process_slot: } else if (key.offset >= off + len) { break; } - + next_key_min_offset = key.offset + datal; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), @@ -3421,6 +3588,20 @@ process_slot: u64 trim = 0; u64 aligned_end = 0; + /* + * Don't copy an inline extent into an offset + * greater than zero. Having an inline extent + * at such an offset results in chaos as btrfs + * isn't prepared for such cases. Just skip + * this case for the same reasons as commented + * at btrfs_ioctl_clone(). + */ + if (last_dest_end > 0) { + ret = -EOPNOTSUPP; + btrfs_end_transaction(trans, root); + goto out; + } + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3490,14 +3671,15 @@ process_slot: root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen); + destoff, olen, + no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) break; } btrfs_release_path(path); - key.offset++; + key.offset = next_key_min_offset; } ret = 0; @@ -3528,7 +3710,7 @@ process_slot: clone_update_extent_map(inode, trans, NULL, last_dest_end, destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen); + destoff, olen, no_time_update); } out: @@ -3626,6 +3808,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; + if (len == 0) { + ret = 0; + goto out_unlock; + } + /* verify the end result is block aligned */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || !IS_ALIGNED(destoff, bs)) @@ -3660,7 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, lock_extent_range(inode, destoff, len); } - ret = btrfs_clone(src, inode, off, olen, len, destoff); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); if (same_inode) { u64 lock_start = min_t(u64, off, destoff); @@ -4624,6 +4811,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) sa->src, sa->dst); } + /* update qgroup status and info */ + err = btrfs_run_qgroups(trans, root->fs_info); + if (err < 0) + btrfs_error(root->fs_info, ret, + "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -4669,8 +4861,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) /* FIXME: check if the IDs really exist */ if (sa->create) { - ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, - NULL); + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid); } else { ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 617553cdb7d3..a2f051347731 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -434,7 +434,7 @@ out: return ret; } -struct btrfs_compress_op btrfs_lzo_compress = { +const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h index b7816cefbd13..1b10a3cd1195 100644 --- a/fs/btrfs/math.h +++ b/fs/btrfs/math.h @@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor) if (factor == 10) return num; num *= factor; - do_div(num, 10); - return num; + return div_u64(num, 10); } static inline u64 div_factor_fine(u64 num, int factor) @@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor) if (factor == 100) return num; num *= factor; - do_div(num, 100); - return num; + return div_u64(num, 100); } #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 157cc54fc634..52170cf1757e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && - !(type == BTRFS_ORDERED_NOCOW)) - entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - WARN_ON(entry->csum_bytes_left < sum->len); - entry->csum_bytes_left -= sum->len; - if (entry->csum_bytes_left == 0) - wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - list_add_tail(&ordered->trans_list, &trans->ordered); + /* + * If our ordered extent completed it means it updated the + * fs/subvol and csum trees already, so no need to make the + * current transaction's commit wait for it, as we end up + * holding memory unnecessarily and delaying the inode's iput + * until the transaction commit (we schedule an iput for the + * inode when the ordered extent's refcount drops to 0), which + * prevents it from being evictable until the transaction + * commits. + */ + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) + btrfs_put_ordered_extent(ordered); + else + list_add_tail(&ordered->trans_list, &trans->ordered); + spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + ASSERT(list_empty(&entry->log_list)); + ASSERT(list_empty(&entry->trans_list)); + ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { @@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); @@ -722,6 +734,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { int ret = 0; + int ret_wb = 0; u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; @@ -741,9 +754,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (ret) return ret; - ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); - if (ret) - return ret; + /* + * If we have a writeback error don't return immediately. Wait first + * for any ordered extents that haven't completed yet. This is to make + * sure no one can dirty the same page ranges and call writepages() + * before the ordered extents complete - to avoid failures (-EEXIST) + * when adding the new ordered extents to the ordered tree. + */ + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; while (1) { @@ -767,7 +785,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) break; end--; } - return ret; + return ret_wb ? ret_wb : ret; } /* @@ -838,6 +856,20 @@ out: return entry; } +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_extent *oe; + + oe = btrfs_lookup_ordered_range(inode, file_offset, len); + if (oe) { + btrfs_put_ordered_extent(oe); + return true; + } + return false; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e96cd4ccd805..7176cc0fe43f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -89,9 +89,6 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; - /* number of bytes that still need csumming */ - u64 csum_bytes_left; - /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 file_offset, u64 len); +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 129b1dd28527..dca137b04095 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode) return NULL; } + + diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 058c79eecbfb..8a8202956576 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -34,6 +34,7 @@ #include "extent_io.h" #include "qgroup.h" + /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys @@ -84,11 +85,42 @@ struct btrfs_qgroup { /* * temp variables for accounting operations + * Refer to qgroup_shared_accouting() for details. */ u64 old_refcnt; u64 new_refcnt; }; +static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->old_refcnt < seq) + qg->old_refcnt = seq; + qg->old_refcnt += mod; +} + +static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->new_refcnt < seq) + qg->new_refcnt = seq; + qg->new_refcnt += mod; +} + +static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->old_refcnt < seq) + return 0; + return qg->old_refcnt - seq; +} + +static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->new_refcnt < seq) + return 0; + return qg->new_refcnt - seq; +} + /* * glue structure to represent the relations between qgroups. */ @@ -644,9 +676,8 @@ out: } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 qgroupid, - u64 flags, u64 max_rfer, u64 max_excl, - u64 rsv_rfer, u64 rsv_excl) + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) { struct btrfs_path *path; struct btrfs_key key; @@ -657,7 +688,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, key.objectid = 0; key.type = BTRFS_QGROUP_LIMIT_KEY; - key.offset = qgroupid; + key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) @@ -673,11 +704,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); - btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); - btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); - btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); - btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); - btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); btrfs_mark_buffer_dirty(l); @@ -967,6 +998,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; fs_info->quota_root = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -982,7 +1014,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(trans, tree_root, quota_root->node); + clean_tree_block(trans, tree_root->fs_info, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); @@ -1001,6 +1033,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + * + * Caller should hold fs_info->qgroup_lock. + */ +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 ref_root, + u64 num_bytes, int sign) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + return ret; +} + + +/* + * Quick path for updating qgroup with only excl refs. + * + * In that case, just update all parent will be enough. + * Or we needs to do a full rescan. + * Caller should also hold fs_info->qgroup_lock. + * + * Return 0 for quick update, return >0 for need to full rescan + * and mark INCONSISTENT flag. + * Return < 0 for other error. + */ +static int quick_update_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 src, u64 dst, + int sign) +{ + struct btrfs_qgroup *qgroup; + int ret = 1; + int err = 0; + + qgroup = find_qgroup_rb(fs_info, src); + if (!qgroup) + goto out; + if (qgroup->excl == qgroup->rfer) { + ret = 0; + err = __qgroup_excl_accounting(fs_info, tmp, dst, + qgroup->excl, sign); + if (err < 0) { + ret = err; + goto out; + } + } +out: + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { @@ -1008,8 +1144,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; + /* Check the level of src and dst first */ + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + return -EINVAL; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1043,23 +1188,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, spin_lock(&fs_info->qgroup_lock); ret = add_relation_rb(quota_root->fs_info, src, dst); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + ret = quick_update_accounting(fs_info, tmp, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + ulist_free(tmp); return ret; } -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, +int __del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; int err; - mutex_lock(&fs_info->qgroup_ioctl_lock); + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; @@ -1088,14 +1243,27 @@ exist: spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); out: + ulist_free(tmp); + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + ret = __del_qgroup_relation(trans, fs_info, src, dst); mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; } int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) + struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -1133,6 +1301,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); @@ -1147,15 +1316,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } else { - /* check if there are no relations to this qgroup */ - if (!list_empty(&qgroup->groups) || - !list_empty(&qgroup->members)) { + /* check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { ret = -EBUSY; goto out; } } ret = del_qgroup_item(trans, quota_root, qgroupid); + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + ret = __del_qgroup_relation(trans, fs_info, + qgroupid, + list->group->qgroupid); + if (ret) + goto out; + } + spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); @@ -1171,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; + /* Sometimes we would want to clear the limit on this qgroup. + * To meet this requirement, we treat the -1 as a special value + * which tell kernel to clear the limit on this qgroup. + */ + const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; @@ -1184,307 +1367,140 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } - ret = update_qgroup_limit_item(trans, quota_root, qgroupid, - limit->flags, limit->max_rfer, - limit->max_excl, limit->rsv_rfer, - limit->rsv_excl); - if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_info(fs_info, "unable to update quota limit for %llu", - qgroupid); - } spin_lock(&fs_info->qgroup_lock); - qgroup->lim_flags = limit->flags; - qgroup->max_rfer = limit->max_rfer; - qgroup->max_excl = limit->max_excl; - qgroup->rsv_rfer = limit->rsv_rfer; - qgroup->rsv_excl = limit->rsv_excl; - spin_unlock(&fs_info->qgroup_lock); -out: - mutex_unlock(&fs_info->qgroup_ioctl_lock); - return ret; -} - -static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - /* - * Ignore seq and type here, we're looking for any operation - * at all related to this extent on that root. - */ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - return 0; -} - -static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node *n; - struct btrfs_qgroup_operation *cur; - int cmp; - - spin_lock(&fs_info->qgroup_op_lock); - n = fs_info->qgroup_op_tree.rb_node; - while (n) { - cur = rb_entry(n, struct btrfs_qgroup_operation, n); - cmp = comp_oper_exist(cur, oper); - if (cmp < 0) { - n = n->rb_right; - } else if (cmp) { - n = n->rb_left; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { + if (limit->max_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + qgroup->max_rfer = 0; } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; + qgroup->max_rfer = limit->max_rfer; } } - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -static int comp_oper(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - if (oper1->type < oper2->type) - return -1; - if (oper1->type > oper2->type) - return 1; - return 0; -} - -static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_qgroup_operation *cur; - int cmp; - - spin_lock(&fs_info->qgroup_op_lock); - p = &fs_info->qgroup_op_tree.rb_node; - while (*p) { - parent = *p; - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); - cmp = comp_oper(cur, oper); - if (cmp < 0) { - p = &(*p)->rb_right; - } else if (cmp) { - p = &(*p)->rb_left; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + if (limit->max_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + qgroup->max_excl = 0; } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; + qgroup->max_excl = limit->max_excl; } } - rb_link_node(&oper->n, parent, p); - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -/* - * Record a quota operation for processing later on. - * @trans: the transaction we are adding the delayed op to. - * @fs_info: the fs_info for this fs. - * @ref_root: the root of the reference we are acting on, - * @bytenr: the bytenr we are acting on. - * @num_bytes: the number of bytes in the reference. - * @type: the type of operation this is. - * @mod_seq: do we need to get a sequence number for looking up roots. - * - * We just add it to our trans qgroup_ref_list and carry on and process these - * operations in order at some later point. If the reference root isn't a fs - * root then we don't bother with doing anything. - * - * MUST BE HOLDING THE REF LOCK. - */ -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, - u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, int mod_seq) -{ - struct btrfs_qgroup_operation *oper; - int ret; - - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - return 0; - - oper = kmalloc(sizeof(*oper), GFP_NOFS); - if (!oper) - return -ENOMEM; - - oper->ref_root = ref_root; - oper->bytenr = bytenr; - oper->num_bytes = num_bytes; - oper->type = type; - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); - INIT_LIST_HEAD(&oper->elem.list); - oper->elem.seq = 0; - - trace_btrfs_qgroup_record_ref(oper); - - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { - /* - * If any operation for this bytenr/ref_root combo - * exists, then we know it's not exclusively owned and - * shouldn't be queued up. - * - * This also catches the case where we have a cloned - * extent that gets queued up multiple times during - * drop snapshot. - */ - if (qgroup_oper_exists(fs_info, oper)) { - kfree(oper); - return 0; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { + if (limit->rsv_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + qgroup->rsv_rfer = 0; + } else { + qgroup->rsv_rfer = limit->rsv_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { + if (limit->rsv_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + qgroup->rsv_excl = 0; + } else { + qgroup->rsv_excl = limit->rsv_excl; } } + qgroup->lim_flags |= limit->flags; + + spin_unlock(&fs_info->qgroup_lock); - ret = insert_qgroup_oper(fs_info, oper); + ret = update_qgroup_limit_item(trans, quota_root, qgroup); if (ret) { - /* Shouldn't happen so have an assert for developers */ - ASSERT(0); - kfree(oper); - return ret; + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + qgroupid); } - list_add_tail(&oper->list, &trans->qgroup_ref_list); - if (mod_seq) - btrfs_get_tree_mod_seq(fs_info, &oper->elem); - - return 0; +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; } -/* - * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and - * exclusive counts adjusted. - */ -static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { - struct btrfs_qgroup *qgroup; - struct ulist *tmp; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; - int sign = 0; + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; - spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) - goto out; - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - sign = 1; - break; - case BTRFS_QGROUP_OPER_SUB_EXCL: - sign = -1; - break; - default: - ASSERT(0); - } - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); + /* + * No need to do lock, since this function will only be called in + * btrfs_commmit_transaction(). + */ + node = rb_first(&delayed_refs->dirty_extent_root); + while (node) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, + &record->old_roots); if (ret < 0) - goto out; + break; + if (qgroup_to_skip) + ulist_del(record->old_roots, qgroup_to_skip, 0); + node = rb_next(node); } + return ret; +} - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = u64_to_ptr(unode->aux); - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - qgroup_dirty(fs_info, qgroup); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) +{ + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_qgroup_extent_record *entry; + u64 bytenr = record->bytenr; - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, + node); + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; } - ret = 0; -out: - spin_unlock(&fs_info->qgroup_lock); - ulist_free(tmp); - return ret; + + rb_link_node(&record->node, parent_node, p); + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); + return NULL; } +#define UPDATE_NEW 0 +#define UPDATE_OLD 1 /* - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as - * properly. + * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, - u64 root_to_skip, struct ulist *tmp, - struct ulist *roots, struct ulist *qgroups, - u64 seq, int *old_roots, int rescan) +static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + struct ulist *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct ulist_node *tmp_unode; struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret; + int ret = 0; + if (!roots) + return 0; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { - /* We don't count our current root here */ - if (unode->val == root_to_skip) - continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - /* - * We could have a pending removal of this same ref so we may - * not have actually found our ref root when doing - * btrfs_find_all_roots, so we need to keep track of how many - * old roots we find in case we removed ours and added a - * different one at the same time. I don't think this could - * happen in practice but that sort of thinking leads to pain - * and suffering and to the dark side. - */ - (*old_roots)++; ulist_reinit(tmp); ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), @@ -1499,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *glist; qg = u64_to_ptr(tmp_unode->aux); - /* - * We use this sequence number to keep from having to - * run the whole list and 0 out the refcnt every time. - * We basically use sequnce as the known 0 count and - * then add 1 everytime we see a qgroup. This is how we - * get how many of the roots actually point up to the - * upper level qgroups in order to determine exclusive - * counts. - * - * For rescan we want to set old_refcnt to seq so our - * exclusive calculations end up correct. - */ - if (rescan) - qg->old_refcnt = seq; - else if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; + if (update_old) + btrfs_qgroup_update_old_refcnt(qg, seq, 1); else - qg->old_refcnt++; - - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; + btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, ptr_to_u64(glist->group), @@ -1540,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, } /* - * We need to walk forward in our operation tree and account for any roots that - * were deleted after we made this operation. - */ -static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct ulist *tmp, - struct ulist *qgroups, u64 seq, - int *old_roots) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - struct btrfs_qgroup_operation *tmp_oper; - struct rb_node *n; - int ret; - - ulist_reinit(tmp); - - /* - * We only walk forward in the tree since we're only interested in - * removals that happened _after_ our operation. - */ - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - return 0; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - while (tmp_oper->bytenr == oper->bytenr) { - /* - * If it's not a removal we don't care, additions work out - * properly with our refcnt tracking. - */ - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) - goto next; - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); - if (!qg) - goto next; - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret) { - if (ret < 0) - return ret; - /* - * We only want to increase old_roots if this qgroup is - * not already in the list of qgroups. If it is already - * there then that means it must have been re-added or - * the delete will be discarded because we had an - * existing ref that we haven't looked up yet. In this - * case we don't want to increase old_roots. So if ret - * == 1 then we know that this is the first time we've - * seen this qgroup and we can bump the old_roots. - */ - (*old_roots)++; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - } -next: - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&tmp_oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - break; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - } - - /* Ok now process the qgroups we found */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* Add refcnt for the newly added reference. */ -static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct btrfs_qgroup *qgroup, - struct ulist *tmp, struct ulist *qgroups, - u64 seq) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - int ret; - - ulist_reinit(tmp); - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - } else { - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * This adjusts the counters for all referenced qgroups if need be. + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * + * Excl update is tricky, the update is split into 2 part. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- + * B | * | - | + * ------------------------------------- + * !B | + | ** | + * ------------------------------------- + * + * Conditions: + * A: cur_old_roots < nr_old_roots (not exclusive before) + * !A: cur_old_roots == nr_old_roots (possible exclusive before) + * B: cur_new_roots < nr_new_roots (not exclusive now) + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * + * Results: + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing + * *: Definitely not changed. **: Possible unchanged. + * + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. + * + * To make the logic clear, we first use condition A and B to split + * combination into 4 results. + * + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them + * only on variant maybe 0. + * + * Lastly, check result **, since there are 2 variants maybe 0, split them + * again(2x2). + * But this time we don't need to consider other things, the codes and logic + * is easy to understand now. */ -static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, - u64 root_to_skip, u64 num_bytes, - struct ulist *qgroups, u64 seq, - int old_roots, int new_roots, int rescan) +static int qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct ulist *qgroups, + u64 nr_old_roots, + u64 nr_new_roots, + u64 num_bytes, u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1706,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, bool dirty = false; qg = u64_to_ptr(unode->aux); - /* - * Wasn't referenced before but is now, add to the reference - * counters. - */ - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + + /* Rfer update part */ + if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } - - /* - * Was referenced before but isn't now, subtract from the - * reference counters. - */ - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } - if (qg->old_refcnt < seq) - cur_old_count = 0; - else - cur_old_count = qg->old_refcnt - seq; - if (qg->new_refcnt < seq) - cur_new_count = 0; - else - cur_new_count = qg->new_refcnt - seq; + /* Excl update part */ + /* Exclusive/none -> shared case */ + if (cur_old_count == nr_old_roots && + cur_new_count < nr_new_roots) { + /* Exclusive -> shared */ + if (cur_old_count != 0) { + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + } - /* - * If our refcount was the same as the roots previously but our - * new count isn't the same as the number of roots now then we - * went from having a exclusive reference on this range to not. - */ - if (old_roots && cur_old_count == old_roots && - (cur_new_count != new_roots || new_roots == 0)) { - WARN_ON(cur_new_count != new_roots && new_roots == 0); - qg->excl -= num_bytes; - qg->excl_cmpr -= num_bytes; - dirty = true; + /* Shared -> exclusive/none case */ + if (cur_old_count < nr_old_roots && + cur_new_count == nr_new_roots) { + /* Shared->exclusive */ + if (cur_new_count != 0) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } } - /* - * If we didn't reference all the roots before but now we do we - * have an exclusive reference to this range. - */ - if ((!old_roots || (old_roots && cur_old_count != old_roots)) - && cur_new_count == new_roots) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - dirty = true; + /* Exclusive/none -> exclusive/none case */ + if (cur_old_count == nr_old_roots && + cur_new_count == nr_new_roots) { + if (cur_old_count == 0) { + /* None -> exclusive/none */ + + if (cur_new_count != 0) { + /* None -> exclusive */ + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + /* None -> none, nothing changed */ + } else { + /* Exclusive -> exclusive/none */ + + if (cur_new_count == 0) { + /* Exclusive -> none */ + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + /* Exclusive -> exclusive, nothing changed */ + } } + /* For exclusive extent, free its reserved bytes too */ + if (nr_old_roots == 0 && nr_new_roots == 1 && + cur_new_count == nr_new_roots) + qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } return 0; } -/* - * If we removed a data extent and there were other references for that bytenr - * then we need to lookup all referenced roots to make sure we still don't - * reference this bytenr. If we do then we can just discard this operation. - */ -static int check_existing_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; - - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - oper->elem.seq, &roots); - if (ret < 0) - return ret; - ret = 0; - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - if (unode->val == oper->ref_root) { - ret = 1; - break; - } - } - ulist_free(roots); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - - return ret; -} - -/* - * If we share a reference across multiple roots then we may need to adjust - * various qgroups referenced and exclusive counters. The basic premise is this - * - * 1) We have seq to represent a 0 count. Instead of looping through all of the - * qgroups and resetting their refcount to 0 we just constantly bump this - * sequence number to act as the base reference count. This means that if - * anybody is equal to or below this sequence they were never referenced. We - * jack this sequence up by the number of roots we found each time in order to - * make sure we don't have any overlap. - * - * 2) We first search all the roots that reference the area _except_ the root - * we're acting on currently. This makes up the old_refcnt of all the qgroups - * before. - * - * 3) We walk all of the qgroups referenced by the root we are currently acting - * on, and will either adjust old_refcnt in the case of a removal or the - * new_refcnt in the case of an addition. - * - * 4) Finally we walk all the qgroups that are referenced by this range - * including the root we are acting on currently. We will adjust the counters - * based on the number of roots we had and will have after this operation. - * - * Take this example as an illustration - * - * [qgroup 1/0] - * / | \ - * [qg 0/0] [qg 0/1] [qg 0/2] - * \ | / - * [ extent ] - * - * Say we are adding a reference that is covered by qg 0/0. The first step - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with - * old_roots being 2. Because it is adding new_roots will be 1. We then go - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a - * reference and thus must add the size to the referenced bytes. Everything - * else is the same so nothing else changes. - */ -static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, + struct ulist *old_roots, struct ulist *new_roots) { - struct ulist *roots = NULL; - struct ulist *qgroups, *tmp; - struct btrfs_qgroup *qgroup; - struct seq_list elem = {}; + struct ulist *qgroups = NULL; + struct ulist *tmp = NULL; u64 seq; - int old_roots = 0; - int new_roots = 0; + u64 nr_new_roots = 0; + u64 nr_old_roots = 0; int ret = 0; - if (oper->elem.seq) { - ret = check_existing_refs(trans, fs_info, oper); - if (ret < 0) - return ret; - if (ret) - return 0; - } + if (new_roots) + nr_new_roots = new_roots->nnodes; + if (old_roots) + nr_old_roots = old_roots->nnodes; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - return -ENOMEM; + if (!fs_info->quota_enabled) + goto out_free; + BUG_ON(!fs_info->quota_root); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) { + ret = -ENOMEM; + goto out_free; + } tmp = ulist_alloc(GFP_NOFS); if (!tmp) { - ulist_free(qgroups); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, - &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) { - ulist_free(qgroups); - ulist_free(tmp); - return ret; + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + ret = 0; + goto out_free; + } } + mutex_unlock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; seq = fs_info->qgroup_seq; - /* - * So roots is the list of all the roots currently pointing at the - * bytenr, including the ref we are adding if we are adding, or not if - * we are removing a ref. So we pass in the ref_root to skip that root - * in our calculations. We set old_refnct and new_refcnt cause who the - * hell knows what everything looked like before, and it doesn't matter - * except... - */ - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, - seq, &old_roots, 0); - if (ret < 0) - goto out; - - /* - * Now adjust the refcounts of the qgroups that care about this - * reference, either the old_count in the case of removal or new_count - * in the case of an addition. - */ - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, - seq); + /* Update old refcnts using old_roots */ + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, + UPDATE_OLD); if (ret < 0) goto out; - /* - * ...in the case of removals. If we had a removal before we got around - * to processing this operation then we need to find that guy and count - * his references as if they really existed so we don't end up screwing - * up the exclusive counts. Then whenever we go to process the delete - * everything will be grand and we can account for whatever exclusive - * changes need to be made there. We also have to pass in old_roots so - * we have an accurate count of the roots as it pertains to this - * operations view of the world. - */ - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, - &old_roots); + /* Update new refcnts using new_roots */ + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, + UPDATE_NEW); if (ret < 0) goto out; - /* - * We are adding our root, need to adjust up the number of roots, - * otherwise old_roots is the number of roots we want. - */ - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - new_roots = old_roots + 1; - } else { - new_roots = old_roots; - old_roots++; - } - fs_info->qgroup_seq += old_roots + 1; - + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + num_bytes, seq); /* - * And now the magic happens, bless Arne for having a pretty elegant - * solution for this. + * Bump qgroup_seq to avoid seq overlap */ - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, - qgroups, seq, old_roots, new_roots, 0); + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(qgroups); - ulist_free(roots); +out_free: ulist_free(tmp); + ulist_free(qgroups); + ulist_free(old_roots); + ulist_free(new_roots); return ret; } -/* - * Process a reference to a shared subtree. This type of operation is - * queued during snapshot removal when we encounter extents which are - * shared between more than one root. - */ -static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup_list *glist; - struct ulist *parents; - int ret = 0; - int err; - struct btrfs_qgroup *qg; - u64 root_obj = 0; - struct seq_list elem = {}; - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - elem.seq, &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) - goto out; - - if (roots->nnodes != 1) - goto out; - - ULIST_ITER_INIT(&uiter); - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ - /* - * If we find our ref root then that means all refs - * this extent has to the root have not yet been - * deleted. In that case, we do nothing and let the - * last ref for this bytenr drive our update. - * - * This can happen for example if an extent is - * referenced multiple times in a snapshot (clone, - * etc). If we are in the middle of snapshot removal, - * queued updates for such an extent will find the - * root if we have not yet finished removing the - * snapshot. - */ - if (unode->val == oper->ref_root) - goto out; - - root_obj = unode->val; - BUG_ON(!root_obj); - - spin_lock(&fs_info->qgroup_lock); - qg = find_qgroup_rb(fs_info, root_obj); - if (!qg) - goto out_unlock; - - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* - * Adjust counts for parent groups. First we find all - * parents, then in the 2nd loop we do the adjustment - * while adding parents of the parents to our ulist. - */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(parents, &uiter))) { - qg = u64_to_ptr(unode->aux); - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - } - -out_unlock: - spin_unlock(&fs_info->qgroup_lock); - -out: - ulist_free(roots); - ulist_free(parents); - return ret; -} - -/* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. - */ -static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct ulist *new_roots = NULL; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - if (!fs_info->quota_enabled) - return 0; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); - BUG_ON(!fs_info->quota_root); - - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + if (!ret) { + /* + * Use (u64)-1 as time_seq to do special search, which + * doesn't lock tree or delayed_refs and search current + * root. It's safe inside commit_transaction(). + */ + ret = btrfs_find_all_roots(trans, fs_info, + record->bytenr, (u64)-1, &new_roots); + if (ret < 0) + goto cleanup; + if (qgroup_to_skip) + ulist_del(new_roots, qgroup_to_skip, 0); + ret = btrfs_qgroup_account_extent(trans, fs_info, + record->bytenr, record->num_bytes, + record->old_roots, new_roots); + record->old_roots = NULL; + new_roots = NULL; } - } - mutex_unlock(&fs_info->qgroup_rescan_lock); +cleanup: + ulist_free(record->old_roots); + ulist_free(new_roots); + new_roots = NULL; + rb_erase(node, &delayed_refs->dirty_extent_root); + kfree(record); - ASSERT(is_fstree(oper->ref_root)); - - trace_btrfs_qgroup_account(oper); - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - case BTRFS_QGROUP_OPER_SUB_EXCL: - ret = qgroup_excl_accounting(fs_info, oper); - break; - case BTRFS_QGROUP_OPER_ADD_SHARED: - case BTRFS_QGROUP_OPER_SUB_SHARED: - ret = qgroup_shared_accounting(trans, fs_info, oper); - break; - case BTRFS_QGROUP_OPER_SUB_SUBTREE: - ret = qgroup_subtree_accounting(trans, fs_info, oper); - break; - default: - ASSERT(0); - } - return ret; -} - -/* - * Needs to be called everytime we run delayed refs, even if there is an error - * in order to cleanup outstanding operations. - */ -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_operation *oper; - int ret = 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - oper = list_first_entry(&trans->qgroup_ref_list, - struct btrfs_qgroup_operation, list); - list_del_init(&oper->list); - if (!ret || !trans->aborted) - ret = btrfs_qgroup_account(trans, fs_info, oper); - spin_lock(&fs_info->qgroup_op_lock); - rb_erase(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - kfree(oper); } return ret; } @@ -2156,6 +1811,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + ret = update_qgroup_limit_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; spin_lock(&fs_info->qgroup_lock); } if (fs_info->quota_enabled) @@ -2219,6 +1878,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } + + if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { + ret = -EINVAL; + goto out; + } ++i_qgroups; } } @@ -2230,17 +1894,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (ret) goto out; - if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { - ret = update_qgroup_limit_item(trans, quota_root, objectid, - inherit->lim.flags, - inherit->lim.max_rfer, - inherit->lim.max_excl, - inherit->lim.rsv_rfer, - inherit->lim.rsv_excl); - if (ret) - goto out; - } - if (srcid) { struct btrfs_root *srcroot; struct btrfs_key srckey; @@ -2286,6 +1939,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, goto unlock; } + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + dstgroup->lim_flags = inherit->lim.flags; + dstgroup->max_rfer = inherit->lim.max_rfer; + dstgroup->max_excl = inherit->lim.max_excl; + dstgroup->rsv_rfer = inherit->lim.rsv_rfer; + dstgroup->rsv_excl = inherit->lim.rsv_excl; + + ret = update_qgroup_limit_item(trans, quota_root, dstgroup); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + dstgroup->qgroupid); + goto unlock; + } + } + if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) @@ -2302,6 +1971,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; + + /* inherit the limit info */ + dstgroup->lim_flags = srcgroup->lim_flags; + dstgroup->max_rfer = srcgroup->max_rfer; + dstgroup->max_excl = srcgroup->max_excl; + dstgroup->rsv_rfer = srcgroup->rsv_rfer; + dstgroup->rsv_excl = srcgroup->rsv_excl; + qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); } @@ -2358,12 +2035,6 @@ out: return ret; } -/* - * reserve some space for a qgroup and all its parents. The reservation takes - * place with start_transaction or dealloc_reserve, similar to ENOSPC - * accounting. If not enough space is available, EDQUOT is returned. - * We assume that the requested space is new for all qgroups. - */ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; @@ -2513,19 +2184,17 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) /* * returns < 0 on error, 0 when more leafs are to be scanned. - * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. + * returns 1 when done. */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *qgroups, - struct ulist *tmp, struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, + struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; - u64 seq; - int new_roots; int slot; int ret; @@ -2575,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ulist_reinit(qgroups); ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, &roots); if (ret < 0) goto out; - spin_lock(&fs_info->qgroup_lock); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - - new_roots = 0; - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, - seq, &new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, - seq, 0, new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); + /* For rescan, just pass old_roots as NULL */ + ret = btrfs_qgroup_account_extent(trans, fs_info, + found.objectid, num_bytes, NULL, roots); + if (ret < 0) goto out; - } - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); } out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -2615,19 +2266,13 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; + int ret = 0; path = btrfs_alloc_path(); if (!path) goto out; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - goto out; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - goto out; scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); if (!scratch_leaf) goto out; @@ -2643,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - qgroups, tmp, scratch_leaf); + scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2653,14 +2298,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); - ulist_free(qgroups); - ulist_free(tmp); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err == 2 && + if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } else if (err < 0) { @@ -2668,13 +2311,33 @@ out: } mutex_unlock(&fs_info->qgroup_rescan_lock); + /* + * only update status, since the previous part has alreay updated the + * qgroup info. + */ + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_err(fs_info, + "fail to start transaction for status update: %d\n", + err); + goto done; + } + ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d\n", err); + } + btrfs_end_transaction(trans, fs_info->quota_root); + if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", - err == 2 ? " (inconsistency flag cleared)" : ""); + err > 0 ? " (inconsistency flag cleared)" : ""); } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } +done: complete_all(&fs_info->qgroup_rescan_completion); } @@ -2709,7 +2372,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); goto err; } - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 18cc68ca3090..6387dcfa354c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -19,43 +19,18 @@ #ifndef __BTRFS_QGROUP__ #define __BTRFS_QGROUP__ +#include "ulist.h" +#include "delayed-ref.h" + /* - * A description of the operations, all of these operations only happen when we - * are adding the 1st reference for that subvolume in the case of adding space - * or on the last reference delete in the case of subtraction. The only - * exception is the last one, which is added for confusion. - * - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only - * one pointing at the bytes we are adding. This is called on the first - * allocation. - * - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be - * shared between subvols. This is called on the creation of a ref that already - * has refs from a different subvolume, so basically reflink. - * - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only - * one referencing the range. - * - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with - * refs with other subvolumes. + * Record a dirty extent, and info qgroup to update quota on it + * TODO: Use kmem cache to alloc it. */ -enum btrfs_qgroup_operation_type { - BTRFS_QGROUP_OPER_ADD_EXCL, - BTRFS_QGROUP_OPER_ADD_SHARED, - BTRFS_QGROUP_OPER_SUB_EXCL, - BTRFS_QGROUP_OPER_SUB_SHARED, - BTRFS_QGROUP_OPER_SUB_SUBTREE, -}; - -struct btrfs_qgroup_operation { - u64 ref_root; +struct btrfs_qgroup_extent_record { + struct rb_node node; u64 bytenr; u64 num_bytes; - u64 seq; - enum btrfs_qgroup_operation_type type; - struct seq_list elem; - struct rb_node n; - struct list_head list; + struct ulist *old_roots; }; int btrfs_quota_enable(struct btrfs_trans_handle *trans, @@ -70,8 +45,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); + struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, @@ -80,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, - int mod_seq); -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper); + struct ulist *old_roots, struct ulist *new_roots); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5264858ed768..fa72068bd256 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) { - if (is_vmalloc_addr(x)) - vfree(x); - else - kfree(x); - } + if (x) + kvfree(x); return 0; } @@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) if (!info->stripe_hash_table) return; btrfs_clear_rbio_cache(info); - if (is_vmalloc_addr(info->stripe_hash_table)) - vfree(info->stripe_hash_table); - else - kfree(info->stripe_hash_table); + kvfree(info->stripe_hash_table); info->stripe_hash_table = NULL; } @@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int err; int i; - pointers = kzalloc(rbio->real_stripes * sizeof(void *), - GFP_NOFS); + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = -ENOMEM; goto cleanup_io; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d83085381bcc..88cbb5995667 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1847,8 +1847,10 @@ again: } eb = read_tree_block(dest, old_bytenr, old_ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { - ret = (!eb) ? -ENOMEM : -EIO; + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { + ret = -EIO; free_extent_buffer(eb); break; } @@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); eb = read_tree_block(root, bytenr, ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, generation); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + err = PTR_ERR(eb); + goto next; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; goto next; @@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.offset); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -3027,7 +3036,7 @@ int prealloc_file_extent_cluster(struct inode *inode, mutex_lock(&inode->i_mutex); ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start); + 1 - cluster->start, 0); if (ret) goto out; @@ -3430,7 +3439,9 @@ static int block_use_full_backref(struct reloc_control *rc, } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct inode *inode, u64 ino) + struct btrfs_block_group_cache *block_group, + struct inode *inode, + u64 ino) { struct btrfs_key key; struct btrfs_root *root = fs_info->tree_root; @@ -3463,7 +3474,7 @@ truncate: goto out; } - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); @@ -3509,6 +3520,7 @@ static int find_data_references(struct reloc_control *rc, */ if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { ret = delete_block_group_cache(rc->extent_root->fs_info, + rc->block_group, NULL, ref_objectid); if (ret != -ENOENT) return ret; @@ -4037,7 +4049,7 @@ restart: if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->extent_root, rc->block_group->flags); - if (ret == 0) { + if (ret == 1) { err = 0; progress = 0; goto restart; @@ -4223,7 +4235,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, inode, 0); + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); else ret = PTR_ERR(inode); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ec57687c9a4d..94db0fa5225a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * the statistics. */ - sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * - sizeof(*sblocks_for_recheck), - GFP_NOFS); + sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, + sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - int offset; + u32 offset; int nsectors; int sectorsize = sparity->sctx->dev_root->sectorsize; @@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - offset = (int)do_div(start, sparity->stripe_len); + start = div_u64_rem(start, sparity->stripe_len, &offset); offset /= sectorsize; nsectors = (int)len / sectorsize; @@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num, int j = 0; u64 stripe_nr; u64 last_offset; - int stripe_index; - int rot; + u32 stripe_index; + u32 rot; last_offset = (physical - map->stripes[num].physical) * nr_data_stripes(map); @@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = *offset; - do_div(stripe_nr, map->stripe_len); - do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ - rot = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -2664,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity) kfree(sparity); } +static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +{ + struct scrub_parity *sparity = container_of(work, struct scrub_parity, + work); + struct scrub_ctx *sctx = sparity->sctx; + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); +} + static void scrub_parity_bio_endio(struct bio *bio, int error) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; - struct scrub_ctx *sctx = sparity->sctx; if (error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); bio_put(bio); + + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, + scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, + &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -2995,10 +3005,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nstripes = length; physical = map->stripes[num].physical; offset = 0; - do_div(nstripes, map->stripe_len); + nstripes = div_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; @@ -3562,8 +3571,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - int ret = 0; - int flags = WQ_FREEZABLE | WQ_UNBOUND; + unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; if (fs_info->scrub_workers_refcnt == 0) { @@ -3575,27 +3583,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, fs_info->scrub_workers = btrfs_alloc_workqueue("btrfs-scrub", flags, max_active, 4); - if (!fs_info->scrub_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_workers) + goto fail_scrub_workers; + fs_info->scrub_wr_completion_workers = btrfs_alloc_workqueue("btrfs-scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_wr_completion_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_wr_completion_workers) + goto fail_scrub_wr_completion_workers; + fs_info->scrub_nocow_workers = btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); - if (!fs_info->scrub_nocow_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_nocow_workers) + goto fail_scrub_nocow_workers; + fs_info->scrub_parity_workers = + btrfs_alloc_workqueue("btrfs-scrubparity", flags, + max_active, 2); + if (!fs_info->scrub_parity_workers) + goto fail_scrub_parity_workers; } ++fs_info->scrub_workers_refcnt; -out: - return ret; + return 0; + +fail_scrub_parity_workers: + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); +fail_scrub_nocow_workers: + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); +fail_scrub_wr_completion_workers: + btrfs_destroy_workqueue(fs_info->scrub_workers); +fail_scrub_workers: + return -ENOMEM; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) @@ -3604,6 +3621,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d6033f540cc7..aa72bfd28f7d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -243,6 +243,7 @@ struct waiting_dir_move { * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; + bool orphanized; }; struct orphan_dir_info { @@ -1158,6 +1159,9 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; + /* data offset in the file extent item */ + u64 data_offset; + /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) if (ret < 0) return ret; - if (offset + bctx->extent_len > i_size) + if (offset + bctx->data_offset + bctx->extent_len > i_size) return 0; /* @@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; + /* + * For non-compressed extents iterate_extent_inodes() gives us extent + * offsets that already take into account the data offset, but not for + * compressed extents, since the offset is logical and not relative to + * the physical extent locations. We must take this into account to + * avoid sending clone offsets that go beyond the source file's size, + * which would result in the clone ioctl failing with -EINVAL on the + * receiving end. + */ + if (compressed == BTRFS_COMPRESS_NONE) + backref_ctx->data_offset = 0; + else + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - /* we know that it is or will be overwritten. check this now */ - if (ow_inode < sctx->send_progress) + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore ow_inode can actually be the + * same as sctx->send_progress. + */ + if (ow_inode <= sctx->send_progress) ret = 1; else ret = 0; @@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + fs_path_reset(name); if (is_waiting_for_rm(sctx, ino)) { @@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, break; } - if (is_waiting_for_move(sctx, ino)) { + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { @@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - sctx->parent_root->root_item.uuid); + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(sctx->parent_root->root_item.ctransid)); } @@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) return entry != NULL; } -static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) { struct rb_node **p = &sctx->waiting_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; + dm->orphanized = orphanized; while (*p) { parent = *p; @@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, goto out; } - ret = add_waiting_dir_move(sctx, pm->ino); + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); if (ret) goto out; @@ -3067,48 +3100,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } -static int path_loop(struct send_ctx *sctx, struct fs_path *name, - u64 ino, u64 gen, u64 *ancestor_ino) -{ - int ret = 0; - u64 parent_inode = 0; - u64 parent_gen = 0; - u64 start_ino = ino; - - *ancestor_ino = 0; - while (ino != BTRFS_FIRST_FREE_OBJECTID) { - fs_path_reset(name); - - if (is_waiting_for_rm(sctx, ino)) - break; - if (is_waiting_for_move(sctx, ino)) { - if (*ancestor_ino == 0) - *ancestor_ino = ino; - ret = get_first_ref(sctx->parent_root, ino, - &parent_inode, &parent_gen, name); - } else { - ret = __get_cur_name_and_parent(sctx, ino, gen, - &parent_inode, - &parent_gen, name); - if (ret > 0) { - ret = 0; - break; - } - } - if (ret < 0) - break; - if (parent_inode == start_ino) { - ret = 1; - if (*ancestor_ino == 0) - *ancestor_ino = ino; - break; - } - ino = parent_inode; - gen = parent_gen; - } - return ret; -} - static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3120,7 +3111,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; int ret; - u64 ancestor = 0; name = fs_path_alloc(); from_path = fs_path_alloc(); @@ -3152,22 +3142,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) goto out; sctx->send_progress = sctx->cur_ino + 1; - ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); - if (ret) { - LIST_HEAD(deleted_refs); - ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); - ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs, - pm->is_orphan); - if (ret < 0) - goto out; - if (rmdir_ino) { - dm = get_waiting_dir_move(sctx, pm->ino); - ASSERT(dm); - dm->rmdir_ino = rmdir_ino; - } - goto out; - } fs_path_reset(name); to_path = name; name = NULL; @@ -3412,8 +3386,40 @@ out: return ret; } +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + int ret; + u64 parent; + u64 parent_gen; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) { + if (ret == -ENOENT && ino == ino2) + ret = 0; + return ret; + } + if (parent == ino1) + return parent_gen == ino1_gen ? 1 : 0; + ino = parent; + } + return 0; +} + static int wait_for_parent_move(struct send_ctx *sctx, - struct recorded_ref *parent_ref) + struct recorded_ref *parent_ref, + const bool is_orphan) { int ret = 0; u64 ino = parent_ref->dir; @@ -3433,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx, * Our current directory inode may not yet be renamed/moved because some * ancestor (immediate or not) has to be renamed/moved first. So find if * such ancestor exists and make sure our own rename/move happens after - * that ancestor is processed. + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { if (is_waiting_for_move(sctx, ino)) { - ret = 1; + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); break; } @@ -3479,7 +3498,7 @@ out: ino, &sctx->new_refs, &sctx->deleted_refs, - false); + is_orphan); if (!ret) ret = 1; } @@ -3610,10 +3629,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { + struct name_cache_entry *nce; + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + /* + * Make sure we clear our orphanized inode's + * name from the name cache. This is because the + * inode ow_inode might be an ancestor of some + * other inode that will be orphanized as well + * later and has an inode number greater than + * sctx->send_progress. We need to prevent + * future name lookups from using the old name + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); + if (nce) { + name_cache_delete(sctx, nce); + kfree(nce); + } } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -3631,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move @@ -3651,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = wait_for_parent_move(sctx, cur); - if (ret < 0) - goto out; - if (ret) { - *pending_move = 1; - } else { - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, - cur->full_path); - } + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); if (ret < 0) goto out; } else { @@ -4550,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " if (ret < 0) goto out; - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root->root->root_item.uuid); + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); @@ -5852,19 +5905,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) ret = PTR_ERR(clone_root); goto out; } - clone_sources_to_rollback = i + 1; spin_lock(&clone_root->root_item_lock); - clone_root->send_in_progress++; - if (!btrfs_root_readonly(clone_root)) { + if (!btrfs_root_readonly(clone_root) || + btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } + clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; + clone_sources_to_rollback = i + 1; } vfree(clone_sources_tmp); clone_sources_tmp = NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 05fef198ff94..cd7ef34d2dce 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. */ +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - /* - * Report first abort since mount - */ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, - &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", - errno); - } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, * __btrfs_panic decodes unexpected, fatal errors from the caller, * issues an alert, and either panics or BUGs, depending on mount options. */ +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -841,33 +836,153 @@ out: return error; } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; + struct btrfs_root *fs_root; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + path->leave_spinning = 1; + + name = kmalloc(PATH_MAX, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, subvol_objectid, + BTRFS_ROOT_BACKREF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(fs_root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); + } + } + + btrfs_free_path(path); + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); } + return name; + +err: + btrfs_free_path(path); + kfree(name); + return ERR_PTR(ret); +} + +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + u64 dir_id; path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); + return -ENOMEM; path->leave_spinning = 1; /* @@ -879,49 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb, di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); - return ERR_CAST(di); + return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and - * mount to root most subvolume. + * mount the top-level subvolume. */ btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && sb->s_root->d_inode == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_root(inode); + *objectid = location.objectid; + return 0; } static int btrfs_fill_super(struct super_block *sb, @@ -1099,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_puts(seq, ",subvol="); + seq_dentry(seq, dentry, " \t\n\\"); return 0; } @@ -1129,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode) } /* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. + * This will add subvolid=0 to the argument string while removing any subvol= + * and subvolid= arguments to make sure we get the top-level root for path + * walking to the subvol we want. */ static char *setup_root_args(char *args) { - unsigned len = strlen(args) + 2 + 1; - char *src, *dst, *buf; + char *buf, *dst, *sep; - /* - * We need the same args as before, but with this substitution: - * s!subvol=[^,]+!subvolid=0! - * - * Since the replacement string is up to 2 bytes longer than the - * original, allocate strlen(args) + 2 + 1 bytes. - */ - - src = strstr(args, "subvol="); - /* This shouldn't happen, but just in case.. */ - if (!src) - return NULL; + if (!args) + return kstrdup("subvolid=0", GFP_NOFS); - buf = dst = kmalloc(len, GFP_NOFS); + /* The worst case is that we add ",subvolid=0" to the end. */ + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); if (!buf) return NULL; - /* - * If the subvol= arg is not at the start of the string, - * copy whatever precedes it into buf. - */ - if (src != args) { - *src++ = '\0'; - strcpy(buf, args); - dst += strlen(args); + while (1) { + sep = strchrnul(args, ','); + if (!strstarts(args, "subvol=") && + !strstarts(args, "subvolid=")) { + memcpy(dst, args, sep - args); + dst += sep - args; + *dst++ = ','; + } + if (*sep) + args = sep + 1; + else + break; } - strcpy(dst, "subvolid=0"); - dst += strlen("subvolid=0"); - - /* - * If there is a "," after the original subvol=... string, - * copy that suffix into our buffer. Otherwise, we're done. - */ - src = strchr(src, ','); - if (src) - strcpy(dst, src); return buf; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + int flags, const char *device_name, + char *data) { struct dentry *root; - struct vfsmount *mnt; + struct vfsmount *mnt = NULL; char *newargs; + int ret; newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); + if (!newargs) { + root = ERR_PTR(-ENOMEM); + goto out; + } - if (PTR_RET(mnt) == -EBUSY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + device_name, newargs); } else { - int r; - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + device_name, newargs); if (IS_ERR(mnt)) { - kfree(newargs); - return ERR_CAST(mnt); + root = ERR_CAST(mnt); + mnt = NULL; + goto out; } - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); - if (r < 0) { - /* FIXME: release vfsmount mnt ??*/ - kfree(newargs); - return ERR_PTR(r); + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); + up_write(&mnt->mnt_sb->s_umount); + if (ret < 0) { + root = ERR_PTR(ret); + goto out; } } } + if (IS_ERR(mnt)) { + root = ERR_CAST(mnt); + mnt = NULL; + goto out; + } - kfree(newargs); + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), + subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + } root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; - if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + if (!IS_ERR(root)) { struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", - subvol_name); + struct inode *root_inode = d_inode(root); + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + pr_err("BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } } +out: + mntput(mnt); + kfree(newargs); + kfree(subvol_name); return root; } @@ -1294,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev = NULL; struct super_block *s; - struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; @@ -1314,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + /* mount_subvol() will free subvol_name. */ + return mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data); } security_init_mnt_opts(&new_sec_opts); @@ -1383,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); } - - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) { + if (error) { deactivate_locked_super(s); - error = PTR_ERR(root); goto error_sec_opts; } fs_info = btrfs_sb(s); error = setup_security_options(fs_info, s, &new_sec_opts); if (error) { - dput(root); deactivate_locked_super(s); goto error_sec_opts; } - return root; + return dget(s->s_root); error_close_devices: btrfs_close_devices(fs_devices); @@ -1714,7 +1834,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); avail_space *= BTRFS_STRIPE_LEN; /* @@ -1886,8 +2006,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; return 0; } @@ -1908,6 +2028,17 @@ static struct file_system_type btrfs_fs_type = { }; MODULE_ALIAS_FS("btrfs"); +static int btrfs_control_open(struct inode *inode, struct file *file) +{ + /* + * The control file's private_data is used to hold the + * transaction when it is started and is used to keep + * track of whether a transaction is already in progress. + */ + file->private_data = NULL; + return 0; +} + /* * used by btrfsctl to scan devices when no FS is mounted */ @@ -2009,6 +2140,7 @@ static const struct super_operations btrfs_super_ops = { }; static const struct file_operations btrfs_ctl_fops = { + .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, .compat_ioctl = btrfs_control_ioctl, .owner = THIS_MODULE, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 94edb0a2a026..603b0cc2b9bb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -33,6 +33,7 @@ #include "volumes.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) @@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); -static struct attribute *btrfs_attrs[] = { +static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), @@ -438,28 +439,36 @@ static struct attribute *btrfs_attrs[] = { static void btrfs_release_super_kobj(struct kobject *kobj) { - struct btrfs_fs_info *fs_info = to_fs_info(kobj); - complete(&fs_info->kobj_unregister); + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); + + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_super_kobj, - .default_attrs = btrfs_attrs, }; +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_devices, super_kobj); +} + static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_info, super_kobj); + return to_fs_devs(kobj)->fs_info; } #define NUM_FEATURE_BITS 64 static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; -static u64 supported_feature_masks[3] = { +static const u64 supported_feature_masks[3] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, @@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, &agroup); } @@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) return 0; } -static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + if (fs_devs->device_dir_kobj) { + kobject_del(fs_devs->device_dir_kobj); + kobject_put(fs_devs->device_dir_kobj); + fs_devs->device_dir_kobj = NULL; + } + + if (fs_devs->super_kobj.state_initialized) { + kobject_del(&fs_devs->super_kobj); + kobject_put(&fs_devs->super_kobj); + wait_for_completion(&fs_devs->kobj_unregister); + } +} + +/* when fs_devs is NULL it will remove all fsid kobject */ +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - kobject_del(&fs_info->super_kobj); - kobject_put(&fs_info->super_kobj); - wait_for_completion(&fs_info->kobj_unregister); + struct list_head *fs_uuids = btrfs_get_fs_uuids(); + + if (fs_devs) { + __btrfs_sysfs_remove_fsid(fs_devs); + return; + } + + list_for_each_entry(fs_devs, fs_uuids, list) { + __btrfs_sysfs_remove_fsid(fs_devs); + } } void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) { + btrfs_reset_fs_info_ptr(fs_info); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } - kobject_del(fs_info->device_dir_kobj); - kobject_put(fs_info->device_dir_kobj); addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); - __btrfs_sysfs_remove_one(fs_info); + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -602,40 +635,60 @@ static void init_feature_attrs(void) } } -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +/* when one_device is NULL, it removes all device links */ + +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_info->device_dir_kobj) + if (!fs_devices->device_dir_kobj) return -EINVAL; if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_info->device_dir_kobj, + sysfs_remove_link(fs_devices->device_dir_kobj, + disk_kobj->name); + } + + if (one_device) + return 0; + + list_for_each_entry(one_device, + &fs_devices->devices, dev_list) { + if (!one_device->bdev) + continue; + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_devices->device_dir_kobj, disk_kobj->name); } return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { - int error = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *dev; - - if (!fs_info->device_dir_kobj) - fs_info->device_dir_kobj = kobject_create_and_add("devices", - &fs_info->super_kobj); + if (!fs_devs->device_dir_kobj) + fs_devs->device_dir_kobj = kobject_create_and_add("devices", + &fs_devs->super_kobj); - if (!fs_info->device_dir_kobj) + if (!fs_devs->device_dir_kobj) return -ENOMEM; + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_device *dev; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_info->device_dir_kobj, + error = sysfs_create_link(fs_devices->device_dir_kobj, disk_kobj, disk_kobj->name); if (error) break; @@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry; /* Debugging tunables and exported data */ u64 btrfs_debugfs_test; +/* + * Can be called by the device discovery thread. + * And parent can be specified for seed device + */ +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent) +{ + int error; + + init_completion(&fs_devs->kobj_unregister); + fs_devs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->super_kobj, + &btrfs_ktype, parent, "%pU", fs_devs->fsid); + return error; +} + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; + struct kobject *super_kobj = &fs_devs->super_kobj; + + btrfs_set_fs_info_ptr(fs_info); - init_completion(&fs_info->kobj_unregister); - fs_info->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, - "%pU", fs_info->fsid); + error = btrfs_kobj_add_device(fs_devs, NULL); if (error) return error; - error = sysfs_create_group(&fs_info->super_kobj, - &btrfs_feature_attr_group); + error = sysfs_create_files(super_kobj, btrfs_attrs); if (error) { - __btrfs_sysfs_remove_one(fs_info); + btrfs_kobj_rm_device(fs_devs, NULL); return error; } - error = addrm_unknown_feature_attrs(fs_info, true); + error = sysfs_create_group(super_kobj, + &btrfs_feature_attr_group); if (error) goto failure; - error = btrfs_kobj_add_device(fs_info, NULL); + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - &fs_info->super_kobj); + super_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index f7dd298b3cf6..6392527bcc15 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -61,17 +61,33 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) /* convert from attribute */ -#define to_btrfs_feature_attr(a) \ - container_of(a, struct btrfs_feature_attr, kobj_attr) -#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) -#define attr_to_btrfs_feature_attr(a) \ - to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +static inline struct btrfs_feature_attr * +to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static inline struct btrfs_feature_attr * +attr_to_btrfs_feature_attr(struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent); +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 73f299ebdabb..846d277b1901 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -21,6 +21,7 @@ #include "../transaction.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../backref.h" static void init_dummy_trans(struct btrfs_trans_handle *trans) { @@ -227,21 +228,28 @@ static int test_no_shared_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 5); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + /* + * Since the test trans doesn't havee the complicated delayed refs, + * we can only call btrfs_qgroup_account_extent() directly to test + * quota. + */ + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } @@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root) if (ret) return ret; - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root) test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } + old_roots = NULL; + new_roots = NULL; + + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } ret = remove_extent_item(root, 4096, 4096); if (ret) return -EINVAL; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't remove space from the qgroup %d\n", ret); - return -EINVAL; + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } @@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -301,26 +331,35 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 256); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = add_tree_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = remove_extent_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8be4278e25e8..f5021fcb154e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -35,7 +35,7 @@ #define BTRFS_ROOT_TRANS_TAG 0 -static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { +static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | __TRANS_START), @@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + if (transaction->delayed_refs.pending_csums) + printk(KERN_ERR "pending csums is %llu\n", + transaction->delayed_refs.pending_csums); while (!list_empty(&transaction->pending_chunks)) { struct extent_map *em; @@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree) */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); - } + + cond_resched_lock(&tree->lock); } spin_unlock(&tree->lock); } @@ -222,13 +222,17 @@ loop: atomic_set(&cur_trans->use_count, 2); cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); + cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -250,6 +254,9 @@ loop: INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); + INIT_LIST_HEAD(&cur_trans->io_bgs); + mutex_init(&cur_trans->cache_write_mutex); + cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -504,6 +511,7 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->chunk_bytes_reserved = 0; h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; @@ -721,7 +729,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { - err = btrfs_run_delayed_refs(trans, root, updates); + err = btrfs_run_delayed_refs(trans, root, updates * 2); if (err) /* Error code will also eval true */ return err; } @@ -753,7 +761,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->ordered)) { spin_lock(&info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); spin_unlock(&info->trans_lock); } @@ -787,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + btrfs_trans_release_chunk_metadata(trans); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root) && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { @@ -1057,6 +1067,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; + struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; struct extent_buffer *eb; int ret; @@ -1110,7 +1121,7 @@ again: return ret; } - while (!list_empty(dirty_bgs)) { + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans, root); if (ret) return ret; @@ -1284,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (pending->error) goto no_free_objectid; + /* + * Make qgroup to skip current new snapshot's qgroupid, as it is + * accounted by later btrfs_qgroup_inherit(). + */ + btrfs_set_skip_qgroup(trans, objectid); + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { @@ -1292,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) - goto no_free_objectid; + goto clear_skip_qgroup; } key.objectid = objectid; @@ -1390,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto fail; } - - /* - * We need to flush delayed refs in order to make sure all of our quota - * operations have been done before we call btrfs_qgroup_inherit. - */ - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); @@ -1491,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * account qgroup counters before qgroup_inherit() + */ + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; +clear_skip_qgroup: + btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); root_item_alloc_fail: @@ -1810,8 +1834,39 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } + if (!cur_trans->dirty_bg_run) { + int run_it = 0; + + /* this mutex is also taken before trying to set + * block groups readonly. We need to make sure + * that nobody has set a block group readonly + * after a extents from that block group have been + * allocated for cache files. btrfs_set_block_group_ro + * will wait for the transaction to commit if it + * finds dirty_bg_run = 1 + * + * The dirty_bg_run flag is also used to make sure only + * one process starts all the block group IO. It wouldn't + * hurt to have more than one go through, but there's no + * real advantage to it either. + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (!cur_trans->dirty_bg_run) { + run_it = 1; + cur_trans->dirty_bg_run = 1; + } + mutex_unlock(&root->fs_info->ro_block_group_mutex); + + if (run_it) + ret = btrfs_start_dirty_block_groups(trans, root); + } + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + spin_lock(&root->fs_info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1926,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto scrub_continue; } + /* Reocrd old roots for later qgroup accounting */ + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + /* * make sure none of the code above managed to slip in a * delayed item @@ -1967,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_free_log_root_tree(trans, root->fs_info); + /* + * Since fs roots are all committed, we can get a quite accurate + * new_roots. So let's do quota accounting. + */ + ret = btrfs_qgroup_account_extents(trans, root->fs_info); + if (ret < 0) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); @@ -2003,6 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); @@ -2016,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + btrfs_trans_release_chunk_metadata(trans); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -2076,7 +2152,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (current != root->fs_info->transaction_kthread) + if (current != root->fs_info->transaction_kthread && + current != root->fs_info->cleaner_kthread) btrfs_run_delayed_iputs(root); return ret; @@ -2085,6 +2162,7 @@ scrub_continue: btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; if (trans->qgroup_reserved) { btrfs_qgroup_free(root, trans->qgroup_reserved); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 937050a2b68e..eb09c2067fa8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -64,9 +64,19 @@ struct btrfs_transaction { struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; + struct list_head io_bgs; + u64 num_dirty_bgs; + + /* + * we need to make sure block group deletion doesn't race with + * free space cache writeout. This mutex keeps them from stomping + * on each other + */ + struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; + int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -92,6 +102,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 chunk_bytes_reserved; u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -136,9 +147,34 @@ struct btrfs_pending_snapshot { static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + spin_unlock(&BTRFS_I(inode)->lock); +} + +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a63719cc9578..a4b9c8b2d35a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; - if (btrfs_test_opt(root, SSD)) - goto out; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c5b8ba37f88e..9c45431e69ab 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -492,11 +492,19 @@ insert: if (btrfs_inode_generation(eb, src_item) == 0) { struct extent_buffer *dst_eb = path->nodes[0]; + const u64 ino_size = btrfs_inode_size(eb, src_item); + /* + * For regular files an ino_size == 0 is used only when + * logging that an inode exists, as part of a directory + * fsync, and the inode wasn't fsynced before. In this + * case don't set the size of the inode in the fs/subvol + * tree, otherwise we would be throwing valid data away. + */ if (S_ISREG(btrfs_inode_mode(eb, src_item)) && - S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && + ino_size != 0) { struct btrfs_map_token token; - u64 ino_size = btrfs_inode_size(eb, src_item); btrfs_init_map_token(&token); btrfs_set_token_inode_size(dst_eb, dst_item, @@ -1951,6 +1959,104 @@ out: return ret; } +static int replay_xattr_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + const u64 ino) +{ + struct btrfs_key search_key; + struct btrfs_path *log_path; + int i; + int nritems; + int ret; + + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_XATTR_ITEM_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; +process_leaf: + nritems = btrfs_header_nritems(path->nodes[0]); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + u32 total_size; + u32 cur; + + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size_nr(path->nodes[0], i); + cur = 0; + while (cur < total_size) { + u16 name_len = btrfs_dir_name_len(path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u32 this_len = sizeof(*di) + name_len + data_len; + char *name; + + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(path->nodes[0], name, + (unsigned long)(di + 1), name_len); + + log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, + name, name_len, 0); + btrfs_release_path(log_path); + if (!log_di) { + /* Doesn't exist in log tree, so delete it. */ + btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, ino, + name, name_len, -1); + kfree(name); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + ASSERT(di); + ret = btrfs_delete_one_dir_name(trans, root, + path, di); + if (ret) + goto out; + btrfs_release_path(path); + search_key = key; + goto again; + } + kfree(name); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } + } + ret = btrfs_next_leaf(root, path); + if (ret > 0) + ret = 0; + else if (ret == 0) + goto process_leaf; +out: + btrfs_free_path(log_path); + btrfs_release_path(path); + return ret; +} + + /* * deletion replay happens before we copy any new directory items * out of the log or out of backreferences from inodes. It @@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + ret = replay_xattr_deletes(wc->trans, root, log, + path, key.objectid); + if (ret) + break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, @@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, log, next); + clean_tree_block(trans, log->fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, + struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; @@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, src = path->nodes[0]; nritems = btrfs_header_nritems(src); for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + btrfs_item_key_to_cpu(src, &min_key, i); if (min_key.objectid != ino || min_key.type != key_type) @@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, err = ret; goto done; } + + /* + * We must make sure that when we log a directory entry, + * the corresponding inode, after log replay, has a + * matching link count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * <crash> + * <mount fs and log replay> + * + * Would result in a fsync log that when replayed, our + * file inode would have a link count of 1, but we get + * two directory entries pointing to the same inode. + * After removing one of the names, it would not be + * possible to remove the other name, which resulted + * always in stale file handle errors, and would not + * be possible to rmdir the parent directory, since + * its i_size could never decrement to the value + * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. + */ + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &tmp); + if (ctx && + (btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + tmp.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; } path->slots[0] = nritems; @@ -3175,7 +3321,8 @@ done: static noinline int log_directory_changes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { u64 min_key; u64 max_key; @@ -3187,7 +3334,7 @@ again: max_key = 0; while (1) { ret = log_dir_items(trans, root, inode, path, - dst_path, key_type, min_key, + dst_path, key_type, ctx, min_key, &max_key); if (ret) return ret; @@ -3734,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, &ordered->flags)) continue; - if (ordered->csum_bytes_left) { - btrfs_start_ordered_extent(inode, ordered, 0); - wait_event(ordered->wait, - ordered->csum_bytes_left == 0); - } - list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) @@ -3963,7 +4104,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, if (ret < 0) { return ret; } else if (ret > 0) { - *size_ret = i_size_read(inode); + *size_ret = 0; } else { struct btrfs_inode_item *item; @@ -3976,6 +4117,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) <crash/power failure> + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4014,6 +4336,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 logged_isize = 0; + bool need_log_inode_item = true; path = btrfs_alloc_path(); if (!path) @@ -4070,10 +4393,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) { - max_key_type = BTRFS_INODE_EXTREF_KEY; - max_key.type = max_key_type; - } + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { if (inode_only == LOG_INODE_EXISTS) { @@ -4098,7 +4419,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { - max_key.type = BTRFS_INODE_EXTREF_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4106,30 +4427,24 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); + while(1) { + ret = btrfs_truncate_inode_items(trans, + log, inode, 0, 0); + if (ret != -EAGAIN) + break; + } } - } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); + if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; - } else { - max_key.type = BTRFS_INODE_EXTREF_KEY; - } + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - ret = log_inode_item(trans, log, dst_path, inode); - if (ret) { - err = ret; - goto out_unlock; - } goto log_extents; } @@ -4152,6 +4467,28 @@ again: if (min_key.type > max_key.type) break; + if (min_key.type == BTRFS_INODE_ITEM_KEY) + need_log_inode_item = false; + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4219,9 +4556,26 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); + if (need_log_inode_item) { + err = log_inode_item(trans, log, dst_path, inode); + if (err) + goto out_unlock; + } if (fast_search) { /* * Some ordered extents started by fsync might have completed @@ -4277,15 +4631,18 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path, + ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + spin_unlock(&BTRFS_I(inode)->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); @@ -4327,9 +4684,9 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, goto out; if (!S_ISDIR(inode->i_mode)) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) goto out; - inode = parent->d_inode; + inode = d_inode(parent); } while (1) { @@ -4355,7 +4712,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; } - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; if (IS_ROOT(parent)) @@ -4364,7 +4721,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, parent = dget_parent(parent); dput(old_parent); old_parent = parent; - inode = parent->d_inode; + inode = d_inode(parent); } dput(old_parent); @@ -4372,6 +4729,181 @@ out: return ret; } +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. See log_dir_items() for + * details about the why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not lock their i_mutex, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not locking i_mutex of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and + * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names. This does not result in a problem because if a dir_item key is + * logged but its matching dir_index key is not logged, at log replay time we + * don't use it to replay the respective name (see replay_one_name()). On the + * other hand if only the dir_index key ends up being logged, the respective + * name is added to the fs/subvol tree with both the dir_item and dir_index + * keys created (see replay_one_name()). + * The directory's inode item with a wrong i_size is not a problem as well, + * since we don't use it at log replay time to set the i_size in the inode + * item of the fs/subvol tree (see overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + btrfs_free_path(path); + return -ENOMEM; + } + dir_elem->ino = btrfs_ino(start_inode); + list_add_tail(&dir_elem->list, &dir_list); + + while (!list_empty(&dir_list)) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + int nritems; + int i; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, + list); + if (ret) + goto next_dir_inode; + + min_key.objectid = dir_elem->ino; + min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(log, &min_key, path, trans->transid); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + +process_leaf: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + struct btrfs_dir_list *new_dir_elem; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != dir_elem->ino || + min_key.type != BTRFS_DIR_ITEM_KEY) + goto next_dir_inode; + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid && + type != BTRFS_FT_DIR) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(root->fs_info->sb, &di_key, + root, NULL); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto next_dir_inode; + } + + if (btrfs_inode_in_log(di_inode, trans->transid)) { + iput(di_inode); + continue; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + btrfs_release_path(path); + ret = btrfs_log_inode(trans, root, di_inode, + log_mode, 0, LLONG_MAX, ctx); + iput(di_inode); + if (ret) + goto next_dir_inode; + if (ctx->log_new_dentries) { + new_dir_elem = kmalloc(sizeof(*new_dir_elem), + GFP_NOFS); + if (!new_dir_elem) { + ret = -ENOMEM; + goto next_dir_inode; + } + new_dir_elem->ino = di_key.objectid; + list_add_tail(&new_dir_elem->list, &dir_list); + } + break; + } + if (i == nritems) { + ret = btrfs_next_leaf(log, path); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + goto process_leaf; + } + if (min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } +next_dir_inode: + list_del(&dir_elem->list); + kfree(dir_elem); + } + + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4394,6 +4926,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, const struct dentry * const first_parent = parent; const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > last_committed); + bool log_dentries = false; + struct inode *orig_inode = inode; sb = inode->i_sb; @@ -4449,11 +4983,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + log_dentries = true; + while (1) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; - inode = parent->d_inode; + inode = d_inode(parent); if (root != BTRFS_I(inode)->root) break; @@ -4485,7 +5022,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, dput(old_parent); old_parent = parent; } - ret = 0; + if (log_dentries) + ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + else + ret = 0; end_trans: dput(old_parent); if (ret < 0) { @@ -4515,7 +5055,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, start, end, 0, ctx); dput(parent); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 154990c26dcb..6916a781ea02 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -29,6 +29,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; int io_err; + bool log_new_dentries; struct list_head list; }; @@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; + ctx->log_new_dentries = false; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 840a38b2778a..91feb2bdefee 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) return NULL; } +static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) +{ + rb_erase(&node->rb_node, &ulist->root); + list_del(&node->list); + kfree(node); + BUG_ON(ulist->nnodes == 0); + ulist->nnodes--; +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { struct rb_node **p = &ulist->root.rb_node; @@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, node->val = val; node->aux = aux; -#ifdef CONFIG_BTRFS_DEBUG - node->seqnum = ulist->nnodes; -#endif ret = ulist_rbtree_insert(ulist, node); ASSERT(!ret); @@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, return 1; } +/* + * ulist_del - delete one node from ulist + * @ulist: ulist to remove node from + * @val: value to delete + * @aux: aux to delete + * + * The deletion will only be done when *BOTH* val and aux matches. + * Return 0 for successful delete. + * Return > 0 for not found. + */ +int ulist_del(struct ulist *ulist, u64 val, u64 aux) +{ + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + /* Not found */ + if (!node) + return 1; + + if (node->aux != aux) + return 1; + + /* Found and delete */ + ulist_rbtree_erase(ulist, node); + return 0; +} + /** * ulist_next - iterate ulist * @ulist: ulist to iterate @@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) uiter->cur_list = uiter->cur_list->next; } else { uiter->cur_list = ulist->nodes.next; -#ifdef CONFIG_BTRFS_DEBUG - uiter->i = 0; -#endif } node = list_entry(uiter->cur_list, struct ulist_node, list); -#ifdef CONFIG_BTRFS_DEBUG - ASSERT(node->seqnum == uiter->i); - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); - uiter->i++; -#endif return node; } diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 4c29db604bbe..a01a2c45825f 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); +int ulist_del(struct ulist *ulist, u64 val, u64 aux); /* just like ulist_add_merge() but take a pointer for the aux data */ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8222f6f74147..fbe7c104531c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +struct list_head *btrfs_get_fs_uuids(void) +{ + return &fs_uuids; +} static struct btrfs_fs_devices *__alloc_fs_devices(void) { @@ -345,7 +349,7 @@ loop_lock: waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->bi_cnt) == 0); + BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* * if we're doing the sync list, record that our @@ -366,8 +370,8 @@ loop_lock: btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; - if (need_resched()) - cond_resched(); + + cond_resched(); /* * we made progress, there is more work to do and the bdi @@ -400,8 +404,7 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; - if (need_resched()) - cond_resched(); + cond_resched(); continue; } spin_lock(&device->io_lock); @@ -442,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } + +void btrfs_free_stale_device(struct btrfs_device *cur_dev) +{ + struct btrfs_fs_devices *fs_devs; + struct btrfs_device *dev; + + if (!cur_dev->name) + return; + + list_for_each_entry(fs_devs, &fs_uuids, list) { + int del = 1; + + if (fs_devs->opened) + continue; + if (fs_devs->seeding) + continue; + + list_for_each_entry(dev, &fs_devs->devices, dev_list) { + + if (dev == cur_dev) + continue; + if (!dev->name) + continue; + + /* + * Todo: This won't be enough. What if the same device + * comes back (with new uuid and) with its mapper path? + * But for now, this does help as mostly an admin will + * either use mapper or non mapper path throughout. + */ + rcu_read_lock(); + del = strcmp(rcu_str_deref(dev->name), + rcu_str_deref(cur_dev->name)); + rcu_read_unlock(); + if (!del) + break; + } + + if (!del) { + /* delete the stale device */ + if (fs_devs->num_devices == 1) { + btrfs_sysfs_remove_fsid(fs_devs); + list_del(&fs_devs->list); + free_fs_devices(fs_devs); + } else { + fs_devs->num_devices--; + list_del(&dev->dev_list); + rcu_string_free(dev->name); + kfree(dev); + } + break; + } + } +} + /* * Add new device to list of registered devices * @@ -557,6 +615,12 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; + /* + * if there is new btrfs on an already registered device, + * then remove the stale device entry. + */ + btrfs_free_stale_device(device); + *fs_devices_ret = fs_devices; return ret; @@ -609,8 +673,7 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step) +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; struct btrfs_device *latest_dev = NULL; @@ -695,13 +758,13 @@ static void free_device(struct rcu_head *head) static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; + struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) return 0; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; struct rcu_string *name; @@ -1060,6 +1123,7 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, struct extent_map *em; struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; + u64 physical_start = *start; again: list_for_each_entry(em, search_list, list) { @@ -1068,15 +1132,31 @@ again: map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + u64 end; + if (map->stripes[i].dev != device) continue; - if (map->stripes[i].physical >= *start + len || + if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= - *start) + physical_start) continue; - *start = map->stripes[i].physical + - em->orig_block_len; - ret = 1; + /* + * Make sure that while processing the pinned list we do + * not override our *start with a lower value, because + * we can have pinned chunks that fall within this + * device hole and that have lower physical addresses + * than the pending chunks we processed before. If we + * do not take this special care we can end up getting + * 2 pending chunks that start at the same physical + * device offsets because the end offset of a pinned + * chunk can be equal to the start offset of some + * pending chunk. + */ + end = map->stripes[i].physical + em->orig_block_len; + if (end > *start) { + *start = end; + ret = 1; + } } } if (search_list == &trans->transaction->pending_chunks) { @@ -1136,11 +1216,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: + max_hole_start = search_start; max_hole_size = 0; - hole_size = 0; +again: if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; goto out; @@ -1195,8 +1275,14 @@ again: */ if (contains_pending_extent(trans, device, &search_start, - hole_size)) - hole_size = 0; + hole_size)) { + if (key.offset >= search_start) { + hole_size = key.offset - search_start; + } else { + WARN_ON_ONCE(1); + hole_size = 0; + } + } if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -1233,21 +1319,23 @@ next: * allocated dev extents, and when shrinking the device, * search_end may be smaller than search_start. */ - if (search_end > search_start) + if (search_end > search_start) { hole_size = search_end - search_start; - if (hole_size > max_hole_size) { - max_hole_start = search_start; - max_hole_size = hole_size; - } + if (contains_pending_extent(trans, device, &search_start, + hole_size)) { + btrfs_release_path(path); + goto again; + } - if (contains_pending_extent(trans, device, &search_start, hole_size)) { - btrfs_release_path(path); - goto again; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } } /* See above. */ - if (hole_size < num_bytes) + if (max_hole_size < num_bytes) ret = -ENOSPC; else ret = 0; @@ -1699,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1868,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); + + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + if (tgtdev->bdev) { btrfs_scratch_superblock(tgtdev); fs_info->fs_devices->open_devices--; @@ -2204,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info, device); + btrfs_kobj_add_device(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2245,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) - goto error_trans; + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + fsid_buf)) + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2282,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2487,8 +2579,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, + struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { int ret; @@ -2580,7 +2671,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct map_lookup *map; u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - u64 chunk_tree = root->fs_info->chunk_root->objectid; int i, ret = 0; /* Just in case */ @@ -2604,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } map = (struct map_lookup *)em->bdev; + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, extent_root, map->type); + unlock_chunks(root->fs_info->chunk_root); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -2634,8 +2727,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } } - ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -2664,8 +2756,8 @@ out: } static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) + u64 chunk_objectid, + u64 chunk_offset) { struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; @@ -2674,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; + /* + * Prevent races with automatic removal of unused block groups. + * After we relocate and before we remove the chunk with offset + * chunk_offset, automatic removal of the block group can kick in, + * resulting in a failure when calling btrfs_remove_chunk() below. + * + * Make sure to acquire this mutex before doing a tree search (dev + * or chunk trees) to find chunks. Otherwise the cleaner kthread might + * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after + * we release the path used to search the chunk/dev tree and before + * the current task acquires this mutex and calls us. + */ + ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex)); + ret = btrfs_can_relocate(extent_root, chunk_offset); if (ret) return -ENOSPC; @@ -2707,7 +2813,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) struct btrfs_chunk *chunk; struct btrfs_key key; struct btrfs_key found_key; - u64 chunk_tree = chunk_root->root_key.objectid; u64 chunk_type; bool retried = false; int failed = 0; @@ -2723,13 +2828,18 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto error; + } BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto error; if (ret > 0) @@ -2744,7 +2854,7 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + ret = btrfs_relocate_chunk(chunk_root, found_key.objectid, found_key.offset); if (ret == -ENOSPC) @@ -2752,6 +2862,7 @@ again: else BUG_ON(ret); } + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (found_key.offset == 0) break; @@ -3022,7 +3133,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, stripe_offset = btrfs_stripe_offset(leaf, stripe); stripe_length = btrfs_chunk_length(leaf, chunk); - do_div(stripe_length, factor); + stripe_length = div_u64(stripe_length, factor); if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) @@ -3208,9 +3319,12 @@ again: goto error; } + mutex_lock(&fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } /* * this shouldn't happen, it means the last relocate @@ -3222,6 +3336,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); ret = 0; break; } @@ -3230,8 +3345,10 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != key.objectid) + if (found_key.objectid != key.objectid) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); break; + } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -3244,10 +3361,13 @@ again: ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); btrfs_release_path(path); - if (!ret) + if (!ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; + } if (counting) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3255,9 +3375,9 @@ again: } ret = btrfs_relocate_chunk(chunk_root, - chunk_root->root_key.objectid, found_key.objectid, found_key.offset); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto error; if (ret == -ENOSPC) { @@ -3906,9 +4026,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) uuid_root = btrfs_create_tree(trans, fs_info, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { - btrfs_abort_transaction(trans, tree_root, - PTR_ERR(uuid_root)); - return PTR_ERR(uuid_root); + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, tree_root, ret); + return ret; } fs_info->uuid_root = uuid_root; @@ -3957,13 +4077,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; - u64 chunk_tree; u64 chunk_objectid; u64 chunk_offset; int ret; int slot; int failed = 0; bool retried = false; + bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; @@ -3997,11 +4117,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto done; + } ret = btrfs_previous_item(root, path, 0, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto done; if (ret) { @@ -4015,6 +4140,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4023,17 +4149,17 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -4046,15 +4172,6 @@ again: goto again; } else if (failed && retried) { ret = -ENOSPC; - lock_chunks(root); - - btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) - device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); goto done; } @@ -4066,6 +4183,35 @@ again: } lock_chunks(root); + + /* + * We checked in the above loop all device extents that were already in + * the device tree. However before we have updated the device's + * total_bytes to the new size, we might have had chunk allocations that + * have not complete yet (new block groups attached to transaction + * handles), and therefore their device extents were not yet in the + * device tree and we missed them in the loop above. So if we have any + * pending chunk using a device extent that overlaps the device range + * that we can not use anymore, commit the current transaction and + * repeat the search on the device tree - this way we guarantee we will + * not have chunks using device extents that end beyond 'new_size'. + */ + if (!checked_pending_chunks) { + u64 start = new_size; + u64 len = old_size - new_size; + + if (contains_pending_extent(trans, device, &start, len)) { + unlock_chunks(root); + checked_pending_chunks = true; + failed = 0; + retried = false; + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto done; + goto again; + } + } + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, @@ -4080,6 +4226,16 @@ again: btrfs_end_transaction(trans, root); done: btrfs_free_path(path); + if (ret) { + lock_chunks(root); + btrfs_device_set_total_bytes(device, old_size); + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); + unlock_chunks(root); + } return ret; } @@ -4131,7 +4287,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, @@ -4289,7 +4445,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); - devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return -ENOMEM; @@ -4400,8 +4556,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, */ if (stripe_size * data_stripes > max_chunk_size) { u64 mask = (1ULL << 24) - 1; - stripe_size = max_chunk_size; - do_div(stripe_size, data_stripes); + + stripe_size = div_u64(max_chunk_size, data_stripes); /* bump the answer up to a 16MB boundary */ stripe_size = (stripe_size + mask) & ~mask; @@ -4413,10 +4569,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; } - do_div(stripe_size, dev_stripes); + stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, raid_stripe_len); + stripe_size = div_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4626,6 +4782,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { u64 chunk_offset; + ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex)); chunk_offset = find_next_chunk(extent_root->fs_info); return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } @@ -4954,7 +5111,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - int stripe_index; + u32 stripe_index; int i; int ret = 0; int num_stripes; @@ -4995,7 +5152,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, stripe_len); + stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); @@ -5011,7 +5168,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* allow a write of a full stripe, but make sure we don't * allow straddling of stripes */ - do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); raid56_full_stripe_start *= full_stripe_len; } @@ -5136,7 +5294,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_nr_orig = stripe_nr; stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - do_div(stripe_nr_end, map->stripe_len); + stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); @@ -5144,7 +5302,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { @@ -5170,9 +5329,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; + u32 factor = map->num_stripes / map->sub_stripes; - stripe_index = do_div(stripe_nr, factor); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) @@ -5198,8 +5357,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = raid56_full_stripe_start; - do_div(stripe_nr, stripe_len * nr_data_stripes(map)); + stripe_nr = div_u64(raid56_full_stripe_start, + stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -5209,32 +5368,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_offset = 0; } else { - u64 tmp; - /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. * Mirror #3 is RAID6 Q block. */ - stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64_rem(stripe_nr, + nr_data_stripes(map), &stripe_index); if (mirror_num > 1) stripe_index = nr_data_stripes(map) + mirror_num - 2; /* We distribute the parity blocks across stripes */ - tmp = stripe_nr + stripe_index; - stripe_index = do_div(tmp, map->num_stripes); + div_u64_rem(stripe_nr + stripe_index, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && mirror_num <= 1) mirror_num = 1; } } else { /* - * after this do_div call, stripe_nr is the number of stripes - * on this device we have to walk to find the data, and - * stripe_index is the number of our device in the stripe array + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array */ - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); @@ -5261,7 +5420,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { u64 tmp; - int i, rot; + unsigned rot; bbio->raid_map = (u64 *)((void *)bbio->stripes + sizeof(struct btrfs_bio_stripe) * @@ -5269,8 +5428,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, sizeof(int) * tgtdev_indexes); /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); + div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ tmp = stripe_nr * nr_data_stripes(map); @@ -5285,8 +5443,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } if (rw & REQ_DISCARD) { - int factor = 0; - int sub_stripes = 0; + u32 factor = 0; + u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; @@ -5437,9 +5595,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - u64 length = map->stripe_len; - - if (physical_of_found + length <= + if (physical_of_found + map->stripe_len <= dev_replace->cursor_left) { struct btrfs_bio_stripe *tgtdev_stripe = bbio->stripes + num_stripes; @@ -5535,15 +5691,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) - do_div(length, map->num_stripes / map->sub_stripes); + length = div_u64(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - do_div(length, map->num_stripes); + length = div_u64(length, map->num_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - do_div(length, nr_data_stripes(map)); + length = div_u64(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } - buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { @@ -5554,11 +5710,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, continue; stripe_nr = physical - map->stripes[i].physical; - do_div(stripe_nr, map->stripe_len); + stripe_nr = div_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; - do_div(stripe_nr, map->sub_stripes); + stripe_nr = div_u64(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; } /* else if RAID[56], multiply by nr_data_stripes(). @@ -5587,17 +5743,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) { - if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) - bio_endio_nodec(bio, err); - else - bio_endio(bio, err); + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio_endio(bio, err); + btrfs_put_bbio(bbio); } static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; - struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; if (err) { @@ -5605,6 +5760,7 @@ static void btrfs_end_bio(struct bio *bio, int err) if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; + struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; @@ -5634,8 +5790,6 @@ static void btrfs_end_bio(struct bio *bio, int err) bio = bbio->orig_bio; } - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio @@ -5817,8 +5971,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) /* Shoud be the original bio. */ WARN_ON(bio != bbio->orig_bio); - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; @@ -5835,8 +5987,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 length = 0; u64 map_length; int ret; - int dev_nr = 0; - int total_devs = 1; + int dev_nr; + int total_devs; struct btrfs_bio *bbio = NULL; length = bio->bi_iter.bi_size; @@ -5877,11 +6029,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG(); } - while (dev_nr < total_devs) { + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); - dev_nr++; continue; } @@ -5894,22 +6045,18 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = breakup_stripe_bio(root, bbio, first_bio, dev, dev_nr, rw, async_submit); BUG_ON(ret); - dev_nr++; continue; } if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ - } else { + } else bio = first_bio; - bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; - } submit_stripe_bio(root, bbio, bio, bbio->stripes[dev_nr].physical, dev_nr, rw, async_submit); - dev_nr++; } btrfs_bio_counter_dec(root->fs_info); return 0; @@ -6082,6 +6229,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); return -EIO; } + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", + devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6201,10 +6350,11 @@ static int read_one_dev(struct btrfs_root *root, if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", + devid, dev_uuid); } else { if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -6732,3 +6882,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, } unlock_chunks(root); } + +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = fs_info; + fs_devices = fs_devices->seed; + } +} + +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = NULL; + fs_devices = fs_devices->seed; + } +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 83069dec6898..95842a909e7f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -253,6 +253,12 @@ struct btrfs_fs_devices { * nonrot flag set */ int rotating; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ + struct kobject super_kobj; + struct kobject *device_dir_kobj; + struct completion kobj_unregister; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -292,8 +298,6 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) - struct btrfs_bio { atomic_t refs; atomic_t stripes_pending; @@ -421,8 +425,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step); +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); @@ -538,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +struct list_head *btrfs_get_fs_uuids(void); +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 883b93623bc5..6f518c90e1c1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -261,7 +261,7 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key, found_key; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { /* * Check if the attribute is in a supported namespace. * - * This applied after the check for the synthetic attributes in the system + * This is applied after the check for the synthetic attributes in the system * namespace. */ -static bool btrfs_is_valid_xattr(const char *name) +static int btrfs_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) || - !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || - !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || - !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); + int len = strlen(name); + int prefixlen = 0; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) + prefixlen = XATTR_SECURITY_PREFIX_LEN; + else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + prefixlen = XATTR_SYSTEM_PREFIX_LEN; + else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + prefixlen = XATTR_TRUSTED_PREFIX_LEN; + else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + prefixlen = XATTR_USER_PREFIX_LEN; + else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + prefixlen = XATTR_BTRFS_PREFIX_LEN; + else + return -EOPNOTSUPP; + + /* + * The name cannot consist of just prefix + */ + if (len <= prefixlen) + return -EINVAL; + + return 0; } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + int ret; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -388,15 +408,17 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_getxattr(dentry, name, buffer, size); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; - return __btrfs_getxattr(dentry->d_inode, name, buffer, size); + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; + return __btrfs_getxattr(d_inode(dentry), name, buffer, size); } int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -413,23 +435,25 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, size, flags); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, value, size, flags); if (size == 0) value = ""; /* empty EA, do not remove */ - return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -446,14 +470,15 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_removexattr(dentry, name); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, NULL, 0, XATTR_REPLACE); - return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index fb22fd8d8fb8..82990b8f872b 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -403,7 +403,7 @@ next: return ret; } -struct btrfs_compress_op btrfs_zlib_compress = { +const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, |