diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-12 22:15:23 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-12 22:15:23 +0300 |
commit | bdeb03cada1c305346505c48e5b1dab37e9acc4e (patch) | |
tree | ecbfda926e8b5b621f37150d509f176886ac0d82 /fs/btrfs/transaction.c | |
parent | 0349678ccd74d16c1f2bb58ecafec13ef7110e36 (diff) | |
parent | 9627aeee3e203e30679549e4962633698a6bf87f (diff) | |
download | linux-bdeb03cada1c305346505c48e5b1dab37e9acc4e.tar.xz |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason:
"From a feature point of view, most of the code here comes from Miao
Xie and others at Fujitsu to implement scrubbing and replacing devices
on raid56. This has been in development for a while, and it's a big
improvement.
Filipe and Josef have a great assortment of fixes, many of which solve
problems corruptions either after a crash or in error conditions. I
still have a round two from Filipe for next week that solves
corruptions with discard and block group removal"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (62 commits)
Btrfs: make get_caching_control unconditionally return the ctl
Btrfs: fix unprotected deletion from pending_chunks list
Btrfs: fix fs mapping extent map leak
Btrfs: fix memory leak after block remove + trimming
Btrfs: make btrfs_abort_transaction consider existence of new block groups
Btrfs: fix race between writing free space cache and trimming
Btrfs: fix race between fs trimming and block group remove/allocation
Btrfs, replace: enable dev-replace for raid56
Btrfs: fix freeing used extents after removing empty block group
Btrfs: fix crash caused by block group removal
Btrfs: fix invalid block group rbtree access after bg is removed
Btrfs, raid56: fix use-after-free problem in the final device replace procedure on raid56
Btrfs, replace: write raid56 parity into the replace target device
Btrfs, replace: write dirty pages into the replace target device
Btrfs, raid56: support parity scrub on raid56
Btrfs, raid56: use a variant to record the operation type
Btrfs, scrub: repair the common data on RAID5/6 if it is corrupted
Btrfs, raid56: don't change bbio and raid_map
Btrfs: remove unnecessary code of stripe_index assignment in __btrfs_map_block
Btrfs: remove noused bbio_ret in __btrfs_map_block in condition
...
Diffstat (limited to 'fs/btrfs/transaction.c')
-rw-r--r-- | fs/btrfs/transaction.c | 166 |
1 files changed, 144 insertions, 22 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..a605d4e2f2bc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } +static void clear_btree_io_tree(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } + spin_unlock(&tree->lock); +} + static noinline void switch_commit_roots(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { @@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, root->commit_root = btrfs_root_node(root); if (is_fstree(root->objectid)) btrfs_unpin_free_ino(root); + clear_btree_io_tree(&root->dirty_log_pages); } up_write(&fs_info->commit_root_sem); } @@ -220,6 +247,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); + INIT_LIST_HEAD(&cur_trans->pending_ordered); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -488,6 +516,7 @@ again: h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); + INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + if (!list_empty(&trans->ordered)) { + spin_lock(&info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); + spin_unlock(&info->trans_lock); + } + trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); - cached_state = NULL; - err = filemap_fdatawrite_range(mapping, start, end); + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + /* + * convert_extent_bit can return -ENOMEM, which is most of the + * time a temporary error. So when it happens, ignore the error + * and wait for writeback of this range to finish - because we + * failed to set the bit EXTENT_NEED_WAIT for the range, a call + * to btrfs_wait_marked_extents() would not know that writeback + * for this range started and therefore wouldn't wait for it to + * finish - we don't want to commit a superblock that points to + * btree nodes/leafs for which writeback hasn't finished yet + * (and without errors). + * We cleanup any entries left in the io tree when committing + * the transaction (through clear_btree_io_tree()). + */ + if (err == -ENOMEM) { + err = 0; + wait_writeback = true; + } + if (!err) + err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; + else if (wait_writeback) + werr = filemap_fdatawait_range(mapping, start, end); + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; return werr; } @@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { - clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); - err = filemap_fdatawait_range(mapping, start, end); + /* + * Ignore -ENOMEM errors returned by clear_extent_bit(). + * When committing the transaction, we'll remove any entries + * left in the io tree. For a log commit, we don't remove them + * after committing the log because the tree can be accessed + * concurrently - we do it only at transaction commit time when + * it's safe to do it (through clear_btree_io_tree()). + */ + err = clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); + if (err == -ENOMEM) + err = 0; + if (!err) + err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } @@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return 0; } -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!trans || !trans->transaction) { - struct inode *btree_inode; - btree_inode = root->fs_info->btree_inode; - return filemap_write_and_wait(btree_inode->i_mapping); - } - return btrfs_write_and_wait_marked_extents(root, + int ret; + + ret = btrfs_write_and_wait_marked_extents(root, &trans->transaction->dirty_pages, EXTENT_DIRTY); + clear_btree_io_tree(&trans->transaction->dirty_pages); + + return ret; } /* @@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, -1); } +static inline void +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); + btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we @@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } /* - * Since the transaction is done, we should set the inode map cache flag - * before any other comming transaction. + * Since the transaction is done, we can apply the pending changes + * before the next transaction. */ - if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) - btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); - else - btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + btrfs_apply_pending_changes(root->fs_info); /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots @@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) return (ret < 0) ? 0 : 1; } + +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +{ + unsigned long prev; + unsigned long bit; + + prev = cmpxchg(&fs_info->pending_changes, 0, 0); + if (!prev) + return; + + bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; + if (prev & bit) + btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; + if (prev & bit) + btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_COMMIT; + if (prev & bit) + btrfs_debug(fs_info, "pending commit done"); + prev &= ~bit; + + if (prev) + btrfs_warn(fs_info, + "unknown pending changes left 0x%lx, ignoring", prev); +} |