Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs update from Chris Mason: "From a feature point of view, most of the code here comes from Miao Xie and others at Fujitsu to implement scrubbing and replacing devices on raid56. This has been in development for a while, and it's a big improvement. Filipe and Josef have a great assortment of fixes, many of which solve problems corruptions either after a crash or in error conditions. I still have a round two from Filipe for next week that solves corruptions with discard and block group removal" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (62 commits) Btrfs: make get_caching_control unconditionally return the ctl Btrfs: fix unprotected deletion from pending_chunks list Btrfs: fix fs mapping extent map leak Btrfs: fix memory leak after block remove + trimming Btrfs: make btrfs_abort_transaction consider existence of new block groups Btrfs: fix race between writing free space cache and trimming Btrfs: fix race between fs trimming and block group remove/allocation Btrfs, replace: enable dev-replace for raid56 Btrfs: fix freeing used extents after removing empty block group Btrfs: fix crash caused by block group removal Btrfs: fix invalid block group rbtree access after bg is removed Btrfs, raid56: fix use-after-free problem in the final device replace procedure on raid56 Btrfs, replace: write raid56 parity into the replace target device Btrfs, replace: write dirty pages into the replace target device Btrfs, raid56: support parity scrub on raid56 Btrfs, raid56: use a variant to record the operation type Btrfs, scrub: repair the common data on RAID5/6 if it is corrupted Btrfs, raid56: don't change bbio and raid_map Btrfs: remove unnecessary code of stripe_index assignment in __btrfs_map_block Btrfs: remove noused bbio_ret in __btrfs_map_block in condition ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-12 22:15:23 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-12-12 22:15:23 +0300
commit: bdeb03cada1c305346505c48e5b1dab37e9acc4e (patch)
tree: ecbfda926e8b5b621f37150d509f176886ac0d82 /fs/btrfs/transaction.c
parent: 0349678ccd74d16c1f2bb58ecafec13ef7110e36 (diff)
parent: 9627aeee3e203e30679549e4962633698a6bf87f (diff)
download: linux-bdeb03cada1c305346505c48e5b1dab37e9acc4e.tar.xz
1 files changed, 144 insertions, 22 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..a605d4e2f2bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 	}
 }
 
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+	spin_lock(&tree->lock);
+	while (!RB_EMPTY_ROOT(&tree->state)) {
+		struct rb_node *node;
+		struct extent_state *state;
+
+		node = rb_first(&tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		rb_erase(&state->rb_node, &tree->state);
+		RB_CLEAR_NODE(&state->rb_node);
+		/*
+		 * btree io trees aren't supposed to have tasks waiting for
+		 * changes in the flags of extent states ever.
+		 */
+		ASSERT(!waitqueue_active(&state->wq));
+		free_extent_state(state);
+		if (need_resched()) {
+			spin_unlock(&tree->lock);
+			cond_resched();
+			spin_lock(&tree->lock);
+		}
+	}
+	spin_unlock(&tree->lock);
+}
+
 static noinline void switch_commit_roots(struct btrfs_transaction *trans,
 					 struct btrfs_fs_info *fs_info)
 {
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
 		root->commit_root = btrfs_root_node(root);
 		if (is_fstree(root->objectid))
 			btrfs_unpin_free_ino(root);
+		clear_btree_io_tree(&root->dirty_log_pages);
 	}
 	up_write(&fs_info->commit_root_sem);
 }
@@ -220,6 +247,7 @@ loop:
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	INIT_LIST_HEAD(&cur_trans->switch_commits);
+	INIT_LIST_HEAD(&cur_trans->pending_ordered);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
 			     fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
 	h->sync = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
+	INIT_LIST_HEAD(&h->ordered);
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
+	if (!list_empty(&trans->ordered)) {
+		spin_lock(&info->trans_lock);
+		list_splice(&trans->ordered, &cur_trans->pending_ordered);
+		spin_unlock(&info->trans_lock);
+	}
+
 	trans->delayed_ref_updates = 0;
 	if (!trans->sync) {
 		must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
 				      mark, &cached_state)) {
-		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-				   mark, &cached_state, GFP_NOFS);
-		cached_state = NULL;
-		err = filemap_fdatawrite_range(mapping, start, end);
+		bool wait_writeback = false;
+
+		err = convert_extent_bit(dirty_pages, start, end,
+					 EXTENT_NEED_WAIT,
+					 mark, &cached_state, GFP_NOFS);
+		/*
+		 * convert_extent_bit can return -ENOMEM, which is most of the
+		 * time a temporary error. So when it happens, ignore the error
+		 * and wait for writeback of this range to finish - because we
+		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+		 * to btrfs_wait_marked_extents() would not know that writeback
+		 * for this range started and therefore wouldn't wait for it to
+		 * finish - we don't want to commit a superblock that points to
+		 * btree nodes/leafs for which writeback hasn't finished yet
+		 * (and without errors).
+		 * We cleanup any entries left in the io tree when committing
+		 * the transaction (through clear_btree_io_tree()).
+		 */
+		if (err == -ENOMEM) {
+			err = 0;
+			wait_writeback = true;
+		}
+		if (!err)
+			err = filemap_fdatawrite_range(mapping, start, end);
 		if (err)
 			werr = err;
+		else if (wait_writeback)
+			werr = filemap_fdatawait_range(mapping, start, end);
+		free_extent_state(cached_state);
+		cached_state = NULL;
 		cond_resched();
 		start = end + 1;
 	}
-	if (err)
-		werr = err;
 	return werr;
 }
 
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
 				      EXTENT_NEED_WAIT, &cached_state)) {
-		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-				 0, 0, &cached_state, GFP_NOFS);
-		err = filemap_fdatawait_range(mapping, start, end);
+		/*
+		 * Ignore -ENOMEM errors returned by clear_extent_bit().
+		 * When committing the transaction, we'll remove any entries
+		 * left in the io tree. For a log commit, we don't remove them
+		 * after committing the log because the tree can be accessed
+		 * concurrently - we do it only at transaction commit time when
+		 * it's safe to do it (through clear_btree_io_tree()).
+		 */
+		err = clear_extent_bit(dirty_pages, start, end,
+				       EXTENT_NEED_WAIT,
+				       0, 0, &cached_state, GFP_NOFS);
+		if (err == -ENOMEM)
+			err = 0;
+		if (!err)
+			err = filemap_fdatawait_range(mapping, start, end);
 		if (err)
 			werr = err;
+		free_extent_state(cached_state);
+		cached_state = NULL;
 		cond_resched();
 		start = end + 1;
 	}
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
-	if (!trans || !trans->transaction) {
-		struct inode *btree_inode;
-		btree_inode = root->fs_info->btree_inode;
-		return filemap_write_and_wait(btree_inode->i_mapping);
-	}
-	return btrfs_write_and_wait_marked_extents(root,
+	int ret;
+
+	ret = btrfs_write_and_wait_marked_extents(root,
 					   &trans->transaction->dirty_pages,
 					   EXTENT_DIRTY);
+	clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+	return ret;
 }
 
 /*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 		btrfs_wait_ordered_roots(fs_info, -1);
 }
 
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+			   struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&fs_info->trans_lock);
+	while (!list_empty(&cur_trans->pending_ordered)) {
+		ordered = list_first_entry(&cur_trans->pending_ordered,
+					   struct btrfs_ordered_extent,
+					   trans_list);
+		list_del_init(&ordered->trans_list);
+		spin_unlock(&fs_info->trans_lock);
+
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						   &ordered->flags));
+		btrfs_put_ordered_extent(ordered);
+		spin_lock(&fs_info->trans_lock);
+	}
+	spin_unlock(&fs_info->trans_lock);
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&root->fs_info->trans_lock);
+	list_splice(&trans->ordered, &cur_trans->pending_ordered);
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
 		spin_unlock(&root->fs_info->trans_lock);
 		atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_wait_delalloc_flush(root->fs_info);
 
+	btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
 	btrfs_scrub_pause(root);
 	/*
 	 * Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	/*
-	 * Since the transaction is done, we should set the inode map cache flag
-	 * before any other comming transaction.
+	 * Since the transaction is done, we can apply the pending changes
+	 * before the next transaction.
 	 */
-	if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
-		btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
-	else
-		btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+	btrfs_apply_pending_changes(root->fs_info);
 
 	/* commit_fs_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 
 	return (ret < 0) ? 0 : 1;
 }
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+	unsigned long prev;
+	unsigned long bit;
+
+	prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+	if (!prev)
+		return;
+
+	bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+	if (prev & bit)
+		btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+	prev &= ~bit;
+
+	bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+	if (prev & bit)
+		btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+	prev &= ~bit;
+
+	bit = 1 << BTRFS_PENDING_COMMIT;
+	if (prev & bit)
+		btrfs_debug(fs_info, "pending commit done");
+	prev &= ~bit;
+
+	if (prev)
+		btrfs_warn(fs_info,
+			"unknown pending changes left 0x%lx, ignoring", prev);
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-12 22:15:23 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-12-12 22:15:23 +0300
commit	bdeb03cada1c305346505c48e5b1dab37e9acc4e (patch)
tree	ecbfda926e8b5b621f37150d509f176886ac0d82 /fs/btrfs/transaction.c
parent	0349678ccd74d16c1f2bb58ecafec13ef7110e36 (diff)
parent	9627aeee3e203e30679549e4962633698a6bf87f (diff)
download	linux-bdeb03cada1c305346505c48e5b1dab37e9acc4e.tar.xz