6 files changed, 107 insertions, 11 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9de772ee0031..614aaa1969bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -880,6 +880,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  * indirect refs to their parent bytenr.
  * When roots are found, they're added to the roots list
  *
+ * NOTE: This can return values > 0
+ *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
@@ -1198,6 +1200,19 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/**
+ * btrfs_check_shared - tell us whether an extent is shared
+ *
+ * @trans: optional trans handle
+ *
+ * btrfs_check_shared uses the backref walking code but will short
+ * circuit as soon as it finds a root or inode that doesn't match the
+ * one passed in. This provides a significant performance benefit for
+ * callers (such as fiemap) which want to know whether the extent is
+ * shared but do not need a ref count.
+ *
+ * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
+ */
 int btrfs_check_shared(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info, u64 root_objectid,
 		       u64 inum, u64 bytenr)
@@ -1226,11 +1241,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
 		ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
 					roots, NULL, root_objectid, inum);
 		if (ret == BACKREF_FOUND_SHARED) {
+			/* this is the only condition under which we return 1 */
 			ret = 1;
 			break;
 		}
 		if (ret < 0 && ret != -ENOENT)
 			break;
+		ret = 0;
 		node = ulist_next(tmp, &uiter);
 		if (!node)
 			break;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec8e228b89f..0ec3acd14cbf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3180,8 +3180,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 fail:
 	btrfs_release_path(path);
-	if (ret)
-		btrfs_abort_transaction(trans, root, ret);
 	return ret;
 
 }
@@ -3487,8 +3485,30 @@ again:
 				ret = 0;
 			}
 		}
-		if (!ret)
+		if (!ret) {
 			ret = write_one_cache_group(trans, root, path, cache);
+			/*
+			 * Our block group might still be attached to the list
+			 * of new block groups in the transaction handle of some
+			 * other task (struct btrfs_trans_handle->new_bgs). This
+			 * means its block group item isn't yet in the extent
+			 * tree. If this happens ignore the error, as we will
+			 * try again later in the critical section of the
+			 * transaction commit.
+			 */
+			if (ret == -ENOENT) {
+				ret = 0;
+				spin_lock(&cur_trans->dirty_bgs_lock);
+				if (list_empty(&cache->dirty_list)) {
+					list_add_tail(&cache->dirty_list,
+						      &cur_trans->dirty_bgs);
+					btrfs_get_block_group(cache);
+				}
+				spin_unlock(&cur_trans->dirty_bgs_lock);
+			} else if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+			}
+		}
 
 		/* if its not on the io list, we need to put the block group */
 		if (should_put)
@@ -3597,8 +3617,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				ret = 0;
 			}
 		}
-		if (!ret)
+		if (!ret) {
 			ret = write_one_cache_group(trans, root, path, cache);
+			if (ret)
+				btrfs_abort_transaction(trans, root, ret);
+		}
 
 		/* if its not on the io list, we need to put the block group */
 		if (should_put)
@@ -8806,6 +8829,24 @@ again:
 		goto again;
 	}
 
+	/*
+	 * if we are changing raid levels, try to allocate a corresponding
+	 * block group with the new raid level.
+	 */
+	alloc_flags = update_block_group_flags(root, cache->flags);
+	if (alloc_flags != cache->flags) {
+		ret = do_chunk_alloc(trans, root, alloc_flags,
+				     CHUNK_ALLOC_FORCE);
+		/*
+		 * ENOSPC is allowed here, we may have enough space
+		 * already allocated at the new raid level to
+		 * carry on
+		 */
+		if (ret == -ENOSPC)
+			ret = 0;
+		if (ret < 0)
+			goto out;
+	}
 
 	ret = set_block_group_ro(cache, 0);
 	if (!ret)
@@ -8819,7 +8860,9 @@ again:
 out:
 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
 		alloc_flags = update_block_group_flags(root, cache->flags);
+		lock_chunks(root->fs_info->chunk_root);
 		check_system_chunk(trans, root, alloc_flags);
+		unlock_chunks(root->fs_info->chunk_root);
 	}
 	mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 43af5a61ad25..c32d226bfecc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4772,6 +4772,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 			       start >> PAGE_CACHE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
+		/*
+		 * Lock our eb's refs_lock to avoid races with
+		 * free_extent_buffer. When we get our eb it might be flagged
+		 * with EXTENT_BUFFER_STALE and another task running
+		 * free_extent_buffer might have seen that flag set,
+		 * eb->refs == 2, that the buffer isn't under IO (dirty and
+		 * writeback flags not set) and it's still in the tree (flag
+		 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
+		 * of decrementing the extent buffer's reference count twice.
+		 * So here we could race and increment the eb's reference count,
+		 * clear its stale flag, mark it as dirty and drop our reference
+		 * before the other task finishes executing free_extent_buffer,
+		 * which would later result in an attempt to free an extent
+		 * buffer that is dirty.
+		 */
+		if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+			spin_lock(&eb->refs_lock);
+			spin_unlock(&eb->refs_lock);
+		}
 		mark_extent_buffer_accessed(eb, NULL);
 		return eb;
 	}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41c510b7cc11..9dbe5b548fa6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -86,7 +86,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
 
 	mapping_set_gfp_mask(inode->i_mapping,
 			mapping_gfp_mask(inode->i_mapping) &
-			~(GFP_NOFS & ~__GFP_HIGHMEM));
+			~(__GFP_FS | __GFP_HIGHMEM));
 
 	return inode;
 }
@@ -3466,6 +3466,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
 	int ret;
 	struct btrfs_io_ctl io_ctl;
+	bool release_metadata = true;
 
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
 		return 0;
@@ -3473,11 +3474,20 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 	memset(&io_ctl, 0, sizeof(io_ctl));
 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
 				      trans, path, 0);
-	if (!ret)
+	if (!ret) {
+		/*
+		 * At this point writepages() didn't error out, so our metadata
+		 * reservation is released when the writeback finishes, at
+		 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
+		 * with or without an error.
+		 */
+		release_metadata = false;
 		ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+	}
 
 	if (ret) {
-		btrfs_delalloc_release_metadata(inode, inode->i_size);
+		if (release_metadata)
+			btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG
 		btrfs_err(root->fs_info,
 			"failed to write free ino cache for root %llu",
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 157cc54fc634..760c4a5e096b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -722,6 +722,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
 	int ret = 0;
+	int ret_wb = 0;
 	u64 end;
 	u64 orig_end;
 	struct btrfs_ordered_extent *ordered;
@@ -741,9 +742,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	if (ret)
 		return ret;
 
-	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
-	if (ret)
-		return ret;
+	/*
+	 * If we have a writeback error don't return immediately. Wait first
+	 * for any ordered extents that haven't completed yet. This is to make
+	 * sure no one can dirty the same page ranges and call writepages()
+	 * before the ordered extents complete - to avoid failures (-EEXIST)
+	 * when adding the new ordered extents to the ordered tree.
+	 */
+	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
 
 	end = orig_end;
 	while (1) {
@@ -767,7 +773,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 			break;
 		end--;
 	}
-	return ret;
+	return ret_wb ? ret_wb : ret;
 }
 
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 96aebf3bcd5b..174f5e1e00ab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4625,6 +4625,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 {
 	u64 chunk_offset;
 
+	ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
 	chunk_offset = find_next_chunk(extent_root->fs_info);
 	return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
 }