summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
authorJoel Stanley <joel@jms.id.au>2018-11-15 03:18:56 +0300
committerJoel Stanley <joel@jms.id.au>2018-11-15 03:19:24 +0300
commit62ccc3924eff37012bd0c227d8b7dc71188fc358 (patch)
tree48cf255186d6961afb2fce522101404ba3f22d00 /fs/btrfs
parentd769a1aa14d3d0e380151048b94aeb6919ace533 (diff)
parent6559b2338d96ec330887871d47e1ced520bc4f03 (diff)
downloadlinux-62ccc3924eff37012bd0c227d8b7dc71188fc358.tar.xz
Merge tag 'v4.18.19' into dev-4.18
This is the 4.18.19 stable release Signed-off-by: Joel Stanley <joel@jms.id.au>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/extent-tree.c168
-rw-r--r--fs/btrfs/file.c41
-rw-r--r--fs/btrfs/free-space-cache.c42
-rw-r--r--fs/btrfs/inode.c15
-rw-r--r--fs/btrfs/ioctl.c11
-rw-r--r--fs/btrfs/qgroup.c5
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/tree-log.c86
12 files changed, 297 insertions, 114 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc326df472e..4a7ae216977d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1054,9 +1054,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
+ /*
+ * If we are COWing a node/leaf from the extent, chunk or device trees,
+ * make sure that we do not finish block group creation of pending block
+ * groups. We do this to avoid a deadlock.
+ * COWing can result in allocation of a new chunk, and flushing pending
+ * block groups (btrfs_create_pending_block_groups()) can be triggered
+ * when finishing allocation of a new chunk. Creation of a pending block
+ * group modifies the extent, chunk and device trees, therefore we could
+ * deadlock with ourselves since we are holding a lock on an extent
+ * buffer that btrfs_create_pending_block_groups() may try to COW later.
+ */
+ if (root == fs_info->extent_root ||
+ root == fs_info->chunk_root ||
+ root == fs_info->dev_root)
+ trans->can_flush_pending_bgs = false;
+
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size);
+ trans->can_flush_pending_bgs = true;
if (IS_ERR(cow))
return PTR_ERR(cow);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d20b244623f2..e129a595f811 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -445,6 +445,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
@@ -487,6 +488,10 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_dev_replace_write_lock(dev_replace);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
goto leave;
}
@@ -508,8 +513,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
leave:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
btrfs_dev_replace_write_unlock(dev_replace);
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4ab0bccfa281..e67de6a9805b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2490,6 +2490,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
insert_reserved);
else
BUG();
+ if (ret && insert_reserved)
+ btrfs_pin_extent(trans->fs_info, node->bytenr,
+ node->num_bytes, 1);
return ret;
}
@@ -3034,7 +3037,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -3051,7 +3053,6 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -3082,7 +3083,6 @@ again:
goto again;
}
out:
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -4664,6 +4664,7 @@ again:
goto out;
} else {
ret = 1;
+ space_info->max_extent_size = 0;
}
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -4685,11 +4686,9 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (u64)SZ_2M) {
+ if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
btrfs_create_pending_block_groups(trans);
- btrfs_trans_release_chunk_metadata(trans);
- }
+
return ret;
}
@@ -6581,6 +6580,7 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
if (delalloc)
cache->delalloc_bytes -= num_bytes;
@@ -7412,6 +7412,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
+ u64 max_free_space = 0;
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
@@ -7707,8 +7708,8 @@ unclustered_alloc:
spin_lock(&ctl->tree_lock);
if (ctl->free_space <
num_bytes + empty_cluster + empty_size) {
- if (ctl->free_space > max_extent_size)
- max_extent_size = ctl->free_space;
+ max_free_space = max(max_free_space,
+ ctl->free_space);
spin_unlock(&ctl->tree_lock);
goto loop;
}
@@ -7877,6 +7878,8 @@ loop:
}
out:
if (ret == -ENOSPC) {
+ if (!max_extent_size)
+ max_extent_size = max_free_space;
spin_lock(&space_info->lock);
space_info->max_extent_size = max_extent_size;
spin_unlock(&space_info->lock);
@@ -8158,21 +8161,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
+ if (!path)
return -ENOMEM;
- }
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
return ret;
}
@@ -8301,6 +8297,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
+ /*
+ * Extra safety check in case the extent tree is corrupted and extent
+ * allocator chooses to use a tree block which is already used and
+ * locked.
+ */
+ if (buf->lock_owner == current->pid) {
+ btrfs_err_rl(fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ buf->start, btrfs_header_owner(buf), current->pid);
+ free_extent_buffer(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
@@ -8938,15 +8947,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(eb));
+ else if (root->root_key.objectid != btrfs_header_owner(eb))
+ goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level + 1]));
+ else if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]))
+ goto owner_mismatch;
}
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
@@ -8954,6 +8962,11 @@ out:
wc->refs[level] = 0;
wc->flags[level] = 0;
return 0;
+
+owner_mismatch:
+ btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
+ btrfs_header_owner(eb), root->root_key.objectid);
+ return -EUCLEAN;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -9007,6 +9020,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
ret = walk_up_proc(trans, root, path, wc);
if (ret > 0)
return 0;
+ if (ret < 0)
+ return ret;
if (path->locks[level]) {
btrfs_tree_unlock_rw(path->nodes[level],
@@ -9772,6 +9787,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group = btrfs_lookup_first_block_group(info, last);
while (block_group) {
+ wait_block_group_cache_done(block_group);
spin_lock(&block_group->lock);
if (block_group->iref)
break;
@@ -10184,15 +10200,19 @@ error:
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group_cache *block_group;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
- trans->can_flush_pending_bgs = false;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ if (!trans->can_flush_pending_bgs)
+ return;
+
+ while (!list_empty(&trans->new_bgs)) {
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
if (ret)
goto next;
@@ -10214,7 +10234,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
next:
list_del_init(&block_group->bg_list);
}
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
+ btrfs_trans_release_chunk_metadata(trans);
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -10869,14 +10889,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
* We don't want a transaction for this since the discard may take a
* substantial amount of time. We don't require that a transaction be
* running, but we do need to take a running transaction into account
- * to ensure that we're not discarding chunks that were released in
- * the current transaction.
+ * to ensure that we're not discarding chunks that were released or
+ * allocated in the current transaction.
*
* Holding the chunks lock will prevent other threads from allocating
* or releasing chunks, but it won't prevent a running transaction
* from committing and releasing the memory that the pending chunks
* list head uses. For that, we need to take a reference to the
- * transaction.
+ * transaction and hold the commit root sem. We only need to hold
+ * it while performing the free space search since we have already
+ * held back allocations.
*/
static int btrfs_trim_free_extents(struct btrfs_device *device,
u64 minlen, u64 *trimmed)
@@ -10886,6 +10908,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
+ /* Discard not supported = nothing to do. */
+ if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ return 0;
+
/* Not writeable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
@@ -10903,9 +10929,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
- return ret;
+ break;
- down_read(&fs_info->commit_root_sem);
+ ret = down_read_killable(&fs_info->commit_root_sem);
+ if (ret) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ break;
+ }
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
@@ -10913,13 +10943,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+
ret = find_free_dev_extent_start(trans, device, minlen, start,
&start, &len);
- if (trans)
+ if (trans) {
+ up_read(&fs_info->commit_root_sem);
btrfs_put_transaction(trans);
+ }
if (ret) {
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret == -ENOSPC)
ret = 0;
@@ -10927,7 +10961,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
}
ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
@@ -10947,6 +10980,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
return ret;
}
+/*
+ * Trim the whole filesystem by:
+ * 1) trimming the free space in each block group
+ * 2) trimming the unallocated space on each device
+ *
+ * This will also continue trimming even if a block group or device encounters
+ * an error. The return value will be the last error, or 0 if nothing bad
+ * happens.
+ */
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -10956,18 +10998,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ u64 bg_failed = 0;
+ u64 dev_failed = 0;
+ int bg_ret = 0;
+ int dev_ret = 0;
int ret = 0;
- /*
- * try to trim all FS space, our block group may start from non-zero.
- */
- if (range->len == total_bytes)
- cache = btrfs_lookup_first_block_group(fs_info, range->start);
- else
- cache = btrfs_lookup_block_group(fs_info, range->start);
-
- while (cache) {
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ for (; cache; cache = next_block_group(fs_info, cache)) {
if (cache->key.objectid >= (range->start + range->len)) {
btrfs_put_block_group(cache);
break;
@@ -10981,13 +11019,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
ret = wait_block_group_cache_done(cache);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
ret = btrfs_trim_block_group(cache,
@@ -10998,28 +11038,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
-
- cache = next_block_group(fs_info, cache);
}
+ if (bg_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu block group(s), last error %d",
+ bg_failed, bg_ret);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->alloc_list;
- list_for_each_entry(device, devices, dev_alloc_list) {
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
ret = btrfs_trim_free_extents(device, range->minlen,
&group_trimmed);
- if (ret)
+ if (ret) {
+ dev_failed++;
+ dev_ret = ret;
break;
+ }
trimmed += group_trimmed;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (dev_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu device(s), last error %d",
+ dev_failed, dev_ret);
range->len = trimmed;
- return ret;
+ if (bg_ret)
+ return bg_ret;
+ return dev_ret;
}
/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 51e77d72068a..22c2f38cd9b3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -534,6 +534,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
end_of_last_block = start_pos + num_bytes - 1;
+ /*
+ * The pages may have already been dirty, clear out old accounting so
+ * we can set things up properly
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+
if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
if (start_pos >= isize &&
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@ -1504,18 +1512,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
- clear_extent_bit(&inode->io_tree, start_pos, last_pos,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state);
+
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
}
+ /*
+ * It's possible the pages are dirty right now, but we don't want
+ * to clean them yet because copy_from_user may catch a page fault
+ * and we might have to fall back to one page at a time. If that
+ * happens, we'll unlock these pages and we'd have a window where
+ * reclaim could sneak in and drop the once-dirty page on the floor
+ * without writing it.
+ *
+ * We have the pages locked and the extent range locked, so there's
+ * no way someone can start IO on any dirty pages in this range.
+ *
+ * We'll call btrfs_dirty_pages() later on, and that will flip around
+ * delalloc bits and dirty the pages as required.
+ */
for (i = 0; i < num_pages; i++) {
- if (clear_page_dirty_for_io(pages[i]))
- account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
@@ -2065,6 +2082,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
inode_lock(inode);
+
+ /*
+ * We take the dio_sem here because the tree log stuff can race with
+ * lockless dio writes and get an extent map logged for an extent we
+ * never waited on. We need it this high up for lockdep reasons.
+ */
+ down_write(&BTRFS_I(inode)->dio_sem);
+
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -2116,6 +2141,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = start_ordered_ops(inode, start, end);
}
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2171,6 +2197,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2189,6 +2216,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2210,6 +2238,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
/*
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d5f80cb300be..a5f18333aa8c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -10,6 +10,7 @@
#include <linux/math64.h>
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct inode *inode = NULL;
+ unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_disk_key_to_cpu(&location, &disk_key);
btrfs_release_path(path);
+ /*
+ * We are often under a trans handle at this point, so we need to make
+ * sure NOFS is set to keep us from deadlocking.
+ */
+ nofs_flag = memalloc_nofs_save();
inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
if (is_bad_inode(inode)) {
@@ -1686,6 +1694,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1769,6 +1779,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+ if (entry->bitmap)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1790,8 +1807,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1809,8 +1826,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bytes < *bytes + align_off) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1822,8 +1839,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*offset = tmp;
*bytes = size;
return entry;
- } else if (size > *max_extent_size) {
- *max_extent_size = size;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
}
continue;
}
@@ -2447,6 +2466,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
struct rb_node *n;
int count = 0;
+ spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
@@ -2455,6 +2475,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
+ spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
@@ -2683,8 +2704,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
- if (search_bytes > *max_extent_size)
- *max_extent_size = search_bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
return 0;
}
@@ -2721,8 +2742,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
- if (entry->bytes < bytes && entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
if (entry->bytes < bytes ||
(!entry->bitmap && entry->offset < min_start)) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3736fbf6774..dc0f9d089b19 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -507,6 +507,7 @@ again:
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+ nr_pages = 0;
goto cont;
}
@@ -2950,6 +2951,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
+ bool clear_reserved_extent = true;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
@@ -3053,10 +3055,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- if (!ret)
+ if (!ret) {
+ clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->start,
ordered_extent->disk_len);
+ }
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -3117,8 +3121,13 @@ out:
* wrong we need to return the space for this ordered extent
* back to the allocator. We only free the extent in the
* truncated case if we didn't write out the extent at all.
+ *
+ * If we made it past insert_reserved_file_extent before we
+ * errored out then we don't need to do this as the accounting
+ * has already been done.
*/
if ((ret || !logical_len) &&
+ clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(fs_info,
@@ -5293,11 +5302,13 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
+ state_flags = state->state;
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, &cached_state);
@@ -5310,7 +5321,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
- if (state->state & EXTENT_DELALLOC)
+ if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ef7159646615..c972920701a3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -496,7 +496,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -520,11 +519,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes ||
- range.len < fs_info->sb->s_blocksize)
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
+ * block group is in the logical address space, which can be any
+ * sectorsize aligned bytenr in the range [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
- range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c25dc47210a3..7407f5a5d682 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2856,6 +2856,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
+ qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3065,6 +3066,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
+ &BTRFS_I(inode)->root->fs_info->flags))
+ return 0;
+
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index d60dd06445ce..cad73ed7aebc 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -261,6 +261,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return;
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
BTRFS_QGROUP_RSV_DATA);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index be94c65bb4d2..5ee49b796815 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1321,7 +1321,7 @@ static void __del_reloc_root(struct btrfs_root *root)
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
- if (rc) {
+ if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
root->node->start);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ff5f6c719976..9ee0aca134fc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1930,6 +1930,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
+ btrfs_trans_release_metadata(trans);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
@@ -1939,9 +1942,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- btrfs_trans_release_metadata(trans);
- trans->block_rsv = NULL;
-
cur_trans = trans->transaction;
/*
@@ -2281,15 +2281,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- /*
- * If fs has been frozen, we can not handle delayed iputs, otherwise
- * it'll result in deadlock about SB_FREEZE_FS.
- */
- if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread &&
- !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
- btrfs_run_delayed_iputs(fs_info);
-
return ret;
scrub_continue:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 84b00a29d531..8b3f14a1adf0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -258,6 +258,13 @@ struct walk_control {
/* what stage of the replay code we're currently in */
int stage;
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+ * the LOG_WALK_REPLAY_INODES stage.
+ */
+ bool ignore_cur_inode;
+
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
@@ -2492,6 +2499,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ /*
+ * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+ * and never got linked before the fsync, skip it, as
+ * replaying it is pointless since it would be deleted
+ * later. We skip logging tmpfiles, but it's always
+ * possible we are replaying a log created with a kernel
+ * that used to log tmpfiles.
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
@@ -2529,16 +2550,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root->fs_info->sectorsize);
ret = btrfs_drop_extents(wc->trans, root, inode,
from, (u64)-1, 1);
- /*
- * If the nlink count is zero here, the iput
- * will free the inode. We bump it to make
- * sure it doesn't get freed until the link
- * count fixup is done.
- */
if (!ret) {
- if (inode->i_nlink == 0)
- inc_nlink(inode);
- /* Update link count and nbytes. */
+ /* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, inode);
}
@@ -2553,6 +2566,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
}
+ if (wc->ignore_cur_inode)
+ continue;
+
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
@@ -3209,9 +3225,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
};
ret = walk_log_tree(trans, log, &wc);
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -4505,7 +4524,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
- down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
logged_start = start;
@@ -4586,7 +4604,6 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- up_write(&inode->dio_sem);
btrfs_release_path(path);
if (!ret)
@@ -4784,7 +4801,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE));
+ BTRFS_COMPRESS_NONE) ||
+ (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
@@ -5718,9 +5736,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
dir_inode = btrfs_iget(fs_info->sb, &inode_key,
root, NULL);
- /* If parent inode was deleted, skip it. */
- if (IS_ERR(dir_inode))
- continue;
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
if (ctx)
ctx->log_new_dentries = false;
@@ -5794,7 +5836,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (btrfs_inode_in_log(inode, trans->transid)) {
+ /*
+ * Skip already logged inodes or inodes corresponding to tmpfiles
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+ if (btrfs_inode_in_log(inode, trans->transid) ||
+ inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}