diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/ctree.c | 16 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 7 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 12 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 21 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 5 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 49 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 12 |
7 files changed, 99 insertions, 23 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d92462fe66c8..f64aad613727 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1016,19 +1016,21 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; /* - * If we are COWing a node/leaf from the extent, chunk or device trees, - * make sure that we do not finish block group creation of pending block - * groups. We do this to avoid a deadlock. + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. * COWing can result in allocation of a new chunk, and flushing pending * block groups (btrfs_create_pending_block_groups()) can be triggered * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk and device trees, therefore we could - * deadlock with ourselves since we are holding a lock on an extent - * buffer that btrfs_create_pending_block_groups() may try to COW later. + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. */ if (root == fs_info->extent_root || root == fs_info->chunk_root || - root == fs_info->dev_root) + root == fs_info->dev_root || + root == fs_info->free_space_root) trans->can_flush_pending_bgs = false; cow = btrfs_alloc_tree_block(trans, root, parent_start, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0a68cf7032f5..7a2a2621f0d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; +struct btrfs_delayed_ref_root; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -786,6 +787,9 @@ enum { * main phase. The fs_info::balance_ctl is initialized. */ BTRFS_FS_BALANCE_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, }; struct btrfs_fs_info { @@ -2661,6 +2665,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8da2f380d3c0..6a2a2a951705 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1682,6 +1682,8 @@ static int cleaner_kthread(void *arg) while (1) { again = 0; + set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; @@ -1728,6 +1730,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) @@ -4201,6 +4204,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -4265,6 +4276,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (pin_bytes) btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b15afeae16df..d81035b7ea7d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2456,12 +2456,10 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; int nr_items = 1; /* Dropping this ref head update. */ if (head->total_ref_mod < 0) { @@ -2544,7 +2542,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -4954,6 +4952,15 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + mutex_lock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + ret = may_commit_transaction(fs_info, space_info); break; default: @@ -7188,7 +7195,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - cleanup_ref_head_accounting(trans, head); + btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43eb4535319d..5c349667c761 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3129,9 +3129,6 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - /* Try to release some metadata so we don't get an OOM but don't wait */ - btrfs_btree_balance_dirty_nodelay(fs_info); - return ret; } @@ -3254,6 +3251,8 @@ void btrfs_add_delayed_iput(struct inode *inode) ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); + if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fab9443f6a42..9c8e1734429c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3221,6 +3221,26 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) inode_lock_nested(inode2, I_MUTEX_CHILD); } +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } else if (inode1 == inode2 && loff2 < loff1) { + swap(loff1, loff2); + } + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { @@ -3242,11 +3262,12 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, return -EINVAL; /* - * Lock destination range to serialize with concurrent readpages(). + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); return ret; } @@ -3905,17 +3926,33 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, len = ALIGN(src->i_size, bs) - off; if (destoff > inode->i_size) { + const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); + ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) return ret; + /* + * We may have truncated the last block if the inode's size is + * not sector size aligned, so we need to wait for writeback to + * complete before proceeding further, otherwise we can race + * with cloning and attempt to increment a reference to an + * extent that no longer exists (writeback completed right after + * we found the previous extent covering eof and before we + * attempted to increment its reference count). + */ + ret = btrfs_wait_ordered_range(inode, wb_start, + destoff - wb_start); + if (ret) + return ret; } /* - * Lock destination range to serialize with concurrent readpages(). + * Lock destination range to serialize with concurrent readpages() and + * source range to serialize with relocation. */ - lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); + btrfs_double_extent_lock(src, off, inode, destoff, len); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); + btrfs_double_extent_unlock(src, off, inode, destoff, len); /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2576b1a379c9..3e4f8f88353e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7825,6 +7825,18 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + /* It's possible this device is a dummy for seed device */ + if (dev->disk_total_bytes == 0) { + dev = find_device(fs_info->fs_devices->seed, devid, NULL); + if (!dev) { + btrfs_err(fs_info, "failed to find seed devid %llu", + devid); + ret = -EUCLEAN; + goto out; + } + } + if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", |