diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/ctree.h | 3 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 74 | ||||
-rw-r--r-- | fs/btrfs/file.c | 24 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 22 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 37 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 14 | ||||
-rw-r--r-- | fs/btrfs/qgroup.c | 3 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 1 | ||||
-rw-r--r-- | fs/btrfs/send.c | 11 | ||||
-rw-r--r-- | fs/btrfs/super.c | 7 | ||||
-rw-r--r-- | fs/btrfs/tree-checker.c | 2 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 17 |
12 files changed, 144 insertions, 71 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 80953528572d..68f322f600a0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3163,6 +3163,9 @@ void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); +struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new, + struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b0ab41da91d1..6d776717d8b3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -477,9 +477,9 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, int mirror_num = 0; int failed_mirror = 0; - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, mirror_num); if (!ret) { @@ -493,15 +493,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, break; } - /* - * This buffer's crc is fine, but its contents are corrupted, so - * there is no reason to read the other copies, they won't be - * any less wrong. - */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) || - ret == -EUCLEAN) - break; - num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) @@ -1664,9 +1655,8 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; int again; - struct btrfs_trans_handle *trans; - do { + while (1) { again = 0; /* Make the cleaner go to sleep early. */ @@ -1715,42 +1705,16 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + if (kthread_should_park()) + kthread_parkme(); + if (kthread_should_stop()) + return 0; if (!again) { set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop()) - schedule(); + schedule(); __set_current_state(TASK_RUNNING); } - } while (!kthread_should_stop()); - - /* - * Transaction kthread is stopped before us and wakes us up. - * However we might have started a new transaction and COWed some - * tree blocks when deleting unused block groups for example. So - * make sure we commit the transaction we started to have a clean - * shutdown when evicting the btree inode - if it has dirty pages - * when we do the final iput() on it, eviction will trigger a - * writeback for it which will fail with null pointer dereferences - * since work queues and other resources were already released and - * destroyed by the time the iput/eviction/writeback is made. - */ - trans = btrfs_attach_transaction(root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) - btrfs_err(fs_info, - "cleaner transaction attach returned %ld", - PTR_ERR(trans)); - } else { - int ret; - - ret = btrfs_commit_transaction(trans); - if (ret) - btrfs_err(fs_info, - "cleaner open transaction commit returned %d", - ret); } - - return 0; } static int transaction_kthread(void *arg) @@ -3931,6 +3895,13 @@ void close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + /* + * We don't want the cleaner to start new transactions, add more delayed + * iputs, etc. while we're closing. We can't use kthread_stop() yet + * because that frees the task_struct, and the transaction kthread might + * still try to wake up the cleaner. + */ + kthread_park(fs_info->cleaner_kthread); /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -3958,9 +3929,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) if (!sb_rdonly(fs_info->sb)) { /* - * If the cleaner thread is stopped and there are - * block groups queued for removal, the deletion will be - * skipped when we quit the cleaner thread. + * The cleaner kthread is stopped, so do one final pass over + * unused block groups. */ btrfs_delete_unused_bgs(fs_info); @@ -4359,13 +4329,23 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, unpin = pinned_extents; again: while (1) { + /* + * The btrfs_finish_extent_commit() may get the same range as + * ours between find_first_extent_bit and clear_extent_dirty. + * Hence, hold the unused_bg_unpin_mutex to avoid double unpin + * the same extent range. + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } clear_extent_dirty(unpin, start, end); btrfs_error_unpin_extent_range(fs_info, start, end); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a3c22e16509b..58e93bce3036 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2089,6 +2089,30 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * Before we acquired the inode's lock, someone may have dirtied more + * pages in the target range. We need to make sure that writeback for + * any such pages does not start while we are logging the inode, because + * if it does, any of the following might happen when we are not doing a + * full inode sync: + * + * 1) We log an extent after its writeback finishes but before its + * checksums are added to the csum tree, leading to -EIO errors + * when attempting to read the extent after a log replay. + * + * 2) We can end up logging an extent before its writeback finishes. + * Therefore after the log replay we will have a file extent item + * pointing to an unwritten extent (and no data checksums as well). + * + * So trigger writeback for any eventual new dirty pages and then we + * wait for all ordered extents to complete below. + */ + ret = start_ordered_ops(inode, start, end); + if (ret) { + inode_unlock(inode); + goto out; + } + + /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaciton open. */ diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4ba0aedc878b..74aa552f4793 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -75,7 +75,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget(fs_info->sb, &location, root, NULL); + inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path); + btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) return inode; @@ -838,6 +839,25 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, path->search_commit_root = 1; path->skip_locking = 1; + /* + * We must pass a path with search_commit_root set to btrfs_iget in + * order to avoid a deadlock when allocating extents for the tree root. + * + * When we are COWing an extent buffer from the tree root, when looking + * for a free extent, at extent-tree.c:find_free_extent(), we can find + * block group without its free space cache loaded. When we find one + * we must load its space cache which requires reading its free space + * cache's inode item from the root tree. If this inode item is located + * in the same leaf that we started COWing before, then we end up in + * deadlock on the extent buffer (trying to read lock it when we + * previously write locked it). + * + * It's safe to read the inode item using the commit root because + * block groups, once loaded, stay in memory forever (until they are + * removed) as well as their space caches once loaded. New block groups + * once created get their ->cached field set to BTRFS_CACHE_FINISHED so + * we will never try to read their inode item while the fs is mounted. + */ inode = lookup_free_space_inode(fs_info, block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3df5b52278c..9ea4c6f0352f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1531,12 +1531,11 @@ out_check: } btrfs_release_path(path); - if (cur_offset <= end && cow_start == (u64)-1) { + if (cur_offset <= end && cow_start == (u64)-1) cow_start = cur_offset; - cur_offset = end; - } if (cow_start != (u64)-1) { + cur_offset = end; ret = cow_file_range(inode, locked_page, cow_start, end, end, page_started, nr_written, 1, NULL); if (ret) @@ -3570,10 +3569,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static int btrfs_read_locked_inode(struct inode *inode) +static int btrfs_read_locked_inode(struct inode *inode, + struct btrfs_path *in_path) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_path *path; + struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3589,15 +3589,18 @@ static int btrfs_read_locked_inode(struct inode *inode) if (!ret) filled = true; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - btrfs_free_path(path); + if (path != in_path) + btrfs_free_path(path); return ret; } @@ -3722,7 +3725,8 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); } - btrfs_free_path(path); + if (path != in_path) + btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -5644,8 +5648,9 @@ static struct inode *btrfs_iget_locked(struct super_block *s, /* Get an inode object given its location and corresponding root. * Returns in *is_new if the inode was read from disk */ -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new) +struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new, + struct btrfs_path *path) { struct inode *inode; @@ -5656,7 +5661,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, if (inode->i_state & I_NEW) { int ret; - ret = btrfs_read_locked_inode(inode); + ret = btrfs_read_locked_inode(inode, path); if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); @@ -5678,6 +5683,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return inode; } +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new) +{ + return btrfs_iget_path(s, location, root, new, NULL); +} + static struct inode *new_simple_dir(struct super_block *s, struct btrfs_key *key, struct btrfs_root *root) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3ca6943827ef..802a628e9f7d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3488,6 +3488,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; len = round_down(i_size_read(src), sz) - loff; + if (len == 0) + return 0; olen = len; } } @@ -4257,9 +4259,17 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, goto out_unlock; if (len == 0) olen = len = src->i_size - off; - /* if we extend to eof, continue to block boundary */ - if (off + len == src->i_size) + /* + * If we extend to eof, continue to block boundary if and only if the + * destination end offset matches the destination file's size, otherwise + * we would be corrupting data by placing the eof block into the middle + * of a file. + */ + if (off + len == src->i_size) { + if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) + goto out_unlock; len = ALIGN(src->i_size, bs) - off; + } if (len == 0) { ret = 0; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 45868fd76209..f70825af6438 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2659,7 +2659,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int i; u64 *i_qgroups; struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; u32 level_size = 0; @@ -2669,6 +2669,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; goto out; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 924116f654a1..a3f75b8926d4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3959,6 +3959,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans); + trans = NULL; continue; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 094cc1444a90..5be83b5a1b43 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) kfree(m); } -static void tail_append_pending_moves(struct pending_dir_move *moves, +static void tail_append_pending_moves(struct send_ctx *sctx, + struct pending_dir_move *moves, struct list_head *stack) { if (list_empty(&moves->list)) { @@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct pending_dir_move *moves, list_add_tail(&moves->list, stack); list_splice_tail(&list, stack); } + if (!RB_EMPTY_NODE(&moves->node)) { + rb_erase(&moves->node, &sctx->pending_dir_moves); + RB_CLEAR_NODE(&moves->node); + } } static int apply_children_dir_moves(struct send_ctx *sctx) @@ -3365,7 +3370,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) return 0; INIT_LIST_HEAD(&stack); - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { pm = list_first_entry(&stack, struct pending_dir_move, list); @@ -3376,7 +3381,7 @@ static int apply_children_dir_moves(struct send_ctx *sctx) goto out; pm = get_pending_dir_moves(sctx, parent_ino); if (pm) - tail_append_pending_moves(pm, &stack); + tail_append_pending_moves(sctx, pm, &stack); } return 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b362b45dd757..645fc81e2a94 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1916,7 +1916,7 @@ restore: } /* Used to sort the devices by max_avail(descending sort) */ -static int btrfs_cmp_device_free_bytes(const void *dev_info1, +static inline int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) { if (((struct btrfs_device_info *)dev_info1)->max_avail > @@ -1945,8 +1945,8 @@ static inline void btrfs_descending_sort_devices( * The helper to calc the free space on the devices that can be used to store * file data. */ -static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, - u64 *free_bytes) +static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, + u64 *free_bytes) { struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2237,6 +2237,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); + vol->name[BTRFS_PATH_NAME_MAX] = '\0'; switch (cmd) { case BTRFS_IOC_SCAN_DEV: diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index cab0b1f1f741..efcf89a8ba44 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)) { block_group_err(fs_info, leaf, slot, -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx", +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_SYSTEM, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e07f3376b7df..a5ce99a6c936 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4396,6 +4396,23 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { + /* + * Skip extents outside our logging range. It's important to do + * it for correctness because if we don't ignore them, we may + * log them before their ordered extent completes, and therefore + * we could log them without logging their respective checksums + * (the checksum items are added to the csum tree at the very + * end of btrfs_finish_ordered_io()). Also leave such extents + * outside of our range in the list, since we may have another + * ranged fsync in the near future that needs them. If an extent + * outside our range corresponds to a hole, log it to avoid + * leaving gaps between extents (fsck will complain when we are + * not using the NO_HOLES feature). + */ + if ((em->start > end || em->start + em->len <= start) && + em->block_start != EXTENT_MAP_HOLE) + continue; + list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive |