diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 226 |
1 files changed, 150 insertions, 76 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 21f34ad0d411..07b5e6f7df67 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -31,10 +31,10 @@ #include <linux/uuid.h> #include <linux/semaphore.h> #include <linux/error-injection.h> +#include <linux/crc32c.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" -#include "hash.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" @@ -110,7 +110,7 @@ int __init btrfs_end_io_wq_init(void) return 0; } -void btrfs_end_io_wq_exit(void) +void __cold btrfs_end_io_wq_exit(void) { kmem_cache_destroy(btrfs_end_io_wq_cache); } @@ -124,8 +124,8 @@ struct async_submit_bio { void *private_data; struct btrfs_fs_info *fs_info; struct bio *bio; - extent_submit_bio_hook_t *submit_bio_start; - extent_submit_bio_hook_t *submit_bio_done; + extent_submit_bio_start_t *submit_bio_start; + extent_submit_bio_done_t *submit_bio_done; int mirror_num; unsigned long bio_flags; /* @@ -270,7 +270,7 @@ out: u32 btrfs_csum_data(const char *data, u32 seed, size_t len) { - return btrfs_crc32c(seed, data, len); + return crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, u8 *result) @@ -403,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, if (csum_type == BTRFS_CSUM_TYPE_CRC32) { u32 crc = ~(u32)0; - const int csum_size = sizeof(crc); - char result[csum_size]; + char result[sizeof(crc)]; /* * The super_block structure does not span the whole @@ -415,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); - if (memcmp(raw_disk_sb, result, csum_size)) + if (memcmp(raw_disk_sb, result, sizeof(result))) ret = 1; } @@ -428,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return ret; } +static int verify_level_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int level, + struct btrfs_key *first_key) +{ + int found_level; + struct btrfs_key found_key; + int ret; + + found_level = btrfs_header_level(eb); + if (found_level != level) { +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(1); + btrfs_err(fs_info, +"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", + eb->start, level, found_level); +#endif + return -EIO; + } + + if (!first_key) + return 0; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + ret = btrfs_comp_cpu_keys(first_key, &found_key); + +#ifdef CONFIG_BTRFS_DEBUG + if (ret) { + WARN_ON(1); + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)", + eb->start, first_key->objectid, first_key->type, + first_key->offset, found_key.objectid, + found_key.type, found_key.offset); + } +#endif + return ret; +} + /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. + * + * @parent_transid: expected transid, skip check if 0 + * @level: expected level, mandatory check + * @first_key: expected key of first slot, skip check if NULL */ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 parent_transid) + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct extent_io_tree *io_tree; int failed = 0; @@ -449,11 +494,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, mirror_num); if (!ret) { - if (!verify_parent_transid(io_tree, eb, + if (verify_parent_transid(io_tree, eb, parent_transid, 0)) - break; - else ret = -EIO; + else if (verify_level_key(fs_info, eb, level, + first_key)) + ret = -EUCLEAN; + else + break; } /* @@ -461,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, * there is no reason to read the other copies, they won't be * any less wrong. */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) || + ret == -EUCLEAN) break; num_copies = btrfs_num_copies(fs_info, @@ -602,12 +651,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf_full(root, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && btrfs_check_node(root, eb)) + if (found_level > 0 && btrfs_check_node(fs_info, eb)) ret = -EIO; if (!ret) @@ -710,14 +759,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, return 0; } -unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) -{ - unsigned long limit = min_t(unsigned long, - info->thread_pool_size, - info->fs_devices->open_devices); - return 256 * limit; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -725,7 +766,6 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); ret = async->submit_bio_start(async->private_data, async->bio, - async->mirror_num, async->bio_flags, async->bio_offset); if (ret) async->status = ret; @@ -744,8 +784,7 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->private_data, async->bio, async->mirror_num, - async->bio_flags, async->bio_offset); + async->submit_bio_done(async->private_data, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -759,8 +798,8 @@ static void run_one_async_free(struct btrfs_work *work) blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, void *private_data, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) + extent_submit_bio_start_t *submit_bio_start, + extent_submit_bio_done_t *submit_bio_done) { struct async_submit_bio *async; @@ -807,8 +846,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, +static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { /* @@ -818,9 +856,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio return btree_csum_one_bio(bio); } -static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio, + int mirror_num) { struct inode *inode = private_data; blk_status_t ret; @@ -879,8 +916,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, bio_offset, private_data, - __btree_submit_bio_start, - __btree_submit_bio_done); + btree_submit_bio_start, + btree_submit_bio_done); } if (ret) @@ -1062,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) buf->start, buf->start + buf->len - 1); } +/* + * Read tree block at logical address @bytenr and do variant basic but critical + * verification. + * + * @parent_transid: expected transid of this tree block, skip check if 0 + * @level: expected level, mandatory check + * @first_key: expected key in slot 0, skip check if NULL + */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid) + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct extent_buffer *buf = NULL; int ret; @@ -1072,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid); + ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + level, first_key); if (ret) { free_extent_buffer(buf); return ERR_PTR(ret); @@ -1108,7 +1155,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); + ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -1160,6 +1207,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); + spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); @@ -1176,7 +1224,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); - atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1401,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_path *path; u64 generation; int ret; + int level; path = btrfs_alloc_path(); if (!path) @@ -1423,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); + level = btrfs_root_level(&root->root_item); root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), - generation); + generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); goto find_fail; @@ -1808,12 +1857,10 @@ sleep: if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))) btrfs_cleanup_transaction(fs_info); - set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || cannot_commit)) - schedule_timeout(delay); - __set_current_state(TASK_RUNNING); + schedule_timeout_interruptible(delay); } while (!kthread_should_stop()); return 0; } @@ -2183,7 +2230,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { - int max_active = fs_info->thread_pool_size; + u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = @@ -2276,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); + int level = btrfs_super_log_root_level(disk_super); if (fs_devices->rw_devices == 0) { btrfs_warn(fs_info, "log replay required on RO media"); @@ -2289,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(fs_info, bytenr, - fs_info->generation + 1); + fs_info->generation + 1, + level, NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2334,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.offset = 0; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; btrfs_init_devices_late(fs_info); location.objectid = BTRFS_CSUM_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->csum_root = root; @@ -2367,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (IS_ERR(root)) { ret = PTR_ERR(root); if (ret != -ENOENT) - return ret; + goto out; } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; @@ -2376,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) - return PTR_ERR(root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->free_space_root = root; } return 0; +out: + btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", + location.objectid, ret); + return ret; } int open_ctree(struct super_block *sb, @@ -2404,8 +2465,8 @@ int open_ctree(struct super_block *sb, int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - int max_active; int clear_free_space_tree = 0; + int level; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2447,6 +2508,8 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); + INIT_LIST_HEAD(&fs_info->pending_raid_kobjs); + spin_lock_init(&fs_info->pending_raid_kobjs_lock); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); @@ -2713,8 +2776,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - max_active = fs_info->thread_pool_size; - ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; @@ -2741,12 +2802,13 @@ int open_ctree(struct super_block *sb, } generation = btrfs_super_chunk_root_generation(disk_super); + level = btrfs_super_chunk_root_level(disk_super); __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), - generation); + generation, level, NULL); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { btrfs_err(fs_info, "failed to read chunk root"); @@ -2768,10 +2830,10 @@ int open_ctree(struct super_block *sb, } /* - * keep the device that is marked to be the target device for the - * dev_replace procedure + * Keep the devid that is marked to be the target device for the + * device replace procedure */ - btrfs_close_extra_devices(fs_devices, 0); + btrfs_free_extra_devids(fs_devices, 0); if (!fs_devices->latest_bdev) { btrfs_err(fs_info, "failed to read devices"); @@ -2780,10 +2842,11 @@ int open_ctree(struct super_block *sb, retry_root_backup: generation = btrfs_super_generation(disk_super); + level = btrfs_super_root_level(disk_super); tree_root->node = read_tree_block(fs_info, btrfs_super_root(disk_super), - generation); + generation, level, NULL); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { btrfs_warn(fs_info, "failed to read tree root"); @@ -2834,7 +2897,7 @@ retry_root_backup: goto fail_block_groups; } - btrfs_close_extra_devices(fs_devices, 1); + btrfs_free_extra_devids(fs_devices, 1); ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { @@ -2953,6 +3016,7 @@ retry_root_backup: fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); + btrfs_warn(fs_info, "failed to read fs tree: %d", err); goto fail_qgroup; } @@ -3290,6 +3354,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) struct buffer_head *bh; int i; int errors = 0; + bool primary_failed = false; u64 bytenr; if (max_mirrors == 0) @@ -3306,11 +3371,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) BTRFS_SUPER_INFO_SIZE); if (!bh) { errors++; + if (i == 0) + primary_failed = true; continue; } wait_on_buffer(bh); - if (!buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) { errors++; + if (i == 0) + primary_failed = true; + } /* drop our reference */ brelse(bh); @@ -3319,6 +3389,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) brelse(bh); } + /* log error, force error return */ + if (primary_failed) { + btrfs_err(device->fs_info, "error writing primary super block to device %llu", + device->devid); + return -1; + } + return errors < i ? 0 : -1; } @@ -3851,7 +3928,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) * So here we should only check item pointers, not item data. */ if (btrfs_header_level(buf) == 0 && - btrfs_check_leaf_relaxed(root, buf)) { + btrfs_check_leaf_relaxed(fs_info, buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -3890,12 +3967,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; struct btrfs_fs_info *fs_info = root->fs_info; - return btree_read_extent_buffer_pages(fs_info, buf, parent_transid); + return btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + level, first_key); } static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) @@ -4314,11 +4393,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); - if (!cache) { - btrfs_err(fs_info, "orphan block group dirty_bgs list"); - spin_unlock(&cur_trans->dirty_bgs_lock); - return; - } if (!list_empty(&cache->io_list)) { spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4338,14 +4412,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, } spin_unlock(&cur_trans->dirty_bgs_lock); + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ while (!list_empty(&cur_trans->io_bgs)) { cache = list_first_entry(&cur_trans->io_bgs, struct btrfs_block_group_cache, io_list); - if (!cache) { - btrfs_err(fs_info, "orphan block group on io_bgs list"); - return; - } list_del_init(&cache->io_list); spin_lock(&cache->lock); |