From 908c7f1949cb7cc6e92ba8f18f2998e87e265b8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:29 +0900 Subject: percpu_counter: add @gfp to percpu_counter_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_counter_init() so that !GFP_KERNEL allocation masks can be used with percpu_counters too. We could have left percpu_counter_init() alone and added percpu_counter_init_gfp(); however, the number of users isn't that high and introducing _gfp variants to all percpu data structures would be quite ugly, so let's just do the conversion. This is the one with the most users. Other percpu data structures are a lot easier to convert. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Jan Kara Acked-by: "David S. Miller" Cc: x86@kernel.org Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andrew Morton --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9cf2aa..61dae01788d7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1180,7 +1180,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0); + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_bdi; @@ -2193,13 +2193,13 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0); + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; -- cgit v1.2.3 From ff9ea323816dc1c8ac7144afd4eab3ac97704430 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 08:03:56 +0900 Subject: block, bdi: an active gendisk always has a request_queue associated with it bdev_get_queue() returns the request_queue associated with the specified block_device. blk_get_backing_dev_info() makes use of bdev_get_queue() to determine the associated bdi given a block_device. All the callers of bdev_get_queue() including blk_get_backing_dev_info() assume that bdev_get_queue() may return NULL and implement NULL handling; however, bdev_get_queue() requires the passed in block_device is opened and attached to its gendisk. Because an active gendisk always has a valid request_queue associated with it, bdev_get_queue() can never return NULL and neither can blk_get_backing_dev_info(). Make it clear that neither of the two functions can return NULL and remove NULL handling from all the callers. Signed-off-by: Tejun Heo Cc: Chris Mason Cc: Dave Chinner Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++------- block/compat_ioctl.c | 4 ---- block/ioctl.c | 4 ---- fs/block_dev.c | 2 -- fs/btrfs/disk-io.c | 2 +- fs/xfs/xfs_buf.c | 2 -- include/linux/blkdev.h | 2 +- 7 files changed, 5 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/block/blk-core.c b/block/blk-core.c index 93603e6ff479..817446175489 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -83,18 +83,14 @@ void blk_queue_congestion_threshold(struct request_queue *q) * @bdev: device * * Locates the passed device's request queue and returns the address of its - * backing_dev_info - * - * Will return NULL if the request queue cannot be located. + * backing_dev_info. This function can only be called if @bdev is opened + * and the return value is never NULL. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { - struct backing_dev_info *ret = NULL; struct request_queue *q = bdev_get_queue(bdev); - if (q) - ret = &q->backing_dev_info; - return ret; + return &q->backing_dev_info; } EXPORT_SYMBOL(blk_get_backing_dev_info); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 18b282ce361e..f678c733df40 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -709,8 +709,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return compat_put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: /* compatible */ @@ -731,8 +729,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: diff --git a/block/ioctl.c b/block/ioctl.c index d6cda8147c91..6c7bf903742f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -356,8 +356,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); @@ -386,8 +384,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - return -ENOTTY; bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKBSZSET: diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d7274619bf9..d3251eca6429 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1173,8 +1173,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; bdev_inode_switch_bdi(bdev->bd_inode, bdi); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d0ed9e664f7d..39ff591ae1b4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1694,7 +1694,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); - if (bdi && bdi_congested(bdi, bdi_bits)) { + if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cd7b8ca9b064..497fcde381d7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1678,8 +1678,6 @@ xfs_alloc_buftarg( btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; btp->bt_bdi = blk_get_backing_dev_info(bdev); - if (!btp->bt_bdi) - goto error; if (xfs_setsize_buftarg_early(btp, bdev)) goto error; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 518b46555b80..e267bf0db559 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -865,7 +865,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { - return bdev->bd_disk->queue; + return bdev->bd_disk->queue; /* this is never NULL */ } /* -- cgit v1.2.3 From 57cdc8db21bf9cfa6b2e45310d56e74e263e8609 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2014 02:37:48 +0100 Subject: btrfs: cleanup ino cache members of btrfs_root The naming is confusing, generic yet used for a specific cache. Add a prefix 'ino_' or rename appropriately. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 10 +++---- fs/btrfs/disk-io.c | 6 ++-- fs/btrfs/free-space-cache.c | 14 +++++----- fs/btrfs/inode-map.c | 68 ++++++++++++++++++++++----------------------- fs/btrfs/ioctl.c | 6 ++-- 5 files changed, 52 insertions(+), 52 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8e29b614fe93..a835a548e47e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1776,12 +1776,12 @@ struct btrfs_root { /* free ino cache stuff */ struct btrfs_free_space_ctl *free_ino_ctl; - enum btrfs_caching_type cached; - spinlock_t cache_lock; - wait_queue_head_t cache_wait; + enum btrfs_caching_type ino_cache_state; + spinlock_t ino_cache_lock; + wait_queue_head_t ino_cache_wait; struct btrfs_free_space_ctl *free_ino_pinned; - u64 cache_progress; - struct inode *cache_inode; + u64 ino_cache_progress; + struct inode *ino_cache_inode; struct mutex log_mutex; wait_queue_head_t log_writer_wait; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a1d36e62179c..354cc3f232bb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1573,8 +1573,8 @@ int btrfs_init_fs_root(struct btrfs_root *root) root->subv_writers = writers; btrfs_init_free_ino_ctl(root); - spin_lock_init(&root->cache_lock); - init_waitqueue_head(&root->cache_wait); + spin_lock_init(&root->ino_cache_lock); + init_waitqueue_head(&root->ino_cache_wait); ret = get_anon_bdev(&root->anon_dev); if (ret) @@ -3532,7 +3532,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, static void free_fs_root(struct btrfs_root *root) { - iput(root->cache_inode); + iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2b0a627cb5f9..f181c9afe5f4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3033,10 +3033,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, { struct inode *inode = NULL; - spin_lock(&root->cache_lock); - if (root->cache_inode) - inode = igrab(root->cache_inode); - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_inode) + inode = igrab(root->ino_cache_inode); + spin_unlock(&root->ino_cache_lock); if (inode) return inode; @@ -3044,10 +3044,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, if (IS_ERR(inode)) return inode; - spin_lock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); if (!btrfs_fs_closing(root->fs_info)) - root->cache_inode = igrab(inode); - spin_unlock(&root->cache_lock); + root->ino_cache_inode = igrab(inode); + spin_unlock(&root->ino_cache_lock); return inode; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 888fbe19079f..83d646bd2e4b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -87,7 +87,7 @@ again: */ btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(path); - root->cache_progress = last; + root->ino_cache_progress = last; up_read(&fs_info->commit_root_sem); schedule_timeout(1); goto again; @@ -106,7 +106,7 @@ again: if (last != (u64)-1 && last + 1 != key.objectid) { __btrfs_add_free_space(ctl, last + 1, key.objectid - last - 1); - wake_up(&root->cache_wait); + wake_up(&root->ino_cache_wait); } last = key.objectid; @@ -119,14 +119,14 @@ next: root->highest_objectid - last - 1); } - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); - root->cache_progress = (u64)-1; + root->ino_cache_progress = (u64)-1; btrfs_unpin_free_ino(root); out: - wake_up(&root->cache_wait); + wake_up(&root->ino_cache_wait); up_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root) if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_NO) { - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_NO) { + spin_unlock(&root->ino_cache_lock); return; } - root->cached = BTRFS_CACHE_STARTED; - spin_unlock(&root->cache_lock); + root->ino_cache_state = BTRFS_CACHE_STARTED; + spin_unlock(&root->ino_cache_lock); ret = load_free_ino_cache(root->fs_info, root); if (ret == 1) { - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); return; } @@ -196,11 +196,11 @@ again: start_caching(root); - wait_event(root->cache_wait, - root->cached == BTRFS_CACHE_FINISHED || + wait_event(root->ino_cache_wait, + root->ino_cache_state == BTRFS_CACHE_FINISHED || root->free_ino_ctl->free_space > 0); - if (root->cached == BTRFS_CACHE_FINISHED && + if (root->ino_cache_state == BTRFS_CACHE_FINISHED && root->free_ino_ctl->free_space == 0) return -ENOSPC; else @@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; again: - if (root->cached == BTRFS_CACHE_FINISHED) { + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { __btrfs_add_free_space(pinned, objectid, 1); } else { down_write(&root->fs_info->commit_root_sem); - spin_lock(&root->cache_lock); - if (root->cached == BTRFS_CACHE_FINISHED) { - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { + spin_unlock(&root->ino_cache_lock); up_write(&root->fs_info->commit_root_sem); goto again; } - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); start_caching(root); @@ -235,10 +235,10 @@ again: } /* - * When a transaction is committed, we'll move those inode numbers which - * are smaller than root->cache_progress from pinned tree to free_ino tree, - * and others will just be dropped, because the commit root we were - * searching has changed. + * When a transaction is committed, we'll move those inode numbers which are + * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and + * others will just be dropped, because the commit root we were searching has + * changed. * * Must be called with root->fs_info->commit_root_sem held */ @@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) info = rb_entry(n, struct btrfs_free_space, offset_index); BUG_ON(info->bitmap); /* Logic error */ - if (info->offset > root->cache_progress) + if (info->offset > root->ino_cache_progress) goto free; - else if (info->offset + info->bytes > root->cache_progress) - count = root->cache_progress - info->offset + 1; + else if (info->offset + info->bytes > root->ino_cache_progress) + count = root->ino_cache_progress - info->offset + 1; else count = info->bytes; @@ -462,13 +462,13 @@ again: } } - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_FINISHED) { + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_FINISHED) { ret = -1; - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); goto out_put; } - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); spin_lock(&ctl->tree_lock); prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8a8e29878c34..091c4d35671b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2526,9 +2526,9 @@ out_unlock: ASSERT(dest->send_in_progress == 0); /* the last ref */ - if (dest->cache_inode) { - iput(dest->cache_inode); - dest->cache_inode = NULL; + if (dest->ino_cache_inode) { + iput(dest->ino_cache_inode); + dest->ino_cache_inode = NULL; } } out_dput: -- cgit v1.2.3 From 3abdbd780e9d75f0648b8a502c3789857b1e92ce Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 Jun 2014 18:10:45 +0200 Subject: btrfs: make close_ctree return void There's no user of the return value and we can get rid of the comment in put_super. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 +--- fs/btrfs/disk-io.h | 2 +- fs/btrfs/super.c | 8 +------- 3 files changed, 3 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 354cc3f232bb..ec32bead96a1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3623,7 +3623,7 @@ int btrfs_commit_super(struct btrfs_root *root) return btrfs_commit_transaction(trans, root); } -int close_ctree(struct btrfs_root *root) +void close_ctree(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3711,8 +3711,6 @@ int close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; - - return 0; } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 23ce3ceba0a9..52a17db700fc 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -56,7 +56,7 @@ void clean_tree_block(struct btrfs_trans_handle *trans, int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); -int close_ctree(struct btrfs_root *root); +void close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c4124de4435b..568ddc16119f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -307,13 +307,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - (void)close_ctree(btrfs_sb(sb)->tree_root); - /* FIXME: need to fix VFS to return error? */ - /* AV: return it _where_? ->put_super() can be triggered by any number - * of async events, up to and including delivery of SIGKILL to the - * last process that kept it busy. Or segfault in the aforementioned - * process... Whom would you report that to? - */ + close_ctree(btrfs_sb(sb)->tree_root); } enum { -- cgit v1.2.3 From 707e8a071528385a87b63a72a37c2322e463c7b8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 Jun 2014 19:22:26 +0200 Subject: btrfs: use nodesize everywhere, kill leafsize The nodesize and leafsize were never of different values. Unify the usage and make nodesize the one. Cleanup the redundant checks and helpers. Shaves a few bytes from .text: text data bss dec hex filename 852418 24560 23112 900090 dbbfa btrfs.ko.before 851074 24584 23112 898770 db6d2 btrfs.ko.after Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 8 ++--- fs/btrfs/check-integrity.c | 13 -------- fs/btrfs/ctree.c | 18 +++++------ fs/btrfs/ctree.h | 21 +++---------- fs/btrfs/disk-io.c | 74 +++++++++++++++++++++------------------------- fs/btrfs/extent-tree.c | 36 +++++++++++----------- fs/btrfs/file.c | 2 +- fs/btrfs/ioctl.c | 6 ++-- fs/btrfs/print-tree.c | 2 +- fs/btrfs/qgroup.c | 6 ++-- fs/btrfs/reada.c | 2 +- fs/btrfs/relocation.c | 21 +++++++------ fs/btrfs/scrub.c | 17 +---------- fs/btrfs/transaction.c | 2 +- fs/btrfs/tree-log.c | 2 +- 15 files changed, 89 insertions(+), 141 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index cfe8566e6e33..4de97926939e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -482,7 +482,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, continue; BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, - fs_info->tree_root->leafsize, 0); + fs_info->tree_root->nodesize, 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -991,8 +991,8 @@ again: ref->level == 0) { u32 bsz; struct extent_buffer *eb; - bsz = btrfs_level_size(fs_info->extent_root, - ref->level); + + bsz = fs_info->extent_root->nodesize; eb = read_tree_block(fs_info->extent_root, ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { @@ -1366,7 +1366,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) - size = fs_info->extent_root->leafsize; + size = fs_info->extent_root->nodesize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ce92ae30250f..d0690da3b150 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror( btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || - btrfs_super_leafsize(super_tmp) != state->metablock_size || btrfs_super_sectorsize(super_tmp) != state->datablock_size) { brelse(bh); return 0; @@ -3120,24 +3119,12 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (root->nodesize != root->leafsize) { - printk(KERN_INFO - "btrfsic: cannot handle nodesize %d != leafsize %d!\n", - root->nodesize, root->leafsize); - return -1; - } if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", root->nodesize, PAGE_CACHE_SIZE); return -1; } - if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { - printk(KERN_INFO - "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->leafsize, PAGE_CACHE_SIZE); - return -1; - } if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 44ee5d2e52a4..263145b27155 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1444,7 +1444,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - blocksize = btrfs_level_size(root, old_root->level); + blocksize = root->nodesize; old = read_tree_block(root, logical, blocksize, 0); if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); @@ -1651,7 +1651,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); - blocksize = btrfs_level_size(root, parent_level - 1); + blocksize = root->nodesize; end_slot = parent_nritems; if (parent_nritems == 1) @@ -1872,7 +1872,7 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), + root->nodesize, btrfs_node_ptr_generation(parent, slot)); if (eb && !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); @@ -2267,7 +2267,7 @@ static void reada_for_search(struct btrfs_root *root, node = path->nodes[level]; search = btrfs_node_blockptr(node, slot); - blocksize = btrfs_level_size(root, level - 1); + blocksize = root->nodesize; eb = btrfs_find_tree_block(root, search, blocksize); if (eb) { free_extent_buffer(eb); @@ -2325,7 +2325,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = btrfs_level_size(root, level); + blocksize = root->nodesize; if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); @@ -2461,7 +2461,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); + blocksize = root->nodesize; tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp) { @@ -4282,13 +4282,13 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + right = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, root->leafsize); + root_add_used(root, root->nodesize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -5375,7 +5375,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); + tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); if (!tmp_buf) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a835a548e47e..6fc16d22d27d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -391,7 +391,7 @@ struct btrfs_header { sizeof(struct btrfs_header)) / \ sizeof(struct btrfs_key_ptr)) #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) @@ -474,7 +474,7 @@ struct btrfs_super_block { __le64 num_devices; __le32 sectorsize; __le32 nodesize; - __le32 leafsize; + __le32 __unused_leafsize; __le32 stripesize; __le32 sys_chunk_array_size; __le64 chunk_root_generation; @@ -1806,9 +1806,6 @@ struct btrfs_root { /* node allocations are done in nodesize units */ u32 nodesize; - /* leaf allocations are done in leafsize units */ - u32 leafsize; - u32 stripesize; u32 type; @@ -2995,8 +2992,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, sectorsize, 32); BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, - leafsize, 32); BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, @@ -3232,13 +3227,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) return sb->s_fs_info; } -static inline u32 btrfs_level_size(struct btrfs_root *root, int level) -{ - if (level == 0) - return root->leafsize; - return root->nodesize; -} - /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ @@ -3263,7 +3251,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 2 * num_items; } @@ -3274,8 +3262,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, unsigned num_items) { - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - num_items; + return root->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ec32bead96a1..508bbee320f6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1200,16 +1200,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) kfree(writers); } -static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, +static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, + struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; - root->leafsize = leafsize; root->stripesize = stripesize; root->state = 0; root->orphan_cleanup_state = 0; @@ -1295,7 +1293,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void) root = btrfs_alloc_root(NULL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); + __setup_root(4096, 4096, 4096, root, NULL, 1); set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; @@ -1318,14 +1316,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + leaf = btrfs_alloc_free_block(trans, root, root->nodesize, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); @@ -1396,9 +1393,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, BTRFS_TREE_LOG_OBJECTID); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, + BTRFS_TREE_LOG_OBJECTID); root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -1413,7 +1410,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, * updated (along with back refs to the log tree). */ - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + leaf = btrfs_alloc_free_block(trans, root, root->nodesize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { @@ -1465,7 +1462,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); @@ -1498,9 +1495,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, goto alloc_fail; } - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, key->objectid); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, key->objectid); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); @@ -1511,7 +1507,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + blocksize = root->nodesize; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); if (!root->node) { @@ -2143,7 +2139,6 @@ int open_ctree(struct super_block *sb, { u32 sectorsize; u32 nodesize; - u32 leafsize; u32 blocksize; u32 stripesize; u64 generation; @@ -2389,7 +2384,7 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - __setup_root(4096, 4096, 4096, 4096, tree_root, + __setup_root(4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); invalidate_bdev(fs_devices->latest_bdev); @@ -2469,19 +2464,22 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - if (btrfs_super_leafsize(disk_super) != + /* + * Leafsize and nodesize were always equal, this is only a sanity check. + */ + if (le32_to_cpu(disk_super->__unused_leafsize) != btrfs_super_nodesize(disk_super)) { printk(KERN_ERR "BTRFS: couldn't mount because metadata " "blocksizes don't match. node %d leaf %d\n", btrfs_super_nodesize(disk_super), - btrfs_super_leafsize(disk_super)); + le32_to_cpu(disk_super->__unused_leafsize)); err = -EINVAL; goto fail_alloc; } - if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { printk(KERN_ERR "BTRFS: couldn't mount because metadata " "blocksize (%d) was too large\n", - btrfs_super_leafsize(disk_super)); + btrfs_super_nodesize(disk_super)); err = -EINVAL; goto fail_alloc; } @@ -2498,17 +2496,16 @@ int open_ctree(struct super_block *sb, * flag our filesystem as having big metadata blocks if * they are bigger than the page size */ - if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); - fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* @@ -2516,7 +2513,7 @@ int open_ctree(struct super_block *sb, * extent buffers for the same range. It leads to corruptions */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (sectorsize != leafsize)) { + (sectorsize != nodesize)) { printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); @@ -2615,7 +2612,6 @@ int open_ctree(struct super_block *sb, 4 * 1024 * 1024 / PAGE_CACHE_SIZE); tree_root->nodesize = nodesize; - tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; tree_root->stripesize = stripesize; @@ -2642,12 +2638,11 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - blocksize = btrfs_level_size(tree_root, - btrfs_super_chunk_root_level(disk_super)); + blocksize = tree_root->nodesize; generation = btrfs_super_chunk_root_generation(disk_super); - __setup_root(nodesize, leafsize, sectorsize, stripesize, - chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + __setup_root(nodesize, sectorsize, stripesize, chunk_root, + fs_info, BTRFS_CHUNK_TREE_OBJECTID); chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), @@ -2684,8 +2679,7 @@ int open_ctree(struct super_block *sb, } retry_root_backup: - blocksize = btrfs_level_size(tree_root, - btrfs_super_root_level(disk_super)); + blocksize = tree_root->nodesize; generation = btrfs_super_generation(disk_super); tree_root->node = read_tree_block(tree_root, @@ -2859,9 +2853,7 @@ retry_root_backup: err = -EIO; goto fail_qgroup; } - blocksize = - btrfs_level_size(tree_root, - btrfs_super_log_root_level(disk_super)); + blocksize = tree_root->nodesize; log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { @@ -2869,7 +2861,7 @@ retry_root_backup: goto fail_qgroup; } - __setup_root(nodesize, leafsize, sectorsize, stripesize, + __setup_root(nodesize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(tree_root, bytenr, @@ -4008,8 +4000,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { eb = btrfs_find_tree_block(root, start, - root->leafsize); - start += root->leafsize; + root->nodesize); + start += root->nodesize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4d1b50d4dc5b..d52da9628f0a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -491,7 +491,7 @@ next: key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + - fs_info->tree_root->leafsize; + fs_info->tree_root->nodesize; else last = key.objectid + key.offset; @@ -765,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, * different */ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { - offset = root->leafsize; + offset = root->nodesize; metadata = 0; } @@ -799,13 +799,13 @@ again: path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == root->leafsize) + key.offset == root->nodesize) ret = 0; } if (ret) { key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->leafsize; + key.offset = root->nodesize; btrfs_release_path(path); goto again; } @@ -2651,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); num_heads = heads_to_leaves(root, num_heads); if (num_heads > 1) - num_bytes += (num_heads - 1) * root->leafsize; + num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; global_rsv = &root->fs_info->global_block_rsv; @@ -3117,7 +3117,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); - num_bytes = btrfs_level_size(root, level - 1); + num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, 1); @@ -4839,7 +4839,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) if (num_bytes * 3 > meta_used) num_bytes = div64_u64(meta_used, 3); - return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); + return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -4988,7 +4988,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * root->leafsize; + num_bytes = 3 * root->nodesize; ret = btrfs_qgroup_reserve(root, num_bytes); if (ret) return ret; @@ -5176,7 +5176,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5185,7 +5185,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (unlikely(ret)) { if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); goto out_fail; } @@ -5301,7 +5301,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { btrfs_qgroup_free(root, num_bytes + - dropped * root->leafsize); + dropped * root->nodesize); } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, @@ -7077,7 +7077,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); return -ENOMEM; } @@ -7086,7 +7086,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ins, size); if (ret) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); btrfs_free_path(path); return ret; } @@ -7101,7 +7101,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - num_bytes = root->leafsize; + num_bytes = root->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -7131,14 +7131,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->leafsize, 1); + ret = update_block_group(root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); return ret; } @@ -7417,7 +7417,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, eb = path->nodes[wc->level]; nritems = btrfs_header_nritems(eb); - blocksize = btrfs_level_size(root, wc->level - 1); + blocksize = root->nodesize; for (slot = path->slots[wc->level]; slot < nritems; slot++) { if (nread >= wc->reada_count) @@ -7806,7 +7806,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); + blocksize = root->nodesize; next = btrfs_find_tree_block(root, bytenr, blocksize); if (!next) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a9b56e32dd8d..033f04bac85b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1653,7 +1653,7 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b61801ac052a..d6e10d60f8ad 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -477,7 +477,7 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + leaf = btrfs_alloc_free_block(trans, root, root->nodesize, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); @@ -503,7 +503,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_flags(&root_item, 0); @@ -3199,7 +3199,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(btrfs_level_size(root, 0)); + buf = vmalloc(root->nodesize); if (!buf) return ret; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1591620bee3d..eb309855d5c8 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -336,7 +336,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) for (i = 0; i < nr; i++) { struct extent_buffer *next = read_tree_block(root, btrfs_node_blockptr(c, i), - btrfs_level_size(root, level - 1), + root->nodesize, btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && level != 1) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ded5c601d916..2ce4ce7b47d8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2237,7 +2237,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (srcid) { struct btrfs_root *srcroot; struct btrfs_key srckey; - int srcroot_level; srckey.objectid = srcid; srckey.type = BTRFS_ROOT_ITEM_KEY; @@ -2249,8 +2248,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, } rcu_read_lock(); - srcroot_level = btrfs_header_level(srcroot->node); - level_size = btrfs_level_size(srcroot, srcroot_level); + level_size = srcroot->nodesize; rcu_read_unlock(); } @@ -2566,7 +2564,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, found.type != BTRFS_METADATA_ITEM_KEY) continue; if (found.type == BTRFS_METADATA_ITEM_KEY) - num_bytes = fs_info->extent_root->leafsize; + num_bytes = fs_info->extent_root->nodesize; else num_bytes = found.offset; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 20408c6b665a..b63ae20618fb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (!re) return NULL; - blocksize = btrfs_level_size(root, level); + blocksize = root->nodesize; re->logical = logical; re->blocksize = blocksize; re->top = *top; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b3329ad34522..2d221c46180c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1787,7 +1787,7 @@ again: btrfs_node_key_to_cpu(parent, next_key, slot + 1); old_bytenr = btrfs_node_blockptr(parent, slot); - blocksize = btrfs_level_size(dest, level - 1); + blocksize = dest->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); if (level <= max_level) { @@ -1970,7 +1970,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, } bytenr = btrfs_node_blockptr(eb, path->slots[i]); - blocksize = btrfs_level_size(root, i - 1); + blocksize = root->nodesize; eb = read_tree_block(root, bytenr, blocksize, ptr_gen); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); @@ -2544,8 +2544,7 @@ u64 calcu_metadata_size(struct reloc_control *rc, if (next->processed && (reserve || next != node)) break; - num_bytes += btrfs_level_size(rc->extent_root, - next->level); + num_bytes += rc->extent_root->nodesize; if (list_empty(&next->upper)) break; @@ -2679,7 +2678,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } - blocksize = btrfs_level_size(root, node->level); + blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, blocksize, generation); if (!eb || !extent_buffer_uptodate(eb)) { @@ -2789,7 +2788,7 @@ static void __mark_block_processed(struct reloc_control *rc, u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { - blocksize = btrfs_level_size(rc->extent_root, node->level); + blocksize = rc->extent_root->nodesize; mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; @@ -2865,7 +2864,7 @@ static int reada_tree_block(struct reloc_control *rc, if (block->key.type == BTRFS_METADATA_ITEM_KEY) readahead_tree_block(rc->extent_root, block->bytenr, block->key.objectid, - rc->extent_root->leafsize); + rc->extent_root->nodesize); else readahead_tree_block(rc->extent_root, block->bytenr, block->key.objectid, block->key.offset); @@ -3313,7 +3312,7 @@ static int add_tree_block(struct reloc_control *rc, return -ENOMEM; block->bytenr = extent_key->objectid; - block->key.objectid = rc->extent_root->leafsize; + block->key.objectid = rc->extent_root->nodesize; block->key.offset = generation; block->level = level; block->key_ready = 0; @@ -3640,7 +3639,7 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize = btrfs_level_size(rc->extent_root, 0); + u32 blocksize = rc->extent_root->nodesize; int ret = 0; int err = 0; @@ -3783,7 +3782,7 @@ next: } if (key.type == BTRFS_METADATA_ITEM_KEY && - key.objectid + rc->extent_root->leafsize <= + key.objectid + rc->extent_root->nodesize <= rc->search_start) { path->slots[0]++; goto next; @@ -3801,7 +3800,7 @@ next: rc->search_start = key.objectid + key.offset; else rc->search_start = key.objectid + - rc->extent_root->leafsize; + rc->extent_root->nodesize; memcpy(extent_key, &key, sizeof(key)); return 0; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 053dd000d4ef..4ae1c5feccbe 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -137,7 +137,6 @@ struct scrub_ctx { int pages_per_rd_bio; u32 sectorsize; u32 nodesize; - u32 leafsize; int is_dev_replace; struct scrub_wr_ctx wr_ctx; @@ -438,7 +437,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) } sctx->first_free = 0; sctx->nodesize = dev->dev_root->nodesize; - sctx->leafsize = dev->dev_root->leafsize; sctx->sectorsize = dev->dev_root->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); @@ -1758,7 +1756,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) ++fail; - WARN_ON(sctx->nodesize != sctx->leafsize); len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; @@ -2196,7 +2193,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - WARN_ON(sctx->nodesize != sctx->leafsize); blocksize = sctx->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; @@ -2487,7 +2483,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, btrfs_item_key_to_cpu(l, &key, slot); if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = root->leafsize; + bytes = root->nodesize; else bytes = key.offset; @@ -2910,17 +2906,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (btrfs_fs_closing(fs_info)) return -EINVAL; - /* - * check some assumptions - */ - if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { - btrfs_err(fs_info, - "scrub: size assumption nodesize == leafsize (%d == %d) fails", - fs_info->chunk_root->nodesize, - fs_info->chunk_root->leafsize); - return -EINVAL; - } - if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { /* * in this case scrub is unable to calculate the checksum diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 977717b45bf7..e336646508fe 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -408,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, if (num_items > 0 && root != root->fs_info->chunk_root) { if (root->fs_info->quota_enabled && is_fstree(root->root_key.objectid)) { - qgroup_reserved = num_items * root->leafsize; + qgroup_reserved = num_items * root->nodesize; ret = btrfs_qgroup_reserve(root, qgroup_reserved); if (ret) return ERR_PTR(ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2f5000c0a87a..7b6d1428f033 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2157,7 +2157,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); + blocksize = root->nodesize; parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); -- cgit v1.2.3 From 29549aec76bd6f1fc8e1723ed5396d65073d6521 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Fri, 4 Jul 2014 17:59:06 +0800 Subject: Btrfs: print btrfs specific info for some fatal error cases Marc argued that if there are several btrfs filesystems mounted, while users even don't know which filesystem hit the corrupted errors something like generation verification failure. Since @extent_buffer structure has a member @fs_info, let's output btrfs device info. Reported-by: Marc MERLIN Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 508bbee320f6..d14847d05f31 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -348,9 +348,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited("parent transid verify failed on %llu wanted %llu " - "found %llu\n", - eb->start, parent_transid, btrfs_header_generation(eb)); + printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + eb->fs_info->sb->s_id, eb->start, + parent_transid, btrfs_header_generation(eb)); ret = 1; /* @@ -614,15 +614,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " + printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " "%llu %llu\n", - found_start, eb->start); + eb->fs_info->sb->s_id, found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", - eb->start); + printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", + eb->fs_info->sb->s_id, eb->start); ret = -EIO; goto err; } -- cgit v1.2.3 From 56094eecd32cbb80d098eee5a7cbd60f39f4b764 Mon Sep 17 00:00:00 2001 From: Andrey Utkin Date: Sat, 9 Aug 2014 14:51:15 +0300 Subject: btrfs: Drop stray check of fixup_workers creation The issue was introduced in a79b7d4b3e8118f265dcb4bdf9a572c392f02708, adding allocation of extent_workers, so this stray check is surely not meant to be a check of something else. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=82021 Reported-by: Maks Naumov Signed-off-by: Andrey Utkin Reviewed-by: Eric Sandeen Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d14847d05f31..38b295553544 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2601,7 +2601,7 @@ int open_ctree(struct super_block *sb, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->fixup_workers && fs_info->extent_workers && + fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; -- cgit v1.2.3 From 82f70d62f7923cc43128e75ae85366f137055b76 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 8 Sep 2014 20:41:09 +0800 Subject: btrfs: remove the wrong comments This comments became wrong after c3c532[bdi: add helper function for doing init and register of a bdi for a file system], so remove them. Signed-off-by: Li RongQing Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 38b295553544..dbd792754b27 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1704,10 +1704,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -/* - * If this fails, caller must call bdi_destroy() to get rid of the - * bdi again. - */ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; -- cgit v1.2.3 From 935e5cc935bcbf9b3d0dd59fed7dbc0f2ebca6bc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:33 +0800 Subject: Btrfs: fix wrong disk size when writing super blocks total_size will be changed when resizing a device, and disk_total_size will be changed if resizing is successful. Meanwhile, the on-disk super blocks of the previous transaction might not be updated. Considering the consistency of the metadata in the previous transaction, We should use the size in the previous transaction to check if the super block is beyond the boundary of the device. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/dev-replace.c | 18 ++++++++++++++++++ fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/scrub.c | 3 ++- fs/btrfs/transaction.c | 2 ++ fs/btrfs/volumes.c | 40 +++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 18 ++++++++++++++++++ 7 files changed, 83 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e0033c843ce7..cb7f3fe9c9f6 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror( /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; bh = __bread(superblock_bdev, dev_bytenr / 4096, BTRFS_SUPER_INFO_SIZE); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 72dc02e82945..7877b0fc6a8d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -168,6 +168,8 @@ no_valid_dev_replace_entry_found: dev_replace->srcdev->total_bytes; dev_replace->tgtdev->disk_total_bytes = dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->commit_total_bytes = + dev_replace->srcdev->commit_total_bytes; dev_replace->tgtdev->bytes_used = dev_replace->srcdev->bytes_used; } @@ -329,6 +331,20 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + + /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, args->start.srcdev_name, @@ -539,6 +555,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); tgt_device->total_bytes = src_device->total_bytes; tgt_device->disk_total_bytes = src_device->disk_total_bytes; + ASSERT(list_empty(&src_device->resized_list)); + tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dbd792754b27..0cd18b725554 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3127,7 +3127,8 @@ static int write_dev_supers(struct btrfs_device *device, for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) break; if (wait) { @@ -3444,7 +3445,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, - dev->disk_total_bytes); + dev->commit_total_bytes); btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 72c8981e7c0a..9d80e37044db 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2840,7 +2840,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > + scrub_dev->commit_total_bytes) break; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e336646508fe..2f7c0bef4043 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1868,6 +1868,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); + btrfs_update_commit_device_size(root->fs_info); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1646659f2800..7b5c04259a6e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void) mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->list); @@ -154,6 +155,7 @@ static struct btrfs_device *__alloc_device(void) INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->resized_list); spin_lock_init(&dev->io_lock); @@ -2168,6 +2170,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); device->disk_total_bytes = device->total_bytes; + device->commit_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -2364,6 +2367,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->sector_size = root->sectorsize; device->total_bytes = srcdev->total_bytes; device->disk_total_bytes = srcdev->disk_total_bytes; + ASSERT(list_empty(&srcdev->resized_list)); + device->commit_total_bytes = srcdev->commit_total_bytes; device->bytes_used = srcdev->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; @@ -2448,6 +2453,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, { struct btrfs_super_block *super_copy = device->dev_root->fs_info->super_copy; + struct btrfs_fs_devices *fs_devices; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -2457,12 +2463,17 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->is_tgtdev_for_dev_replace) return -EINVAL; + fs_devices = device->dev_root->fs_info->fs_devices; + btrfs_set_super_total_bytes(super_copy, old_total + diff); device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; device->disk_total_bytes = new_size; btrfs_clear_space_info_full(device->dev_root->fs_info); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &fs_devices->resized_devices); return btrfs_update_device(trans, device); } @@ -4011,8 +4022,11 @@ again: } lock_chunks(root); - device->disk_total_bytes = new_size; + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &root->fs_info->fs_devices->resized_devices); + /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); if (ret) { @@ -5993,6 +6007,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->devid = btrfs_device_id(leaf, dev_item); device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); device->total_bytes = device->disk_total_bytes; + device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); @@ -6520,3 +6535,26 @@ int btrfs_scratch_superblock(struct btrfs_device *device) return 0; } + +/* + * Update the size of all devices, which is used for writing out the + * super blocks. + */ +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *curr, *next; + + if (list_empty(&fs_devices->resized_devices)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + lock_chunks(fs_info->dev_root); + list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, + resized_list) { + list_del_init(&curr->resized_list); + curr->commit_total_bytes = curr->disk_total_bytes; + } + unlock_chunks(fs_info->dev_root); + mutex_unlock(&fs_devices->device_list_mutex); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e15f2886d33e..b30d018fa359 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -87,6 +87,21 @@ struct btrfs_device { /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + /* + * size of the device on the current transaction + * + * This variant is update when committing the transaction, + * and protected by device_list_mutex + */ + u64 commit_total_bytes; + + /* + * used to manage the device which is resized + * + * It is protected by chunk_lock. + */ + struct list_head resized_list; + /* for sending down flush barriers */ int nobarriers; struct bio *flush_bio; @@ -136,6 +151,7 @@ struct btrfs_fs_devices { struct mutex device_list_mutex; struct list_head devices; + struct list_head resized_devices; /* devices not currently being allocated */ struct list_head alloc_list; struct list_head list; @@ -402,4 +418,6 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, { btrfs_dev_stat_set(dev, index, 0); } + +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); #endif -- cgit v1.2.3 From ce7213c70c37e3a66bc0b50c45edcbfea505f62f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:34 +0800 Subject: Btrfs: fix wrong device bytes_used in the super block device->bytes_used will be changed when allocating a new chunk, and disk_total_size will be changed if resizing is successful. Meanwhile, the on-disk super blocks of the previous transaction might not be updated. Considering the consistency of the metadata in the previous transaction, We should use the size in the previous transaction to check if the super block is beyond the boundary of the device. Though it is not big problem because we don't use it now, but anyway it is better that we make it be consistent with the common metadata, maybe we will use it in the future. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 3 +++ fs/btrfs/disk-io.c | 3 ++- fs/btrfs/transaction.c | 1 + fs/btrfs/volumes.c | 27 +++++++++++++++++++++++++++ fs/btrfs/volumes.h | 4 ++++ 5 files changed, 37 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7877b0fc6a8d..1be03d85d267 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -172,6 +172,8 @@ no_valid_dev_replace_entry_found: dev_replace->srcdev->commit_total_bytes; dev_replace->tgtdev->bytes_used = dev_replace->srcdev->bytes_used; + dev_replace->tgtdev->commit_bytes_used = + dev_replace->srcdev->commit_bytes_used; } dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; btrfs_init_dev_replace_tgtdev_for_resume(fs_info, @@ -558,6 +560,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->bytes_used = src_device->bytes_used; + tgt_device->commit_bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; if (fs_info->fs_devices->latest_bdev == src_device->bdev) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0cd18b725554..a224fb9b34a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3446,7 +3446,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, dev->commit_total_bytes); - btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_bytes_used(dev_item, + dev->commit_bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2f7c0bef4043..16d0c1b62b3e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1869,6 +1869,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, sizeof(*root->fs_info->super_copy)); btrfs_update_commit_device_size(root->fs_info); + btrfs_update_commit_device_bytes_used(root, cur_trans); spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7b5c04259a6e..f8273bb53b3f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2370,6 +2370,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, ASSERT(list_empty(&srcdev->resized_list)); device->commit_total_bytes = srcdev->commit_total_bytes; device->bytes_used = srcdev->bytes_used; + device->commit_bytes_used = device->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -6009,6 +6010,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->total_bytes = device->disk_total_bytes; device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->commit_bytes_used = device->bytes_used; device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); @@ -6558,3 +6560,28 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) unlock_chunks(fs_info->dev_root); mutex_unlock(&fs_devices->device_list_mutex); } + +/* Must be invoked during the transaction commit */ +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *dev; + int i; + + if (list_empty(&transaction->pending_chunks)) + return; + + /* In order to kick the device replace finish process */ + lock_chunks(root); + list_for_each_entry(em, &transaction->pending_chunks, list) { + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + dev = map->stripes[i].dev; + dev->commit_bytes_used = dev->bytes_used; + } + } + unlock_chunks(root); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b30d018fa359..f79d532fedb0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -95,6 +95,8 @@ struct btrfs_device { */ u64 commit_total_bytes; + /* bytes used on the current transaction */ + u64 commit_bytes_used; /* * used to manage the device which is resized * @@ -420,4 +422,6 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, } void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction); #endif -- cgit v1.2.3 From 8b110e393c5a6e72d50fcdf9fa7ed8b647cfdfc9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 12 Sep 2014 18:44:03 +0800 Subject: Btrfs: implement repair function when direct read fails This patch implement data repair function when direct read fails. The detail of the implementation is: - When we find the data is not right, we try to read the data from the other mirror. - When the io on the mirror ends, we will insert the endio work into the dedicated btrfs workqueue, not common read endio workqueue, because the original endio work is still blocked in the btrfs endio workqueue, if we insert the endio work of the io on the mirror into that workqueue, deadlock would happen. - After we get right data, we write it back to the corrupted mirror. - And if the data on the new mirror is still corrupted, we will try next mirror until we read right data or all the mirrors are traversed. - After the above work, we set the uptodate flag according to the result. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 1 + fs/btrfs/async-thread.h | 1 + fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 11 +- fs/btrfs/disk-io.h | 1 + fs/btrfs/extent_io.c | 12 ++- fs/btrfs/extent_io.h | 5 +- fs/btrfs/inode.c | 276 ++++++++++++++++++++++++++++++++++++++++++++---- 9 files changed, 281 insertions(+), 29 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index fbd76ded9a34..2da0a66790ba 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper); BTRFS_WORK_HELPER(endio_meta_helper); BTRFS_WORK_HELPER(endio_meta_write_helper); BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(endio_repair_helper); BTRFS_WORK_HELPER(rmw_helper); BTRFS_WORK_HELPER(endio_write_helper); BTRFS_WORK_HELPER(freespace_write_helper); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index e9e31c94758f..e386c29ef1f6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper); BTRFS_WORK_HELPER_PROTO(endio_meta_helper); BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(endio_repair_helper); BTRFS_WORK_HELPER_PROTO(rmw_helper); BTRFS_WORK_HELPER_PROTO(endio_write_helper); BTRFS_WORK_HELPER_PROTO(freespace_write_helper); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4d309471294e..7a7521c87c88 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -271,7 +271,7 @@ struct btrfs_dio_private { * The original bio may be splited to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *); + int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); }; /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0f3e4f7e454a..51ff3f8dbab9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1538,6 +1538,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *endio_repair_workers; struct btrfs_workqueue *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a224fb9b34a3..48794f951427 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -713,7 +713,11 @@ static void end_workqueue_bio(struct bio *bio, int err) func = btrfs_endio_write_helper; } } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + if (unlikely(end_io_wq->metadata == + BTRFS_WQ_ENDIO_DIO_REPAIR)) { + wq = fs_info->endio_repair_workers; + func = btrfs_endio_repair_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { wq = fs_info->endio_raid56_workers; func = btrfs_endio_raid56_helper; } else if (end_io_wq->metadata) { @@ -741,6 +745,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) { struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); if (!end_io_wq) return -ENOMEM; @@ -2055,6 +2060,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_meta_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); @@ -2572,6 +2578,8 @@ int open_ctree(struct super_block *sb, btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); fs_info->endio_raid56_workers = btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); fs_info->rmw_workers = btrfs_alloc_workqueue("rmw", flags, max_active, 2); fs_info->endio_write_workers = @@ -2593,6 +2601,7 @@ int open_ctree(struct super_block *sb, fs_info->submit_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 52a17db700fc..14d06ee1e143 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -30,6 +30,7 @@ enum { BTRFS_WQ_ENDIO_METADATA = 1, BTRFS_WQ_ENDIO_FREE_SPACE = 2, BTRFS_WQ_ENDIO_RAID56 = 3, + BTRFS_WQ_ENDIO_DIO_REPAIR = 4, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05533c99f89d..9e2ef27672e5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1962,7 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -static int free_io_failure(struct inode *inode, struct io_failure_record *rec) +int free_io_failure(struct inode *inode, struct io_failure_record *rec) { int ret; int err = 0; @@ -2081,8 +2081,8 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -static int clean_io_failure(struct inode *inode, u64 start, - struct page *page, unsigned int pg_offset) +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset) { u64 private; u64 private_failure; @@ -2291,7 +2291,7 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func) + bio_end_io_t *endio_func, void *data) { struct bio *bio; struct btrfs_io_bio *btrfs_failed_bio; @@ -2305,6 +2305,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_iter.bi_size = 0; + bio->bi_private = data; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { @@ -2362,7 +2363,8 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, phy_offset >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, start - page_offset(page), - (int)phy_offset, failed_bio->bi_end_io); + (int)phy_offset, failed_bio->bi_end_io, + NULL); if (!bio) { free_io_failure(inode, failrec); return -EIO; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bf0597f3a9e7..176a4b1ed520 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -341,6 +341,8 @@ struct btrfs_fs_info; int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); @@ -371,7 +373,8 @@ int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func); + bio_end_io_t *endio_func, void *data); +int free_io_failure(struct inode *inode, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09d8c5ee8869..c3c3269a9e08 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7242,30 +7242,267 @@ unlock_err: return ret; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio) +static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int rw, int mirror_num) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + BUG_ON(rw & REQ_WRITE); + + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DIO_REPAIR); + if (ret) + goto err; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); +err: + bio_put(bio); + return ret; +} + +static int btrfs_check_dio_repairable(struct inode *inode, + struct bio *failed_bio, + struct io_failure_record *failrec, + int failed_mirror) +{ + int num_copies; + + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + + if (failrec->this_mirror > num_copies) { + pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + return 1; +} + +static int dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int failed_mirror, bio_end_io_t *repair_endio, + void *repair_arg) +{ + struct io_failure_record *failrec; + struct bio *bio; + int isector; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, + failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + isector = start - btrfs_io_bio(failed_bio)->logical; + isector >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + 0, isector, repair_endio, repair_arg); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } + + btrfs_debug(BTRFS_I(inode)->root->fs_info, + "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); + + ret = submit_dio_repair_bio(inode, bio, read_mode, + failrec->this_mirror); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + + return ret; +} + +struct btrfs_retry_complete { + struct completion done; + struct inode *inode; + u64 start; + int uptodate; +}; + +static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct bio_vec *bvec; + int i; + + if (err) + goto end; + + done->uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) + clean_io_failure(done->inode, done->start, bvec->bv_page, 0); +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) { struct bio_vec *bvec; + struct btrfs_retry_complete done; u64 start; int i; int ret; - int err = 0; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - return 0; + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio_nocsum, &done); + if (ret) + return ret; + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } + + start += bvec->bv_len; + } + + return 0; +} + +static void btrfs_retry_endio(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct bio_vec *bvec; + int uptodate; + int ret; + int i; + + if (err) + goto end; + + uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) { + ret = __readpage_endio_check(done->inode, io_bio, i, + bvec->bv_page, 0, + done->start, bvec->bv_len); + if (!ret) + clean_io_failure(done->inode, done->start, + bvec->bv_page, 0); + else + uptodate = 0; + } + + done->uptodate = uptodate; +end: + complete(&done->done); + bio_put(bio); +} +static int __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; + u64 start; + u64 offset = 0; + int i; + int ret; + + err = 0; start = io_bio->logical; + done.inode = inode; + bio_for_each_segment_all(bvec, &io_bio->bio, i) { ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 0, start, bvec->bv_len); - if (ret) - err = -EIO; + if (likely(!ret)) + goto next; +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio, &done); + if (ret) { + err = ret; + goto next; + } + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } +next: + offset += bvec->bv_len; start += bvec->bv_len; } return err; } +static int btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + if (skip_csum) { + if (unlikely(err)) + return __btrfs_correct_data_nocsum(inode, io_bio); + else + return 0; + } else { + return __btrfs_subio_endio_read(inode, io_bio, err); + } +} + static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; @@ -7273,8 +7510,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - if (!err && (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)) - err = btrfs_subio_endio_read(inode, io_bio); + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + err = btrfs_subio_endio_read(inode, io_bio, err); unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -7353,19 +7590,16 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, static void btrfs_end_dio_bio(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; - int ret; - if (err) { - btrfs_err(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); - } else if (dip->subio_endio) { - ret = dip->subio_endio(dip->inode, btrfs_io_bio(bio)); - if (ret) - err = ret; - } + if (err) + btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio->bi_rw, + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); + + if (dip->subio_endio) + err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); if (err) { dip->errors = 1; -- cgit v1.2.3 From 47ab2a6c689913db23ccae38349714edf8365e0a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 18 Sep 2014 11:20:02 -0400 Subject: Btrfs: remove empty block groups automatically One problem that has plagued us is that a user will use up all of his space with data, remove a bunch of that data, and then try to create a bunch of small files and run out of space. This happens because all the chunks were allocated for data since the metadata requirements were so low. But now there's a bunch of empty data block groups and not enough metadata space to do anything. This patch solves this problem by automatically deleting empty block groups. If we notice the used count go down to 0 when deleting or on mount notice that a block group has a used count of 0 then we will queue it to be deleted. When the cleaner thread runs we will double check to make sure the block group is still empty and then we will delete it. This patch has the side effect of no longer having a bunch of BUG_ON()'s in the chunk delete code, which will be helpful for both this and relocate. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 ++- fs/btrfs/disk-io.c | 6 ++ fs/btrfs/extent-tree.c | 141 ++++++++++++++++++++++++++++++++++++-- fs/btrfs/tests/free-space-tests.c | 2 +- fs/btrfs/volumes.c | 115 ++++++++++++++++++++----------- fs/btrfs/volumes.h | 2 + 6 files changed, 226 insertions(+), 49 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 51ff3f8dbab9..089f6da09411 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1298,8 +1298,8 @@ struct btrfs_block_group_cache { */ struct list_head cluster_list; - /* For delayed block group creation */ - struct list_head new_bg_list; + /* For delayed block group creation or deletion of empty block groups */ + struct list_head bg_list; }; /* delayed seq elem */ @@ -1568,6 +1568,7 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; + int open; u64 total_pinned; @@ -1717,6 +1718,9 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; }; struct btrfs_subvolume_writers { @@ -3344,6 +3348,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 48794f951427..4780e6623c7b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1769,6 +1769,7 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); + btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2230,6 +2231,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); @@ -2239,6 +2241,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_LIST_HEAD(&fs_info->unused_bgs); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); @@ -2977,6 +2980,8 @@ retry_root_backup: fs_info->update_uuid_tree_gen = 1; } + fs_info->open = 1; + return 0; fail_qgroup: @@ -3688,6 +3693,7 @@ void close_ctree(struct btrfs_root *root) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); + fs_info->open = 0; free_root_pointers(fs_info, 1); iput(fs_info->btree_inode); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b30ddb49cfab..28a27d5f02d3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5433,6 +5433,20 @@ static int update_block_group(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; + + /* + * No longer have used bytes in this block group, queue + * it for deletion. + */ + if (old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; cache->space_info->bytes_pinned += num_bytes; @@ -8855,6 +8869,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } up_write(&info->commit_root_sem); + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group_cache, @@ -8989,7 +9013,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + INIT_LIST_HEAD(&cache->bg_list); btrfs_init_free_space_ctl(cache); return cache; @@ -9130,8 +9154,18 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); set_avail_alloc_bits(root->fs_info, cache->flags); - if (btrfs_chunk_readonly(root, cache->key.objectid)) + if (btrfs_chunk_readonly(root, cache->key.objectid)) { set_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + spin_lock(&info->unused_bgs_lock); + /* Should always be true but just in case. */ + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -9172,10 +9206,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret = 0; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, - new_bg_list) { - list_del_init(&block_group->new_bg_list); - + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + list_del_init(&block_group->bg_list); if (ret) continue; @@ -9261,7 +9293,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); - list_add_tail(&cache->new_bg_list, &trans->new_bgs); + list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); @@ -9430,6 +9462,101 @@ out: return ret; } +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!fs_info->open) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + space_info = block_group->space_info; + list_del_init(&block_group->bg_list); + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || + btrfs_block_group_used(&block_group->item) || + block_group->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = set_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_set_block_group_rw(root, block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY, GFP_NOFS); + clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY, GFP_NOFS); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + block_group->pinned = 0; + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, root, + block_group->key.objectid); + btrfs_end_transaction(trans, root); +next: + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index d78ae10d0446..2299bfde39ee 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -45,7 +45,7 @@ static struct btrfs_block_group_cache *init_test_block_group(void) spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + INIT_LIST_HEAD(&cache->bg_list); btrfs_init_free_space_ctl(cache); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 63e632746d8a..f27c0f7c387e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2568,58 +2568,49 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 return ret; } -static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset) { struct extent_map_tree *em_tree; - struct btrfs_root *extent_root; - struct btrfs_trans_handle *trans; - struct btrfs_device *device; struct extent_map *em; + struct btrfs_root *extent_root = root->fs_info->extent_root; struct map_lookup *map; u64 dev_extent_len = 0; - int ret; - int i; + u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + u64 chunk_tree = root->fs_info->chunk_root->objectid; + int i, ret = 0; + /* Just in case */ root = root->fs_info->chunk_root; - extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; - ret = btrfs_can_relocate(extent_root, chunk_offset); - if (ret) - return -ENOSPC; - - /* step one, relocate all the extents inside this chunk */ - ret = btrfs_relocate_block_group(extent_root, chunk_offset); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret); - return ret; - } - - /* - * step two, delete the device extents and the - * chunk tree entries - */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start > chunk_offset || - em->start + em->len < chunk_offset); + if (!em || em->start > chunk_offset || + em->start + em->len < chunk_offset) { + /* + * This is a logic error, but we don't want to just rely on the + * user having built with ASSERT enabled, so if ASSERT doens't + * do anything we still error out. + */ + ASSERT(0); + if (em) + free_extent_map(em); + return -EINVAL; + } map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { - device = map->stripes[i].dev; + struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } if (device->bytes_used > 0) { lock_chunks(root); @@ -2634,23 +2625,34 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } } ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, chunk_offset); - - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } trace_btrfs_chunk_free(root, map, chunk_offset, em->len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); @@ -2658,11 +2660,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* once for the tree */ free_extent_map(em); +out: /* once for us */ free_extent_map(em); + return ret; +} + +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + int ret; + + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + ret = btrfs_remove_chunk(trans, root, chunk_offset); btrfs_end_transaction(trans, root); - return 0; + return ret; } static int btrfs_relocate_sys_chunks(struct btrfs_root *root) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 91998bc0b4c4..08980fa23039 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -463,6 +463,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 chunk_offset, u64 chunk_size); +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset); static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) { -- cgit v1.2.3 From 58dc4ce4325108b35425ffd30e6acfad9644d49d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 00:29:04 +0200 Subject: btrfs: remove unused parameter from readahead_tree_block The parent_transid parameter has been unused since its introduction in ca7a79ad8dbe2466 ("Pass down the expected generation number when reading tree blocks"). In reada_tree_block, it was even wrongly set to leafsize. Transid check is done in the proper read and readahead ignores errors. Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 6 +++--- fs/btrfs/disk-io.c | 3 +-- fs/btrfs/disk-io.h | 3 +-- fs/btrfs/extent-tree.c | 3 +-- fs/btrfs/relocation.c | 9 ++------- 5 files changed, 8 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 39021bf2df9a..1b7e3545a596 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2298,7 +2298,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize, gen); + readahead_tree_block(root, search, blocksize); nread += blocksize; } nscan++; @@ -2350,9 +2350,9 @@ static noinline void reada_for_balance(struct btrfs_root *root, } if (block1) - readahead_tree_block(root, block1, blocksize, 0); + readahead_tree_block(root, block1, blocksize); if (block2) - readahead_tree_block(root, block2, blocksize, 0); + readahead_tree_block(root, block2, blocksize); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4780e6623c7b..ff83748d39da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1062,8 +1062,7 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid) +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 14d06ee1e143..8cd6a53db621 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,8 +46,7 @@ struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid); +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 44d04979f071..058abd088f0f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7486,8 +7486,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - ret = readahead_tree_block(root, bytenr, blocksize, - generation); + ret = readahead_tree_block(root, bytenr, blocksize); if (ret) break; nread++; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2d221c46180c..16cb2b4a9620 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2861,13 +2861,8 @@ static int reada_tree_block(struct reloc_control *rc, struct tree_block *block) { BUG_ON(block->key_ready); - if (block->key.type == BTRFS_METADATA_ITEM_KEY) - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, - rc->extent_root->nodesize); - else - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid); return 0; } -- cgit v1.2.3 From 6197d86eabb844c1a9c99956d4e6b0f8eb548ad3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 00:49:36 +0200 Subject: btrfs: return void from readahead_tree_block Errors in readahead are not fatal and ignored elsewhere in the code. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ++---- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent-tree.c | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ff83748d39da..332f63518156 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1062,19 +1062,17 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) +void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - int ret = 0; buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) - return 0; + return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); - return ret; } int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8cd6a53db621..0d9793f6b594 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,7 +46,7 @@ struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); +void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 058abd088f0f..e0468a9789a5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7486,9 +7486,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - ret = readahead_tree_block(root, bytenr, blocksize); - if (ret) - break; + readahead_tree_block(root, bytenr, blocksize); nread++; } wc->reada_slot = slot; -- cgit v1.2.3 From ce86cd59179279a6fe673d2a105d24fb7e70aef3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 01:07:32 +0200 Subject: btrfs: remove parameter blocksize from read_tree_block We know the tree block size, no need to pass it around. Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 ++---- fs/btrfs/ctree.c | 10 +++------- fs/btrfs/disk-io.c | 17 +++++------------ fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent-tree.c | 8 +++----- fs/btrfs/print-tree.c | 1 - fs/btrfs/relocation.c | 11 ++++------- 7 files changed, 18 insertions(+), 37 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 6829dc5aa657..2d3e32ebfd15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -490,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, continue; BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, - fs_info->tree_root->nodesize, 0); + 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -1028,12 +1028,10 @@ again: if (ref->count && ref->parent) { if (extent_item_pos && !ref->inode_list && ref->level == 0) { - u32 bsz; struct extent_buffer *eb; - bsz = fs_info->extent_root->nodesize; eb = read_tree_block(fs_info->extent_root, - ref->parent, bsz, 0); + ref->parent, 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1b7e3545a596..302c3f955706 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1425,7 +1425,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; - u32 blocksize; eb_root = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); @@ -1444,8 +1443,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - blocksize = root->nodesize; - old = read_tree_block(root, logical, blocksize, 0); + old = read_tree_block(root, logical, 0); if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); btrfs_warn(root->fs_info, @@ -1692,8 +1690,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, uptodate = 0; if (!cur || !uptodate) { if (!cur) { - cur = read_tree_block(root, blocknr, - blocksize, gen); + cur = read_tree_block(root, blocknr, gen); if (!cur || !extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; @@ -1872,7 +1869,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), - root->nodesize, btrfs_node_ptr_generation(parent, slot)); if (eb && !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); @@ -2507,7 +2503,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_release_path(p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, 0); + tmp = read_tree_block(root, blocknr, 0); if (tmp) { /* * If the read above didn't mark this buffer up to date, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 332f63518156..03c0973568ef 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1138,12 +1138,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid) + u64 parent_transid) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); if (!buf) return NULL; @@ -1484,7 +1484,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; u64 generation; - u32 blocksize; int ret; path = btrfs_alloc_path(); @@ -1509,9 +1508,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); - blocksize = root->nodesize; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); + generation); if (!root->node) { ret = -ENOMEM; goto find_fail; @@ -2139,7 +2137,6 @@ int open_ctree(struct super_block *sb, { u32 sectorsize; u32 nodesize; - u32 blocksize; u32 stripesize; u64 generation; u64 features; @@ -2643,7 +2640,6 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - blocksize = tree_root->nodesize; generation = btrfs_super_chunk_root_generation(disk_super); __setup_root(nodesize, sectorsize, stripesize, chunk_root, @@ -2651,7 +2647,7 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), - blocksize, generation); + generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", @@ -2684,12 +2680,11 @@ int open_ctree(struct super_block *sb, } retry_root_backup: - blocksize = tree_root->nodesize; generation = btrfs_super_generation(disk_super); tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), - blocksize, generation); + generation); if (!tree_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", @@ -2858,7 +2853,6 @@ retry_root_backup: err = -EIO; goto fail_qgroup; } - blocksize = tree_root->nodesize; log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { @@ -2870,7 +2864,6 @@ retry_root_backup: log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(tree_root, bytenr, - blocksize, generation + 1); if (!log_tree_root->node || !extent_buffer_uptodate(log_tree_root->node)) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 0d9793f6b594..03f396144fe1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -45,7 +45,7 @@ struct btrfs_device; struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid); + u64 parent_transid); void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e0468a9789a5..178f6dbf2d7c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7645,7 +7645,6 @@ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { - int child_bsize = root->nodesize; int parent_slot; u64 child_gen; u64 child_bytenr; @@ -7657,8 +7656,7 @@ walk_down: child_bytenr = btrfs_node_blockptr(eb, parent_slot); child_gen = btrfs_node_ptr_generation(eb, parent_slot); - eb = read_tree_block(root, child_bytenr, child_bsize, - child_gen); + eb = read_tree_block(root, child_bytenr, child_gen); if (!eb || !extent_buffer_uptodate(eb)) { ret = -EIO; goto out; @@ -7674,7 +7672,7 @@ walk_down: ret = btrfs_qgroup_record_ref(trans, root->fs_info, root->objectid, child_bytenr, - child_bsize, + root->nodesize, BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); if (ret) @@ -7889,7 +7887,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(root, bytenr, blocksize, generation); + next = read_tree_block(root, bytenr, generation); if (!next || !extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index eb309855d5c8..647ab12fdf5d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) for (i = 0; i < nr; i++) { struct extent_buffer *next = read_tree_block(root, btrfs_node_blockptr(c, i), - root->nodesize, btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && level != 1) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d7506325b024..95bc40ae358d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1813,8 +1813,7 @@ again: break; } - eb = read_tree_block(dest, old_bytenr, blocksize, - old_ptr_gen); + eb = read_tree_block(dest, old_bytenr, old_ptr_gen); if (!eb || !extent_buffer_uptodate(eb)) { ret = (!eb) ? -ENOMEM : -EIO; free_extent_buffer(eb); @@ -1944,7 +1943,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, u64 bytenr; u64 ptr_gen = 0; u64 last_snapshot; - u32 blocksize; u32 nritems; last_snapshot = btrfs_root_last_snapshot(&root->root_item); @@ -1970,8 +1968,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, } bytenr = btrfs_node_blockptr(eb, path->slots[i]); - blocksize = root->nodesize; - eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + eb = read_tree_block(root, bytenr, ptr_gen); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -2680,7 +2677,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); - eb = read_tree_block(root, bytenr, blocksize, generation); + eb = read_tree_block(root, bytenr, generation); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; @@ -2842,7 +2839,7 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); + block->key.offset); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; -- cgit v1.2.3 From 0308af4465897c889e32754ef37bb465a1b2b872 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 01:43:40 +0200 Subject: btrfs: remove unused parameter blocksize from btrfs_find_tree_block Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 +++++------- fs/btrfs/disk-io.c | 5 ++--- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent-tree.c | 2 +- 4 files changed, 9 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 302c3f955706..2fb4ab659a0f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1683,7 +1683,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root, blocknr, blocksize); + cur = btrfs_find_tree_block(root, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else @@ -2264,7 +2264,7 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, slot); blocksize = root->nodesize; - eb = btrfs_find_tree_block(root, search, blocksize); + eb = btrfs_find_tree_block(root, search); if (eb) { free_extent_buffer(eb); return; @@ -2326,7 +2326,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1, blocksize); + eb = btrfs_find_tree_block(root, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2339,7 +2339,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2, blocksize); + eb = btrfs_find_tree_block(root, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); @@ -2450,16 +2450,14 @@ read_block_for_search(struct btrfs_trans_handle *trans, { u64 blocknr; u64 gen; - u32 blocksize; struct extent_buffer *b = *eb_ret; struct extent_buffer *tmp; int ret; blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - blocksize = root->nodesize; - tmp = btrfs_find_tree_block(root, blocknr, blocksize); + tmp = btrfs_find_tree_block(root, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 03c0973568ef..e0293d2fbb3a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1108,7 +1108,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, } struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr) { return find_extent_buffer(root->fs_info, bytenr); } @@ -4002,8 +4002,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - eb = btrfs_find_tree_block(root, start, - root->nodesize); + eb = btrfs_find_tree_block(root, start); start += root->nodesize; if (!eb) continue; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 03f396144fe1..ae04daef6087 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -62,7 +62,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 178f6dbf2d7c..d9a90da93302 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7825,7 +7825,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr, blocksize); + next = btrfs_find_tree_block(root, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) -- cgit v1.2.3 From 4d75f8a9c87b843c8ded15b82b8d137b9724cccc Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 01:54:12 +0200 Subject: btrfs: remove blocksize from btrfs_alloc_free_block and rename Rename to btrfs_alloc_tree_block as it fits to the alloc/find/free + _tree_block family. The parameter blocksize was set to the metadata block size, directly or indirectly. Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 26 +++++++++++--------------- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/disk-io.c | 8 +++----- fs/btrfs/extent-tree.c | 5 +++-- fs/btrfs/ioctl.c | 3 +-- 5 files changed, 21 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2fb4ab659a0f..d498982bd202 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, root, buf->len, 0, - new_root_objectid, &disk_key, level, - buf->start, 0); + cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, + &disk_key, level, buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else parent_start = 0; - cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, - root->root_key.objectid, &disk_key, - level, search_start, empty_size); + cow = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, &disk_key, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -3355,9 +3354,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, &lower_key, - level, root->node->start, 0); + c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &lower_key, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3495,9 +3493,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, - &disk_key, level, c->start, 0); + split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4275,9 +4272,8 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 089f6da09411..3073b8876bca 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3290,9 +3290,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int get_block_group_index(struct btrfs_block_group_cache *cache); -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, - u64 parent, u64 root_objectid, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0293d2fbb3a..2e5d460d4e0c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1324,8 +1324,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_free_block(trans, root, root->nodesize, - 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1412,9 +1411,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, * updated (along with back refs to the log tree). */ - leaf = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, + NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d9a90da93302..0ba42eb96775 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7322,8 +7322,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * * returns the tree buffer or NULL. */ -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size) @@ -7333,6 +7333,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf; u64 flags = 0; int ret; + u32 blocksize = root->nodesize; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0ff212757b95..2fc48905ccf5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -480,8 +480,7 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->nodesize, - 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; -- cgit v1.2.3 From 95ac567af212db3293af3897ccb521efdf1dd7ff Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Thu, 8 Aug 2013 22:45:48 +0100 Subject: Btrfs: set default max_inline to 8KiB instead of 8MiB 8MiB is way too large and likely set by mistake. This is not a significant issue as in practice the max amount of data added to an inline extent is also limited by the page cache and btree leaf sizes. Signed-off-by: Filipe David Borba Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 +- fs/btrfs/super.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 089f6da09411..dd79ba7ee3ea 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2089,6 +2089,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (8192) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4780e6623c7b..9b2a741370b7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2260,7 +2260,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; - fs_info->max_inline = 8192 * 1024; + fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->free_chunk_space = 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1c6da8e00c1b..b1d2a42f379d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1014,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nodatacow"); if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); - if (info->max_inline != 8192 * 1024) + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->alloc_start != 0) seq_printf(seq, ",alloc_start=%llu", info->alloc_start); -- cgit v1.2.3 From bfebd8b5441755f228ad02273682d675d3335123 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 30 Jul 2014 00:25:45 +0200 Subject: btrfs: use enum for wq endio metadata type The enum exists but is not consistently used. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 11 +++++++---- fs/btrfs/disk-io.c | 14 +++----------- fs/btrfs/disk-io.h | 4 ++-- fs/btrfs/inode.c | 3 ++- 4 files changed, 14 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index eeee13842cd0..d3220d31d3cb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -388,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * freed before we're done setting it up */ atomic_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -419,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -668,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_CACHE_SIZE) { bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ /* @@ -706,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9b2a741370b7..d7cb58ed2946 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -82,7 +82,7 @@ struct end_io_wq { void *private; struct btrfs_fs_info *info; int error; - int metadata; + enum btrfs_wq_endio_type metadata; struct list_head list; struct btrfs_work work; }; @@ -733,16 +733,8 @@ static void end_workqueue_bio(struct bio *bio, int err) btrfs_queue_work(wq, &end_io_wq->work); } -/* - * For the metadata arg you want - * - * 0 - if data - * 1 - if normal metadta - * 2 - if writing to the free space cache area - * 3 - raid parity work - */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata) + enum btrfs_wq_endio_type metadata) { struct end_io_wq *end_io_wq; @@ -930,7 +922,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, * can happen in the async kernel threads */ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); + bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 14d06ee1e143..84da438fd9a3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,7 +25,7 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 -enum { +enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_DATA = 0, BTRFS_WQ_ENDIO_METADATA = 1, BTRFS_WQ_ENDIO_FREE_SPACE = 2, @@ -120,7 +120,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata); + enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 344a322eb386..b1e388dea7b8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7721,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, bio_get(bio); if (!write) { - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); if (ret) goto err; } -- cgit v1.2.3 From 97eb6b69d1e856cb5e1cf2c3d94afab643e93128 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 30 Jul 2014 00:55:42 +0200 Subject: btrfs: use slab for end_io_wq structures The structure is frequently reused. Rename it according to the slab name. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 38 +++++++++++++++++++++++++++++--------- fs/btrfs/disk-io.h | 2 ++ fs/btrfs/super.c | 8 +++++++- 3 files changed, 38 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d7cb58ed2946..2f075ef20050 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -72,11 +72,11 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root); static void btrfs_error_commit_super(struct btrfs_root *root); /* - * end_io_wq structs are used to do processing in task context when an IO is - * complete. This is used during reads to verify checksums, and it is used + * btrfs_end_io_wq structs are used to do processing in task context when an IO + * is complete. This is used during reads to verify checksums, and it is used * by writes to insert metadata for new file extents after IO is complete. */ -struct end_io_wq { +struct btrfs_end_io_wq { struct bio *bio; bio_end_io_t *end_io; void *private; @@ -87,6 +87,26 @@ struct end_io_wq { struct btrfs_work work; }; +static struct kmem_cache *btrfs_end_io_wq_cache; + +int __init btrfs_end_io_wq_init(void) +{ + btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", + sizeof(struct btrfs_end_io_wq), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_end_io_wq_cache) + return -ENOMEM; + return 0; +} + +void btrfs_end_io_wq_exit(void) +{ + if (btrfs_end_io_wq_cache) + kmem_cache_destroy(btrfs_end_io_wq_cache); +} + /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios @@ -690,7 +710,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) static void end_workqueue_bio(struct bio *bio, int err) { - struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; struct btrfs_workqueue *wq; btrfs_work_func_t func; @@ -736,9 +756,9 @@ static void end_workqueue_bio(struct bio *bio, int err) int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { - struct end_io_wq *end_io_wq; + struct btrfs_end_io_wq *end_io_wq; - end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) return -ENOMEM; @@ -1723,16 +1743,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; - struct end_io_wq *end_io_wq; + struct btrfs_end_io_wq *end_io_wq; int error; - end_io_wq = container_of(work, struct end_io_wq, work); + end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; - kfree(end_io_wq); + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); bio_endio_nodec(bio, error); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 84da438fd9a3..9ac233923ca3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -142,6 +142,8 @@ int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_calc_num_tolerated_disk_barrier_failures( struct btrfs_fs_info *fs_info); +int __init btrfs_end_io_wq_init(void); +void btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b915d7704f19..4685b9704f15 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2001,10 +2001,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_ref; - err = btrfs_interface_init(); + err = btrfs_end_io_wq_init(); if (err) goto free_prelim_ref; + err = btrfs_interface_init(); + if (err) + goto free_end_io_wq; + btrfs_init_lockdep(); btrfs_print_info(); @@ -2021,6 +2025,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_end_io_wq: + btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: -- cgit v1.2.3 From 2755a0de64693501741fb3603cd8ca928b0b7e81 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 31 Jul 2014 00:43:18 +0200 Subject: btrfs: hide typecast to definition of BTRFS_SEND_TRANS_STUB Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- fs/btrfs/send.c | 2 +- fs/btrfs/transaction.c | 2 +- fs/btrfs/transaction.h | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2f075ef20050..0abf4b0a9010 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -347,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; - bool need_lock = (current->journal_info == - (void *)BTRFS_SEND_TRANS_STUB); + bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7edfc7cebda4..8b44630f4abf 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5728,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; - current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; if (ret < 0) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 16d0c1b62b3e..f4c194b160b7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, int ret; /* Send isn't supposed to start transactions. */ - ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 579be51b27e5..d8f40e1a5d2d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,7 +79,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ __TRANS_ATTACH) -#define BTRFS_SEND_TRANS_STUB 1 +#define BTRFS_SEND_TRANS_STUB ((void *)1) struct btrfs_trans_handle { u64 transid; -- cgit v1.2.3 From fccb84c94a9755f48668e43d0a44d6ecc750900f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 29 Sep 2014 23:53:21 +0200 Subject: btrfs: move checks for DUMMY_ROOT into a helper Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 ++--- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/disk-io.c | 4 +--- fs/btrfs/extent-tree.c | 16 +++++++--------- fs/btrfs/qgroup.c | 10 ++++------ 5 files changed, 23 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 533657c508e2..ce1d71d171bb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1506,10 +1506,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + /* ensure we can see the force_cow */ smp_rmb(); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bae025a20e63..557fd9520607 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4131,4 +4131,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif +static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 1; +#endif + return 0; +} + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0abf4b0a9010..14117f85b545 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1130,11 +1130,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return alloc_test_extent_buffer(root->fs_info, bytenr, blocksize); -#endif return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 44d04979f071..7895db9c6bdc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3073,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); @@ -6264,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -7336,15 +7335,14 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, blocksize, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } -#endif + block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index cd9717ea8c9d..48b60dbf807f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + if (btrfs_test_is_dummy_root(quota_root)) return 0; -#endif + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -698,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; -- cgit v1.2.3 From 656f30dba7ab8179c9a2e04293b0c7b383fa9ce9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 26 Sep 2014 12:25:56 +0100 Subject: Btrfs: be aware of btree inode write errors to avoid fs corruption While we have a transaction ongoing, the VM might decide at any time to call btree_inode->i_mapping->a_ops->writepages(), which will start writeback of dirty pages belonging to btree nodes/leafs. This call might return an error or the writeback might finish with an error before we attempt to commit the running transaction. If this happens, we might have no way of knowing that such error happened when we are committing the transaction - because the pages might no longer be marked dirty nor tagged for writeback (if a subsequent modification to the extent buffer didn't happen before the transaction commit) which makes filemap_fdata[write|wait]_range unable to find such pages (even if they're marked with SetPageError). So if this happens we must abort the transaction, otherwise we commit a super block with btree roots that point to btree nodes/leafs whose content on disk is invalid - either garbage or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like "parent transid verify failed on 10826481664 wanted 25748 found 29562" when reading btree nodes/leafs from disk). Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's i_mapping would not be enough because we need to distinguish between log tree extents (not fatal) vs non-log tree extents (fatal) and because the next call to filemap_fdatawait_range() will catch and clear such errors in the mapping - and that call might be from a log sync and not from a transaction commit, which means we would not know about the error at transaction commit time. Also, checking for the eb flag EXTENT_BUFFER_IOERR at transaction commit time isn't done and would not be completely reliable, as the eb might be removed from memory and read back when trying to get it, which clears that flag right before reading the eb's pages from disk, making us not know about the previous write error. Using the new 3 flags for the btree inode also makes us achieve the goal of AS_EIO/AS_ENOSPC when writepages() returns success, started writeback for all dirty pages and before filemap_fdatawait_range() is called, the writeback for all dirty pages had already finished with errors - because we were not using AS_EIO/AS_ENOSPC, filemap_fdatawait_range() would return success, as it could not know that writeback errors happened (the pages were no longer tagged for writeback). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 11 ++++++++ fs/btrfs/disk-io.c | 4 +-- fs/btrfs/extent-tree.c | 4 ++- fs/btrfs/extent_io.c | 74 +++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/extent_io.h | 7 +++-- fs/btrfs/transaction.c | 26 ++++++++++++++++++ 6 files changed, 114 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7a7521c87c88..8a42adb4e5ed 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,17 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* + * The following 3 bits are meant only for the btree inode. + * When any of them is set, it means an error happened while writing an + * extent buffer belonging to: + * 1) a non-log btree + * 2) a log btree and first log sub-transaction + * 3) a log btree and second log sub-transaction + */ +#define BTRFS_INODE_BTREE_ERR 12 +#define BTRFS_INODE_BTREE_LOG1_ERR 13 +#define BTRFS_INODE_BTREE_LOG2_ERR 14 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4780e6623c7b..09b3c8a0c790 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -607,7 +607,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { ret = -EIO; goto err; } @@ -680,7 +680,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 44d04979f071..8ebe6bf66e78 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7235,17 +7235,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXENT bit to differentiate dirty pages. */ - if (root->log_transid % 2 == 0) + if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } else { + buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4267a054b9c1..215603b911f1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3601,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } +static void set_btree_ioerr(struct page *page) +{ + struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode); + + SetPageError(page); + if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; + + /* + * If writeback for a btree extent that doesn't belong to a log tree + * failed, increment the counter transaction->eb_write_errors. + * We do this because while the transaction is running and before it's + * committing (when we call filemap_fdata[write|wait]_range against + * the btree inode), we might have + * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it + * returns an error or an error happens during writeback, when we're + * committing the transaction we wouldn't know about it, since the pages + * can be no longer dirty nor marked anymore for writeback (if a + * subsequent modification to the extent buffer didn't happen before the + * transaction commit), which makes filemap_fdata[write|wait]_range not + * able to find the pages tagged with SetPageError at transaction + * commit time. So if this happens we must abort the transaction, + * otherwise we commit a super block with btree roots that point to + * btree nodes/leafs whose content on disk is invalid - either garbage + * or the content of some node/leaf from a past generation that got + * cowed or deleted and is no longer valid. + * + * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would + * not be enough - we need to distinguish between log tree extents vs + * non-log tree extents, and the next filemap_fdatawait_range() call + * will catch and clear such errors in the mapping - and that call might + * be from a log sync and not from a transaction commit. Also, checking + * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is + * not done and would not be reliable - the eb might have been released + * from memory and reading it back again means that flag would not be + * set (since it's a runtime flag, not persisted on disk). + * + * Using the flags below in the btree inode also makes us achieve the + * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started + * writeback for all dirty pages and before filemap_fdatawait_range() + * is called, the writeback for all dirty pages had already finished + * with errors - because we were not using AS_EIO/AS_ENOSPC, + * filemap_fdatawait_range() would return success, as it could not know + * that writeback errors happened (the pages were no longer tagged for + * writeback). + */ + switch (eb->log_index) { + case -1: + set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags); + break; + case 0: + set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + break; + case 1: + set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + break; + default: + BUG(); /* unexpected, logic error */ + } +} + static void end_bio_extent_buffer_writepage(struct bio *bio, int err) { struct bio_vec *bvec; @@ -3614,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); - SetPageError(page); + set_btree_ioerr(page); } end_page_writeback(page); @@ -3644,7 +3705,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; int ret = 0; - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) @@ -3661,8 +3722,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 0, epd->bio_flags, bio_flags); epd->bio_flags = bio_flags; if (ret) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - SetPageError(p); + set_btree_ioerr(p); end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) end_extent_buffer_writeback(eb); @@ -5055,7 +5115,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5e91fb9d1764..06f030c0084c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -41,9 +41,10 @@ #define EXTENT_BUFFER_TREE_REF 5 #define EXTENT_BUFFER_STALE 6 #define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ #define EXTENT_BUFFER_DUMMY 9 #define EXTENT_BUFFER_IN_TREE 10 +#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ /* these are flags for extent_clear_unlock_delalloc */ #define PAGE_UNLOCK (1 << 0) @@ -141,7 +142,9 @@ struct extent_buffer { atomic_t blocking_readers; atomic_t spinning_readers; atomic_t spinning_writers; - int lock_nested; + short lock_nested; + /* >= 0 if eb belongs to a log tree, -1 otherwise */ + short log_index; /* protects write locks */ rwlock_t lock; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 16d0c1b62b3e..a47b1000a6e5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -851,6 +851,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); + bool errors = false; while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { @@ -864,6 +866,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, } if (err) werr = err; + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if ((mark & EXTENT_DIRTY) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, + &btree_ino->runtime_flags)) + errors = true; + + if ((mark & EXTENT_NEW) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, + &btree_ino->runtime_flags)) + errors = true; + } else { + if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR, + &btree_ino->runtime_flags)) + errors = true; + } + + if (errors && !werr) + werr = -EIO; + return werr; } @@ -1629,6 +1651,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); int ret; /* Stop the commit early if ->aborted is set */ @@ -1871,6 +1894,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_update_commit_device_size(root->fs_info); btrfs_update_commit_device_bytes_used(root, cur_trans); + clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; -- cgit v1.2.3 From c926093ec516f5d316ecdf8c1be11f577ac71b85 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Sep 2014 19:16:47 +0200 Subject: btrfs: add more superblock checks Populate btrfs_check_super_valid() with checks that try to verify consistency of superblock by additional conditions that may arise from corrupted devices or bitflips. Some of tests are only hints and issue warnings instead of failing the mount, basically when the checks are derived from the data found in the superblock. Tested on a broken image provided by Qu. Reported-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 09b3c8a0c790..fc8dfaa27967 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3817,10 +3817,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { + struct btrfs_super_block *sb = fs_info->super_copy; + int ret = 0; + + if (sb->root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n", + sb->root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (sb->chunk_root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n", + sb->chunk_root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (sb->log_root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n", + sb->log_root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + /* - * Placeholder for checks + * The common minimum, we don't know if we can trust the nodesize/sectorsize + * items yet, they'll be verified later. Issue just a warning. */ - return 0; + if (!IS_ALIGNED(sb->root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->root); + if (!IS_ALIGNED(sb->chunk_root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->chunk_root); + if (!IS_ALIGNED(sb->log_root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->log_root); + + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { + printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", + fs_info->fsid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (sb->num_devices > (1UL << 31)) + printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", + sb->num_devices); + + if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) { + printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", + sb->bytenr, BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (sb->generation < sb->chunk_root_generation) + printk(KERN_WARNING + "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", + sb->generation, sb->chunk_root_generation); + if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1) + printk(KERN_WARNING + "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", + sb->generation, sb->cache_generation); + + return ret; } static void btrfs_error_commit_super(struct btrfs_root *root) -- cgit v1.2.3 From 21e7626b12f25770e2975bc7c7b2e1d5b1d58a57 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 27 Oct 2014 13:52:21 +0100 Subject: btrfs: use macro accessors in superblock validation checks The initial patch c926093ec516f5d316 (btrfs: add more superblock checks) did not properly use the macro accessors that wrap endianness and the code would not work correctly on big endian machines. Reported-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2409718e3f20..1ae1661ba14c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3817,19 +3817,19 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb = fs_info->super_copy; int ret = 0; - if (sb->root_level > BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n", - sb->root_level, BTRFS_MAX_LEVEL); + if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", + btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } - if (sb->chunk_root_level > BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n", - sb->chunk_root_level, BTRFS_MAX_LEVEL); + if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n", + btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } - if (sb->log_root_level > BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n", - sb->log_root_level, BTRFS_MAX_LEVEL); + if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n", + btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } @@ -3837,15 +3837,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * The common minimum, we don't know if we can trust the nodesize/sectorsize * items yet, they'll be verified later. Issue just a warning. */ - if (!IS_ALIGNED(sb->root, 4096)) + if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", sb->root); - if (!IS_ALIGNED(sb->chunk_root, 4096)) + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", sb->chunk_root); - if (!IS_ALIGNED(sb->log_root, 4096)) + if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->log_root); + btrfs_super_log_root(sb)); if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", @@ -3857,13 +3857,13 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ - if (sb->num_devices > (1UL << 31)) + if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", - sb->num_devices); + btrfs_super_num_devices(sb)); - if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) { + if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", - sb->bytenr, BTRFS_SUPER_INFO_OFFSET); + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); ret = -EINVAL; } @@ -3871,14 +3871,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ - if (sb->generation < sb->chunk_root_generation) + if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) printk(KERN_WARNING "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", - sb->generation, sb->chunk_root_generation); - if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1) + btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); + if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) + && btrfs_super_cache_generation(sb) != (u64)-1) printk(KERN_WARNING "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", - sb->generation, sb->cache_generation); + btrfs_super_generation(sb), btrfs_super_cache_generation(sb)); return ret; } -- cgit v1.2.3 From 572d9ab7845ea0e043ec34cd733a75228130ad03 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2014 15:26:17 +0100 Subject: btrfs: add support for processing pending changes There are some actions that modify global filesystem state but cannot be performed at the time of request, but later at the transaction commit time when the filesystem is in a known state. For example enabling new incompat features on-the-fly or issuing transaction commit from unsafe contexts (sysfs handlers). Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/transaction.c | 16 ++++++++++++++++ fs/btrfs/transaction.h | 2 ++ 4 files changed, 69 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe69edda11fb..f30b061ef77d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1402,6 +1402,11 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; unsigned long mount_opt; + /* + * Track requests for actions that need to be done during transaction + * commit (like for some mount options). + */ + unsigned long pending_changes; unsigned long compress_type:4; int commit_interval; /* @@ -2103,6 +2108,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) + #define btrfs_set_and_info(root, opt, fmt, args...) \ { \ if (!btrfs_test_opt(root, opt)) \ @@ -2117,6 +2123,45 @@ struct btrfs_ioctl_defrag_range_args { btrfs_clear_opt(root->fs_info->mount_opt, opt); \ } +/* + * Requests for changes that need to be done during transaction commit. + * + * Internal mount options that are used for special handling of the real + * mount options (eg. cannot be set during remount and have to be set during + * transaction commit) + */ + +#define btrfs_test_pending(info, opt) \ + test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_set_pending(info, opt) \ + set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_clear_pending(info, opt) \ + clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) + +/* + * Helpers for setting pending mount option changes. + * + * Expects corresponding macros + * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name + */ +#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), SET_##opt); \ + btrfs_clear_pending((info), CLEAR_##opt); \ + } \ +} while(0) + +#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), CLEAR_##opt); \ + btrfs_clear_pending((info), SET_##opt); \ + } \ +} while(0) + /* * Inode flags */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1bf9f897065d..fd80c0d98421 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2834,6 +2834,12 @@ retry_root_backup: if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + /* + * Mount does not set all options immediatelly, we can do it now and do + * not have to wait for transaction commit + */ + btrfs_apply_pending_changes(fs_info); + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..7a4024a55e5c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1850,6 +1850,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, else btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + btrfs_apply_pending_changes(root->fs_info); + /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -2019,3 +2021,17 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) return (ret < 0) ? 0 : 1; } + +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +{ + unsigned long prev; + unsigned long bit; + + prev = cmpxchg(&fs_info->pending_changes, 0, 0); + if (!prev) + return; + + if (prev) + btrfs_warn(fs_info, + "unknown pending changes left 0x%lx, ignoring", prev); +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8f40e1a5d2d..75ebcfce9d57 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); + #endif -- cgit v1.2.3 From 7e1876aca815029d5c3023a66a91e249eca3e533 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2014 15:26:17 +0100 Subject: btrfs: switch inode_cache option handling to pending changes The pending mount option(s) now share namespace and bits with the normal options, and the existing one for (inode_cache) is unset unconditionally at each transaction commit. Introduce a separate namespace for pending changes and enhance the descriptions of the intended change to use separate bits for each action. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 +++- fs/btrfs/disk-io.c | 4 ---- fs/btrfs/inode-map.c | 2 +- fs/btrfs/super.c | 4 ++-- fs/btrfs/transaction.c | 19 ++++++++++++------- 5 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f30b061ef77d..1c9157e4ab0c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2098,7 +2098,6 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) -#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2131,6 +2130,9 @@ struct btrfs_ioctl_defrag_range_args { * transaction commit) */ +#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) +#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) + #define btrfs_test_pending(info, opt) \ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) #define btrfs_set_pending(info, opt) \ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fd80c0d98421..6b406e3f3abe 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2830,10 +2830,6 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } - /* Set the real inode map cache flag */ - if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) - btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); - /* * Mount does not set all options immediatelly, we can do it now and do * not have to wait for transaction commit diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 83d646bd2e4b..4ebd5ebb1ea1 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) root->root_key.objectid); if (IS_ERR(tsk)) { btrfs_warn(root->fs_info, "failed to start inode caching task"); - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE, "disabling inode map caching"); } } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1da16d59e115..65c75d9e9750 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "disabling disk space caching"); break; case Opt_inode_cache: - btrfs_set_and_info(root, CHANGE_INODE_CACHE, + btrfs_set_pending_and_info(info, INODE_MAP_CACHE, "enabling inode map caching"); break; case Opt_noinode_cache: - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, "disabling inode map caching"); break; case Opt_clear_cache: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7a4024a55e5c..703238ed7337 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1842,14 +1842,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } /* - * Since the transaction is done, we should set the inode map cache flag - * before any other comming transaction. + * Since the transaction is done, we can apply the pending changes + * before the next transaction. */ - if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) - btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); - else - btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); - btrfs_apply_pending_changes(root->fs_info); /* commit_fs_roots gets rid of all the tree log roots, it is now @@ -2031,6 +2026,16 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) if (!prev) return; + bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; + if (prev & bit) + btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; + if (prev & bit) + btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + if (prev) btrfs_warn(fs_info, "unknown pending changes left 0x%lx, ignoring", prev); -- cgit v1.2.3 From cd743fac42bbc2e6125ee14aaa8741601f92fe9a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 Oct 2014 19:40:14 +0100 Subject: btrfs: fix typos in btrfs_check_super_valid Copy&paste errors in some messages and add few more missing macro accessors. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1bf9f897065d..7af9a1978a2f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3839,12 +3839,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, */ if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->root); + btrfs_super_root(sb)); if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->chunk_root); + printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", + btrfs_super_chunk_root(sb)); if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", btrfs_super_log_root(sb)); if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { -- cgit v1.2.3 From 50d9aa99bd35c77200e0e3dd7a72274f8304701f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Nov 2014 14:52:38 -0500 Subject: Btrfs: make sure logged extents complete in the current transaction V3 Liu Bo pointed out that my previous fix would lose the generation update in the scenario I described. It is actually much worse than that, we could lose the entire extent if we lose power right after the transaction commits. Consider the following write extent 0-4k log extent in log tree commit transaction < power fail happens here ordered extent completes We would lose the 0-4k extent because it hasn't updated the actual fs tree, and the transaction commit will reset the log so it isn't replayed. If we lose power before the transaction commit we are save, otherwise we are not. Fix this by keeping track of all extents we logged in this transaction. Then when we go to commit the transaction make sure we wait for all of those ordered extents to complete before proceeding. This will make sure that if we lose power after the transaction commit we still have our data. This also fixes the problem of the improperly updated extent generation. Thanks, cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 20 ++++++++++++++++++++ fs/btrfs/ordered-data.c | 9 +++++++-- fs/btrfs/ordered-data.h | 8 +++++++- fs/btrfs/transaction.c | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/transaction.h | 2 ++ fs/btrfs/tree-log.c | 6 +++--- 6 files changed, 72 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7af9a1978a2f..6efaee8d7739 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4129,6 +4129,25 @@ again: return 0; } +static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4140,6 +4159,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); + btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1401b1af4f06..9c28eb4da4dd 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); INIT_LIST_HEAD(&entry->log_list); + INIT_LIST_HEAD(&entry->trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -443,6 +444,8 @@ void btrfs_get_logged_extents(struct inode *inode, ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); if (!list_empty(&ordered->log_list)) continue; + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; list_add_tail(&ordered->log_list, logged_list); atomic_inc(&ordered->refs); } @@ -472,7 +475,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list, spin_unlock_irq(&log->log_extents_lock[index]); } -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; int index = transid % 2; @@ -497,7 +501,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - btrfs_put_ordered_extent(ordered); + if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + list_add_tail(&ordered->trans_list, &trans->ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d81a274d621e..0124bffc775f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -71,6 +71,8 @@ struct btrfs_ordered_sum { ordered extent */ #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ +#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent + * in the logging code. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -121,6 +123,9 @@ struct btrfs_ordered_extent { /* If we need to wait on this to be done */ struct list_head log_list; + /* If the transaction needs to wait on this ordered extent */ + struct list_head trans_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -197,7 +202,8 @@ void btrfs_get_logged_extents(struct inode *inode, void btrfs_put_logged_extents(struct list_head *logged_list); void btrfs_submit_logged_extents(struct list_head *logged_list, struct btrfs_root *log); -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 16c704b68704..295a135c9c24 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -247,6 +247,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); + INIT_LIST_HEAD(&cur_trans->pending_ordered); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -515,6 +516,7 @@ again: h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); + INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -746,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + if (!list_empty(&trans->ordered)) { + spin_lock(&info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); + spin_unlock(&info->trans_lock); + } + trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -1715,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, -1); } +static inline void +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -1765,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1817,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); + btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b3f5b40aab22..fd400a3668a8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -56,6 +56,7 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head pending_chunks; + struct list_head pending_ordered; struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; int aborted; @@ -105,6 +106,7 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; + struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc715ff31d26..7d96cc961663 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2600,7 +2600,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_wait_logged_extents(log, log_transid); + btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(trans, log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2645,7 +2645,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages, EXTENT_NEW | EXTENT_DIRTY); - btrfs_wait_logged_extents(log, log_transid); + btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -3766,7 +3766,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) btrfs_set_token_file_extent_type(leaf, fi, -- cgit v1.2.3 From 04216820fe83d5e27322065ba989de27dbfc104d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 27 Nov 2014 21:14:15 +0000 Subject: Btrfs: fix race between fs trimming and block group remove/allocation Our fs trim operation, which is completely transactionless (doesn't start or joins an existing transaction) consists of visiting all block groups and then for each one to iterate its free space entries and perform a discard operation against the space range represented by the free space entries. However before performing a discard, the corresponding free space entry is removed from the free space rbtree, and when the discard completes it is added back to the free space rbtree. If a block group remove operation happens while the discard is ongoing (or before it starts and after a free space entry is hidden), we end up not waiting for the discard to complete, remove the extent map that maps logical address to physical addresses and the corresponding chunk metadata from the the chunk and device trees. After that and before the discard completes, the current running transaction can finish and a new one start, allowing for new block groups that map to the same physical addresses to be allocated and written to. So fix this by keeping the extent map in memory until the discard completes so that the same physical addresses aren't reused before it completes. If the physical locations that are under a discard operation end up being used for a new metadata block group for example, and dirty metadata extents are written before the discard finishes (the VM might call writepages() of our btree inode's i_mapping for example, or an fsync log commit happens) we end up overwriting metadata with zeroes, which leads to errors from fsck like the following: checking extents Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block owner ref check failed [833912832 16384] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block root 5 root dir 256 error root 5 inode 260 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref root 5 inode 262 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref root 5 inode 263 errors 2001, no inode item, link count wrong (...) Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 12 ++++++++- fs/btrfs/disk-io.c | 13 ++++++++++ fs/btrfs/extent-tree.c | 60 ++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/free-space-cache.c | 38 +++++++++++++++++++++++++++- fs/btrfs/volumes.c | 26 ++++++-------------- fs/btrfs/volumes.h | 12 +++++++++ 6 files changed, 140 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 302f37c56546..d71915e04e92 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1279,6 +1279,7 @@ struct btrfs_block_group_cache { unsigned int dirty:1; unsigned int iref:1; unsigned int has_caching_ctl:1; + unsigned int removed:1; int disk_cache_state; @@ -1311,6 +1312,8 @@ struct btrfs_block_group_cache { /* For read-only block groups */ struct list_head ro_list; + + atomic_t trimming; }; /* delayed seq elem */ @@ -1740,6 +1743,12 @@ struct btrfs_fs_info { /* For btrfs to record security options */ struct security_mnt_opts security_opts; + + /* + * Chunks that can't be freed yet (under a trim/discard operation) + * and will be latter freed. Protected by fs_info->chunk_mutex. + */ + struct list_head pinned_chunks; }; struct btrfs_subvolume_writers { @@ -3405,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start); + struct btrfs_root *root, u64 group_start, + struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1e3e414c8501..30965120772b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + INIT_LIST_HEAD(&fs_info->pinned_chunks); + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -3715,6 +3717,17 @@ void close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; + + lock_chunks(root); + while (!list_empty(&fs_info->pinned_chunks)) { + struct extent_map *em; + + em = list_first_entry(&fs_info->pinned_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } + unlock_chunks(root); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5e64dda2db9..dbc115a25798 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9005,6 +9005,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); return cache; } @@ -9306,7 +9307,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) + struct btrfs_root *root, u64 group_start, + struct extent_map *em) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -9319,6 +9321,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int index; int factor; struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; root = root->fs_info->extent_root; @@ -9464,6 +9467,61 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + lock_chunks(root); + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + unlock_chunks(root); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 33848196550e..0ddc114e2aed 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -27,6 +27,7 @@ #include "disk-io.h" #include "extent_io.h" #include "inode-map.h" +#include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) @@ -3101,11 +3102,46 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, *trimmed = 0; + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + atomic_inc(&block_group->trimming); + spin_unlock(&block_group->lock); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); if (ret) - return ret; + goto out; ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + spin_lock(&block_group->lock); + if (atomic_dec_and_test(&block_group->trimming) && + block_group->removed) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + spin_unlock(&block_group->lock); + + em_tree = &block_group->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->key.objectid, + 1); + BUG_ON(!em); /* logic error, can't happen */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + lock_chunks(block_group->fs_info->chunk_root); + list_del_init(&em->list); + unlock_chunks(block_group->fs_info->chunk_root); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + } else { + spin_unlock(&block_group->lock); + } return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 01920515f90d..588f37e0a564 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -static void lock_chunks(struct btrfs_root *root) -{ - mutex_lock(&root->fs_info->chunk_mutex); -} - -static void unlock_chunks(struct btrfs_root *root) -{ - mutex_unlock(&root->fs_info->chunk_mutex); -} - static struct btrfs_fs_devices *__alloc_fs_devices(void) { struct btrfs_fs_devices *fs_devs; @@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, u64 *start, u64 len) { struct extent_map *em; + struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; - list_for_each_entry(em, &trans->transaction->pending_chunks, list) { +again: + list_for_each_entry(em, search_list, list) { struct map_lookup *map; int i; @@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, ret = 1; } } + if (search_list == &trans->transaction->pending_chunks) { + search_list = &trans->root->fs_info->pinned_chunks; + goto again; + } return ret; } @@ -2653,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } - ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; } - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for the tree */ - free_extent_map(em); out: /* once for us */ free_extent_map(em); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4cc00e64427e..637bcfadadb2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -515,4 +515,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, struct btrfs_transaction *transaction); + +static inline void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static inline void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + + #endif -- cgit v1.2.3 From 678886bdc6378c1cbd5072da2c5a3035000214e3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 7 Dec 2014 21:31:47 +0000 Subject: Btrfs: fix fs corruption on transaction abort if device supports discard When we abort a transaction we iterate over all the ranges marked as dirty in fs_info->freed_extents[0] and fs_info->freed_extents[1], clear them from those trees, add them back (unpin) to the free space caches and, if the fs was mounted with "-o discard", perform a discard on those regions. Also, after adding the regions to the free space caches, a fitrim ioctl call can see those ranges in a block group's free space cache and perform a discard on the ranges, so the same issue can happen without "-o discard" as well. This causes corruption, affecting one or multiple btree nodes (in the worst case leaving the fs unmountable) because some of those ranges (the ones in the fs_info->pinned_extents tree) correspond to btree nodes/leafs that are referred by the last committed super block - breaking the rule that anything that was committed by a transaction is untouched until the next transaction commits successfully. I ran into this while running in a loop (for several hours) the fstest that I recently submitted: [PATCH] fstests: add btrfs test to stress chunk allocation/removal and fstrim The corruption always happened when a transaction aborted and then fsck complained like this: _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent *** fsck.btrfs output *** Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 read block failed check_tree_block Couldn't open file system In this case 94945280 corresponded to the root of a tree. Using frace what I observed was the following sequence of steps happened: 1) transaction N started, fs_info->pinned_extents pointed to fs_info->freed_extents[0]; 2) node/eb 94945280 is created; 3) eb is persisted to disk; 4) transaction N commit starts, fs_info->pinned_extents now points to fs_info->freed_extents[1], and transaction N completes; 5) transaction N + 1 starts; 6) eb is COWed, and btrfs_free_tree_block() called for this eb; 7) eb range (94945280 to 94945280 + 16Kb) is added to fs_info->pinned_extents (fs_info->freed_extents[1]); 8) Something goes wrong in transaction N + 1, like hitting ENOSPC for example, and the transaction is aborted, turning the fs into readonly mode. The stack trace I got for example: [112065.253935] [] dump_stack+0x4d/0x66 [112065.254271] [] warn_slowpath_common+0x7f/0x98 [112065.254567] [] ? __btrfs_abort_transaction+0x50/0x10b [btrfs] [112065.261674] [] warn_slowpath_fmt+0x48/0x50 [112065.261922] [] ? btrfs_free_path+0x26/0x29 [btrfs] [112065.262211] [] __btrfs_abort_transaction+0x50/0x10b [btrfs] [112065.262545] [] btrfs_remove_chunk+0x537/0x58b [btrfs] [112065.262771] [] btrfs_delete_unused_bgs+0x1de/0x21b [btrfs] [112065.263105] [] cleaner_kthread+0x100/0x12f [btrfs] (...) [112065.264493] ---[ end trace dd7903a975a31a08 ]--- [112065.264673] BTRFS: error (device sdc) in btrfs_remove_chunk:2625: errno=-28 No space left [112065.264997] BTRFS info (device sdc): forced readonly 9) The clear kthread sees that the BTRFS_FS_STATE_ERROR bit is set in fs_info->fs_state and calls btrfs_cleanup_transaction(), which in turn calls btrfs_destroy_pinned_extent(); 10) Then btrfs_destroy_pinned_extent() iterates over all the ranges marked as dirty in fs_info->freed_extents[], and for each one it calls discard, if the fs was mounted with "-o discard", and adds the range to the free space cache of the respective block group; 11) btrfs_trim_block_group(), invoked from the fitrim ioctl code path, sees the free space entries and performs a discard; 12) After an umount and mount (or fsck), our eb's location on disk was full of zeroes, and it should have been untouched, because it was marked as dirty in the fs_info->pinned_extents tree, and therefore used by the trees that the last committed superblock points to. Fix this by not performing a discard and not adding the ranges to the free space caches - it's useless from this point since the fs is now in readonly mode and we won't write free space caches to disk anymore (otherwise we would leak space) nor any new superblock. By not adding the ranges to the free space caches, it prevents other code paths from allocating that space and write to it as well, therefore being safer and simpler. This isn't a new problem, as it's been present since 2011 (git commit acce952b0263825da32cf10489413dec78053347). Cc: stable@vger.kernel.org # any kernel released after 2011-01-06 Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 ------ fs/btrfs/extent-tree.c | 10 ++++++---- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 30965120772b..8c63419a7f70 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4121,12 +4121,6 @@ again: if (ret) break; - /* opt_discard */ - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_error_discard_extent(root, start, - end + 1 - start, - NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 974b3edf69c7..4f3c03d9a575 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5727,7 +5727,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } -static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, + const bool return_free_space) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; @@ -5751,7 +5752,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (start < cache->last_byte_to_unpin) { len = min(len, cache->last_byte_to_unpin - start); - btrfs_add_free_space(cache, start, len); + if (return_free_space) + btrfs_add_free_space(cache, start, len); } start += len; @@ -5815,7 +5817,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); - unpin_extent_range(root, start, end); + unpin_extent_range(root, start, end, true); cond_resched(); } @@ -9693,7 +9695,7 @@ out: int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - return unpin_extent_range(root, start, end); + return unpin_extent_range(root, start, end, false); } int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, -- cgit v1.2.3