From 63dd86fa79db737a50f47488e5249f24e5acebc1 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 13 Aug 2014 14:24:21 +0800 Subject: btrfs: fix rw_devices miss match after seed replace reproducer: reproducer: mount /dev/sdb /btrfs btrfs dev add /dev/sdc /btrfs btrfs rep start -B /dev/sdb /dev/sdd /btrfs umount /btrfs WARNING: CPU: 0 PID: 3882 at fs/btrfs/volumes.c:892 __btrfs_close_devices+0x1c8/0x200 [btrfs]() which is WARN_ON(fs_devices->rw_devices); The problem here is that we did not add one to the rw_devices when we replace the seed device with a writable device. Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index eea26e1b2fda..fb0a7fa2f70c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -562,6 +562,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, if (fs_info->fs_devices->latest_bdev == src_device->bdev) fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + if (src_device->fs_devices->seeding) + fs_info->fs_devices->rw_devices++; /* replace the sysfs entry */ btrfs_kobj_rm_device(fs_info, src_device); -- cgit v1.2.3 From de4c296f63b43794df453a3fffbb4163ccd1c6af Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 13 Aug 2014 14:24:25 +0800 Subject: btrfs: fix typo in the log message there is no matching open parenthesis for the closing parenthesis Signed-off-by: Anand Jain Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index fb0a7fa2f70c..64657b3ae97a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -542,7 +542,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", + "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", src_device->missing ? "" : rcu_str_deref(src_device->name), src_device->devid, -- cgit v1.2.3 From 12b894cb288d57292b01cf158177b6d5c89a6272 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Aug 2014 16:10:15 +0800 Subject: btrfs: Fix a deadlock in btrfs_dev_replace_finishing() btrfs-transacion:5657 [stack snip] btrfs_bio_map() btrfs_bio_counter_inc_blocked() percpu_counter_inc(&fs_info->bio_counter) ###bio_counter > 0(A) __btrfs_bio_map() btrfs_dev_replace_lock() mutex_lock(dev_replace->lock) ###wait mutex(B) btrfs:32612 [stack snip] btrfs_dev_replace_start() btrfs_dev_replace_lock() mutex_lock(dev_replace->lock) ###hold mutex(B) btrfs_dev_replace_finishing() btrfs_rm_dev_replace_blocked() wait until percpu_counter_sum == 0 ###wait on bio_counter(A) This bug can be triggered quite easily by the following test script: http://pastebin.com/MQmb37Cy This patch will fix the ABBA problem by calling btrfs_dev_replace_unlock() before btrfs_rm_dev_replace_blocked(). The consistency of btrfs devices list and their superblocks is protected by device_list_mutex, not btrfs_dev_replace_lock/unlock(). So it is safe the move btrfs_dev_replace_unlock() before btrfs_rm_dev_replace_blocked(). Reported-by: Zhao Lei Signed-off-by: Qu Wenruo Cc: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 64657b3ae97a..a85b5f53856e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -569,6 +569,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_kobj_rm_device(fs_info, src_device); btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_dev_replace_unlock(dev_replace); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); @@ -582,7 +584,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * superblock is scratched out so that it is no longer marked to * belong to this filesystem. */ - btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&root->fs_info->chunk_mutex); -- cgit v1.2.3 From c7662111c741bc04a7192f2a00aad608cbc0b205 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:31 +0800 Subject: Btrfs: cleanup double assignment of device->bytes_used when device replace finishes Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a85b5f53856e..10dfb41f4c22 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -550,7 +550,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, tgt_device->is_tgtdev_for_dev_replace = 0; tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; - tgt_device->bytes_used = src_device->bytes_used; memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); -- cgit v1.2.3 From 1c43366d3b3f0fa6c6e81aaf3aa18e0550245dad Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:32 +0800 Subject: Btrfs: fix unprotected assignment of the target device We didn't protect the assignment of the target device, it might cause the problem that the super block update was skipped because we might find wrong size of the target device during the assignment. Fix it by moving the assignment sentences into the initialization function of the target device. And there is another merit that we can check if the target device is suitable more early. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 32 ++++++++------------------------ fs/btrfs/volumes.c | 23 +++++++++++++++++++---- fs/btrfs/volumes.h | 1 + 3 files changed, 28 insertions(+), 28 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 10dfb41f4c22..72dc02e82945 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -330,29 +330,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root, return -EINVAL; mutex_lock(&fs_info->volume_mutex); - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, - &tgt_device); - if (ret) { - btrfs_err(fs_info, "target device %s is invalid!", - args->start.tgtdev_name); - mutex_unlock(&fs_info->volume_mutex); - return -EINVAL; - } - ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, args->start.srcdev_name, &src_device); - mutex_unlock(&fs_info->volume_mutex); if (ret) { - ret = -EINVAL; - goto leave_no_lock; + mutex_unlock(&fs_info->volume_mutex); + return ret; } - if (tgt_device->total_bytes < src_device->total_bytes) { - btrfs_err(fs_info, "target device is smaller than source device!"); - ret = -EINVAL; - goto leave_no_lock; - } + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + src_device, &tgt_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) + return ret; btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { @@ -380,10 +370,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, src_device->devid, rcu_str_deref(tgt_device->name)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; - /* * from now on, the writes to the srcdev are all duplicated to * go to the tgtdev as well (refer to btrfs_map_block()). @@ -426,9 +412,7 @@ leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; btrfs_dev_replace_unlock(dev_replace); -leave_no_lock: - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 483fc6d45299..1646659f2800 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2295,6 +2295,7 @@ error: } int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, struct btrfs_device **device_out) { struct request_queue *q; @@ -2307,24 +2308,37 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) + if (fs_info->fs_devices->seeding) { + btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; + } bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, fs_info->bdev_holder); - if (IS_ERR(bdev)) + if (IS_ERR(bdev)) { + btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev); + } filemap_write_and_wait(bdev->bd_inode->i_mapping); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { + btrfs_err(fs_info, "target device is in the filesystem!"); ret = -EEXIST; goto error; } } + + if (i_size_read(bdev->bd_inode) < srcdev->total_bytes) { + btrfs_err(fs_info, "target device is smaller than source device!"); + ret = -EINVAL; + goto error; + } + + device = btrfs_alloc_device(NULL, &devid, NULL); if (IS_ERR(device)) { ret = PTR_ERR(device); @@ -2348,8 +2362,9 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->io_width = root->sectorsize; device->io_align = root->sectorsize; device->sector_size = root->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); - device->disk_total_bytes = device->total_bytes; + device->total_bytes = srcdev->total_bytes; + device->disk_total_bytes = srcdev->disk_total_bytes; + device->bytes_used = srcdev->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37f8bff97df1..e15f2886d33e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -322,6 +322,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); -- cgit v1.2.3 From 935e5cc935bcbf9b3d0dd59fed7dbc0f2ebca6bc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:33 +0800 Subject: Btrfs: fix wrong disk size when writing super blocks total_size will be changed when resizing a device, and disk_total_size will be changed if resizing is successful. Meanwhile, the on-disk super blocks of the previous transaction might not be updated. Considering the consistency of the metadata in the previous transaction, We should use the size in the previous transaction to check if the super block is beyond the boundary of the device. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/dev-replace.c | 18 ++++++++++++++++++ fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/scrub.c | 3 ++- fs/btrfs/transaction.c | 2 ++ fs/btrfs/volumes.c | 40 +++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 18 ++++++++++++++++++ 7 files changed, 83 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e0033c843ce7..cb7f3fe9c9f6 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror( /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; bh = __bread(superblock_bdev, dev_bytenr / 4096, BTRFS_SUPER_INFO_SIZE); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 72dc02e82945..7877b0fc6a8d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -168,6 +168,8 @@ no_valid_dev_replace_entry_found: dev_replace->srcdev->total_bytes; dev_replace->tgtdev->disk_total_bytes = dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->commit_total_bytes = + dev_replace->srcdev->commit_total_bytes; dev_replace->tgtdev->bytes_used = dev_replace->srcdev->bytes_used; } @@ -329,6 +331,20 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + + /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, args->start.srcdev_name, @@ -539,6 +555,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); tgt_device->total_bytes = src_device->total_bytes; tgt_device->disk_total_bytes = src_device->disk_total_bytes; + ASSERT(list_empty(&src_device->resized_list)); + tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dbd792754b27..0cd18b725554 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3127,7 +3127,8 @@ static int write_dev_supers(struct btrfs_device *device, for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) break; if (wait) { @@ -3444,7 +3445,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, - dev->disk_total_bytes); + dev->commit_total_bytes); btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 72c8981e7c0a..9d80e37044db 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2840,7 +2840,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > + scrub_dev->commit_total_bytes) break; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e336646508fe..2f7c0bef4043 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1868,6 +1868,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); + btrfs_update_commit_device_size(root->fs_info); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1646659f2800..7b5c04259a6e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void) mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->list); @@ -154,6 +155,7 @@ static struct btrfs_device *__alloc_device(void) INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->resized_list); spin_lock_init(&dev->io_lock); @@ -2168,6 +2170,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); device->disk_total_bytes = device->total_bytes; + device->commit_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -2364,6 +2367,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->sector_size = root->sectorsize; device->total_bytes = srcdev->total_bytes; device->disk_total_bytes = srcdev->disk_total_bytes; + ASSERT(list_empty(&srcdev->resized_list)); + device->commit_total_bytes = srcdev->commit_total_bytes; device->bytes_used = srcdev->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; @@ -2448,6 +2453,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, { struct btrfs_super_block *super_copy = device->dev_root->fs_info->super_copy; + struct btrfs_fs_devices *fs_devices; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -2457,12 +2463,17 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->is_tgtdev_for_dev_replace) return -EINVAL; + fs_devices = device->dev_root->fs_info->fs_devices; + btrfs_set_super_total_bytes(super_copy, old_total + diff); device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; device->disk_total_bytes = new_size; btrfs_clear_space_info_full(device->dev_root->fs_info); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &fs_devices->resized_devices); return btrfs_update_device(trans, device); } @@ -4011,8 +4022,11 @@ again: } lock_chunks(root); - device->disk_total_bytes = new_size; + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &root->fs_info->fs_devices->resized_devices); + /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); if (ret) { @@ -5993,6 +6007,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->devid = btrfs_device_id(leaf, dev_item); device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); device->total_bytes = device->disk_total_bytes; + device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); @@ -6520,3 +6535,26 @@ int btrfs_scratch_superblock(struct btrfs_device *device) return 0; } + +/* + * Update the size of all devices, which is used for writing out the + * super blocks. + */ +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *curr, *next; + + if (list_empty(&fs_devices->resized_devices)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + lock_chunks(fs_info->dev_root); + list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, + resized_list) { + list_del_init(&curr->resized_list); + curr->commit_total_bytes = curr->disk_total_bytes; + } + unlock_chunks(fs_info->dev_root); + mutex_unlock(&fs_devices->device_list_mutex); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e15f2886d33e..b30d018fa359 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -87,6 +87,21 @@ struct btrfs_device { /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + /* + * size of the device on the current transaction + * + * This variant is update when committing the transaction, + * and protected by device_list_mutex + */ + u64 commit_total_bytes; + + /* + * used to manage the device which is resized + * + * It is protected by chunk_lock. + */ + struct list_head resized_list; + /* for sending down flush barriers */ int nobarriers; struct bio *flush_bio; @@ -136,6 +151,7 @@ struct btrfs_fs_devices { struct mutex device_list_mutex; struct list_head devices; + struct list_head resized_devices; /* devices not currently being allocated */ struct list_head alloc_list; struct list_head list; @@ -402,4 +418,6 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, { btrfs_dev_stat_set(dev, index, 0); } + +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); #endif -- cgit v1.2.3 From ce7213c70c37e3a66bc0b50c45edcbfea505f62f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:34 +0800 Subject: Btrfs: fix wrong device bytes_used in the super block device->bytes_used will be changed when allocating a new chunk, and disk_total_size will be changed if resizing is successful. Meanwhile, the on-disk super blocks of the previous transaction might not be updated. Considering the consistency of the metadata in the previous transaction, We should use the size in the previous transaction to check if the super block is beyond the boundary of the device. Though it is not big problem because we don't use it now, but anyway it is better that we make it be consistent with the common metadata, maybe we will use it in the future. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 3 +++ fs/btrfs/disk-io.c | 3 ++- fs/btrfs/transaction.c | 1 + fs/btrfs/volumes.c | 27 +++++++++++++++++++++++++++ fs/btrfs/volumes.h | 4 ++++ 5 files changed, 37 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7877b0fc6a8d..1be03d85d267 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -172,6 +172,8 @@ no_valid_dev_replace_entry_found: dev_replace->srcdev->commit_total_bytes; dev_replace->tgtdev->bytes_used = dev_replace->srcdev->bytes_used; + dev_replace->tgtdev->commit_bytes_used = + dev_replace->srcdev->commit_bytes_used; } dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; btrfs_init_dev_replace_tgtdev_for_resume(fs_info, @@ -558,6 +560,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->bytes_used = src_device->bytes_used; + tgt_device->commit_bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; if (fs_info->fs_devices->latest_bdev == src_device->bdev) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0cd18b725554..a224fb9b34a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3446,7 +3446,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, dev->commit_total_bytes); - btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_bytes_used(dev_item, + dev->commit_bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2f7c0bef4043..16d0c1b62b3e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1869,6 +1869,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, sizeof(*root->fs_info->super_copy)); btrfs_update_commit_device_size(root->fs_info); + btrfs_update_commit_device_bytes_used(root, cur_trans); spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7b5c04259a6e..f8273bb53b3f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2370,6 +2370,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, ASSERT(list_empty(&srcdev->resized_list)); device->commit_total_bytes = srcdev->commit_total_bytes; device->bytes_used = srcdev->bytes_used; + device->commit_bytes_used = device->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -6009,6 +6010,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->total_bytes = device->disk_total_bytes; device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->commit_bytes_used = device->bytes_used; device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); @@ -6558,3 +6560,28 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) unlock_chunks(fs_info->dev_root); mutex_unlock(&fs_devices->device_list_mutex); } + +/* Must be invoked during the transaction commit */ +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *dev; + int i; + + if (list_empty(&transaction->pending_chunks)) + return; + + /* In order to kick the device replace finish process */ + lock_chunks(root); + list_for_each_entry(em, &transaction->pending_chunks, list) { + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + dev = map->stripes[i].dev; + dev->commit_bytes_used = dev->bytes_used; + } + } + unlock_chunks(root); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b30d018fa359..f79d532fedb0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -95,6 +95,8 @@ struct btrfs_device { */ u64 commit_total_bytes; + /* bytes used on the current transaction */ + u64 commit_bytes_used; /* * used to manage the device which is resized * @@ -420,4 +422,6 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, } void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction); #endif -- cgit v1.2.3 From 7cc8e58d53cd2295c3c1cee7b503bd1790ea4486 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:38 +0800 Subject: Btrfs: fix unprotected device's variants on 32bits machine ->total_bytes,->disk_total_bytes,->bytes_used is protected by chunk lock when we change them, but sometimes we read them without any lock, and we might get unexpected value. We fix this problem like inode's i_size. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 15 +++++---- fs/btrfs/ioctl.c | 6 ++-- fs/btrfs/volumes.c | 48 +++++++++++++++++------------ fs/btrfs/volumes.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 29 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1be03d85d267..da7ac1432b15 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -418,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, /* the disk copy procedure reuses the scrub code */ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, - src_device->total_bytes, + btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); @@ -555,11 +555,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; + btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); + btrfs_device_set_disk_total_bytes(tgt_device, + src_device->disk_total_bytes); + btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; tgt_device->commit_bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; @@ -650,6 +651,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *srcdev; btrfs_dev_replace_lock(dev_replace); /* even if !dev_replace_is_valid, the values are good enough for @@ -672,8 +674,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + srcdev = dev_replace->srcdev; args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(dev_replace->srcdev->total_bytes, 1000)); + div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -832,7 +835,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, dev_replace->committed_cursor_left, - dev_replace->srcdev->total_bytes, + btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f60d1ca389f0..0ff212757b95 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1553,7 +1553,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - old_size = device->total_bytes; + old_size = btrfs_device_get_total_bytes(device); if (mod < 0) { if (new_size > old_size) { @@ -2740,8 +2740,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) } di_args->devid = dev->devid; - di_args->bytes_used = dev->bytes_used; - di_args->total_bytes = dev->total_bytes; + di_args->bytes_used = btrfs_device_get_bytes_used(dev); + di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { struct rcu_string *name; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8e4a3d1ad89..41da102cdcc0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1308,7 +1308,7 @@ again: if (device->bytes_used > 0) { u64 len = btrfs_dev_extent_length(leaf, extent); - device->bytes_used -= len; + btrfs_device_set_bytes_used(device, device->bytes_used - len); spin_lock(&root->fs_info->free_chunk_lock); root->fs_info->free_chunk_space += len; spin_unlock(&root->fs_info->free_chunk_lock); @@ -1462,8 +1462,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); btrfs_set_device_group(leaf, dev_item, 0); btrfs_set_device_seek_speed(leaf, dev_item, 0); btrfs_set_device_bandwidth(leaf, dev_item, 0); @@ -2330,7 +2332,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, } - if (i_size_read(bdev->bd_inode) < srcdev->total_bytes) { + if (i_size_read(bdev->bd_inode) < + btrfs_device_get_total_bytes(srcdev)) { btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; goto error; @@ -2360,11 +2363,11 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->io_width = root->sectorsize; device->io_align = root->sectorsize; device->sector_size = root->sectorsize; - device->total_bytes = srcdev->total_bytes; - device->disk_total_bytes = srcdev->disk_total_bytes; + device->total_bytes = btrfs_device_get_total_bytes(srcdev); + device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); + device->bytes_used = btrfs_device_get_bytes_used(srcdev); ASSERT(list_empty(&srcdev->resized_list)); device->commit_total_bytes = srcdev->commit_total_bytes; - device->bytes_used = srcdev->bytes_used; device->commit_bytes_used = device->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; @@ -2435,8 +2438,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); btrfs_mark_buffer_dirty(leaf); out: @@ -2464,8 +2469,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, btrfs_set_super_total_bytes(super_copy, old_total + diff); device->fs_devices->total_rw_bytes += diff; - device->total_bytes = new_size; - device->disk_total_bytes = new_size; + btrfs_device_set_total_bytes(device, new_size); + btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->dev_root->fs_info); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, @@ -3110,11 +3115,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - old_size = device->total_bytes; + old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free || + btrfs_device_get_total_bytes(device) - + btrfs_device_get_bytes_used(device) > size_to_free || device->is_tgtdev_for_dev_replace) continue; @@ -3920,8 +3926,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); - u64 old_size = device->total_bytes; - u64 diff = device->total_bytes - new_size; + u64 old_size = btrfs_device_get_total_bytes(device); + u64 diff = old_size - new_size; if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -3934,7 +3940,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); - device->total_bytes = new_size; + btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; spin_lock(&root->fs_info->free_chunk_lock); @@ -4000,7 +4006,7 @@ again: ret = -ENOSPC; lock_chunks(root); - device->total_bytes = old_size; + btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; spin_lock(&root->fs_info->free_chunk_lock); @@ -4018,7 +4024,7 @@ again: } lock_chunks(root); - device->disk_total_bytes = new_size; + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, &root->fs_info->fs_devices->resized_devices); @@ -4429,8 +4435,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret) goto error_del_extent; - for (i = 0; i < map->num_stripes; i++) - map->stripes[i].dev->bytes_used += stripe_size; + for (i = 0; i < map->num_stripes; i++) { + num_bytes = map->stripes[i].dev->bytes_used + stripe_size; + btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); + } spin_lock(&extent_root->fs_info->free_chunk_lock); extent_root->fs_info->free_chunk_space -= (stripe_size * diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f79d532fedb0..76600a3fedbe 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -32,6 +32,19 @@ struct btrfs_pending_bios { struct bio *tail; }; +/* + * Use sequence counter to get consistent device stat data on + * 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include +#define __BTRFS_NEED_DEVICE_DATA_ORDERED +#define btrfs_device_data_ordered_init(device) \ + seqcount_init(&device->data_seqcount) +#else +#define btrfs_device_data_ordered_init(device) do { } while (0) +#endif + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; @@ -61,6 +74,10 @@ struct btrfs_device { int can_discard; int is_tgtdev_for_dev_replace; +#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED + seqcount_t data_seqcount; +#endif + /* the internal btrfs device id */ u64 devid; @@ -133,6 +150,73 @@ struct btrfs_device { atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; +/* + * If we read those variants at the context of their own lock, we needn't + * use the following helpers, reading them directly is safe. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + unsigned int seq; \ + \ + do { \ + seq = read_seqcount_begin(&dev->data_seqcount); \ + size = dev->name; \ + } while (read_seqcount_retry(&dev->data_seqcount, seq)); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + write_seqcount_begin(&dev->data_seqcount); \ + dev->name = size; \ + write_seqcount_end(&dev->data_seqcount); \ + preempt_enable(); \ +} +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + \ + preempt_disable(); \ + size = dev->name; \ + preempt_enable(); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + dev->name = size; \ + preempt_enable(); \ +} +#else +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + return dev->name; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + dev->name = size; \ +} +#endif + +BTRFS_DEVICE_GETSET_FUNCS(total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(bytes_used); + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ -- cgit v1.2.3 From 2196d6e8a71fc901e31c1d81581fc6cc6c64913e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:41 +0800 Subject: Btrfs: Fix misuse of chunk mutex There were several problems about chunk mutex usage: - Lock chunk mutex when updating metadata. It would cause the nested deadlock because updating metadata might need allocate new chunks that need acquire chunk mutex. We remove chunk mutex at this case, because b-tree lock and other lock mechanism can help us. - ABBA deadlock occured between device_list_mutex and chunk_mutex. When we update device status, we must acquire device_list_mutex at the beginning, and then we might get chunk_mutex during the device status update because we need allocate new chunks for metadata COW. But at most place, we acquire chunk_mutex at first and then acquire device list mutex. We need change the lock order. - Some place we needn't acquire chunk_mutex. For example we needn't get chunk_mutex when we free a empty seed fs_devices structure. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 6 +-- fs/btrfs/extent-tree.c | 2 - fs/btrfs/volumes.c | 129 ++++++++++++++++++++++++------------------------- 3 files changed, 65 insertions(+), 72 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index da7ac1432b15..aa4c82863c73 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -510,8 +510,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* keep away write_all_supers() during the finishing procedure */ - mutex_lock(&root->fs_info->chunk_mutex); mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&root->fs_info->chunk_mutex); btrfs_dev_replace_lock(dev_replace); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED @@ -534,8 +534,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -589,8 +589,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * superblock is scratched out so that it is no longer marked to * belong to this filesystem. */ - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2191a2c7496b..b30ddb49cfab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9415,8 +9415,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); - btrfs_clear_space_info_full(root->fs_info); - btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9f22398d465f..105c5fe004db 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1264,7 +1264,7 @@ out: static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 start) + u64 start, u64 *dev_extent_len) { int ret; struct btrfs_path *path; @@ -1306,13 +1306,8 @@ again: goto out; } - if (device->bytes_used > 0) { - u64 len = btrfs_dev_extent_length(leaf, extent); - btrfs_device_set_bytes_used(device, device->bytes_used - len); - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += len; - spin_unlock(&root->fs_info->free_chunk_lock); - } + *dev_extent_len = btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); if (ret) { btrfs_error(root->fs_info, ret, @@ -1521,7 +1516,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; - lock_chunks(root); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -1537,7 +1531,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, goto out; out: btrfs_free_path(path); - unlock_chunks(root); btrfs_commit_transaction(trans, root); return ret; } @@ -1726,9 +1719,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) fs_devices = fs_devices->seed; } cur_devices->seed = NULL; - lock_chunks(root); __btrfs_close_devices(cur_devices); - unlock_chunks(root); free_fs_devices(cur_devices); } @@ -1990,11 +1981,12 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); + list_for_each_entry(device, &seed_devices->devices, dev_list) + device->fs_devices = seed_devices; + lock_chunks(root); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - list_for_each_entry(device, &seed_devices->devices, dev_list) { - device->fs_devices = seed_devices; - } + unlock_chunks(root); fs_devices->seeding = 0; fs_devices->num_devices = 0; @@ -2155,8 +2147,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - lock_chunks(root); - q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; @@ -2185,6 +2175,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->fs_devices = root->fs_info->fs_devices; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + lock_chunks(root); list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); @@ -2212,15 +2203,34 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) /* add sysfs device entry */ btrfs_kobj_add_device(root->fs_info, device); + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + + unlock_chunks(root); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + lock_chunks(root); ret = init_first_rw_device(trans, root, device); + unlock_chunks(root); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error_trans; } + } + + ret = btrfs_add_device(trans, root, device); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } + + if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + ret = btrfs_finish_sprout(trans, root); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2234,21 +2244,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fsid); if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) goto error_trans; - } else { - ret = btrfs_add_device(trans, root, device); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto error_trans; - } } - /* - * we've got more storage, clear any full flags on the space - * infos - */ - btrfs_clear_space_info_full(root->fs_info); - - unlock_chunks(root); root->fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); ret = btrfs_commit_transaction(trans, root); @@ -2280,7 +2277,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return ret; error_trans: - unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); btrfs_kobj_rm_device(root->fs_info, device); @@ -2449,20 +2445,27 @@ out: return ret; } -static int __btrfs_grow_device(struct btrfs_trans_handle *trans, +int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = device->dev_root->fs_info->super_copy; struct btrfs_fs_devices *fs_devices; - u64 old_total = btrfs_super_total_bytes(super_copy); - u64 diff = new_size - device->total_bytes; + u64 old_total; + u64 diff; if (!device->writeable) return -EACCES; + + lock_chunks(device->dev_root); + old_total = btrfs_super_total_bytes(super_copy); + diff = new_size - device->total_bytes; + if (new_size <= device->total_bytes || - device->is_tgtdev_for_dev_replace) + device->is_tgtdev_for_dev_replace) { + unlock_chunks(device->dev_root); return -EINVAL; + } fs_devices = device->dev_root->fs_info->fs_devices; @@ -2475,20 +2478,11 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, &fs_devices->resized_devices); + unlock_chunks(device->dev_root); return btrfs_update_device(trans, device); } -int btrfs_grow_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 new_size) -{ - int ret; - lock_chunks(device->dev_root); - ret = __btrfs_grow_device(trans, device, new_size); - unlock_chunks(device->dev_root); - return ret; -} - static int btrfs_free_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 chunk_tree, u64 chunk_objectid, @@ -2540,6 +2534,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 u32 cur; struct btrfs_key key; + lock_chunks(root); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; @@ -2569,6 +2564,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 cur += len; } } + unlock_chunks(root); return ret; } @@ -2579,8 +2575,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, struct extent_map_tree *em_tree; struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; + struct btrfs_device *device; struct extent_map *em; struct map_lookup *map; + u64 dev_extent_len = 0; int ret; int i; @@ -2604,8 +2602,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; } - lock_chunks(root); - /* * step two, delete the device extents and the * chunk tree entries @@ -2619,10 +2615,23 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, - map->stripes[i].physical); + device = map->stripes[i].dev; + ret = btrfs_free_dev_extent(trans, device, + map->stripes[i].physical, + &dev_extent_len); BUG_ON(ret); + if (device->bytes_used > 0) { + lock_chunks(root); + btrfs_device_set_bytes_used(device, + device->bytes_used - dev_extent_len); + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += dev_extent_len; + spin_unlock(&root->fs_info->free_chunk_lock); + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); + } + if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); BUG_ON(ret); @@ -2652,7 +2661,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* once for us */ free_extent_map(em); - unlock_chunks(root); btrfs_end_transaction(trans, root); return 0; } @@ -4029,16 +4037,12 @@ again: list_add_tail(&device->resized_list, &root->fs_info->fs_devices->resized_devices); - /* Now btrfs_update_device() will change the on-disk size. */ - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); + + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); btrfs_end_transaction(trans, root); done: btrfs_free_path(path); @@ -4612,15 +4616,6 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, alloc_profile); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) - btrfs_abort_transaction(trans, root, ret); -out: return ret; } -- cgit v1.2.3 From 67a2c45ee7f4f250458279a2e1244679c5d9735c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:43 +0800 Subject: Btrfs: fix use-after-free problem of the device during device replace The problem is: Task0(device scan task) Task1(device replace task) scan_one_device() mutex_lock(&uuid_mutex) device = find_device() mutex_lock(&device_list_mutex) lock_chunk() rm_and_free_source_device unlock_chunk() mutex_unlock(&device_list_mutex) check device Destroying the target device if device replace fails also has the same problem. We fix this problem by locking uuid_mutex during destroying source device or target device, just like the device remove operation. It is a temporary solution, we can fix this problem and make the code more clear by atomic counter in the future. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 3 +++ fs/btrfs/volumes.c | 4 +++- fs/btrfs/volumes.h | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index aa4c82863c73..e9cbbdb72978 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -509,6 +509,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ret = btrfs_commit_transaction(trans, root); WARN_ON(ret); + mutex_lock(&uuid_mutex); /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&root->fs_info->fs_devices->device_list_mutex); mutex_lock(&root->fs_info->chunk_mutex); @@ -536,6 +537,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->chunk_mutex); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -591,6 +593,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ mutex_unlock(&root->fs_info->chunk_mutex); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d28e1761fdeb..a15c8ac5d5b3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static DEFINE_MUTEX(uuid_mutex); +DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); static void lock_chunks(struct btrfs_root *root) @@ -1867,6 +1867,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *next_device; + mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); if (tgtdev->bdev) { @@ -1886,6 +1887,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, call_rcu(&tgtdev->rcu, free_device); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 76600a3fedbe..2b37da3dd408 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -24,6 +24,8 @@ #include #include "async-thread.h" +extern struct mutex uuid_mutex; + #define BTRFS_STRIPE_LEN (64 * 1024) struct buffer_head; -- cgit v1.2.3 From 82372bc816d75722c24d1abadb11cd8c0a33883a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 3 Sep 2014 21:35:44 +0800 Subject: Btrfs: make the logic of source device removing more clear Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 3 +-- fs/btrfs/volumes.c | 19 +++++++------------ 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e9cbbdb72978..6f662b34ba0e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -569,8 +569,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, if (fs_info->fs_devices->latest_bdev == src_device->bdev) fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); - if (src_device->fs_devices->seeding) - fs_info->fs_devices->rw_devices++; + fs_info->fs_devices->rw_devices++; /* replace the sysfs entry */ btrfs_kobj_rm_device(fs_info, src_device); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a15c8ac5d5b3..2e078fa705a6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1819,23 +1819,18 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, list_del_rcu(&srcdev->dev_list); list_del_rcu(&srcdev->dev_alloc_list); fs_devices->num_devices--; - if (srcdev->missing) { + if (srcdev->missing) fs_devices->missing_devices--; - if (!fs_devices->seeding) - fs_devices->rw_devices++; + + if (srcdev->writeable) { + fs_devices->rw_devices--; + /* zero out the old super if it is writable */ + btrfs_scratch_superblock(srcdev); } - if (srcdev->bdev) { + if (srcdev->bdev) fs_devices->open_devices--; - /* - * zero out the old super if it is not writable - * (e.g. seed device) - */ - if (srcdev->writeable) - btrfs_scratch_superblock(srcdev); - } - call_rcu(&srcdev->rcu, free_device); /* -- cgit v1.2.3 From 2fc9f6baa24ac230166df41ed636224969523341 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 13 Oct 2014 12:42:12 +0800 Subject: Btrfs: return failure if btrfs_dev_replace_finishing() failed device replace could fail due to another running scrub process or any other errors btrfs_scrub_dev() may hit, but this failure doesn't get returned to userspace. The following steps could reproduce this issue mkfs -t btrfs -f /dev/sdb1 /dev/sdb2 mount /dev/sdb1 /mnt/btrfs while true; do btrfs scrub start -B /mnt/btrfs >/dev/null 2>&1; done & btrfs replace start -Bf /dev/sdb2 /dev/sdb3 /mnt/btrfs # if this replace succeeded, do the following and repeat until # you see this log in dmesg # BTRFS: btrfs_scrub_dev(/dev/sdb2, 2, /dev/sdb3) failed -115 #btrfs replace start -Bf /dev/sdb3 /dev/sdb2 /mnt/btrfs # once you see the error log in dmesg, check return value of # replace echo $? Introduce a new dev replace result BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to catch -EINPROGRESS explicitly and return other errors directly to userspace. Signed-off-by: Eryu Guan Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 12 +++++++++--- include/uapi/linux/btrfs.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6f662b34ba0e..971c061eb1ce 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -422,9 +422,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); - WARN_ON(ret); + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == -EINPROGRESS) { + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + ret = 0; + } else { + WARN_ON(ret); + } - return 0; + return ret; leave: dev_replace->srcdev = NULL; @@ -542,7 +548,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; + return scrub_ret; } printk_in_rcu(KERN_INFO diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 2f47824e7a36..611e1c5893b4 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -157,6 +157,7 @@ struct btrfs_ioctl_dev_replace_status_params { #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS 3 struct btrfs_ioctl_dev_replace_args { __u64 cmd; /* in */ __u64 result; /* out */ -- cgit v1.2.3 From 084b6e7c7607bbeb28544da659c3f5981a4689b0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 30 Oct 2014 16:52:31 +0800 Subject: btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 11 ++++++----- fs/btrfs/volumes.c | 10 ++++++++-- fs/btrfs/volumes.h | 6 ++++-- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 971c061eb1ce..3fbd0628620b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -577,15 +577,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); - btrfs_dev_replace_unlock(dev_replace); btrfs_rm_dev_replace_blocked(fs_info); - btrfs_rm_dev_replace_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); btrfs_rm_dev_replace_unblocked(fs_info); @@ -600,6 +596,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); + /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); if (!IS_ERR(trans)) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d47289c715c8..01920515f90d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1800,8 +1800,8 @@ error_undo: goto error_brelse; } -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices; @@ -1829,6 +1829,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->bdev) fs_devices->open_devices--; +} + +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; call_rcu(&srcdev->rcu, free_device); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 08980fa23039..4cc00e64427e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -448,8 +448,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 4245215d6a8dba1a51c50533b6667919687c0b89 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 25 Nov 2014 16:39:28 +0800 Subject: Btrfs, raid56: fix use-after-free problem in the final device replace procedure on raid56 The commit c404e0dc (Btrfs: fix use-after-free in the finishing procedure of the device replace) fixed a use-after-free problem which happened when removing the source device at the end of device replace, but at that time, btrfs didn't support device replace on raid56, so we didn't fix the problem on the raid56 profile. Currently, we implemented device replace for raid56, so we need kick that problem out before we enable that function for raid56. The fix method is very simple, we just increase the bio per-cpu counter before we submit a raid56 io, and decrease the counter when the raid56 io ends. Signed-off-by: Miao Xie --- fs/btrfs/ctree.h | 7 ++++++- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/raid56.c | 41 ++++++++++++++++++++++++++++++++--------- fs/btrfs/raid56.h | 4 ++-- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.c | 7 ++----- 6 files changed, 45 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe69edda11fb..470e3177a7e8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4097,7 +4097,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} /* reada.c */ struct reada_control { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6f662b34ba0e..fa27b4e3b6c8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -920,9 +920,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) percpu_counter_inc(&fs_info->bio_counter); } -void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { - percpu_counter_dec(&fs_info->bio_counter); + percpu_counter_sub(&fs_info->bio_counter, amount); if (waitqueue_active(&fs_info->replace_wait)) wake_up(&fs_info->replace_wait); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5ece565bc5f0..8ab2a17bbba8 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -162,6 +162,8 @@ struct btrfs_raid_bio { */ int bio_list_bytes; + int generic_bio_cnt; + atomic_t refs; atomic_t stripes_pending; @@ -354,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -891,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; + + if (rbio->generic_bio_cnt) + btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + free_raid_bio(rbio); while (cur) { @@ -1775,6 +1782,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; + int ret; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); if (IS_ERR(rbio)) { @@ -1785,12 +1793,19 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; + /* * don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - return full_stripe_write(rbio); + if (rbio_is_full(rbio)) { + ret = full_stripe_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); + return ret; + } cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, sizeof(*plug)); @@ -1801,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); + ret = 0; } else { - return __raid56_parity_write(rbio); + ret = __raid56_parity_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); } - return 0; + return ret; } /* @@ -2139,19 +2157,17 @@ cleanup: */ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int hold_bbio) + u64 stripe_len, int mirror_num, int generic_io) { struct btrfs_raid_bio *rbio; int ret; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); if (IS_ERR(rbio)) { - __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio); + __free_bbio_and_raid_map(bbio, raid_map, generic_io); return PTR_ERR(rbio); } - if (hold_bbio) - set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); rbio->operation = BTRFS_RBIO_READ_REBUILD; bio_list_add(&rbio->bio_list, bio); rbio->bio_list_bytes = bio->bi_iter.bi_size; @@ -2159,11 +2175,18 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); - __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio); + __free_bbio_and_raid_map(bbio, raid_map, generic_io); kfree(rbio); return -EIO; } + if (generic_io) { + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; + } else { + set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); + } + /* * reconstruct from the q stripe if they are * asking for mirror 3 diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3d4ddb3d861d..31d4a157b5e3 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -43,8 +43,8 @@ struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int hold_bbio); + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num, int generic_io); int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 *raid_map, u64 stripe_len); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0ae837fd676d..27f2e16cd259 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1477,7 +1477,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, page->recover->raid_map, page->recover->map_length, - page->mirror_num, 1); + page->mirror_num, 0); if (ret) return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6d8a5e8d8c39..cbb766577f31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5843,12 +5843,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } else { ret = raid56_parity_recover(root, bio, bbio, raid_map, map_length, - mirror_num, 0); + mirror_num, 1); } - /* - * FIXME, replace dosen't support raid56 yet, please fix - * it in the future. - */ + btrfs_bio_counter_dec(root->fs_info); return ret; } -- cgit v1.2.3 From 5d3edd8f44aac94de7b16f4c54290e24f5e8c532 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 13 Nov 2014 11:45:38 +0800 Subject: Btrfs, replace: enable dev-replace for raid56 Signed-off-by: Zhao Lei Signed-off-by: Miao Xie --- fs/btrfs/dev-replace.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index fa27b4e3b6c8..0bf41f8b1e23 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - if (btrfs_fs_incompat(fs_info, RAID56)) { - btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); - return -EOPNOTSUPP; - } - switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: -- cgit v1.2.3