diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r-- | fs/btrfs/volumes.c | 515 |
1 files changed, 234 insertions, 281 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 292266f6ab9c..e034ad9e23b4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .raid_name = "raid10", + .bg_flag = BTRFS_BLOCK_GROUP_RAID10, + .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, }, [BTRFS_RAID_RAID1] = { .sub_stripes = 1, @@ -49,6 +52,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .raid_name = "raid1", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1, + .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, @@ -58,6 +64,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .raid_name = "dup", + .bg_flag = BTRFS_BLOCK_GROUP_DUP, + .mindev_error = 0, }, [BTRFS_RAID_RAID0] = { .sub_stripes = 1, @@ -67,6 +76,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .raid_name = "raid0", + .bg_flag = BTRFS_BLOCK_GROUP_RAID0, + .mindev_error = 0, }, [BTRFS_RAID_SINGLE] = { .sub_stripes = 1, @@ -76,6 +88,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .raid_name = "single", + .bg_flag = 0, + .mindev_error = 0, }, [BTRFS_RAID_RAID5] = { .sub_stripes = 1, @@ -85,6 +100,9 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 1, .ncopies = 2, + .raid_name = "raid5", + .bg_flag = BTRFS_BLOCK_GROUP_RAID5, + .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, }, [BTRFS_RAID_RAID6] = { .sub_stripes = 1, @@ -94,33 +112,19 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 2, .devs_increment = 1, .ncopies = 3, + .raid_name = "raid6", + .bg_flag = BTRFS_BLOCK_GROUP_RAID6, + .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, }, }; -const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, - [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, - [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, - [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, - [BTRFS_RAID_SINGLE] = 0, - [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, - [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, -}; +const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; -/* - * Table to convert BTRFS_RAID_* to the error code if minimum number of devices - * condition is not met. Zero means there's no corresponding - * BTRFS_ERROR_DEV_*_NOT_MET value. - */ -const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, - [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, - [BTRFS_RAID_DUP] = 0, - [BTRFS_RAID_RAID0] = 0, - [BTRFS_RAID_SINGLE] = 0, - [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, - [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, -}; + return btrfs_raid_array[type].raid_name; +} static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); @@ -167,12 +171,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * - * volume_mutex - * ------------ - * coarse lock owned by a mounted filesystem; used to exclude some operations - * that cannot run in parallel and affect the higher-level properties of the - * filesystem like: device add/deleting/resize/replace, or balance - * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from @@ -197,6 +195,41 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * device_list_mutex * chunk_mutex * balance_mutex + * + * + * Exclusive operations, BTRFS_FS_EXCL_OP + * ====================================== + * + * Maintains the exclusivity of the following operations that apply to the + * whole filesystem and cannot run in parallel. + * + * - Balance (*) + * - Device add + * - Device remove + * - Device replace (*) + * - Resize + * + * The device operations (as above) can be in one of the following states: + * + * - Running state + * - Paused state + * - Completed state + * + * Only device operations marked with (*) can go into the Paused state for the + * following reasons: + * + * - ioctl (only Balance can be Paused through ioctl) + * - filesystem remounted as read-only + * - filesystem unmounted and mounted as read-only + * - system power-cycle and filesystem mounted as read-only + * - filesystem or device errors leading to forced read-only + * + * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. + * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * A device operation in Paused or Running state can be canceled or resumed + * either by ioctl (Balance only) or when remounted as read-write. + * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * completed. */ DEFINE_MUTEX(uuid_mutex); @@ -227,14 +260,14 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); - INIT_LIST_HEAD(&fs_devs->list); + INIT_LIST_HEAD(&fs_devs->fs_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); return fs_devs; } -static void free_device(struct btrfs_device *device) +void btrfs_free_device(struct btrfs_device *device) { rcu_string_free(device->name); bio_put(device->flush_bio); @@ -249,7 +282,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - free_device(device); + btrfs_free_device(device); } kfree(fs_devices); } @@ -273,8 +306,8 @@ void __exit btrfs_cleanup_fs_uuids(void) while (!list_empty(&fs_uuids)) { fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, list); - list_del(&fs_devices->list); + struct btrfs_fs_devices, fs_list); + list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } @@ -282,7 +315,7 @@ void __exit btrfs_cleanup_fs_uuids(void) /* * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. * Returned struct is not linked onto any lists and must be destroyed using - * free_device. + * btrfs_free_device. */ static struct btrfs_device *__alloc_device(void) { @@ -327,10 +360,9 @@ static struct btrfs_device *__alloc_device(void) static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, u64 devid, const u8 *uuid) { - struct list_head *head = &fs_devices->devices; struct btrfs_device *dev; - list_for_each_entry(dev, head, dev_list) { + list_for_each_entry(dev, &fs_devices->devices, dev_list) { if (dev->devid == devid && (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { return dev; @@ -343,7 +375,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) { struct btrfs_fs_devices *fs_devices; - list_for_each_entry(fs_devices, &fs_uuids, list) { + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) return fs_devices; } @@ -607,7 +639,7 @@ static void btrfs_free_stale_devices(const char *path, struct btrfs_fs_devices *fs_devs, *tmp_fs_devs; struct btrfs_device *dev, *tmp_dev; - list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) { + list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, fs_list) { if (fs_devs->opened) continue; @@ -632,13 +664,13 @@ static void btrfs_free_stale_devices(const char *path, /* delete the stale device */ if (fs_devs->num_devices == 1) { btrfs_sysfs_remove_fsid(fs_devs); - list_del(&fs_devs->list); + list_del(&fs_devs->fs_list); free_fs_devices(fs_devs); break; } else { fs_devs->num_devices--; list_del(&dev->dev_list); - free_device(dev); + btrfs_free_device(dev); } } } @@ -732,7 +764,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); - list_add(&fs_devices->list, &fs_uuids); + list_add(&fs_devices->fs_list, &fs_uuids); device = NULL; } else { @@ -753,7 +785,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { - free_device(device); + btrfs_free_device(device); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); @@ -866,7 +898,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { - free_device(device); + btrfs_free_device(device); goto error; } rcu_assign_pointer(device->name, name); @@ -938,7 +970,7 @@ again: } list_del_init(&device->dev_list); fs_devices->num_devices--; - free_device(device); + btrfs_free_device(device); } if (fs_devices->seed) { @@ -956,7 +988,7 @@ static void free_device_rcu(struct rcu_head *head) struct btrfs_device *device; device = container_of(head, struct btrfs_device, rcu); - free_device(device); + btrfs_free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) @@ -1005,7 +1037,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device) new_device->fs_devices = device->fs_devices; } -static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +static int close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; struct list_head pending_put; @@ -1050,7 +1082,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) int ret; mutex_lock(&uuid_mutex); - ret = __btrfs_close_devices(fs_devices); + ret = close_fs_devices(fs_devices); if (!fs_devices->opened) { seed_devices = fs_devices->seed; fs_devices->seed = NULL; @@ -1060,23 +1092,22 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) while (seed_devices) { fs_devices = seed_devices; seed_devices = fs_devices->seed; - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); } return ret; } -static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, +static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { - struct list_head *head = &fs_devices->devices; struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; int ret = 0; flags |= FMODE_EXCL; - list_for_each_entry(device, head, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { /* Just open everything we can; ignore failures here */ if (btrfs_open_one_device(fs_devices, device, flags, holder)) continue; @@ -1115,15 +1146,16 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, { int ret; - mutex_lock(&uuid_mutex); + mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { fs_devices->opened++; ret = 0; } else { list_sort(NULL, &fs_devices->devices, devid_cmp); - ret = __btrfs_open_devices(fs_devices, flags, holder); + ret = open_fs_devices(fs_devices, flags, holder); } - mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_devices->device_list_mutex); + return ret; } @@ -1201,31 +1233,29 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, */ bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; - mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto error; - } + if (IS_ERR(bdev)) + return PTR_ERR(bdev); if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { ret = -EINVAL; goto error_bdev_put; } + mutex_lock(&uuid_mutex); device = device_list_add(path, disk_super); if (IS_ERR(device)) ret = PTR_ERR(device); else *fs_devices_ret = device->fs_devices; + mutex_unlock(&uuid_mutex); btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); -error: - mutex_unlock(&uuid_mutex); + return ret; } @@ -1857,11 +1887,11 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, } while (read_seqretry(&fs_info->profiles_lock, seq)); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - if (!(all_avail & btrfs_raid_group[i])) + if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; if (num_devices < btrfs_raid_array[i].devs_min) { - int ret = btrfs_raid_mindev_error[i]; + int ret = btrfs_raid_array[i].mindev_error; if (ret) return ret; @@ -1917,13 +1947,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; - mutex_lock(&fs_info->volume_mutex); mutex_lock(&uuid_mutex); - num_devices = fs_info->fs_devices->num_devices; + num_devices = fs_devices->num_devices; btrfs_dev_replace_read_lock(&fs_info->dev_replace); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { WARN_ON(num_devices < 1); @@ -1986,27 +2016,32 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * (super_copy) should hold the device list mutex. */ + /* + * In normal cases the cur_devices == fs_devices. But in case + * of deleting a seed device, the cur_devices should point to + * its own fs_devices listed under the fs_devices->seed. + */ cur_devices = device->fs_devices; - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); list_del_rcu(&device->dev_list); - device->fs_devices->num_devices--; - device->fs_devices->total_devices--; + cur_devices->num_devices--; + cur_devices->total_devices--; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - device->fs_devices->missing_devices--; + cur_devices->missing_devices--; btrfs_assign_next_active_device(fs_info, device, NULL); if (device->bdev) { - device->fs_devices->open_devices--; + cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(fs_devices, device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; btrfs_set_super_num_devices(fs_info->super_copy, num_devices); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* * at this point, the device is zero sized and detached from @@ -2020,8 +2055,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, call_rcu(&device->rcu, free_device_rcu); if (cur_devices->open_devices == 0) { - struct btrfs_fs_devices *fs_devices; - fs_devices = fs_info->fs_devices; while (fs_devices) { if (fs_devices->seed == cur_devices) { fs_devices->seed = cur_devices->seed; @@ -2030,20 +2063,19 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, fs_devices = fs_devices->seed; } cur_devices->seed = NULL; - __btrfs_close_devices(cur_devices); + close_fs_devices(cur_devices); free_fs_devices(cur_devices); } out: mutex_unlock(&uuid_mutex); - mutex_unlock(&fs_info->volume_mutex); return ret; error_undo: if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, - &fs_info->fs_devices->alloc_list); + &fs_devices->alloc_list); device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } @@ -2112,7 +2144,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, tmp_fs_devices = tmp_fs_devices->seed; } fs_devices->seed = NULL; - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); } } @@ -2120,23 +2152,23 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - mutex_lock(&uuid_mutex); + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + WARN_ON(!tgtdev); - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); + btrfs_sysfs_rm_device_link(fs_devices, tgtdev); if (tgtdev->bdev) - fs_info->fs_devices->open_devices--; + fs_devices->open_devices--; - fs_info->fs_devices->num_devices--; + fs_devices->num_devices--; btrfs_assign_next_active_device(fs_info, tgtdev, NULL); list_del_rcu(&tgtdev->dev_list); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* * The update_dev_time() with in btrfs_scratch_superblocks() @@ -2188,10 +2220,6 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, struct btrfs_device *tmp; devices = &fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held by the caller. - */ list_for_each_entry(tmp, devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tmp->dev_state) && !tmp->bdev) { @@ -2259,7 +2287,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) return PTR_ERR(old_devices); } - list_add(&old_devices->list, &fs_uuids); + list_add(&old_devices->fs_list, &fs_uuids); memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); seed_devices->opened = 1; @@ -2570,7 +2598,7 @@ error_trans: if (trans) btrfs_end_transaction(trans); error_free_device: - free_device(device); + btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev && !unlocked) { @@ -2580,99 +2608,6 @@ error: return ret; } -int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device *srcdev, - struct btrfs_device **device_out) -{ - struct btrfs_device *device; - struct block_device *bdev; - struct list_head *devices; - struct rcu_string *name; - u64 devid = BTRFS_DEV_REPLACE_DEVID; - int ret = 0; - - *device_out = NULL; - if (fs_info->fs_devices->seeding) { - btrfs_err(fs_info, "the filesystem is a seed filesystem!"); - return -EINVAL; - } - - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - fs_info->bdev_holder); - if (IS_ERR(bdev)) { - btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev); - } - - filemap_write_and_wait(bdev->bd_inode->i_mapping); - - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - if (device->bdev == bdev) { - btrfs_err(fs_info, - "target device is in the filesystem!"); - ret = -EEXIST; - goto error; - } - } - - - if (i_size_read(bdev->bd_inode) < - btrfs_device_get_total_bytes(srcdev)) { - btrfs_err(fs_info, - "target device is smaller than source device!"); - ret = -EINVAL; - goto error; - } - - - device = btrfs_alloc_device(NULL, &devid, NULL); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; - } - - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); - - mutex_lock(&fs_info->fs_devices->device_list_mutex); - set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - device->generation = 0; - device->io_width = fs_info->sectorsize; - device->io_align = fs_info->sectorsize; - device->sector_size = fs_info->sectorsize; - device->total_bytes = btrfs_device_get_total_bytes(srcdev); - device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); - device->bytes_used = btrfs_device_get_bytes_used(srcdev); - device->commit_total_bytes = srcdev->commit_total_bytes; - device->commit_bytes_used = device->bytes_used; - device->fs_info = fs_info; - device->bdev = bdev; - set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->mode = FMODE_EXCL; - device->dev_stats_valid = 1; - set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - *device_out = device; - return ret; - -error: - blkdev_put(bdev, FMODE_EXCL); - return ret; -} - static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { @@ -3273,24 +3208,12 @@ static void update_balance_args(struct btrfs_balance_control *bctl) } /* - * Should be called with both balance and volume mutexes held to - * serialize other volume operations (add_dev/rm_dev/resize) with - * restriper. Same goes for unset_balance_control. + * Clear the balance status in fs_info and delete the balance item from disk. */ -static void set_balance_control(struct btrfs_balance_control *bctl) -{ - struct btrfs_fs_info *fs_info = bctl->fs_info; - - BUG_ON(fs_info->balance_ctl); - - spin_lock(&fs_info->balance_lock); - fs_info->balance_ctl = bctl; - spin_unlock(&fs_info->balance_lock); -} - -static void unset_balance_control(struct btrfs_fs_info *fs_info) +static void reset_balance_state(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; + int ret; BUG_ON(!fs_info->balance_ctl); @@ -3299,6 +3222,9 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); kfree(bctl); + ret = del_balance_item(fs_info); + if (ret) + btrfs_handle_fs_error(fs_info, ret, NULL); } /* @@ -3835,18 +3761,6 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info) atomic_read(&fs_info->balance_cancel_req) == 0); } -static void __cancel_balance(struct btrfs_fs_info *fs_info) -{ - int ret; - - unset_balance_control(fs_info); - ret = del_balance_item(fs_info); - if (ret) - btrfs_handle_fs_error(fs_info, ret, NULL); - - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); -} - /* Non-zero return value signifies invalidity */ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, u64 allowed) @@ -3857,12 +3771,12 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* - * Should be called with both balance and volume mutexes held + * Should be called with balance mutexe held */ -int btrfs_balance(struct btrfs_balance_control *bctl, +int btrfs_balance(struct btrfs_fs_info *fs_info, + struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { - struct btrfs_fs_info *fs_info = bctl->fs_info; u64 meta_target, data_target; u64 allowed; int mixed = 0; @@ -3891,7 +3805,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, !(bctl->flags & BTRFS_BALANCE_METADATA) || memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { btrfs_err(fs_info, - "with mixed groups data and metadata balance options must be the same"); + "balance: mixed groups data and metadata options must be the same"); ret = -EINVAL; goto out; } @@ -3913,23 +3827,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID6); if (validate_convert_profile(&bctl->data, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->data.target); + btrfs_err(fs_info, - "unable to start balance with target data profile %llu", - bctl->data.target); + "balance: invalid convert data profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->meta, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); + btrfs_err(fs_info, - "unable to start balance with target metadata profile %llu", - bctl->meta.target); + "balance: invalid convert metadata profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->sys, allowed)) { + int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); + btrfs_err(fs_info, - "unable to start balance with target system profile %llu", - bctl->sys.target); + "balance: invalid convert system profile %s", + get_raid_name(index)); ret = -EINVAL; goto out; } @@ -3950,10 +3870,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, !(bctl->meta.target & allowed))) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, - "force reducing metadata integrity"); + "balance: force reducing metadata integrity"); } else { btrfs_err(fs_info, - "balance will reduce metadata integrity, use force if you want this"); + "balance: reduces metadata integrity, use --force if you want this"); ret = -EINVAL; goto out; } @@ -3967,9 +3887,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl, bctl->data.target : fs_info->avail_data_alloc_bits; if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { + int meta_index = btrfs_bg_flags_to_raid_index(meta_target); + int data_index = btrfs_bg_flags_to_raid_index(data_target); + btrfs_warn(fs_info, - "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - meta_target, data_target); + "balance: metadata profile %s has lower redundancy than data profile %s", + get_raid_name(meta_index), get_raid_name(data_index)); } ret = insert_balance_item(fs_info, bctl); @@ -3978,7 +3901,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { BUG_ON(ret == -EEXIST); - set_balance_control(bctl); + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); } else { BUG_ON(ret != -EEXIST); spin_lock(&fs_info->balance_lock); @@ -3986,22 +3912,24 @@ int btrfs_balance(struct btrfs_balance_control *bctl, spin_unlock(&fs_info->balance_lock); } - atomic_inc(&fs_info->balance_running); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); - atomic_dec(&fs_info->balance_running); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, 0, bargs); + btrfs_update_ioctl_balance_args(fs_info, bargs); } if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { - __cancel_balance(fs_info); + reset_balance_state(fs_info); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } wake_up(&fs_info->balance_wait_q); @@ -4009,11 +3937,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, return ret; out: if (bctl->flags & BTRFS_BALANCE_RESUME) - __cancel_balance(fs_info); - else { + reset_balance_state(fs_info); + else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); - } + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + return ret; } @@ -4022,16 +3950,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "continuing balance"); - ret = btrfs_balance(fs_info->balance_ctl, NULL); + btrfs_info(fs_info, "balance: resuming"); + ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); } - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); return ret; } @@ -4040,18 +3964,27 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) { struct task_struct *tsk; - spin_lock(&fs_info->balance_lock); + mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { - spin_unlock(&fs_info->balance_lock); + mutex_unlock(&fs_info->balance_mutex); return 0; } - spin_unlock(&fs_info->balance_lock); + mutex_unlock(&fs_info->balance_mutex); if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { - btrfs_info(fs_info, "force skipping balance"); + btrfs_info(fs_info, "balance: resume skipped"); return 0; } + /* + * A ro->rw remount sequence should continue with the paused balance + * regardless of who pauses it, system or the user as of now, so set + * the resume flag. + */ + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); return PTR_ERR_OR_ZERO(tsk); } @@ -4091,7 +4024,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - bctl->fs_info = fs_info; bctl->flags = btrfs_balance_flags(leaf, item); bctl->flags |= BTRFS_BALANCE_RESUME; @@ -4102,15 +4034,26 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); + /* + * This should never happen, as the paused balance state is recovered + * during mount without any chance of other exclusive ops to collide. + * + * This gives the exclusive op status to balance and keeps in paused + * state until user intervention (cancel or umount). If the ownership + * cannot be assigned, show a message but do not fail. The balance + * is in a paused state and must have fs_info::balance_ctl properly + * set up. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + btrfs_warn(fs_info, + "balance: cannot set exclusive op status, resume manually"); - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - - set_balance_control(bctl); - + BUG_ON(fs_info->balance_ctl); + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); out: btrfs_free_path(path); return ret; @@ -4126,16 +4069,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) return -ENOTCONN; } - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { atomic_inc(&fs_info->balance_pause_req); mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); /* we are good with balance_ctl ripped off from under us */ - BUG_ON(atomic_read(&fs_info->balance_running)); + BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_pause_req); } else { ret = -ENOTCONN; @@ -4147,38 +4090,49 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) { - if (sb_rdonly(fs_info->sb)) - return -EROFS; - mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { mutex_unlock(&fs_info->balance_mutex); return -ENOTCONN; } + /* + * A paused balance with the item stored on disk can be resumed at + * mount time if the mount is read-write. Otherwise it's still paused + * and we must not allow cancelling as it deletes the item. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_unlock(&fs_info->balance_mutex); + return -EROFS; + } + atomic_inc(&fs_info->balance_cancel_req); /* * if we are running just wait and return, balance item is * deleted in btrfs_balance in this case */ - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); } else { - /* __cancel_balance needs volume_mutex */ mutex_unlock(&fs_info->balance_mutex); - mutex_lock(&fs_info->volume_mutex); + /* + * Lock released to allow other waiters to continue, we'll + * reexamine the status again. + */ mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) - __cancel_balance(fs_info); - - mutex_unlock(&fs_info->volume_mutex); + if (fs_info->balance_ctl) { + reset_balance_state(fs_info); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_info(fs_info, "balance: canceled"); + } } - BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + BUG_ON(fs_info->balance_ctl || + test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; @@ -4255,8 +4209,7 @@ static int btrfs_uuid_scan_kthread(void *data) } update_tree: if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, - root_item.uuid, + ret = btrfs_uuid_tree_add(trans, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, key.objectid); if (ret < 0) { @@ -4267,7 +4220,7 @@ update_tree: } if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info, + ret = btrfs_uuid_tree_add(trans, root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, key.objectid); @@ -4473,7 +4426,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = READA_FORWARD; + path->reada = READA_BACK; mutex_lock(&fs_info->chunk_mutex); @@ -6034,9 +5987,8 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, - u64 chunk_start, u64 physical, u64 devid, - u64 **logical, int *naddrs, int *stripe_len) +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -6068,8 +6020,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { - if (devid && map->stripes[i].dev->devid != devid) - continue; if (map->stripes[i].physical > physical || map->stripes[i].physical + length <= physical) continue; @@ -6401,7 +6351,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be - * destroyed with free_device. + * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, @@ -6424,7 +6374,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { - free_device(dev); + btrfs_free_device(dev); return ERR_PTR(ret); } } @@ -6675,8 +6625,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = __btrfs_open_devices(fs_devices, FMODE_READ, - fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); fs_devices = ERR_PTR(ret); @@ -6684,7 +6633,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, } if (!fs_devices->seeding) { - __btrfs_close_devices(fs_devices); + close_fs_devices(fs_devices); free_fs_devices(fs_devices); fs_devices = ERR_PTR(-EINVAL); goto out; @@ -6993,6 +6942,10 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; + /* + * uuid_mutex is needed only if we are mounting a sprout FS + * otherwise we don't need it. + */ mutex_lock(&uuid_mutex); mutex_lock(&fs_info->chunk_mutex); |