diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 22:35:36 +0300 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 22:35:36 +0300 |
commit | 4ba24fef3eb3b142197135223b90ced2f319cd53 (patch) | |
tree | a20c125b27740ec7b4c761b11d801108e1b316b2 /drivers/md/dm.c | |
parent | 47c1ffb2b6b630894e9a16442611c056ab21c057 (diff) | |
parent | 98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff) | |
download | linux-4ba24fef3eb3b142197135223b90ced2f319cd53.tar.xz |
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.20.
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r-- | drivers/md/dm.c | 431 |
1 files changed, 336 insertions, 95 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 32b958dbc499..b98cd9d84435 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -19,6 +19,7 @@ #include <linux/idr.h> #include <linux/hdreg.h> #include <linux/delay.h> +#include <linux/wait.h> #include <trace/events/block.h> @@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_MERGE_IS_OPTIONAL 6 #define DMF_DEFERRED_REMOVE 7 +#define DMF_SUSPENDED_INTERNALLY 8 /* * A dummy definition to make RCU happy. @@ -140,7 +142,10 @@ struct mapped_device { * Use dm_get_live_table{_fast} or take suspend_lock for * dereference. */ - struct dm_table *map; + struct dm_table __rcu *map; + + struct list_head table_devices; + struct mutex table_devices_lock; unsigned long flags; @@ -212,6 +217,12 @@ struct dm_md_mempools { struct bio_set *bs; }; +struct table_device { + struct list_head list; + atomic_t count; + struct dm_dev dm_dev; +}; + #define RESERVED_BIO_BASED_IOS 16 #define RESERVED_REQUEST_BASED_IOS 256 #define RESERVED_MAX_IOS 1024 @@ -516,14 +527,15 @@ retry: goto out; tgt = dm_table_get_target(map, 0); + if (!tgt->type->ioctl) + goto out; if (dm_suspended_md(md)) { r = -EAGAIN; goto out; } - if (tgt->type->ioctl) - r = tgt->type->ioctl(tgt, cmd, arg); + r = tgt->type->ioctl(tgt, cmd, arg); out: dm_put_live_table(md, srcu_idx); @@ -593,13 +605,10 @@ static void end_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->bio; unsigned long duration = jiffies - io->start_time; - int pending, cpu; + int pending; int rw = bio_data_dir(bio); - cpu = part_stat_lock(); - part_round_stats(cpu, &dm_disk(md)->part0); - part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); - part_stat_unlock(); + generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, @@ -670,6 +679,120 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) } /* + * Open a table device so we can use it as a map destination. + */ +static int open_table_device(struct table_device *td, dev_t dev, + struct mapped_device *md) +{ + static char *_claim_ptr = "I belong to device-mapper"; + struct block_device *bdev; + + int r; + + BUG_ON(td->dm_dev.bdev); + + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); + return r; + } + + td->dm_dev.bdev = bdev; + return 0; +} + +/* + * Close a table device that we've been using. + */ +static void close_table_device(struct table_device *td, struct mapped_device *md) +{ + if (!td->dm_dev.bdev) + return; + + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + td->dm_dev.bdev = NULL; +} + +static struct table_device *find_table_device(struct list_head *l, dev_t dev, + fmode_t mode) { + struct table_device *td; + + list_for_each_entry(td, l, list) + if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) + return td; + + return NULL; +} + +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result) { + int r; + struct table_device *td; + + mutex_lock(&md->table_devices_lock); + td = find_table_device(&md->table_devices, dev, mode); + if (!td) { + td = kmalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + mutex_unlock(&md->table_devices_lock); + return -ENOMEM; + } + + td->dm_dev.mode = mode; + td->dm_dev.bdev = NULL; + + if ((r = open_table_device(td, dev, md))) { + mutex_unlock(&md->table_devices_lock); + kfree(td); + return r; + } + + format_dev_t(td->dm_dev.name, dev); + + atomic_set(&td->count, 0); + list_add(&td->list, &md->table_devices); + } + atomic_inc(&td->count); + mutex_unlock(&md->table_devices_lock); + + *result = &td->dm_dev; + return 0; +} +EXPORT_SYMBOL_GPL(dm_get_table_device); + +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) +{ + struct table_device *td = container_of(d, struct table_device, dm_dev); + + mutex_lock(&md->table_devices_lock); + if (atomic_dec_and_test(&td->count)) { + close_table_device(td, md); + list_del(&td->list); + kfree(td); + } + mutex_unlock(&md->table_devices_lock); +} +EXPORT_SYMBOL(dm_put_table_device); + +static void free_table_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct table_device *td = list_entry(tmp, struct table_device, list); + + DMWARN("dm_destroy: %s still exists with %d references", + td->dm_dev.name, atomic_read(&td->count)); + kfree(td); + } +} + +/* * Get the geometry associated with a dm device */ int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) @@ -776,7 +899,7 @@ static void disable_write_same(struct mapped_device *md) static void clone_endio(struct bio *bio, int error) { - int r = 0; + int r = error; struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; @@ -1249,13 +1372,13 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio, } static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti, int nr_iovecs, + struct dm_target *ti, unsigned target_bio_nr) { struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); + clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); tio = container_of(clone, struct dm_target_io, clone); tio->io = ci->io; @@ -1269,17 +1392,12 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target *ti, unsigned target_bio_nr, unsigned *len) { - struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); + struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); struct bio *clone = &tio->clone; tio->len_ptr = len; - /* - * Discard requests require the bio's inline iovecs be initialized. - * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush - * and discard, so no need for concern about wasted bvec allocations. - */ - __bio_clone_fast(clone, ci->bio); + __bio_clone_fast(clone, ci->bio); if (len) bio_setup_sector(clone, ci->sector, *len); @@ -1322,7 +1440,7 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti num_target_bios = ti->num_write_bios(ti, bio); for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { - tio = alloc_tio(ci, ti, 0, target_bio_nr); + tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; clone_bio(tio, bio, sector, *len); __map_bio(tio); @@ -1489,9 +1607,9 @@ static int dm_merge_bvec(struct request_queue *q, * Find maximum amount of I/O that won't need splitting */ max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) BIO_MAX_SECTORS); + (sector_t) queue_max_sectors(q)); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - if (max_size < 0) + if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ max_size = 0; /* @@ -1503,10 +1621,10 @@ static int dm_merge_bvec(struct request_queue *q, max_size = ti->type->merge(ti, bvm, biovec, max_size); /* * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking at - * queue_max_hw_sectors), then we can't allow bios with multiple vector - * entries. So always set max_size to 0, and the code below allows - * just one page. + * provided their merge_bvec method (we know this by looking for the + * max_hw_sectors that dm_set_device_limits may set), then we can't + * allow bios with multiple vector entries. So always set max_size + * to 0, and the code below allows just one page. */ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) max_size = 0; @@ -1530,16 +1648,12 @@ static void _dm_request(struct request_queue *q, struct bio *bio) { int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; - int cpu; int srcu_idx; struct dm_table *map; map = dm_get_live_table(md, &srcu_idx); - cpu = part_stat_lock(); - part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); - part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); - part_stat_unlock(); + generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { @@ -1949,12 +2063,14 @@ static struct mapped_device *alloc_dev(int minor) md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); + mutex_init(&md->table_devices_lock); spin_lock_init(&md->deferred_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); atomic_set(&md->uevent_seq, 0); INIT_LIST_HEAD(&md->uevent_list); + INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); md->queue = blk_alloc_queue(GFP_KERNEL); @@ -2040,6 +2156,7 @@ static void free_dev(struct mapped_device *md) blk_integrity_unregister(md->disk); del_gendisk(md->disk); cleanup_srcu_struct(&md->io_barrier); + free_table_devices(&md->table_devices); free_minor(minor); spin_lock(&_minor_lock); @@ -2211,7 +2328,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, merge_is_optional = dm_table_merge_is_optional(t); - old_map = md->map; + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, t); md->immutable_target_type = dm_table_get_immutable_target_type(t); @@ -2220,7 +2337,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); else clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - dm_sync_table(md); + if (old_map) + dm_sync_table(md); return old_map; } @@ -2230,7 +2348,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, */ static struct dm_table *__unbind(struct mapped_device *md) { - struct dm_table *map = md->map; + struct dm_table *map = rcu_dereference_protected(md->map, 1); if (!map) return NULL; @@ -2595,36 +2713,18 @@ static void unlock_fs(struct mapped_device *md) } /* - * We need to be able to change a mapping table under a mounted - * filesystem. For example we might want to move some data in - * the background. Before the table can be swapped with - * dm_bind_table, dm_suspend must be called to flush any in - * flight bios and ensure that any further io gets deferred. - */ -/* - * Suspend mechanism in request-based dm. - * - * 1. Flush all I/Os by lock_fs() if needed. - * 2. Stop dispatching any I/O by stopping the request_queue. - * 3. Wait for all in-flight I/Os to be completed or requeued. + * If __dm_suspend returns 0, the device is completely quiescent + * now. There is no request-processing activity. All new requests + * are being added to md->deferred list. * - * To abort suspend, start the request_queue. + * Caller must hold md->suspend_lock */ -int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +static int __dm_suspend(struct mapped_device *md, struct dm_table *map, + unsigned suspend_flags, int interruptible) { - struct dm_table *map = NULL; - int r = 0; - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - - mutex_lock(&md->suspend_lock); - - if (dm_suspended_md(md)) { - r = -EINVAL; - goto out_unlock; - } - - map = md->map; + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; + int r; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2633,7 +2733,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - /* This does not get reverted if there's an error later. */ + /* + * This gets reverted if there's an error later and the targets + * provide the .presuspend_undo hook. + */ dm_table_presuspend_targets(map); /* @@ -2644,8 +2747,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) */ if (!noflush && do_lockfs) { r = lock_fs(md); - if (r) - goto out_unlock; + if (r) { + dm_table_presuspend_undo_targets(map); + return r; + } } /* @@ -2661,7 +2766,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * flush_workqueue(md->wq). */ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - synchronize_srcu(&md->io_barrier); + if (map) + synchronize_srcu(&md->io_barrier); /* * Stop md->queue before flushing md->wq in case request-based @@ -2677,11 +2783,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); + r = dm_wait_for_completion(md, interruptible); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - synchronize_srcu(&md->io_barrier); + if (map) + synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { @@ -2691,14 +2798,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) start_queue(md->queue); unlock_fs(md); - goto out_unlock; /* pushback list is already flushed, so skip flush */ + dm_table_presuspend_undo_targets(map); + /* pushback list is already flushed, so skip flush */ } - /* - * If dm_wait_for_completion returned 0, the device is completely - * quiescent now. There is no request-processing activity. All new - * requests are being added to md->deferred list. - */ + return r; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +{ + struct dm_table *map = NULL; + int r = 0; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (dm_suspended_md(md)) { + r = -EINVAL; + goto out_unlock; + } + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); + if (r) + goto out_unlock; set_bit(DMF_SUSPENDED, &md->flags); @@ -2709,22 +2858,13 @@ out_unlock: return r; } -int dm_resume(struct mapped_device *md) +static int __dm_resume(struct mapped_device *md, struct dm_table *map) { - int r = -EINVAL; - struct dm_table *map = NULL; - - mutex_lock(&md->suspend_lock); - if (!dm_suspended_md(md)) - goto out; - - map = md->map; - if (!map || !dm_table_get_size(map)) - goto out; - - r = dm_table_resume_targets(map); - if (r) - goto out; + if (map) { + int r = dm_table_resume_targets(map); + if (r) + return r; + } dm_queue_flush(md); @@ -2738,6 +2878,37 @@ int dm_resume(struct mapped_device *md) unlock_fs(md); + return 0; +} + +int dm_resume(struct mapped_device *md) +{ + int r = -EINVAL; + struct dm_table *map = NULL; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (!dm_suspended_md(md)) + goto out; + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + if (!map || !dm_table_get_size(map)) + goto out; + + r = __dm_resume(md, map); + if (r) + goto out; + clear_bit(DMF_SUSPENDED, &md->flags); r = 0; @@ -2751,15 +2922,80 @@ out: * Internal suspend/resume works like userspace-driven suspend. It waits * until all bios finish and prevents issuing new bios to the target drivers. * It may be used only from the kernel. - * - * Internal suspend holds md->suspend_lock, which prevents interaction with - * userspace-driven suspend. */ -void dm_internal_suspend(struct mapped_device *md) +static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) { - mutex_lock(&md->suspend_lock); + struct dm_table *map = NULL; + + if (dm_suspended_internally_md(md)) + return; /* nested internal suspend */ + + if (dm_suspended_md(md)) { + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + return; /* nest suspend */ + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + /* + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend + * would require changing .presuspend to return an error -- avoid this + * until there is a need for more elaborate variants of internal suspend. + */ + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); + + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + + dm_table_postsuspend_targets(map); +} + +static void __dm_internal_resume(struct mapped_device *md) +{ + if (!dm_suspended_internally_md(md)) + return; /* resume from nested internal suspend */ + if (dm_suspended_md(md)) + goto done; /* resume from nested suspend */ + + /* + * NOTE: existing callers don't need to call dm_table_resume_targets + * (which may fail -- so best to avoid it for now by passing NULL map) + */ + (void) __dm_resume(md, NULL); + +done: + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + smp_mb__after_atomic(); + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); +} + +void dm_internal_suspend_noflush(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); + +void dm_internal_resume(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_resume(md); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_resume); + +/* + * Fast variants of internal suspend/resume hold md->suspend_lock, + * which prevents interaction with userspace-driven suspend. + */ + +void dm_internal_suspend_fast(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) return; set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); @@ -2768,9 +3004,9 @@ void dm_internal_suspend(struct mapped_device *md) dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); } -void dm_internal_resume(struct mapped_device *md) +void dm_internal_resume_fast(struct mapped_device *md) { - if (dm_suspended_md(md)) + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) goto done; dm_queue_flush(md); @@ -2856,6 +3092,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_suspended_internally_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); +} + int dm_test_deferred_remove_flag(struct mapped_device *md) { return test_bit(DMF_DEFERRED_REMOVE, &md->flags); @@ -2900,7 +3141,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u if (!pools->io_pool) goto out; - pools->bs = bioset_create(pool_size, front_pad); + pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) goto out; |