diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-09 08:10:03 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-09 08:10:03 +0300 |
commit | 140dfc9299c33bbfc9350fa061f5ab65cb83df13 (patch) | |
tree | 09508691964e277f4835d30f7b9c3962e8cac596 /drivers/md/dm.c | |
parent | f94784bdb114439eb3a5e62343826887bbf3f37c (diff) | |
parent | 1a71d6ffe18c0d0f03fc8531949cc8ed41d702ee (diff) | |
download | linux-140dfc9299c33bbfc9350fa061f5ab65cb83df13.tar.xz |
Merge tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Significant DM thin-provisioning performance improvements to meet
performance requirements that were requested by the Gluster
distributed filesystem.
Specifically, dm-thinp now takes care to aggregate IO that will be
issued to the same thinp block before issuing IO to the underlying
devices. This really helps improve performance on HW RAID6 devices
that have a writeback cache because it avoids RMW in the HW RAID
controller.
- Some stable fixes: fix leak in DM bufio if integrity profiles were
enabled, use memzero_explicit in DM crypt to avoid any potential for
information leak, and a DM cache fix to properly mark a cache block
dirty if it was promoted to the cache via the overwrite optimization.
- A few simple DM persistent data library fixes
- DM cache multiqueue policy block promotion improvements.
- DM cache discard improvements that take advantage of range
(multiblock) discard support in the DM bio-prison. This allows for
much more efficient bulk discard processing (e.g. when mkfs.xfs
discards the entire device).
- Some small optimizations in DM core and RCU deference cleanups
- DM core changes to suspend/resume code to introduce the new internal
suspend/resume interface that the DM thin-pool target now uses to
suspend/resume active thin devices when the thin-pool must
suspend/resume.
This avoids forcing userspace to track all active thin volumes in a
thin-pool when the thin-pool is suspended for the purposes of
metadata or data space resize.
* tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (49 commits)
dm crypt: use memzero_explicit for on-stack buffer
dm space map metadata: fix sm_bootstrap_get_count()
dm space map metadata: fix sm_bootstrap_get_nr_blocks()
dm bufio: fix memleak when using a dm_buffer's inline bio
dm cache: fix spurious cell_defer when dealing with partial block at end of device
dm cache: dirty flag was mistakenly being cleared when promoting via overwrite
dm cache: only use overwrite optimisation for promotion when in writeback mode
dm cache: discard block size must be a multiple of cache block size
dm cache: fix a harmless race when working out if a block is discarded
dm cache: when reloading a discard bitset allow for a different discard block size
dm cache: fix some issues with the new discard range support
dm array: if resizing the array is a noop set the new root to the old one
dm: use rcu_dereference_protected instead of rcu_dereference
dm thin: fix pool_io_hints to avoid looking at max_hw_sectors
dm thin: suspend/resume active thin devices when reloading thin-pool
dm: enhance internal suspend and resume interface
dm thin: do not allow thin device activation while pool is suspended
dm: add presuspend_undo hook to target_type
dm: return earlier from dm_blk_ioctl if target doesn't implement .ioctl
dm thin: remove stale 'trim' message in block comment above pool_message
...
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r-- | drivers/md/dm.c | 273 |
1 files changed, 200 insertions, 73 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 58f3927fd7cc..8f37ed215b19 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -19,6 +19,7 @@ #include <linux/idr.h> #include <linux/hdreg.h> #include <linux/delay.h> +#include <linux/wait.h> #include <trace/events/block.h> @@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_MERGE_IS_OPTIONAL 6 #define DMF_DEFERRED_REMOVE 7 +#define DMF_SUSPENDED_INTERNALLY 8 /* * A dummy definition to make RCU happy. @@ -140,7 +142,7 @@ struct mapped_device { * Use dm_get_live_table{_fast} or take suspend_lock for * dereference. */ - struct dm_table *map; + struct dm_table __rcu *map; struct list_head table_devices; struct mutex table_devices_lock; @@ -525,14 +527,15 @@ retry: goto out; tgt = dm_table_get_target(map, 0); + if (!tgt->type->ioctl) + goto out; if (dm_suspended_md(md)) { r = -EAGAIN; goto out; } - if (tgt->type->ioctl) - r = tgt->type->ioctl(tgt, cmd, arg); + r = tgt->type->ioctl(tgt, cmd, arg); out: dm_put_live_table(md, srcu_idx); @@ -1607,9 +1610,9 @@ static int dm_merge_bvec(struct request_queue *q, * Find maximum amount of I/O that won't need splitting */ max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) BIO_MAX_SECTORS); + (sector_t) queue_max_sectors(q)); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - if (max_size < 0) + if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ max_size = 0; /* @@ -1621,10 +1624,10 @@ static int dm_merge_bvec(struct request_queue *q, max_size = ti->type->merge(ti, bvm, biovec, max_size); /* * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking at - * queue_max_hw_sectors), then we can't allow bios with multiple vector - * entries. So always set max_size to 0, and the code below allows - * just one page. + * provided their merge_bvec method (we know this by looking for the + * max_hw_sectors that dm_set_device_limits may set), then we can't + * allow bios with multiple vector entries. So always set max_size + * to 0, and the code below allows just one page. */ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) max_size = 0; @@ -2332,7 +2335,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, merge_is_optional = dm_table_merge_is_optional(t); - old_map = md->map; + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, t); md->immutable_target_type = dm_table_get_immutable_target_type(t); @@ -2341,7 +2344,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); else clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - dm_sync_table(md); + if (old_map) + dm_sync_table(md); return old_map; } @@ -2351,7 +2355,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, */ static struct dm_table *__unbind(struct mapped_device *md) { - struct dm_table *map = md->map; + struct dm_table *map = rcu_dereference_protected(md->map, 1); if (!map) return NULL; @@ -2716,36 +2720,18 @@ static void unlock_fs(struct mapped_device *md) } /* - * We need to be able to change a mapping table under a mounted - * filesystem. For example we might want to move some data in - * the background. Before the table can be swapped with - * dm_bind_table, dm_suspend must be called to flush any in - * flight bios and ensure that any further io gets deferred. - */ -/* - * Suspend mechanism in request-based dm. + * If __dm_suspend returns 0, the device is completely quiescent + * now. There is no request-processing activity. All new requests + * are being added to md->deferred list. * - * 1. Flush all I/Os by lock_fs() if needed. - * 2. Stop dispatching any I/O by stopping the request_queue. - * 3. Wait for all in-flight I/Os to be completed or requeued. - * - * To abort suspend, start the request_queue. + * Caller must hold md->suspend_lock */ -int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +static int __dm_suspend(struct mapped_device *md, struct dm_table *map, + unsigned suspend_flags, int interruptible) { - struct dm_table *map = NULL; - int r = 0; - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - - mutex_lock(&md->suspend_lock); - - if (dm_suspended_md(md)) { - r = -EINVAL; - goto out_unlock; - } - - map = md->map; + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; + int r; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2754,7 +2740,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - /* This does not get reverted if there's an error later. */ + /* + * This gets reverted if there's an error later and the targets + * provide the .presuspend_undo hook. + */ dm_table_presuspend_targets(map); /* @@ -2765,8 +2754,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) */ if (!noflush && do_lockfs) { r = lock_fs(md); - if (r) - goto out_unlock; + if (r) { + dm_table_presuspend_undo_targets(map); + return r; + } } /* @@ -2782,7 +2773,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * flush_workqueue(md->wq). */ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - synchronize_srcu(&md->io_barrier); + if (map) + synchronize_srcu(&md->io_barrier); /* * Stop md->queue before flushing md->wq in case request-based @@ -2798,11 +2790,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); + r = dm_wait_for_completion(md, interruptible); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - synchronize_srcu(&md->io_barrier); + if (map) + synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { @@ -2812,14 +2805,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) start_queue(md->queue); unlock_fs(md); - goto out_unlock; /* pushback list is already flushed, so skip flush */ + dm_table_presuspend_undo_targets(map); + /* pushback list is already flushed, so skip flush */ } - /* - * If dm_wait_for_completion returned 0, the device is completely - * quiescent now. There is no request-processing activity. All new - * requests are being added to md->deferred list. - */ + return r; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +{ + struct dm_table *map = NULL; + int r = 0; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (dm_suspended_md(md)) { + r = -EINVAL; + goto out_unlock; + } + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); + if (r) + goto out_unlock; set_bit(DMF_SUSPENDED, &md->flags); @@ -2830,22 +2865,13 @@ out_unlock: return r; } -int dm_resume(struct mapped_device *md) +static int __dm_resume(struct mapped_device *md, struct dm_table *map) { - int r = -EINVAL; - struct dm_table *map = NULL; - - mutex_lock(&md->suspend_lock); - if (!dm_suspended_md(md)) - goto out; - - map = md->map; - if (!map || !dm_table_get_size(map)) - goto out; - - r = dm_table_resume_targets(map); - if (r) - goto out; + if (map) { + int r = dm_table_resume_targets(map); + if (r) + return r; + } dm_queue_flush(md); @@ -2859,6 +2885,37 @@ int dm_resume(struct mapped_device *md) unlock_fs(md); + return 0; +} + +int dm_resume(struct mapped_device *md) +{ + int r = -EINVAL; + struct dm_table *map = NULL; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (!dm_suspended_md(md)) + goto out; + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + if (!map || !dm_table_get_size(map)) + goto out; + + r = __dm_resume(md, map); + if (r) + goto out; + clear_bit(DMF_SUSPENDED, &md->flags); r = 0; @@ -2872,15 +2929,80 @@ out: * Internal suspend/resume works like userspace-driven suspend. It waits * until all bios finish and prevents issuing new bios to the target drivers. * It may be used only from the kernel. - * - * Internal suspend holds md->suspend_lock, which prevents interaction with - * userspace-driven suspend. */ -void dm_internal_suspend(struct mapped_device *md) +static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) { - mutex_lock(&md->suspend_lock); + struct dm_table *map = NULL; + + if (dm_suspended_internally_md(md)) + return; /* nested internal suspend */ + + if (dm_suspended_md(md)) { + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + return; /* nest suspend */ + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + /* + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend + * would require changing .presuspend to return an error -- avoid this + * until there is a need for more elaborate variants of internal suspend. + */ + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); + + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + + dm_table_postsuspend_targets(map); +} + +static void __dm_internal_resume(struct mapped_device *md) +{ + if (!dm_suspended_internally_md(md)) + return; /* resume from nested internal suspend */ + if (dm_suspended_md(md)) + goto done; /* resume from nested suspend */ + + /* + * NOTE: existing callers don't need to call dm_table_resume_targets + * (which may fail -- so best to avoid it for now by passing NULL map) + */ + (void) __dm_resume(md, NULL); + +done: + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + smp_mb__after_atomic(); + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); +} + +void dm_internal_suspend_noflush(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); + +void dm_internal_resume(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_resume(md); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_resume); + +/* + * Fast variants of internal suspend/resume hold md->suspend_lock, + * which prevents interaction with userspace-driven suspend. + */ + +void dm_internal_suspend_fast(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) return; set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); @@ -2889,9 +3011,9 @@ void dm_internal_suspend(struct mapped_device *md) dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); } -void dm_internal_resume(struct mapped_device *md) +void dm_internal_resume_fast(struct mapped_device *md) { - if (dm_suspended_md(md)) + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) goto done; dm_queue_flush(md); @@ -2977,6 +3099,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_suspended_internally_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); +} + int dm_test_deferred_remove_flag(struct mapped_device *md) { return test_bit(DMF_DEFERRED_REMOVE, &md->flags); |