summaryrefslogtreecommitdiff
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-09 08:10:03 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-09 08:10:03 +0300
commit140dfc9299c33bbfc9350fa061f5ab65cb83df13 (patch)
tree09508691964e277f4835d30f7b9c3962e8cac596 /drivers/md/dm.c
parentf94784bdb114439eb3a5e62343826887bbf3f37c (diff)
parent1a71d6ffe18c0d0f03fc8531949cc8ed41d702ee (diff)
downloadlinux-140dfc9299c33bbfc9350fa061f5ab65cb83df13.tar.xz
Merge tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Significant DM thin-provisioning performance improvements to meet performance requirements that were requested by the Gluster distributed filesystem. Specifically, dm-thinp now takes care to aggregate IO that will be issued to the same thinp block before issuing IO to the underlying devices. This really helps improve performance on HW RAID6 devices that have a writeback cache because it avoids RMW in the HW RAID controller. - Some stable fixes: fix leak in DM bufio if integrity profiles were enabled, use memzero_explicit in DM crypt to avoid any potential for information leak, and a DM cache fix to properly mark a cache block dirty if it was promoted to the cache via the overwrite optimization. - A few simple DM persistent data library fixes - DM cache multiqueue policy block promotion improvements. - DM cache discard improvements that take advantage of range (multiblock) discard support in the DM bio-prison. This allows for much more efficient bulk discard processing (e.g. when mkfs.xfs discards the entire device). - Some small optimizations in DM core and RCU deference cleanups - DM core changes to suspend/resume code to introduce the new internal suspend/resume interface that the DM thin-pool target now uses to suspend/resume active thin devices when the thin-pool must suspend/resume. This avoids forcing userspace to track all active thin volumes in a thin-pool when the thin-pool is suspended for the purposes of metadata or data space resize. * tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (49 commits) dm crypt: use memzero_explicit for on-stack buffer dm space map metadata: fix sm_bootstrap_get_count() dm space map metadata: fix sm_bootstrap_get_nr_blocks() dm bufio: fix memleak when using a dm_buffer's inline bio dm cache: fix spurious cell_defer when dealing with partial block at end of device dm cache: dirty flag was mistakenly being cleared when promoting via overwrite dm cache: only use overwrite optimisation for promotion when in writeback mode dm cache: discard block size must be a multiple of cache block size dm cache: fix a harmless race when working out if a block is discarded dm cache: when reloading a discard bitset allow for a different discard block size dm cache: fix some issues with the new discard range support dm array: if resizing the array is a noop set the new root to the old one dm: use rcu_dereference_protected instead of rcu_dereference dm thin: fix pool_io_hints to avoid looking at max_hw_sectors dm thin: suspend/resume active thin devices when reloading thin-pool dm: enhance internal suspend and resume interface dm thin: do not allow thin device activation while pool is suspended dm: add presuspend_undo hook to target_type dm: return earlier from dm_blk_ioctl if target doesn't implement .ioctl dm thin: remove stale 'trim' message in block comment above pool_message ...
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c273
1 files changed, 200 insertions, 73 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 58f3927fd7cc..8f37ed215b19 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/delay.h>
+#include <linux/wait.h>
#include <trace/events/block.h>
@@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_MERGE_IS_OPTIONAL 6
#define DMF_DEFERRED_REMOVE 7
+#define DMF_SUSPENDED_INTERNALLY 8
/*
* A dummy definition to make RCU happy.
@@ -140,7 +142,7 @@ struct mapped_device {
* Use dm_get_live_table{_fast} or take suspend_lock for
* dereference.
*/
- struct dm_table *map;
+ struct dm_table __rcu *map;
struct list_head table_devices;
struct mutex table_devices_lock;
@@ -525,14 +527,15 @@ retry:
goto out;
tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->ioctl)
+ goto out;
if (dm_suspended_md(md)) {
r = -EAGAIN;
goto out;
}
- if (tgt->type->ioctl)
- r = tgt->type->ioctl(tgt, cmd, arg);
+ r = tgt->type->ioctl(tgt, cmd, arg);
out:
dm_put_live_table(md, srcu_idx);
@@ -1607,9 +1610,9 @@ static int dm_merge_bvec(struct request_queue *q,
* Find maximum amount of I/O that won't need splitting
*/
max_sectors = min(max_io_len(bvm->bi_sector, ti),
- (sector_t) BIO_MAX_SECTORS);
+ (sector_t) queue_max_sectors(q));
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
- if (max_size < 0)
+ if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
max_size = 0;
/*
@@ -1621,10 +1624,10 @@ static int dm_merge_bvec(struct request_queue *q,
max_size = ti->type->merge(ti, bvm, biovec, max_size);
/*
* If the target doesn't support merge method and some of the devices
- * provided their merge_bvec method (we know this by looking at
- * queue_max_hw_sectors), then we can't allow bios with multiple vector
- * entries. So always set max_size to 0, and the code below allows
- * just one page.
+ * provided their merge_bvec method (we know this by looking for the
+ * max_hw_sectors that dm_set_device_limits may set), then we can't
+ * allow bios with multiple vector entries. So always set max_size
+ * to 0, and the code below allows just one page.
*/
else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
max_size = 0;
@@ -2332,7 +2335,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
merge_is_optional = dm_table_merge_is_optional(t);
- old_map = md->map;
+ old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
@@ -2341,7 +2344,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
else
clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
- dm_sync_table(md);
+ if (old_map)
+ dm_sync_table(md);
return old_map;
}
@@ -2351,7 +2355,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
*/
static struct dm_table *__unbind(struct mapped_device *md)
{
- struct dm_table *map = md->map;
+ struct dm_table *map = rcu_dereference_protected(md->map, 1);
if (!map)
return NULL;
@@ -2716,36 +2720,18 @@ static void unlock_fs(struct mapped_device *md)
}
/*
- * We need to be able to change a mapping table under a mounted
- * filesystem. For example we might want to move some data in
- * the background. Before the table can be swapped with
- * dm_bind_table, dm_suspend must be called to flush any in
- * flight bios and ensure that any further io gets deferred.
- */
-/*
- * Suspend mechanism in request-based dm.
+ * If __dm_suspend returns 0, the device is completely quiescent
+ * now. There is no request-processing activity. All new requests
+ * are being added to md->deferred list.
*
- * 1. Flush all I/Os by lock_fs() if needed.
- * 2. Stop dispatching any I/O by stopping the request_queue.
- * 3. Wait for all in-flight I/Os to be completed or requeued.
- *
- * To abort suspend, start the request_queue.
+ * Caller must hold md->suspend_lock
*/
-int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
+ unsigned suspend_flags, int interruptible)
{
- struct dm_table *map = NULL;
- int r = 0;
- int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
- int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
-
- mutex_lock(&md->suspend_lock);
-
- if (dm_suspended_md(md)) {
- r = -EINVAL;
- goto out_unlock;
- }
-
- map = md->map;
+ bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
+ bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
+ int r;
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2754,7 +2740,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- /* This does not get reverted if there's an error later. */
+ /*
+ * This gets reverted if there's an error later and the targets
+ * provide the .presuspend_undo hook.
+ */
dm_table_presuspend_targets(map);
/*
@@ -2765,8 +2754,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
*/
if (!noflush && do_lockfs) {
r = lock_fs(md);
- if (r)
- goto out_unlock;
+ if (r) {
+ dm_table_presuspend_undo_targets(map);
+ return r;
+ }
}
/*
@@ -2782,7 +2773,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
* flush_workqueue(md->wq).
*/
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- synchronize_srcu(&md->io_barrier);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
/*
* Stop md->queue before flushing md->wq in case request-based
@@ -2798,11 +2790,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
* We call dm_wait_for_completion to wait for all existing requests
* to finish.
*/
- r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
+ r = dm_wait_for_completion(md, interruptible);
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- synchronize_srcu(&md->io_barrier);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
/* were we interrupted ? */
if (r < 0) {
@@ -2812,14 +2805,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
start_queue(md->queue);
unlock_fs(md);
- goto out_unlock; /* pushback list is already flushed, so skip flush */
+ dm_table_presuspend_undo_targets(map);
+ /* pushback list is already flushed, so skip flush */
}
- /*
- * If dm_wait_for_completion returned 0, the device is completely
- * quiescent now. There is no request-processing activity. All new
- * requests are being added to md->deferred list.
- */
+ return r;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem. For example we might want to move some data in
+ * the background. Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight bios and ensure that any further io gets deferred.
+ */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
+ *
+ * To abort suspend, start the request_queue.
+ */
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+{
+ struct dm_table *map = NULL;
+ int r = 0;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (dm_suspended_md(md)) {
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
+ if (r)
+ goto out_unlock;
set_bit(DMF_SUSPENDED, &md->flags);
@@ -2830,22 +2865,13 @@ out_unlock:
return r;
}
-int dm_resume(struct mapped_device *md)
+static int __dm_resume(struct mapped_device *md, struct dm_table *map)
{
- int r = -EINVAL;
- struct dm_table *map = NULL;
-
- mutex_lock(&md->suspend_lock);
- if (!dm_suspended_md(md))
- goto out;
-
- map = md->map;
- if (!map || !dm_table_get_size(map))
- goto out;
-
- r = dm_table_resume_targets(map);
- if (r)
- goto out;
+ if (map) {
+ int r = dm_table_resume_targets(map);
+ if (r)
+ return r;
+ }
dm_queue_flush(md);
@@ -2859,6 +2885,37 @@ int dm_resume(struct mapped_device *md)
unlock_fs(md);
+ return 0;
+}
+
+int dm_resume(struct mapped_device *md)
+{
+ int r = -EINVAL;
+ struct dm_table *map = NULL;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (!dm_suspended_md(md))
+ goto out;
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ r = __dm_resume(md, map);
+ if (r)
+ goto out;
+
clear_bit(DMF_SUSPENDED, &md->flags);
r = 0;
@@ -2872,15 +2929,80 @@ out:
* Internal suspend/resume works like userspace-driven suspend. It waits
* until all bios finish and prevents issuing new bios to the target drivers.
* It may be used only from the kernel.
- *
- * Internal suspend holds md->suspend_lock, which prevents interaction with
- * userspace-driven suspend.
*/
-void dm_internal_suspend(struct mapped_device *md)
+static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
{
- mutex_lock(&md->suspend_lock);
+ struct dm_table *map = NULL;
+
+ if (dm_suspended_internally_md(md))
+ return; /* nested internal suspend */
+
+ if (dm_suspended_md(md)) {
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ return; /* nest suspend */
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ /*
+ * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
+ * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
+ * would require changing .presuspend to return an error -- avoid this
+ * until there is a need for more elaborate variants of internal suspend.
+ */
+ (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
+
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+
+ dm_table_postsuspend_targets(map);
+}
+
+static void __dm_internal_resume(struct mapped_device *md)
+{
+ if (!dm_suspended_internally_md(md))
+ return; /* resume from nested internal suspend */
+
if (dm_suspended_md(md))
+ goto done; /* resume from nested suspend */
+
+ /*
+ * NOTE: existing callers don't need to call dm_table_resume_targets
+ * (which may fail -- so best to avoid it for now by passing NULL map)
+ */
+ (void) __dm_resume(md, NULL);
+
+done:
+ clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
+}
+
+void dm_internal_suspend_noflush(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
+
+void dm_internal_resume(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_resume(md);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_resume);
+
+/*
+ * Fast variants of internal suspend/resume hold md->suspend_lock,
+ * which prevents interaction with userspace-driven suspend.
+ */
+
+void dm_internal_suspend_fast(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
return;
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
@@ -2889,9 +3011,9 @@ void dm_internal_suspend(struct mapped_device *md)
dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
}
-void dm_internal_resume(struct mapped_device *md)
+void dm_internal_resume_fast(struct mapped_device *md)
{
- if (dm_suspended_md(md))
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
goto done;
dm_queue_flush(md);
@@ -2977,6 +3099,11 @@ int dm_suspended_md(struct mapped_device *md)
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_suspended_internally_md(struct mapped_device *md)
+{
+ return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+}
+
int dm_test_deferred_remove_flag(struct mapped_device *md)
{
return test_bit(DMF_DEFERRED_REMOVE, &md->flags);