summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/bitmap.c15
-rw-r--r--drivers/md/dm-bufio.c3
-rw-r--r--drivers/md/dm-cache-target.c5
-rw-r--r--drivers/md/dm-crypt.c392
-rw-r--r--drivers/md/dm-io.c6
-rw-r--r--drivers/md/dm-ioctl.c4
-rw-r--r--drivers/md/dm-log-userspace-base.c5
-rw-r--r--drivers/md/dm-mpath.c87
-rw-r--r--drivers/md/dm-raid.c24
-rw-r--r--drivers/md/dm-raid1.c9
-rw-r--r--drivers/md/dm-snap-persistent.c14
-rw-r--r--drivers/md/dm-snap.c4
-rw-r--r--drivers/md/dm-table.c72
-rw-r--r--drivers/md/dm-target.c15
-rw-r--r--drivers/md/dm-thin-metadata.c9
-rw-r--r--drivers/md/dm-thin-metadata.h2
-rw-r--r--drivers/md/dm-thin.c5
-rw-r--r--drivers/md/dm.c373
-rw-r--r--drivers/md/dm.h11
-rw-r--r--drivers/md/faulty.c8
-rw-r--r--drivers/md/linear.c67
-rw-r--r--drivers/md/md.c830
-rw-r--r--drivers/md/md.h57
-rw-r--r--drivers/md/multipath.c22
-rw-r--r--drivers/md/persistent-data/Kconfig2
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c4
-rw-r--r--drivers/md/raid0.c29
-rw-r--r--drivers/md/raid1.c60
-rw-r--r--drivers/md/raid1.h3
-rw-r--r--drivers/md/raid10.c52
-rw-r--r--drivers/md/raid10.h3
-rw-r--r--drivers/md/raid5.c350
-rw-r--r--drivers/md/raid5.h1
34 files changed, 1549 insertions, 1003 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index c355a226a024..63e05e32b462 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -178,7 +178,7 @@ config MD_FAULTY
source "drivers/md/bcache/Kconfig"
config BLK_DEV_DM_BUILTIN
- boolean
+ bool
config BLK_DEV_DM
tristate "Device mapper support"
@@ -197,7 +197,7 @@ config BLK_DEV_DM
If unsure, say N.
config DM_DEBUG
- boolean "Device mapper debugging support"
+ bool "Device mapper debugging support"
depends on BLK_DEV_DM
---help---
Enable this for messages that may help debug device-mapper problems.
@@ -231,9 +231,8 @@ config DM_CRYPT
transparently encrypts the data on it. You'll need to activate
the ciphers you're going to use in the cryptoapi configuration.
- Information on how to use dm-crypt can be found on
-
- <http://www.saout.de/misc/dm-crypt/>
+ For further information on dm-crypt and userspace tools see:
+ <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
To compile this code as a module, choose M here: the module will
be called dm-crypt.
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1695ee5f3ffc..3a5767968ba0 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1619,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev)
return;
mutex_lock(&mddev->bitmap_info.mutex);
+ spin_lock(&mddev->lock);
mddev->bitmap = NULL; /* disconnect from the md device */
+ spin_unlock(&mddev->lock);
mutex_unlock(&mddev->bitmap_info.mutex);
if (mddev->thread)
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
@@ -2209,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t can_clear_show(struct mddev *mddev, char *page)
{
int len;
+ spin_lock(&mddev->lock);
if (mddev->bitmap)
len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
"false" : "true"));
else
len = sprintf(page, "\n");
+ spin_unlock(&mddev->lock);
return len;
}
@@ -2238,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
static ssize_t
behind_writes_used_show(struct mddev *mddev, char *page)
{
+ ssize_t ret;
+ spin_lock(&mddev->lock);
if (mddev->bitmap == NULL)
- return sprintf(page, "0\n");
- return sprintf(page, "%lu\n",
- mddev->bitmap->behind_writes_used);
+ ret = sprintf(page, "0\n");
+ else
+ ret = sprintf(page, "%lu\n",
+ mddev->bitmap->behind_writes_used);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index c33b49792b87..86dbbc737402 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -11,6 +11,7 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include <linux/vmalloc.h>
#include <linux/shrinker.h>
#include <linux/module.h>
@@ -1739,7 +1740,7 @@ static unsigned get_max_age_hz(void)
static bool older_than(struct dm_buffer *b, unsigned long age_hz)
{
- return (jiffies - b->last_accessed) >= age_hz;
+ return time_after_eq(jiffies, b->last_accessed + age_hz);
}
static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index e1650539cc2f..7755af351867 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -11,6 +11,7 @@
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
@@ -1562,8 +1563,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
static int need_commit_due_to_time(struct cache *cache)
{
- return jiffies < cache->last_commit_jiffies ||
- jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+ return !time_in_range(jiffies, cache->last_commit_jiffies,
+ cache->last_commit_jiffies + COMMIT_PERIOD);
}
static int commit_if_needed(struct cache *cache)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 08981be7baa1..713a96237a80 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,9 +18,11 @@
#include <linux/slab.h>
#include <linux/crypto.h>
#include <linux/workqueue.h>
+#include <linux/kthread.h>
#include <linux/backing-dev.h>
#include <linux/atomic.h>
#include <linux/scatterlist.h>
+#include <linux/rbtree.h>
#include <asm/page.h>
#include <asm/unaligned.h>
#include <crypto/hash.h>
@@ -58,7 +60,8 @@ struct dm_crypt_io {
atomic_t io_pending;
int error;
sector_t sector;
- struct dm_crypt_io *base_io;
+
+ struct rb_node rb_node;
} CRYPTO_MINALIGN_ATTR;
struct dm_crypt_request {
@@ -108,7 +111,8 @@ struct iv_tcw_private {
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
*/
-enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
+ DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
/*
* The fields in here must be read only after initialization.
@@ -121,14 +125,18 @@ struct crypt_config {
* pool for per bio private data, crypto requests and
* encryption requeusts/buffer pages
*/
- mempool_t *io_pool;
mempool_t *req_pool;
mempool_t *page_pool;
struct bio_set *bs;
+ struct mutex bio_alloc_lock;
struct workqueue_struct *io_queue;
struct workqueue_struct *crypt_queue;
+ struct task_struct *write_thread;
+ wait_queue_head_t write_thread_wait;
+ struct rb_root write_tree;
+
char *cipher;
char *cipher_string;
@@ -172,9 +180,6 @@ struct crypt_config {
};
#define MIN_IOS 16
-#define MIN_POOL_PAGES 32
-
-static struct kmem_cache *_crypt_io_pool;
static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
@@ -946,57 +951,70 @@ static int crypt_convert(struct crypt_config *cc,
return 0;
}
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
+
/*
* Generate a new unfragmented bio with the given size
* This should never violate the device limitations
- * May return a smaller bio when running out of pages, indicated by
- * *out_of_pages set to 1.
+ *
+ * This function may be called concurrently. If we allocate from the mempool
+ * concurrently, there is a possibility of deadlock. For example, if we have
+ * mempool of 256 pages, two processes, each wanting 256, pages allocate from
+ * the mempool concurrently, it may deadlock in a situation where both processes
+ * have allocated 128 pages and the mempool is exhausted.
+ *
+ * In order to avoid this scenario we allocate the pages under a mutex.
+ *
+ * In order to not degrade performance with excessive locking, we try
+ * non-blocking allocations without a mutex first but on failure we fallback
+ * to blocking allocations with a mutex.
*/
-static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
- unsigned *out_of_pages)
+static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
{
struct crypt_config *cc = io->cc;
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
- unsigned i, len;
+ gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
+ unsigned i, len, remaining_size;
struct page *page;
+ struct bio_vec *bvec;
+
+retry:
+ if (unlikely(gfp_mask & __GFP_WAIT))
+ mutex_lock(&cc->bio_alloc_lock);
clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
- return NULL;
+ goto return_clone;
clone_init(io, clone);
- *out_of_pages = 0;
+
+ remaining_size = size;
for (i = 0; i < nr_iovecs; i++) {
page = mempool_alloc(cc->page_pool, gfp_mask);
if (!page) {
- *out_of_pages = 1;
- break;
+ crypt_free_buffer_pages(cc, clone);
+ bio_put(clone);
+ gfp_mask |= __GFP_WAIT;
+ goto retry;
}
- /*
- * If additional pages cannot be allocated without waiting,
- * return a partially-allocated bio. The caller will then try
- * to allocate more bios while submitting this partial bio.
- */
- gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+ len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
- len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
+ bvec = &clone->bi_io_vec[clone->bi_vcnt++];
+ bvec->bv_page = page;
+ bvec->bv_len = len;
+ bvec->bv_offset = 0;
- if (!bio_add_page(clone, page, len, 0)) {
- mempool_free(page, cc->page_pool);
- break;
- }
+ clone->bi_iter.bi_size += len;
- size -= len;
+ remaining_size -= len;
}
- if (!clone->bi_iter.bi_size) {
- bio_put(clone);
- return NULL;
- }
+return_clone:
+ if (unlikely(gfp_mask & __GFP_WAIT))
+ mutex_unlock(&cc->bio_alloc_lock);
return clone;
}
@@ -1020,7 +1038,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
io->base_bio = bio;
io->sector = sector;
io->error = 0;
- io->base_io = NULL;
io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
}
@@ -1033,13 +1050,11 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
/*
* One of the bios was finished. Check for completion of
* the whole request and correctly clean up the buffer.
- * If base_io is set, wait for the last fragment to complete.
*/
static void crypt_dec_pending(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
struct bio *base_bio = io->base_bio;
- struct dm_crypt_io *base_io = io->base_io;
int error = io->error;
if (!atomic_dec_and_test(&io->io_pending))
@@ -1047,16 +1062,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (io->ctx.req)
crypt_free_req(cc, io->ctx.req, base_bio);
- if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
- mempool_free(io, cc->io_pool);
-
- if (likely(!base_io))
- bio_endio(base_bio, error);
- else {
- if (error && !base_io->error)
- base_io->error = error;
- crypt_dec_pending(base_io);
- }
+
+ bio_endio(base_bio, error);
}
/*
@@ -1138,37 +1145,97 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
return 0;
}
+static void kcryptd_io_read_work(struct work_struct *work)
+{
+ struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+ crypt_inc_pending(io);
+ if (kcryptd_io_read(io, GFP_NOIO))
+ io->error = -ENOMEM;
+ crypt_dec_pending(io);
+}
+
+static void kcryptd_queue_read(struct dm_crypt_io *io)
+{
+ struct crypt_config *cc = io->cc;
+
+ INIT_WORK(&io->work, kcryptd_io_read_work);
+ queue_work(cc->io_queue, &io->work);
+}
+
static void kcryptd_io_write(struct dm_crypt_io *io)
{
struct bio *clone = io->ctx.bio_out;
+
generic_make_request(clone);
}
-static void kcryptd_io(struct work_struct *work)
+#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node)
+
+static int dmcrypt_write(void *data)
{
- struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+ struct crypt_config *cc = data;
+ struct dm_crypt_io *io;
- if (bio_data_dir(io->base_bio) == READ) {
- crypt_inc_pending(io);
- if (kcryptd_io_read(io, GFP_NOIO))
- io->error = -ENOMEM;
- crypt_dec_pending(io);
- } else
- kcryptd_io_write(io);
-}
+ while (1) {
+ struct rb_root write_tree;
+ struct blk_plug plug;
-static void kcryptd_queue_io(struct dm_crypt_io *io)
-{
- struct crypt_config *cc = io->cc;
+ DECLARE_WAITQUEUE(wait, current);
- INIT_WORK(&io->work, kcryptd_io);
- queue_work(cc->io_queue, &io->work);
+ spin_lock_irq(&cc->write_thread_wait.lock);
+continue_locked:
+
+ if (!RB_EMPTY_ROOT(&cc->write_tree))
+ goto pop_from_list;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ __add_wait_queue(&cc->write_thread_wait, &wait);
+
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+
+ if (unlikely(kthread_should_stop())) {
+ set_task_state(current, TASK_RUNNING);
+ remove_wait_queue(&cc->write_thread_wait, &wait);
+ break;
+ }
+
+ schedule();
+
+ set_task_state(current, TASK_RUNNING);
+ spin_lock_irq(&cc->write_thread_wait.lock);
+ __remove_wait_queue(&cc->write_thread_wait, &wait);
+ goto continue_locked;
+
+pop_from_list:
+ write_tree = cc->write_tree;
+ cc->write_tree = RB_ROOT;
+ spin_unlock_irq(&cc->write_thread_wait.lock);
+
+ BUG_ON(rb_parent(write_tree.rb_node));
+
+ /*
+ * Note: we cannot walk the tree here with rb_next because
+ * the structures may be freed when kcryptd_io_write is called.
+ */
+ blk_start_plug(&plug);
+ do {
+ io = crypt_io_from_node(rb_first(&write_tree));
+ rb_erase(&io->rb_node, &write_tree);
+ kcryptd_io_write(io);
+ } while (!RB_EMPTY_ROOT(&write_tree));
+ blk_finish_plug(&plug);
+ }
+ return 0;
}
static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
{
struct bio *clone = io->ctx.bio_out;
struct crypt_config *cc = io->cc;
+ unsigned long flags;
+ sector_t sector;
+ struct rb_node **rbp, *parent;
if (unlikely(io->error < 0)) {
crypt_free_buffer_pages(cc, clone);
@@ -1182,20 +1249,34 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
clone->bi_iter.bi_sector = cc->start + io->sector;
- if (async)
- kcryptd_queue_io(io);
- else
+ if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
generic_make_request(clone);
+ return;
+ }
+
+ spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+ rbp = &cc->write_tree.rb_node;
+ parent = NULL;
+ sector = io->sector;
+ while (*rbp) {
+ parent = *rbp;
+ if (sector < crypt_io_from_node(parent)->sector)
+ rbp = &(*rbp)->rb_left;
+ else
+ rbp = &(*rbp)->rb_right;
+ }
+ rb_link_node(&io->rb_node, parent, rbp);
+ rb_insert_color(&io->rb_node, &cc->write_tree);
+
+ wake_up_locked(&cc->write_thread_wait);
+ spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
}
static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
struct bio *clone;
- struct dm_crypt_io *new_io;
int crypt_finished;
- unsigned out_of_pages = 0;
- unsigned remaining = io->base_bio->bi_iter.bi_size;
sector_t sector = io->sector;
int r;
@@ -1205,80 +1286,30 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
crypt_inc_pending(io);
crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
- /*
- * The allocated buffers can be smaller than the whole bio,
- * so repeat the whole process until all the data can be handled.
- */
- while (remaining) {
- clone = crypt_alloc_buffer(io, remaining, &out_of_pages);
- if (unlikely(!clone)) {
- io->error = -ENOMEM;
- break;
- }
-
- io->ctx.bio_out = clone;
- io->ctx.iter_out = clone->bi_iter;
-
- remaining -= clone->bi_iter.bi_size;
- sector += bio_sectors(clone);
-
- crypt_inc_pending(io);
-
- r = crypt_convert(cc, &io->ctx);
- if (r < 0)
- io->error = -EIO;
-
- crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
-
- /* Encryption was already finished, submit io now */
- if (crypt_finished) {
- kcryptd_crypt_write_io_submit(io, 0);
-
- /*
- * If there was an error, do not try next fragments.
- * For async, error is processed in async handler.
- */
- if (unlikely(r < 0))
- break;
+ clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+ if (unlikely(!clone)) {
+ io->error = -EIO;
+ goto dec;
+ }
- io->sector = sector;
- }
+ io->ctx.bio_out = clone;
+ io->ctx.iter_out = clone->bi_iter;
- /*
- * Out of memory -> run queues
- * But don't wait if split was due to the io size restriction
- */
- if (unlikely(out_of_pages))
- congestion_wait(BLK_RW_ASYNC, HZ/100);
+ sector += bio_sectors(clone);
- /*
- * With async crypto it is unsafe to share the crypto context
- * between fragments, so switch to a new dm_crypt_io structure.
- */
- if (unlikely(!crypt_finished && remaining)) {
- new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
- crypt_io_init(new_io, io->cc, io->base_bio, sector);
- crypt_inc_pending(new_io);
- crypt_convert_init(cc, &new_io->ctx, NULL,
- io->base_bio, sector);
- new_io->ctx.iter_in = io->ctx.iter_in;
-
- /*
- * Fragments after the first use the base_io
- * pending count.
- */
- if (!io->base_io)
- new_io->base_io = io;
- else {
- new_io->base_io = io->base_io;
- crypt_inc_pending(io->base_io);
- crypt_dec_pending(io);
- }
+ crypt_inc_pending(io);
+ r = crypt_convert(cc, &io->ctx);
+ if (r)
+ io->error = -EIO;
+ crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
- io = new_io;
- }
+ /* Encryption was already finished, submit io now */
+ if (crypt_finished) {
+ kcryptd_crypt_write_io_submit(io, 0);
+ io->sector = sector;
}
+dec:
crypt_dec_pending(io);
}
@@ -1481,6 +1512,9 @@ static void crypt_dtr(struct dm_target *ti)
if (!cc)
return;
+ if (cc->write_thread)
+ kthread_stop(cc->write_thread);
+
if (cc->io_queue)
destroy_workqueue(cc->io_queue);
if (cc->crypt_queue)
@@ -1495,8 +1529,6 @@ static void crypt_dtr(struct dm_target *ti)
mempool_destroy(cc->page_pool);
if (cc->req_pool)
mempool_destroy(cc->req_pool);
- if (cc->io_pool)
- mempool_destroy(cc->io_pool);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc);
@@ -1688,7 +1720,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
char dummy;
static struct dm_arg _args[] = {
- {0, 1, "Invalid number of feature args"},
+ {0, 3, "Invalid number of feature args"},
};
if (argc < 5) {
@@ -1710,13 +1742,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret < 0)
goto bad;
- ret = -ENOMEM;
- cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
- if (!cc->io_pool) {
- ti->error = "Cannot allocate crypt io mempool";
- goto bad;
- }
-
cc->dmreq_start = sizeof(struct ablkcipher_request);
cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
@@ -1734,6 +1759,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
}
+ ret = -ENOMEM;
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
if (!cc->req_pool) {
@@ -1746,7 +1772,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
ARCH_KMALLOC_MINALIGN);
- cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+ cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
if (!cc->page_pool) {
ti->error = "Cannot allocate page mempool";
goto bad;
@@ -1758,6 +1784,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ mutex_init(&cc->bio_alloc_lock);
+
ret = -EINVAL;
if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid iv_offset sector";
@@ -1788,15 +1816,26 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret)
goto bad;
- opt_string = dm_shift_arg(&as);
+ while (opt_params--) {
+ opt_string = dm_shift_arg(&as);
+ if (!opt_string) {
+ ti->error = "Not enough feature arguments";
+ goto bad;
+ }
- if (opt_params == 1 && opt_string &&
- !strcasecmp(opt_string, "allow_discards"))
- ti->num_discard_bios = 1;
- else if (opt_params) {
- ret = -EINVAL;
- ti->error = "Invalid feature arguments";
- goto bad;
+ if (!strcasecmp(opt_string, "allow_discards"))
+ ti->num_discard_bios = 1;
+
+ else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+ set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+ else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+ set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+
+ else {
+ ti->error = "Invalid feature arguments";
+ goto bad;
+ }
}
}
@@ -1807,13 +1846,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- cc->crypt_queue = alloc_workqueue("kcryptd",
- WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ else
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ num_online_cpus());
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
}
+ init_waitqueue_head(&cc->write_thread_wait);
+ cc->write_tree = RB_ROOT;
+
+ cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+ if (IS_ERR(cc->write_thread)) {
+ ret = PTR_ERR(cc->write_thread);
+ cc->write_thread = NULL;
+ ti->error = "Couldn't spawn write thread";
+ goto bad;
+ }
+ wake_up_process(cc->write_thread);
+
ti->num_flush_bios = 1;
ti->discard_zeroes_data_unsupported = true;
@@ -1848,7 +1902,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
if (bio_data_dir(io->base_bio) == READ) {
if (kcryptd_io_read(io, GFP_NOWAIT))
- kcryptd_queue_io(io);
+ kcryptd_queue_read(io);
} else
kcryptd_queue_crypt(io);
@@ -1860,6 +1914,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
{
struct crypt_config *cc = ti->private;
unsigned i, sz = 0;
+ int num_feature_args = 0;
switch (type) {
case STATUSTYPE_INFO:
@@ -1878,8 +1933,18 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
cc->dev->name, (unsigned long long)cc->start);
- if (ti->num_discard_bios)
- DMEMIT(" 1 allow_discards");
+ num_feature_args += !!ti->num_discard_bios;
+ num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+ num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+ if (num_feature_args) {
+ DMEMIT(" %d", num_feature_args);
+ if (ti->num_discard_bios)
+ DMEMIT(" allow_discards");
+ if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+ DMEMIT(" same_cpu_crypt");
+ if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
+ DMEMIT(" submit_from_crypt_cpus");
+ }
break;
}
@@ -1976,7 +2041,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 13, 0},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
@@ -1994,15 +2059,9 @@ static int __init dm_crypt_init(void)
{
int r;
- _crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0);
- if (!_crypt_io_pool)
- return -ENOMEM;
-
r = dm_register_target(&crypt_target);
- if (r < 0) {
+ if (r < 0)
DMERR("register failed %d", r);
- kmem_cache_destroy(_crypt_io_pool);
- }
return r;
}
@@ -2010,7 +2069,6 @@ static int __init dm_crypt_init(void)
static void __exit dm_crypt_exit(void)
{
dm_unregister_target(&crypt_target);
- kmem_cache_destroy(_crypt_io_pool);
}
module_init(dm_crypt_init);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index c09359db3a90..37de0173b6d2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -290,6 +290,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
unsigned short logical_block_size = queue_logical_block_size(q);
sector_t num_sectors;
+ /* Reject unsupported discard requests */
+ if ((rw & REQ_DISCARD) && !blk_queue_discard(q)) {
+ dec_count(io, region, -EOPNOTSUPP);
+ return;
+ }
+
/*
* where->count may be zero if rw holds a flush and we need to
* send a zero-sized flush.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 73f791bb9ea4..c8a18e4ee9dc 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -639,8 +639,8 @@ static int check_name(const char *name)
/*
* On successful return, the caller must not attempt to acquire
- * _hash_lock without first calling dm_table_put, because dm_table_destroy
- * waits for this dm_table_put and could be called under this lock.
+ * _hash_lock without first calling dm_put_live_table, because dm_table_destroy
+ * waits for this dm_put_live_table and could be called under this lock.
*/
static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx)
{
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index b953db6cc229..03177ca0b009 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -6,6 +6,7 @@
#include <linux/bio.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include <linux/dm-dirty-log.h>
#include <linux/device-mapper.h>
#include <linux/dm-log-userspace.h>
@@ -829,7 +830,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log,
int r;
uint64_t region64 = region;
struct log_c *lc = log->context;
- static unsigned long long limit;
+ static unsigned long limit;
struct {
int64_t is_recovering;
uint64_t in_sync_hint;
@@ -845,7 +846,7 @@ static int userspace_is_remote_recovering(struct dm_dirty_log *log,
*/
if (region < lc->in_sync_hint)
return 0;
- else if (jiffies < limit)
+ else if (time_after(limit, jiffies))
return 1;
limit = jiffies + (HZ / 4);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7b6b0f0f831a..d376dc87716e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -11,6 +11,7 @@
#include "dm-path-selector.h"
#include "dm-uevent.h"
+#include <linux/blkdev.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/mempool.h>
@@ -378,18 +379,18 @@ static int __must_push_back(struct multipath *m)
/*
* Map cloned requests
*/
-static int multipath_map(struct dm_target *ti, struct request *clone,
- union map_info *map_context)
+static int __multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context,
+ struct request *rq, struct request **__clone)
{
struct multipath *m = (struct multipath *) ti->private;
int r = DM_MAPIO_REQUEUE;
- size_t nr_bytes = blk_rq_bytes(clone);
- unsigned long flags;
+ size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
struct pgpath *pgpath;
struct block_device *bdev;
struct dm_mpath_io *mpio;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
@@ -411,25 +412,61 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
/* ENOMEM, requeue */
goto out_unlock;
- bdev = pgpath->path.dev->bdev;
- clone->q = bdev_get_queue(bdev);
- clone->rq_disk = bdev->bd_disk;
- clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
mpio = map_context->ptr;
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
+
+ bdev = pgpath->path.dev->bdev;
+
+ spin_unlock_irq(&m->lock);
+
+ if (clone) {
+ /* Old request-based interface: allocated clone is passed in */
+ clone->q = bdev_get_queue(bdev);
+ clone->rq_disk = bdev->bd_disk;
+ clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ } else {
+ /* blk-mq request-based interface */
+ *__clone = blk_get_request(bdev_get_queue(bdev),
+ rq_data_dir(rq), GFP_KERNEL);
+ if (IS_ERR(*__clone))
+ /* ENOMEM, requeue */
+ return r;
+ (*__clone)->bio = (*__clone)->biotail = NULL;
+ (*__clone)->rq_disk = bdev->bd_disk;
+ (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+ }
+
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
&pgpath->path,
nr_bytes);
- r = DM_MAPIO_REMAPPED;
+ return DM_MAPIO_REMAPPED;
out_unlock:
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
return r;
}
+static int multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context)
+{
+ return __multipath_map(ti, clone, map_context, NULL, NULL);
+}
+
+static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return __multipath_map(ti, NULL, map_context, rq, clone);
+}
+
+static void multipath_release_clone(struct request *clone)
+{
+ blk_put_request(clone);
+}
+
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
@@ -1666,11 +1703,13 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 7, 0},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
.map_rq = multipath_map,
+ .clone_and_map_rq = multipath_clone_and_map,
+ .release_clone_rq = multipath_release_clone,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
@@ -1694,16 +1733,15 @@ static int __init dm_multipath_init(void)
r = dm_register_target(&multipath_target);
if (r < 0) {
DMERR("register failed %d", r);
- kmem_cache_destroy(_mpio_cache);
- return -EINVAL;
+ r = -EINVAL;
+ goto bad_register_target;
}
kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
- dm_unregister_target(&multipath_target);
- kmem_cache_destroy(_mpio_cache);
- return -ENOMEM;
+ r = -ENOMEM;
+ goto bad_alloc_kmultipathd;
}
/*
@@ -1716,16 +1754,23 @@ static int __init dm_multipath_init(void)
WQ_MEM_RECLAIM);
if (!kmpath_handlerd) {
DMERR("failed to create workqueue kmpath_handlerd");
- destroy_workqueue(kmultipathd);
- dm_unregister_target(&multipath_target);
- kmem_cache_destroy(_mpio_cache);
- return -ENOMEM;
+ r = -ENOMEM;
+ goto bad_alloc_kmpath_handlerd;
}
DMINFO("version %u.%u.%u loaded",
multipath_target.version[0], multipath_target.version[1],
multipath_target.version[2]);
+ return 0;
+
+bad_alloc_kmpath_handlerd:
+ destroy_workqueue(kmultipathd);
+bad_alloc_kmultipathd:
+ dm_unregister_target(&multipath_target);
+bad_register_target:
+ kmem_cache_destroy(_mpio_cache);
+
return r;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 07c0fa0fa284..88e4c7f24986 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
{
struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
- if (rs->raid_type->level == 1)
- return md_raid1_congested(&rs->md, bits);
-
- if (rs->raid_type->level == 10)
- return md_raid10_congested(&rs->md, bits);
-
- return md_raid5_congested(&rs->md, bits);
+ return mddev_congested(&rs->md, bits);
}
/*
@@ -1243,7 +1237,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
argv++;
/* Skip over RAID params for now and find out # of devices */
- if (num_raid_params + 1 > argc) {
+ if (num_raid_params >= argc) {
ti->error = "Arguments do not agree with counts given";
return -EINVAL;
}
@@ -1254,6 +1248,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
return -EINVAL;
}
+ argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+ if (argc != (num_raid_devs * 2)) {
+ ti->error = "Supplied RAID devices does not match the count given";
+ return -EINVAL;
+ }
+
rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
if (IS_ERR(rs))
return PTR_ERR(rs);
@@ -1262,16 +1262,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (ret)
goto bad;
- ret = -EINVAL;
-
- argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
argv += num_raid_params + 1;
- if (argc != (num_raid_devs * 2)) {
- ti->error = "Supplied RAID devices does not match the count given";
- goto bad;
- }
-
ret = dev_parms(rs, argv);
if (ret)
goto bad;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7dfdb5c746d6..089d62751f7f 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -604,6 +604,15 @@ static void write_callback(unsigned long error, void *context)
return;
}
+ /*
+ * If the bio is discard, return an error, but do not
+ * degrade the array.
+ */
+ if (bio->bi_rw & REQ_DISCARD) {
+ bio_endio(bio, -EOPNOTSUPP);
+ return;
+ }
+
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error))
fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index d6e88178d22c..808b8419bc48 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -200,16 +200,11 @@ err_area:
static void free_area(struct pstore *ps)
{
- if (ps->area)
- vfree(ps->area);
+ vfree(ps->area);
ps->area = NULL;
-
- if (ps->zero_area)
- vfree(ps->zero_area);
+ vfree(ps->zero_area);
ps->zero_area = NULL;
-
- if (ps->header_area)
- vfree(ps->header_area);
+ vfree(ps->header_area);
ps->header_area = NULL;
}
@@ -605,8 +600,7 @@ static void persistent_dtr(struct dm_exception_store *store)
free_area(ps);
/* Allocated in persistent_read_metadata */
- if (ps->callbacks)
- vfree(ps->callbacks);
+ vfree(ps->callbacks);
kfree(ps);
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 864b03f47727..8b204ae216ab 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1432,8 +1432,6 @@ out:
full_bio->bi_private = pe->full_bio_private;
atomic_inc(&full_bio->bi_remaining);
}
- free_pending_exception(pe);
-
increment_pending_exceptions_done_count();
up_write(&s->lock);
@@ -1450,6 +1448,8 @@ out:
}
retry_origin_bios(s, origin_bios);
+
+ free_pending_exception(pe);
}
static void commit_callback(void *context, int success)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3afae9e062f8..6554d9148927 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -827,10 +827,11 @@ static int dm_table_set_type(struct dm_table *t)
{
unsigned i;
unsigned bio_based = 0, request_based = 0, hybrid = 0;
+ bool use_blk_mq = false;
struct dm_target *tgt;
struct dm_dev_internal *dd;
struct list_head *devices;
- unsigned live_md_type;
+ unsigned live_md_type = dm_get_md_type(t->md);
for (i = 0; i < t->num_targets; i++) {
tgt = t->targets + i;
@@ -854,8 +855,8 @@ static int dm_table_set_type(struct dm_table *t)
* Determine the type from the live device.
* Default to bio-based if device is new.
*/
- live_md_type = dm_get_md_type(t->md);
- if (live_md_type == DM_TYPE_REQUEST_BASED)
+ if (live_md_type == DM_TYPE_REQUEST_BASED ||
+ live_md_type == DM_TYPE_MQ_REQUEST_BASED)
request_based = 1;
else
bio_based = 1;
@@ -869,16 +870,6 @@ static int dm_table_set_type(struct dm_table *t)
BUG_ON(!request_based); /* No targets in this table */
- /* Non-request-stackable devices can't be used for request-based dm */
- devices = dm_table_get_devices(t);
- list_for_each_entry(dd, devices, list) {
- if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) {
- DMWARN("table load rejected: including"
- " non-request-stackable devices");
- return -EINVAL;
- }
- }
-
/*
* Request-based dm supports only tables that have a single target now.
* To support multiple targets, request splitting support is needed,
@@ -890,7 +881,37 @@ static int dm_table_set_type(struct dm_table *t)
return -EINVAL;
}
- t->type = DM_TYPE_REQUEST_BASED;
+ /* Non-request-stackable devices can't be used for request-based dm */
+ devices = dm_table_get_devices(t);
+ list_for_each_entry(dd, devices, list) {
+ struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
+
+ if (!blk_queue_stackable(q)) {
+ DMERR("table load rejected: including"
+ " non-request-stackable devices");
+ return -EINVAL;
+ }
+
+ if (q->mq_ops)
+ use_blk_mq = true;
+ }
+
+ if (use_blk_mq) {
+ /* verify _all_ devices in the table are blk-mq devices */
+ list_for_each_entry(dd, devices, list)
+ if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
+ DMERR("table load rejected: not all devices"
+ " are blk-mq request-stackable");
+ return -EINVAL;
+ }
+ t->type = DM_TYPE_MQ_REQUEST_BASED;
+
+ } else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) {
+ /* inherit live MD type */
+ t->type = live_md_type;
+
+ } else
+ t->type = DM_TYPE_REQUEST_BASED;
return 0;
}
@@ -907,7 +928,15 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
bool dm_table_request_based(struct dm_table *t)
{
- return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+ unsigned table_type = dm_table_get_type(t);
+
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
+bool dm_table_mq_request_based(struct dm_table *t)
+{
+ return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
}
static int dm_table_alloc_md_mempools(struct dm_table *t)
@@ -1360,6 +1389,14 @@ static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
}
+static int queue_supports_sg_gaps(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags);
+}
+
static bool dm_table_all_devices_attribute(struct dm_table *t,
iterate_devices_callout_fn func)
{
@@ -1480,6 +1517,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+ if (dm_table_all_devices_attribute(t, queue_supports_sg_gaps))
+ queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q);
+
dm_table_set_integrity(t);
/*
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 242e3cec397a..925ec1b15e75 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -137,13 +137,26 @@ static int io_err_map_rq(struct dm_target *ti, struct request *clone,
return -EIO;
}
+static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
+ union map_info *map_context,
+ struct request **clone)
+{
+ return -EIO;
+}
+
+static void io_err_release_clone_rq(struct request *clone)
+{
+}
+
static struct target_type error_target = {
.name = "error",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.ctr = io_err_ctr,
.dtr = io_err_dtr,
.map = io_err_map,
.map_rq = io_err_map_rq,
+ .clone_and_map_rq = io_err_clone_and_map_rq,
+ .release_clone_rq = io_err_release_clone_rq,
};
int __init dm_target_init(void)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 43adbb863f5a..79f694120ddf 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1635,15 +1635,6 @@ int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
return r;
}
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
-{
- down_read(&pmd->root_lock);
- *result = pmd->data_block_size;
- up_read(&pmd->root_lock);
-
- return 0;
-}
-
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
{
int r = -EINVAL;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 921d15ee56a0..fac01a96d303 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -182,8 +182,6 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
dm_block_t *result);
-int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
-
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 07705ee181e3..654773cb1eee 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -11,6 +11,7 @@
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
+#include <linux/jiffies.h>
#include <linux/log2.h>
#include <linux/list.h>
#include <linux/rculist.h>
@@ -1700,8 +1701,8 @@ static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell
*/
static int need_commit_due_to_time(struct pool *pool)
{
- return jiffies < pool->last_commit_jiffies ||
- jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+ return !time_in_range(jiffies, pool->last_commit_jiffies,
+ pool->last_commit_jiffies + COMMIT_PERIOD);
}
#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2caf5b374649..73f28802dc7a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,6 +20,7 @@
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/wait.h>
+#include <linux/kthread.h>
#include <trace/events/block.h>
@@ -78,7 +79,8 @@ struct dm_io {
struct dm_rq_target_io {
struct mapped_device *md;
struct dm_target *ti;
- struct request *orig, clone;
+ struct request *orig, *clone;
+ struct kthread_work work;
int error;
union map_info info;
};
@@ -179,6 +181,7 @@ struct mapped_device {
* io objects are allocated from here.
*/
mempool_t *io_pool;
+ mempool_t *rq_pool;
struct bio_set *bs;
@@ -210,6 +213,9 @@ struct mapped_device {
unsigned internal_suspend_count;
struct dm_stats stats;
+
+ struct kthread_worker kworker;
+ struct task_struct *kworker_task;
};
/*
@@ -217,6 +223,7 @@ struct mapped_device {
*/
struct dm_md_mempools {
mempool_t *io_pool;
+ mempool_t *rq_pool;
struct bio_set *bs;
};
@@ -231,6 +238,7 @@ struct table_device {
#define RESERVED_MAX_IOS 1024
static struct kmem_cache *_io_cache;
static struct kmem_cache *_rq_tio_cache;
+static struct kmem_cache *_rq_cache;
/*
* Bio-based DM's mempools' reserved IOs set by the user.
@@ -288,9 +296,14 @@ static int __init local_init(void)
if (!_rq_tio_cache)
goto out_free_io_cache;
+ _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+ __alignof__(struct request), 0, NULL);
+ if (!_rq_cache)
+ goto out_free_rq_tio_cache;
+
r = dm_uevent_init();
if (r)
- goto out_free_rq_tio_cache;
+ goto out_free_rq_cache;
deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
if (!deferred_remove_workqueue) {
@@ -312,6 +325,8 @@ out_free_workqueue:
destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
dm_uevent_exit();
+out_free_rq_cache:
+ kmem_cache_destroy(_rq_cache);
out_free_rq_tio_cache:
kmem_cache_destroy(_rq_tio_cache);
out_free_io_cache:
@@ -325,6 +340,7 @@ static void local_exit(void)
flush_scheduled_work();
destroy_workqueue(deferred_remove_workqueue);
+ kmem_cache_destroy(_rq_cache);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -577,6 +593,17 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
mempool_free(tio, tio->md->io_pool);
}
+static struct request *alloc_clone_request(struct mapped_device *md,
+ gfp_t gfp_mask)
+{
+ return mempool_alloc(md->rq_pool, gfp_mask);
+}
+
+static void free_clone_request(struct mapped_device *md, struct request *rq)
+{
+ mempool_free(rq, md->rq_pool);
+}
+
static int md_in_flight(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
@@ -992,7 +1019,7 @@ static void end_clone_bio(struct bio *clone, int error)
* the md may be freed in dm_put() at the end of this function.
* Or do dm_get() before calling this function and dm_put() later.
*/
-static void rq_completed(struct mapped_device *md, int rw, int run_queue)
+static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
atomic_dec(&md->pending[rw]);
@@ -1020,12 +1047,17 @@ static void free_rq_clone(struct request *clone)
struct dm_rq_target_io *tio = clone->end_io_data;
blk_rq_unprep_clone(clone);
+ if (clone->q && clone->q->mq_ops)
+ tio->ti->type->release_clone_rq(clone);
+ else
+ free_clone_request(tio->md, clone);
free_rq_tio(tio);
}
/*
* Complete the clone and the original request.
- * Must be called without queue lock.
+ * Must be called without clone's queue lock held,
+ * see end_clone_request() for more details.
*/
static void dm_end_request(struct request *clone, int error)
{
@@ -1054,23 +1086,23 @@ static void dm_end_request(struct request *clone, int error)
static void dm_unprep_request(struct request *rq)
{
- struct request *clone = rq->special;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = tio->clone;
rq->special = NULL;
rq->cmd_flags &= ~REQ_DONTPREP;
- free_rq_clone(clone);
+ if (clone)
+ free_rq_clone(clone);
}
/*
* Requeue the original request of a clone.
*/
-void dm_requeue_unmapped_request(struct request *clone)
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+ struct request *rq)
{
- int rw = rq_data_dir(clone);
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct mapped_device *md = tio->md;
- struct request *rq = tio->orig;
+ int rw = rq_data_dir(rq);
struct request_queue *q = rq->q;
unsigned long flags;
@@ -1080,9 +1112,15 @@ void dm_requeue_unmapped_request(struct request *clone)
blk_requeue_request(q, rq);
spin_unlock_irqrestore(q->queue_lock, flags);
- rq_completed(md, rw, 0);
+ rq_completed(md, rw, false);
+}
+
+static void dm_requeue_unmapped_request(struct request *clone)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ dm_requeue_unmapped_original_request(tio->md, tio->orig);
}
-EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
static void __stop_queue(struct request_queue *q)
{
@@ -1151,8 +1189,15 @@ static void dm_done(struct request *clone, int error, bool mapped)
static void dm_softirq_done(struct request *rq)
{
bool mapped = true;
- struct request *clone = rq->completion_data;
- struct dm_rq_target_io *tio = clone->end_io_data;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = tio->clone;
+
+ if (!clone) {
+ blk_end_request_all(rq, tio->error);
+ rq_completed(tio->md, rq_data_dir(rq), false);
+ free_rq_tio(tio);
+ return;
+ }
if (rq->cmd_flags & REQ_FAILED)
mapped = false;
@@ -1164,13 +1209,11 @@ static void dm_softirq_done(struct request *rq)
* Complete the clone and the original request with the error status
* through softirq context.
*/
-static void dm_complete_request(struct request *clone, int error)
+static void dm_complete_request(struct request *rq, int error)
{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *rq = tio->orig;
+ struct dm_rq_target_io *tio = rq->special;
tio->error = error;
- rq->completion_data = clone;
blk_complete_request(rq);
}
@@ -1178,40 +1221,40 @@ static void dm_complete_request(struct request *clone, int error)
* Complete the not-mapped clone and the original request with the error status
* through softirq context.
* Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() function fails.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
*/
-void dm_kill_unmapped_request(struct request *clone, int error)
+static void dm_kill_unmapped_request(struct request *rq, int error)
{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *rq = tio->orig;
-
rq->cmd_flags |= REQ_FAILED;
- dm_complete_request(clone, error);
+ dm_complete_request(rq, error);
}
-EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
/*
- * Called with the queue lock held
+ * Called with the clone's queue lock held
*/
static void end_clone_request(struct request *clone, int error)
{
- /*
- * For just cleaning up the information of the queue in which
- * the clone was dispatched.
- * The clone is *NOT* freed actually here because it is alloced from
- * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
- */
- __blk_put_request(clone->q, clone);
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ if (!clone->q->mq_ops) {
+ /*
+ * For just cleaning up the information of the queue in which
+ * the clone was dispatched.
+ * The clone is *NOT* freed actually here because it is alloced
+ * from dm own mempool (REQ_ALLOCED isn't set).
+ */
+ __blk_put_request(clone->q, clone);
+ }
/*
* Actual request completion is done in a softirq context which doesn't
- * hold the queue lock. Otherwise, deadlock could occur because:
+ * hold the clone's queue lock. Otherwise, deadlock could occur because:
* - another request may be submitted by the upper level driver
* of the stacking during the completion
* - the submission which requires queue lock may be done
- * against this queue
+ * against this clone's queue
*/
- dm_complete_request(clone, error);
+ dm_complete_request(tio->orig, error);
}
/*
@@ -1689,19 +1732,19 @@ static void dm_request(struct request_queue *q, struct bio *bio)
_dm_request(q, bio);
}
-void dm_dispatch_request(struct request *rq)
+static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
{
int r;
- if (blk_queue_io_stat(rq->q))
- rq->cmd_flags |= REQ_IO_STAT;
+ if (blk_queue_io_stat(clone->q))
+ clone->cmd_flags |= REQ_IO_STAT;
- rq->start_time = jiffies;
- r = blk_insert_cloned_request(rq->q, rq);
+ clone->start_time = jiffies;
+ r = blk_insert_cloned_request(clone->q, clone);
if (r)
+ /* must complete clone in terms of original request */
dm_complete_request(rq, r);
}
-EXPORT_SYMBOL_GPL(dm_dispatch_request);
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
void *data)
@@ -1718,11 +1761,11 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
}
static int setup_clone(struct request *clone, struct request *rq,
- struct dm_rq_target_io *tio)
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
{
int r;
- r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+ r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
dm_rq_bio_constructor, tio);
if (r)
return r;
@@ -1733,14 +1776,37 @@ static int setup_clone(struct request *clone, struct request *rq,
clone->end_io = end_clone_request;
clone->end_io_data = tio;
+ tio->clone = clone;
+
return 0;
}
static struct request *clone_rq(struct request *rq, struct mapped_device *md,
- gfp_t gfp_mask)
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+ struct request *clone = alloc_clone_request(md, gfp_mask);
+
+ if (!clone)
+ return NULL;
+
+ blk_rq_init(NULL, clone);
+ if (setup_clone(clone, rq, tio, gfp_mask)) {
+ /* -ENOMEM */
+ free_clone_request(md, clone);
+ return NULL;
+ }
+
+ return clone;
+}
+
+static void map_tio_request(struct kthread_work *work);
+
+static struct dm_rq_target_io *prep_tio(struct request *rq,
+ struct mapped_device *md, gfp_t gfp_mask)
{
- struct request *clone;
struct dm_rq_target_io *tio;
+ int srcu_idx;
+ struct dm_table *table;
tio = alloc_rq_tio(md, gfp_mask);
if (!tio)
@@ -1748,18 +1814,23 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
tio->md = md;
tio->ti = NULL;
+ tio->clone = NULL;
tio->orig = rq;
tio->error = 0;
memset(&tio->info, 0, sizeof(tio->info));
-
- clone = &tio->clone;
- if (setup_clone(clone, rq, tio)) {
- /* -ENOMEM */
- free_rq_tio(tio);
- return NULL;
+ init_kthread_work(&tio->work, map_tio_request);
+
+ table = dm_get_live_table(md, &srcu_idx);
+ if (!dm_table_mq_request_based(table)) {
+ if (!clone_rq(rq, md, tio, gfp_mask)) {
+ dm_put_live_table(md, srcu_idx);
+ free_rq_tio(tio);
+ return NULL;
+ }
}
+ dm_put_live_table(md, srcu_idx);
- return clone;
+ return tio;
}
/*
@@ -1768,18 +1839,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
static int dm_prep_fn(struct request_queue *q, struct request *rq)
{
struct mapped_device *md = q->queuedata;
- struct request *clone;
+ struct dm_rq_target_io *tio;
if (unlikely(rq->special)) {
DMWARN("Already has something in rq->special.");
return BLKPREP_KILL;
}
- clone = clone_rq(rq, md, GFP_ATOMIC);
- if (!clone)
+ tio = prep_tio(rq, md, GFP_ATOMIC);
+ if (!tio)
return BLKPREP_DEFER;
- rq->special = clone;
+ rq->special = tio;
rq->cmd_flags |= REQ_DONTPREP;
return BLKPREP_OK;
@@ -1787,17 +1858,36 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
/*
* Returns:
- * 0 : the request has been processed (not requeued)
- * !0 : the request has been requeued
+ * 0 : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0 : the request was completed due to failure
*/
-static int map_request(struct dm_target *ti, struct request *clone,
+static int map_request(struct dm_target *ti, struct request *rq,
struct mapped_device *md)
{
- int r, requeued = 0;
- struct dm_rq_target_io *tio = clone->end_io_data;
+ int r;
+ struct dm_rq_target_io *tio = rq->special;
+ struct request *clone = NULL;
+
+ if (tio->clone) {
+ clone = tio->clone;
+ r = ti->type->map_rq(ti, clone, &tio->info);
+ } else {
+ r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+ if (r < 0) {
+ /* The target wants to complete the I/O */
+ dm_kill_unmapped_request(rq, r);
+ return r;
+ }
+ if (IS_ERR(clone))
+ return DM_MAPIO_REQUEUE;
+ if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+ /* -ENOMEM */
+ ti->type->release_clone_rq(clone);
+ return DM_MAPIO_REQUEUE;
+ }
+ }
- tio->ti = ti;
- r = ti->type->map_rq(ti, clone, &tio->info);
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
@@ -1805,13 +1895,12 @@ static int map_request(struct dm_target *ti, struct request *clone,
case DM_MAPIO_REMAPPED:
/* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
- blk_rq_pos(tio->orig));
- dm_dispatch_request(clone);
+ blk_rq_pos(rq));
+ dm_dispatch_clone_request(clone, rq);
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
dm_requeue_unmapped_request(clone);
- requeued = 1;
break;
default:
if (r > 0) {
@@ -1820,20 +1909,27 @@ static int map_request(struct dm_target *ti, struct request *clone,
}
/* The target wants to complete the I/O */
- dm_kill_unmapped_request(clone, r);
- break;
+ dm_kill_unmapped_request(rq, r);
+ return r;
}
- return requeued;
+ return 0;
}
-static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
+static void map_tio_request(struct kthread_work *work)
{
- struct request *clone;
+ struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+ struct request *rq = tio->orig;
+ struct mapped_device *md = tio->md;
+
+ if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+ dm_requeue_unmapped_original_request(md, rq);
+}
+static void dm_start_request(struct mapped_device *md, struct request *orig)
+{
blk_start_request(orig);
- clone = orig->special;
- atomic_inc(&md->pending[rq_data_dir(clone)]);
+ atomic_inc(&md->pending[rq_data_dir(orig)]);
/*
* Hold the md reference here for the in-flight I/O.
@@ -1843,8 +1939,6 @@ static struct request *dm_start_request(struct mapped_device *md, struct request
* See the comment in rq_completed() too.
*/
dm_get(md);
-
- return clone;
}
/*
@@ -1857,7 +1951,8 @@ static void dm_request_fn(struct request_queue *q)
int srcu_idx;
struct dm_table *map = dm_get_live_table(md, &srcu_idx);
struct dm_target *ti;
- struct request *rq, *clone;
+ struct request *rq;
+ struct dm_rq_target_io *tio;
sector_t pos;
/*
@@ -1879,34 +1974,29 @@ static void dm_request_fn(struct request_queue *q)
ti = dm_table_find_target(map, pos);
if (!dm_target_is_valid(ti)) {
/*
- * Must perform setup, that dm_done() requires,
+ * Must perform setup, that rq_completed() requires,
* before calling dm_kill_unmapped_request
*/
DMERR_LIMIT("request attempted access beyond the end of device");
- clone = dm_start_request(md, rq);
- dm_kill_unmapped_request(clone, -EIO);
+ dm_start_request(md, rq);
+ dm_kill_unmapped_request(rq, -EIO);
continue;
}
if (ti->type->busy && ti->type->busy(ti))
goto delay_and_out;
- clone = dm_start_request(md, rq);
-
- spin_unlock(q->queue_lock);
- if (map_request(ti, clone, md))
- goto requeued;
+ dm_start_request(md, rq);
+ tio = rq->special;
+ /* Establish tio->ti before queuing work (map_tio_request) */
+ tio->ti = ti;
+ queue_kthread_work(&md->kworker, &tio->work);
BUG_ON(!irqs_disabled());
- spin_lock(q->queue_lock);
}
goto out;
-requeued:
- BUG_ON(!irqs_disabled());
- spin_lock(q->queue_lock);
-
delay_and_out:
blk_delay_queue(q, HZ / 10);
out:
@@ -2092,6 +2182,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
+ md->kworker_task = NULL;
md->disk->major = _major;
md->disk->first_minor = minor;
@@ -2152,8 +2243,13 @@ static void free_dev(struct mapped_device *md)
unlock_fs(md);
bdput(md->bdev);
destroy_workqueue(md->wq);
+
+ if (md->kworker_task)
+ kthread_stop(md->kworker_task);
if (md->io_pool)
mempool_destroy(md->io_pool);
+ if (md->rq_pool)
+ mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
blk_integrity_unregister(md->disk);
@@ -2187,23 +2283,24 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
bioset_free(md->bs);
md->bs = p->bs;
p->bs = NULL;
- } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
- /*
- * There's no need to reload with request-based dm
- * because the size of front_pad doesn't change.
- * Note for future: If you are to reload bioset,
- * prep-ed requests in the queue may refer
- * to bio from the old bioset, so you must walk
- * through the queue to unprep.
- */
}
+ /*
+ * There's no need to reload with request-based dm
+ * because the size of front_pad doesn't change.
+ * Note for future: If you are to reload bioset,
+ * prep-ed requests in the queue may refer
+ * to bio from the old bioset, so you must walk
+ * through the queue to unprep.
+ */
goto out;
}
- BUG_ON(!p || md->io_pool || md->bs);
+ BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
md->io_pool = p->io_pool;
p->io_pool = NULL;
+ md->rq_pool = p->rq_pool;
+ p->rq_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
@@ -2406,6 +2503,14 @@ unsigned dm_get_md_type(struct mapped_device *md)
return md->type;
}
+static bool dm_md_type_request_based(struct mapped_device *md)
+{
+ unsigned table_type = dm_get_md_type(md);
+
+ return (table_type == DM_TYPE_REQUEST_BASED ||
+ table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
{
return md->immutable_target_type;
@@ -2443,6 +2548,11 @@ static int dm_init_request_based_queue(struct mapped_device *md)
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
+ /* Also initialize the request-based DM worker thread */
+ init_kthread_worker(&md->kworker);
+ md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+ "kdmwork-%s", dm_device_name(md));
+
elv_register_queue(md->queue);
return 1;
@@ -2453,8 +2563,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
*/
int dm_setup_md_queue(struct mapped_device *md)
{
- if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
- !dm_init_request_based_queue(md)) {
+ if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
DMWARN("Cannot initialize queue for request-based mapped device");
return -EINVAL;
}
@@ -2462,7 +2571,7 @@ int dm_setup_md_queue(struct mapped_device *md)
return 0;
}
-static struct mapped_device *dm_find_md(dev_t dev)
+struct mapped_device *dm_get_md(dev_t dev)
{
struct mapped_device *md;
unsigned minor = MINOR(dev);
@@ -2473,12 +2582,15 @@ static struct mapped_device *dm_find_md(dev_t dev)
spin_lock(&_minor_lock);
md = idr_find(&_minor_idr, minor);
- if (md && (md == MINOR_ALLOCED ||
- (MINOR(disk_devt(dm_disk(md))) != minor) ||
- dm_deleting_md(md) ||
- test_bit(DMF_FREEING, &md->flags))) {
- md = NULL;
- goto out;
+ if (md) {
+ if ((md == MINOR_ALLOCED ||
+ (MINOR(disk_devt(dm_disk(md))) != minor) ||
+ dm_deleting_md(md) ||
+ test_bit(DMF_FREEING, &md->flags))) {
+ md = NULL;
+ goto out;
+ }
+ dm_get(md);
}
out:
@@ -2486,16 +2598,6 @@ out:
return md;
}
-
-struct mapped_device *dm_get_md(dev_t dev)
-{
- struct mapped_device *md = dm_find_md(dev);
-
- if (md)
- dm_get(md);
-
- return md;
-}
EXPORT_SYMBOL_GPL(dm_get_md);
void *dm_get_mdptr(struct mapped_device *md)
@@ -2533,6 +2635,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
+ if (dm_request_based(md))
+ flush_kthread_worker(&md->kworker);
+
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
@@ -2776,8 +2881,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
*/
- if (dm_request_based(md))
+ if (dm_request_based(md)) {
stop_queue(md->queue);
+ flush_kthread_worker(&md->kworker);
+ }
flush_workqueue(md->wq);
@@ -3123,24 +3230,35 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
{
struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
struct kmem_cache *cachep;
- unsigned int pool_size;
+ unsigned int pool_size = 0;
unsigned int front_pad;
if (!pools)
return NULL;
- if (type == DM_TYPE_BIO_BASED) {
+ switch (type) {
+ case DM_TYPE_BIO_BASED:
cachep = _io_cache;
pool_size = dm_get_reserved_bio_based_ios();
front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
- } else if (type == DM_TYPE_REQUEST_BASED) {
- cachep = _rq_tio_cache;
+ break;
+ case DM_TYPE_REQUEST_BASED:
pool_size = dm_get_reserved_rq_based_ios();
+ pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+ if (!pools->rq_pool)
+ goto out;
+ /* fall through to setup remaining rq-based pools */
+ case DM_TYPE_MQ_REQUEST_BASED:
+ cachep = _rq_tio_cache;
+ if (!pool_size)
+ pool_size = dm_get_reserved_rq_based_ios();
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_bio_data_size is not used. See __bind_mempools(). */
WARN_ON(per_bio_data_size != 0);
- } else
+ break;
+ default:
goto out;
+ }
pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
if (!pools->io_pool)
@@ -3169,6 +3287,9 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
if (pools->io_pool)
mempool_destroy(pools->io_pool);
+ if (pools->rq_pool)
+ mempool_destroy(pools->rq_pool);
+
if (pools->bs)
bioset_free(pools->bs);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 84b0f9e4ba6c..59f53e79db82 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -34,9 +34,10 @@
/*
* Type of table and mapped_device's mempool
*/
-#define DM_TYPE_NONE 0
-#define DM_TYPE_BIO_BASED 1
-#define DM_TYPE_REQUEST_BASED 2
+#define DM_TYPE_NONE 0
+#define DM_TYPE_BIO_BASED 1
+#define DM_TYPE_REQUEST_BASED 2
+#define DM_TYPE_MQ_REQUEST_BASED 3
/*
* List of devices that a metadevice uses and should open/close.
@@ -73,6 +74,7 @@ int dm_table_any_busy_target(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
+bool dm_table_mq_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
@@ -99,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md);
/*
* To check whether the target type is request-based or not (bio-based).
*/
-#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
+#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
+ ((t)->type->clone_and_map_rq != NULL))
/*
* To check whether the target type is a hybrid (capable of being
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index e8b4574956c7..1277eb26b58a 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -332,13 +332,11 @@ static int run(struct mddev *mddev)
return 0;
}
-static int stop(struct mddev *mddev)
+static void faulty_free(struct mddev *mddev, void *priv)
{
- struct faulty_conf *conf = mddev->private;
+ struct faulty_conf *conf = priv;
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static struct md_personality faulty_personality =
@@ -348,7 +346,7 @@ static struct md_personality faulty_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = faulty_free,
.status = status,
.check_reshape = reshape,
.size = faulty_size,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 64713b77df1c..fa7d577f3d12 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
lo = 0;
hi = mddev->raid_disks - 1;
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
/*
* Binary Search
@@ -60,18 +60,16 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
*
* Return amount of bytes we can take at this offset
*/
-static int linear_mergeable_bvec(struct request_queue *q,
+static int linear_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct dev_info *dev0;
unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int maxbytes = biovec->bv_len;
struct request_queue *subq;
- rcu_read_lock();
dev0 = which_dev(mddev, sector);
maxsectors = dev0->end_sector - sector;
subq = bdev_get_queue(dev0->rdev->bdev);
@@ -81,7 +79,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
biovec));
}
- rcu_read_unlock();
if (maxsectors < bio_sectors)
maxsectors = 0;
@@ -97,24 +94,18 @@ static int linear_mergeable_bvec(struct request_queue *q,
return maxsectors << 9;
}
-static int linear_congested(void *data, int bits)
+static int linear_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct linear_conf *conf;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
for (i = 0; i < mddev->raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
ret |= bdi_congested(&q->backing_dev_info, bits);
}
- rcu_read_unlock();
return ret;
}
@@ -123,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk
struct linear_conf *conf;
sector_t array_sectors;
- rcu_read_lock();
- conf = rcu_dereference(mddev->private);
+ conf = mddev->private;
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
array_sectors = conf->array_sectors;
- rcu_read_unlock();
return array_sectors;
}
@@ -217,10 +206,6 @@ static int linear_run (struct mddev *mddev)
mddev->private = conf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
- blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
- mddev->queue->backing_dev_info.congested_fn = linear_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
-
ret = md_integrity_register(mddev);
if (ret) {
kfree(conf);
@@ -252,38 +237,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
if (!newconf)
return -ENOMEM;
- oldconf = rcu_dereference_protected(mddev->private,
- lockdep_is_held(
- &mddev->reconfig_mutex));
+ mddev_suspend(mddev);
+ oldconf = mddev->private;
mddev->raid_disks++;
- rcu_assign_pointer(mddev->private, newconf);
+ mddev->private = newconf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
+ mddev_resume(mddev);
revalidate_disk(mddev->gendisk);
- kfree_rcu(oldconf, rcu);
+ kfree(oldconf);
return 0;
}
-static int linear_stop (struct mddev *mddev)
+static void linear_free(struct mddev *mddev, void *priv)
{
- struct linear_conf *conf =
- rcu_dereference_protected(mddev->private,
- lockdep_is_held(
- &mddev->reconfig_mutex));
+ struct linear_conf *conf = priv;
- /*
- * We do not require rcu protection here since
- * we hold reconfig_mutex for both linear_add and
- * linear_stop, so they cannot race.
- * We should make sure any old 'conf's are properly
- * freed though.
- */
- rcu_barrier();
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf);
- mddev->private = NULL;
-
- return 0;
}
static void linear_make_request(struct mddev *mddev, struct bio *bio)
@@ -299,16 +269,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
}
do {
- rcu_read_lock();
-
tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
end_sector = tmp_dev->end_sector;
data_offset = tmp_dev->rdev->data_offset;
bio->bi_bdev = tmp_dev->rdev->bdev;
- rcu_read_unlock();
-
if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
bio->bi_iter.bi_sector < start_sector))
goto out_of_bounds;
@@ -355,6 +321,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
}
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
static struct md_personality linear_personality =
{
.name = "linear",
@@ -362,10 +332,13 @@ static struct md_personality linear_personality =
.owner = THIS_MODULE,
.make_request = linear_make_request,
.run = linear_run,
- .stop = linear_stop,
+ .free = linear_free,
.status = linear_status,
.hot_add_disk = linear_add,
.size = linear_size,
+ .quiesce = linear_quiesce,
+ .congested = linear_congested,
+ .mergeable_bvec = linear_mergeable_bvec,
};
static int __init linear_init (void)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 709755fb6d7b..cadf9cc02b25 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
+static void mddev_detach(struct mddev *mddev);
/*
* Default number of read corrections we'll attempt on an rdev
@@ -292,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
/* mddev_suspend makes sure no new requests are submitted
* to the device, and that any requests that have been submitted
* are completely handled.
- * Once ->stop is called and completes, the module will be completely
- * unused.
+ * Once mddev_detach() is called and completes, the module will be
+ * completely unused.
*/
void mddev_suspend(struct mddev *mddev)
{
@@ -321,10 +322,47 @@ EXPORT_SYMBOL_GPL(mddev_resume);
int mddev_congested(struct mddev *mddev, int bits)
{
- return mddev->suspended;
+ struct md_personality *pers = mddev->pers;
+ int ret = 0;
+
+ rcu_read_lock();
+ if (mddev->suspended)
+ ret = 1;
+ else if (pers && pers->congested)
+ ret = pers->congested(mddev, bits);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mddev_congested);
+static int md_congested(void *data, int bits)
+{
+ struct mddev *mddev = data;
+ return mddev_congested(mddev, bits);
}
-EXPORT_SYMBOL(mddev_congested);
+static int md_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mddev *mddev = q->queuedata;
+ int ret;
+ rcu_read_lock();
+ if (mddev->suspended) {
+ /* Must always allow one vec */
+ if (bvm->bi_size == 0)
+ ret = biovec->bv_len;
+ else
+ ret = 0;
+ } else {
+ struct md_personality *pers = mddev->pers;
+ if (pers && pers->mergeable_bvec)
+ ret = pers->mergeable_bvec(mddev, bvm, biovec);
+ else
+ ret = biovec->bv_len;
+ }
+ rcu_read_unlock();
+ return ret;
+}
/*
* Generic flush handling for md
*/
@@ -397,12 +435,12 @@ static void md_submit_flush_data(struct work_struct *ws)
void md_flush_request(struct mddev *mddev, struct bio *bio)
{
- spin_lock_irq(&mddev->write_lock);
+ spin_lock_irq(&mddev->lock);
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio,
- mddev->write_lock);
+ mddev->lock);
mddev->flush_bio = bio;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock_irq(&mddev->lock);
INIT_WORK(&mddev->flush_work, submit_flushes);
queue_work(md_wq, &mddev->flush_work);
@@ -465,7 +503,7 @@ void mddev_init(struct mddev *mddev)
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
- spin_lock_init(&mddev->write_lock);
+ spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
@@ -552,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit)
goto retry;
}
-static inline int __must_check mddev_lock(struct mddev *mddev)
-{
- return mutex_lock_interruptible(&mddev->reconfig_mutex);
-}
-
-/* Sometimes we need to take the lock in a situation where
- * failure due to interrupts is not acceptable.
- */
-static inline void mddev_lock_nointr(struct mddev *mddev)
-{
- mutex_lock(&mddev->reconfig_mutex);
-}
-
-static inline int mddev_is_locked(struct mddev *mddev)
-{
- return mutex_is_locked(&mddev->reconfig_mutex);
-}
-
-static inline int mddev_trylock(struct mddev *mddev)
-{
- return mutex_trylock(&mddev->reconfig_mutex);
-}
-
static struct attribute_group md_redundancy_group;
-static void mddev_unlock(struct mddev *mddev)
+void mddev_unlock(struct mddev *mddev)
{
if (mddev->to_remove) {
/* These cannot be removed under reconfig_mutex as
@@ -619,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev)
md_wakeup_thread(mddev->thread);
spin_unlock(&pers_lock);
}
+EXPORT_SYMBOL_GPL(mddev_unlock);
static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
{
@@ -2230,7 +2246,7 @@ repeat:
return;
}
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
mddev->utime = get_seconds();
@@ -2287,7 +2303,7 @@ repeat:
}
sync_sbs(mddev, nospares);
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev), mddev->in_sync);
@@ -2326,15 +2342,15 @@ repeat:
md_super_wait(mddev);
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync != sync_req ||
test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
/* have to write it out again */
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
goto repeat;
}
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
wake_up(&mddev->sb_wait);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -2381,40 +2397,41 @@ state_show(struct md_rdev *rdev, char *page)
{
char *sep = "";
size_t len = 0;
+ unsigned long flags = ACCESS_ONCE(rdev->flags);
- if (test_bit(Faulty, &rdev->flags) ||
+ if (test_bit(Faulty, &flags) ||
rdev->badblocks.unacked_exist) {
len+= sprintf(page+len, "%sfaulty",sep);
sep = ",";
}
- if (test_bit(In_sync, &rdev->flags)) {
+ if (test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
- if (test_bit(WriteMostly, &rdev->flags)) {
+ if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
}
- if (test_bit(Blocked, &rdev->flags) ||
+ if (test_bit(Blocked, &flags) ||
(rdev->badblocks.unacked_exist
- && !test_bit(Faulty, &rdev->flags))) {
+ && !test_bit(Faulty, &flags))) {
len += sprintf(page+len, "%sblocked", sep);
sep = ",";
}
- if (!test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags)) {
+ if (!test_bit(Faulty, &flags) &&
+ !test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
}
- if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ if (test_bit(WriteErrorSeen, &flags)) {
len += sprintf(page+len, "%swrite_error", sep);
sep = ",";
}
- if (test_bit(WantReplacement, &rdev->flags)) {
+ if (test_bit(WantReplacement, &flags)) {
len += sprintf(page+len, "%swant_replacement", sep);
sep = ",";
}
- if (test_bit(Replacement, &rdev->flags)) {
+ if (test_bit(Replacement, &flags)) {
len += sprintf(page+len, "%sreplacement", sep);
sep = ",";
}
@@ -2538,7 +2555,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
return err ? err : len;
}
static struct rdev_sysfs_entry rdev_state =
-__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
+__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
static ssize_t
errors_show(struct md_rdev *rdev, char *page)
@@ -2927,21 +2944,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
- struct mddev *mddev = rdev->mddev;
- ssize_t rv;
if (!entry->show)
return -EIO;
-
- rv = mddev ? mddev_lock(mddev) : -EBUSY;
- if (!rv) {
- if (rdev->mddev == NULL)
- rv = -EBUSY;
- else
- rv = entry->show(rdev, page);
- mddev_unlock(mddev);
- }
- return rv;
+ if (!rdev->mddev)
+ return -EBUSY;
+ return entry->show(rdev, page);
}
static ssize_t
@@ -3212,11 +3220,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
mddev->safemode_delay = 0;
else {
unsigned long old_delay = mddev->safemode_delay;
- mddev->safemode_delay = (msec*HZ)/1000;
- if (mddev->safemode_delay == 0)
- mddev->safemode_delay = 1;
- if (mddev->safemode_delay < old_delay || old_delay == 0)
- md_safemode_timeout((unsigned long)mddev);
+ unsigned long new_delay = (msec*HZ)/1000;
+
+ if (new_delay == 0)
+ new_delay = 1;
+ mddev->safemode_delay = new_delay;
+ if (new_delay < old_delay || old_delay == 0)
+ mod_timer(&mddev->safemode_timer, jiffies+1);
}
return len;
}
@@ -3226,41 +3236,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
static ssize_t
level_show(struct mddev *mddev, char *page)
{
- struct md_personality *p = mddev->pers;
+ struct md_personality *p;
+ int ret;
+ spin_lock(&mddev->lock);
+ p = mddev->pers;
if (p)
- return sprintf(page, "%s\n", p->name);
+ ret = sprintf(page, "%s\n", p->name);
else if (mddev->clevel[0])
- return sprintf(page, "%s\n", mddev->clevel);
+ ret = sprintf(page, "%s\n", mddev->clevel);
else if (mddev->level != LEVEL_NONE)
- return sprintf(page, "%d\n", mddev->level);
+ ret = sprintf(page, "%d\n", mddev->level);
else
- return 0;
+ ret = 0;
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
level_store(struct mddev *mddev, const char *buf, size_t len)
{
char clevel[16];
- ssize_t rv = len;
- struct md_personality *pers;
+ ssize_t rv;
+ size_t slen = len;
+ struct md_personality *pers, *oldpers;
long level;
- void *priv;
+ void *priv, *oldpriv;
struct md_rdev *rdev;
+ if (slen == 0 || slen >= sizeof(clevel))
+ return -EINVAL;
+
+ rv = mddev_lock(mddev);
+ if (rv)
+ return rv;
+
if (mddev->pers == NULL) {
- if (len == 0)
- return 0;
- if (len >= sizeof(mddev->clevel))
- return -ENOSPC;
- strncpy(mddev->clevel, buf, len);
- if (mddev->clevel[len-1] == '\n')
- len--;
- mddev->clevel[len] = 0;
+ strncpy(mddev->clevel, buf, slen);
+ if (mddev->clevel[slen-1] == '\n')
+ slen--;
+ mddev->clevel[slen] = 0;
mddev->level = LEVEL_NONE;
- return rv;
+ rv = len;
+ goto out_unlock;
}
+ rv = -EROFS;
if (mddev->ro)
- return -EROFS;
+ goto out_unlock;
/* request to change the personality. Need to ensure:
* - array is not engaged in resync/recovery/reshape
@@ -3268,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
* - new personality will access other array.
*/
+ rv = -EBUSY;
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector ||
mddev->sysfs_active)
- return -EBUSY;
+ goto out_unlock;
+ rv = -EINVAL;
if (!mddev->pers->quiesce) {
printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
mdname(mddev), mddev->pers->name);
- return -EINVAL;
+ goto out_unlock;
}
/* Now find the new personality */
- if (len == 0 || len >= sizeof(clevel))
- return -EINVAL;
- strncpy(clevel, buf, len);
- if (clevel[len-1] == '\n')
- len--;
- clevel[len] = 0;
+ strncpy(clevel, buf, slen);
+ if (clevel[slen-1] == '\n')
+ slen--;
+ clevel[slen] = 0;
if (kstrtol(clevel, 10, &level))
level = LEVEL_NONE;
@@ -3297,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (!pers || !try_module_get(pers->owner)) {
spin_unlock(&pers_lock);
printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
- return -EINVAL;
+ rv = -EINVAL;
+ goto out_unlock;
}
spin_unlock(&pers_lock);
if (pers == mddev->pers) {
/* Nothing to do! */
module_put(pers->owner);
- return rv;
+ rv = len;
+ goto out_unlock;
}
if (!pers->takeover) {
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
mdname(mddev), clevel);
- return -EINVAL;
+ rv = -EINVAL;
+ goto out_unlock;
}
rdev_for_each(rdev, mddev)
@@ -3330,30 +3354,29 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s would not accept array\n",
mdname(mddev), clevel);
- return PTR_ERR(priv);
+ rv = PTR_ERR(priv);
+ goto out_unlock;
}
/* Looks like we have a winner */
mddev_suspend(mddev);
- mddev->pers->stop(mddev);
+ mddev_detach(mddev);
- if (mddev->pers->sync_request == NULL &&
- pers->sync_request != NULL) {
- /* need to add the md_redundancy_group */
- if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
- printk(KERN_WARNING
- "md: cannot register extra attributes for %s\n",
- mdname(mddev));
- mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
- }
- if (mddev->pers->sync_request != NULL &&
- pers->sync_request == NULL) {
- /* need to remove the md_redundancy_group */
- if (mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
- }
+ spin_lock(&mddev->lock);
+ oldpers = mddev->pers;
+ oldpriv = mddev->private;
+ mddev->pers = pers;
+ mddev->private = priv;
+ strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+ mddev->level = mddev->new_level;
+ mddev->layout = mddev->new_layout;
+ mddev->chunk_sectors = mddev->new_chunk_sectors;
+ mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ mddev->degraded = 0;
+ spin_unlock(&mddev->lock);
- if (mddev->pers->sync_request == NULL &&
+ if (oldpers->sync_request == NULL &&
mddev->external) {
/* We are converting from a no-redundancy array
* to a redundancy array and metadata is managed
@@ -3367,6 +3390,24 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->safemode = 0;
}
+ oldpers->free(mddev, oldpriv);
+
+ if (oldpers->sync_request == NULL &&
+ pers->sync_request != NULL) {
+ /* need to add the md_redundancy_group */
+ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+ printk(KERN_WARNING
+ "md: cannot register extra attributes for %s\n",
+ mdname(mddev));
+ mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+ }
+ if (oldpers->sync_request != NULL &&
+ pers->sync_request == NULL) {
+ /* need to remove the md_redundancy_group */
+ if (mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ }
+
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
continue;
@@ -3392,17 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
}
- module_put(mddev->pers->owner);
- mddev->pers = pers;
- mddev->private = priv;
- strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
- mddev->level = mddev->new_level;
- mddev->layout = mddev->new_layout;
- mddev->chunk_sectors = mddev->new_chunk_sectors;
- mddev->delta_disks = 0;
- mddev->reshape_backwards = 0;
- mddev->degraded = 0;
- if (mddev->pers->sync_request == NULL) {
+ if (pers->sync_request == NULL) {
/* this is now an array without redundancy, so
* it must always be in_sync
*/
@@ -3417,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
md_update_sb(mddev, 1);
sysfs_notify(&mddev->kobj, NULL, "level");
md_new_event(mddev);
+ rv = len;
+out_unlock:
+ mddev_unlock(mddev);
return rv;
}
@@ -3439,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
+ int err;
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
- int err;
if (mddev->pers->check_reshape == NULL)
- return -EBUSY;
- if (mddev->ro)
- return -EROFS;
- mddev->new_layout = n;
- err = mddev->pers->check_reshape(mddev);
- if (err) {
- mddev->new_layout = mddev->layout;
- return err;
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_layout = n;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_layout = mddev->layout;
}
} else {
mddev->new_layout = n;
if (mddev->reshape_position == MaxSector)
mddev->layout = n;
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_layout =
__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
@@ -3483,32 +3521,39 @@ static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
- int rv = 0;
+ int err;
unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers)
- rv = update_raid_disks(mddev, n);
+ err = update_raid_disks(mddev, n);
else if (mddev->reshape_position != MaxSector) {
struct md_rdev *rdev;
int olddisks = mddev->raid_disks - mddev->delta_disks;
+ err = -EINVAL;
rdev_for_each(rdev, mddev) {
if (olddisks < n &&
rdev->data_offset < rdev->new_data_offset)
- return -EINVAL;
+ goto out_unlock;
if (olddisks > n &&
rdev->data_offset > rdev->new_data_offset)
- return -EINVAL;
+ goto out_unlock;
}
+ err = 0;
mddev->delta_disks = n - olddisks;
mddev->raid_disks = n;
mddev->reshape_backwards = (mddev->delta_disks < 0);
} else
mddev->raid_disks = n;
- return rv ? rv : len;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ? err : len;
}
static struct md_sysfs_entry md_raid_disks =
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
@@ -3527,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
- int err;
if (mddev->pers->check_reshape == NULL)
- return -EBUSY;
- if (mddev->ro)
- return -EROFS;
- mddev->new_chunk_sectors = n >> 9;
- err = mddev->pers->check_reshape(mddev);
- if (err) {
- mddev->new_chunk_sectors = mddev->chunk_sectors;
- return err;
+ err = -EBUSY;
+ else if (mddev->ro)
+ err = -EROFS;
+ else {
+ mddev->new_chunk_sectors = n >> 9;
+ err = mddev->pers->check_reshape(mddev);
+ if (err)
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
}
} else {
mddev->new_chunk_sectors = n >> 9;
if (mddev->reshape_position == MaxSector)
mddev->chunk_sectors = n >> 9;
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_chunk_size =
__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
@@ -3566,23 +3615,31 @@ resync_start_show(struct mddev *mddev, char *page)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
char *e;
unsigned long long n = simple_strtoull(buf, &e, 10);
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
- return -EBUSY;
- if (cmd_match(buf, "none"))
+ err = -EBUSY;
+ else if (cmd_match(buf, "none"))
n = MaxSector;
else if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = -EINVAL;
- mddev->recovery_cp = n;
- if (mddev->pers)
- set_bit(MD_CHANGE_CLEAN, &mddev->flags);
- return len;
+ if (!err) {
+ mddev->recovery_cp = n;
+ if (mddev->pers)
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_resync_start =
-__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
+__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
+ resync_start_show, resync_start_store);
/*
* The array state can be:
@@ -3677,8 +3734,39 @@ static int restart_array(struct mddev *mddev);
static ssize_t
array_state_store(struct mddev *mddev, const char *buf, size_t len)
{
- int err = -EINVAL;
+ int err;
enum array_state st = match_word(buf, array_states);
+
+ if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
+ /* don't take reconfig_mutex when toggling between
+ * clean and active
+ */
+ spin_lock(&mddev->lock);
+ if (st == active) {
+ restart_array(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ err = 0;
+ } else /* st == clean */ {
+ restart_array(mddev);
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+ }
+ err = 0;
+ } else
+ err = -EBUSY;
+ }
+ spin_unlock(&mddev->lock);
+ return err;
+ }
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
switch(st) {
case bad_word:
break;
@@ -3722,7 +3810,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case clean:
if (mddev->pers) {
restart_array(mddev);
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
mddev->in_sync = 1;
@@ -3733,7 +3821,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = 0;
} else
err = -EBUSY;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
} else
err = -EINVAL;
break;
@@ -3754,17 +3842,17 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
/* these cannot be set */
break;
}
- if (err)
- return err;
- else {
+
+ if (!err) {
if (mddev->hold_active == UNTIL_IOCTL)
mddev->hold_active = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
- return len;
}
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_array_state =
-__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
+__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
static ssize_t
max_corrected_read_errors_show(struct mddev *mddev, char *page) {
@@ -3822,6 +3910,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
+ flush_workqueue(md_misc_wq);
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->persistent) {
rdev = md_import_device(dev, mddev->major_version,
mddev->minor_version);
@@ -3845,6 +3938,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
out:
if (err)
export_rdev(rdev);
+ mddev_unlock(mddev);
return err ? err : len;
}
@@ -3856,7 +3950,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
{
char *end;
unsigned long chunk, end_chunk;
+ int err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (!mddev->bitmap)
goto out;
/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
@@ -3874,6 +3972,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
}
bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
out:
+ mddev_unlock(mddev);
return len;
}
@@ -3901,6 +4000,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err < 0)
return err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (mddev->pers) {
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
@@ -3911,6 +4013,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
else
err = -ENOSPC;
}
+ mddev_unlock(mddev);
return err ? err : len;
}
@@ -3940,21 +4043,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
{
int major, minor;
char *e;
+ int err;
/* Changing the details of 'external' metadata is
* always permitted. Otherwise there must be
* no devices attached to the array.
*/
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
if (mddev->external && strncmp(buf, "external:", 9) == 0)
;
else if (!list_empty(&mddev->disks))
- return -EBUSY;
+ goto out_unlock;
+ err = 0;
if (cmd_match(buf, "none")) {
mddev->persistent = 0;
mddev->external = 0;
mddev->major_version = 0;
mddev->minor_version = 90;
- return len;
+ goto out_unlock;
}
if (strncmp(buf, "external:", 9) == 0) {
size_t namelen = len-9;
@@ -3968,45 +4078,51 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
mddev->external = 1;
mddev->major_version = 0;
mddev->minor_version = 90;
- return len;
+ goto out_unlock;
}
major = simple_strtoul(buf, &e, 10);
+ err = -EINVAL;
if (e==buf || *e != '.')
- return -EINVAL;
+ goto out_unlock;
buf = e+1;
minor = simple_strtoul(buf, &e, 10);
if (e==buf || (*e && *e != '\n') )
- return -EINVAL;
+ goto out_unlock;
+ err = -ENOENT;
if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
- return -ENOENT;
+ goto out_unlock;
mddev->major_version = major;
mddev->minor_version = minor;
mddev->persistent = 1;
mddev->external = 0;
- return len;
+ err = 0;
+out_unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_metadata =
-__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t
action_show(struct mddev *mddev, char *page)
{
char *type = "idle";
- if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ unsigned long recovery = mddev->recovery;
+ if (test_bit(MD_RECOVERY_FROZEN, &recovery))
type = "frozen";
- else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+ (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
type = "reshape";
- else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
- if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
type = "resync";
- else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ else if (test_bit(MD_RECOVERY_CHECK, &recovery))
type = "check";
else
type = "repair";
- } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover";
}
return sprintf(page, "%s\n", type);
@@ -4027,7 +4143,10 @@ action_store(struct mddev *mddev, const char *page, size_t len)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
+ if (mddev_lock(mddev) == 0) {
+ md_reap_sync_thread(mddev);
+ mddev_unlock(mddev);
+ }
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -4041,7 +4160,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
- err = mddev->pers->start_reshape(mddev);
+ err = mddev_lock(mddev);
+ if (!err) {
+ err = mddev->pers->start_reshape(mddev);
+ mddev_unlock(mddev);
+ }
if (err)
return err;
sysfs_notify(&mddev->kobj, NULL, "degraded");
@@ -4067,7 +4190,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
static struct md_sysfs_entry md_scan_mode =
-__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
static ssize_t
last_sync_action_show(struct mddev *mddev, char *page)
@@ -4213,7 +4336,8 @@ sync_completed_show(struct mddev *mddev, char *page)
return sprintf(page, "%llu / %llu\n", resync, max_sectors);
}
-static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
+static struct md_sysfs_entry md_sync_completed =
+ __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
static ssize_t
min_sync_show(struct mddev *mddev, char *page)
@@ -4225,22 +4349,36 @@ static ssize_t
min_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long long min;
+ int err;
+ int chunk;
+
if (kstrtoull(buf, 10, &min))
return -EINVAL;
+
+ spin_lock(&mddev->lock);
+ err = -EINVAL;
if (min > mddev->resync_max)
- return -EINVAL;
+ goto out_unlock;
+
+ err = -EBUSY;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
+ goto out_unlock;
/* Must be a multiple of chunk_size */
- if (mddev->chunk_sectors) {
+ chunk = mddev->chunk_sectors;
+ if (chunk) {
sector_t temp = min;
- if (sector_div(temp, mddev->chunk_sectors))
- return -EINVAL;
+
+ err = -EINVAL;
+ if (sector_div(temp, chunk))
+ goto out_unlock;
}
mddev->resync_min = min;
+ err = 0;
- return len;
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
}
static struct md_sysfs_entry md_min_sync =
@@ -4258,29 +4396,42 @@ max_sync_show(struct mddev *mddev, char *page)
static ssize_t
max_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
+ int err;
+ spin_lock(&mddev->lock);
if (strncmp(buf, "max", 3) == 0)
mddev->resync_max = MaxSector;
else {
unsigned long long max;
+ int chunk;
+
+ err = -EINVAL;
if (kstrtoull(buf, 10, &max))
- return -EINVAL;
+ goto out_unlock;
if (max < mddev->resync_min)
- return -EINVAL;
+ goto out_unlock;
+
+ err = -EBUSY;
if (max < mddev->resync_max &&
mddev->ro == 0 &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- return -EBUSY;
+ goto out_unlock;
/* Must be a multiple of chunk_size */
- if (mddev->chunk_sectors) {
+ chunk = mddev->chunk_sectors;
+ if (chunk) {
sector_t temp = max;
- if (sector_div(temp, mddev->chunk_sectors))
- return -EINVAL;
+
+ err = -EINVAL;
+ if (sector_div(temp, chunk))
+ goto out_unlock;
}
mddev->resync_max = max;
}
wake_up(&mddev->recovery_wait);
- return len;
+ err = 0;
+out_unlock:
+ spin_unlock(&mddev->lock);
+ return err ?: len;
}
static struct md_sysfs_entry md_max_sync =
@@ -4297,14 +4448,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old = mddev->suspend_lo;
+ unsigned long long old;
+ int err;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_lo;
mddev->suspend_lo = new;
if (new >= old)
/* Shrinking suspended region */
@@ -4314,7 +4471,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4330,14 +4490,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old = mddev->suspend_hi;
+ unsigned long long old;
+ int err;
- if (mddev->pers == NULL ||
- mddev->pers->quiesce == NULL)
- return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EINVAL;
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
+ goto unlock;
+ old = mddev->suspend_hi;
mddev->suspend_hi = new;
if (new <= old)
/* Shrinking suspended region */
@@ -4347,7 +4513,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4367,11 +4536,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
char *e;
+ int err;
unsigned long long new = simple_strtoull(buf, &e, 10);
- if (mddev->pers)
- return -EBUSY;
+
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ err = -EBUSY;
+ if (mddev->pers)
+ goto unlock;
mddev->reshape_position = new;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
@@ -4380,7 +4555,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
mddev->new_chunk_sectors = mddev->chunk_sectors;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
- return len;
+ err = 0;
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_reshape_position =
@@ -4398,6 +4576,8 @@ static ssize_t
reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
{
int backwards = 0;
+ int err;
+
if (cmd_match(buf, "forwards"))
backwards = 0;
else if (cmd_match(buf, "backwards"))
@@ -4407,16 +4587,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->reshape_backwards == backwards)
return len;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
/* check if we are allowed to change */
if (mddev->delta_disks)
- return -EBUSY;
-
- if (mddev->persistent &&
+ err = -EBUSY;
+ else if (mddev->persistent &&
mddev->major_version == 0)
- return -EINVAL;
-
- mddev->reshape_backwards = backwards;
- return len;
+ err = -EINVAL;
+ else
+ mddev->reshape_backwards = backwards;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_reshape_direction =
@@ -4437,6 +4620,11 @@ static ssize_t
array_size_store(struct mddev *mddev, const char *buf, size_t len)
{
sector_t sectors;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
if (strncmp(buf, "default", 7) == 0) {
if (mddev->pers)
@@ -4447,19 +4635,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
mddev->external_size = 0;
} else {
if (strict_blocks_to_sectors(buf, &sectors) < 0)
- return -EINVAL;
- if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
- return -E2BIG;
-
- mddev->external_size = 1;
+ err = -EINVAL;
+ else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+ err = -E2BIG;
+ else
+ mddev->external_size = 1;
}
- mddev->array_sectors = sectors;
- if (mddev->pers) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ if (!err) {
+ mddev->array_sectors = sectors;
+ if (mddev->pers) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
}
- return len;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry md_array_size =
@@ -4523,11 +4714,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
- rv = mddev_lock(mddev);
- if (!rv) {
- rv = entry->show(mddev, page);
- mddev_unlock(mddev);
- }
+ rv = entry->show(mddev, page);
mddev_put(mddev);
return rv;
}
@@ -4551,13 +4738,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
}
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
- if (entry->store == new_dev_store)
- flush_workqueue(md_misc_wq);
- rv = mddev_lock(mddev);
- if (!rv) {
- rv = entry->store(mddev, page, length);
- mddev_unlock(mddev);
- }
+ rv = entry->store(mddev, page, length);
mddev_put(mddev);
return rv;
}
@@ -4825,7 +5006,6 @@ int md_run(struct mddev *mddev)
mddev->clevel);
return -EINVAL;
}
- mddev->pers = pers;
spin_unlock(&pers_lock);
if (mddev->level != pers->level) {
mddev->level = pers->level;
@@ -4836,7 +5016,6 @@ int md_run(struct mddev *mddev)
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
- mddev->pers = NULL;
module_put(pers->owner);
return -EINVAL;
}
@@ -4880,35 +5059,38 @@ int md_run(struct mddev *mddev)
if (start_readonly && mddev->ro == 0)
mddev->ro = 2; /* read-only, but switch on first write */
- err = mddev->pers->run(mddev);
+ err = pers->run(mddev);
if (err)
printk(KERN_ERR "md: pers->run() failed ...\n");
- else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+ else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
WARN_ONCE(!mddev->external_size, "%s: default size too small,"
" but 'external_size' not in effect?\n", __func__);
printk(KERN_ERR
"md: invalid array_size %llu > default size %llu\n",
(unsigned long long)mddev->array_sectors / 2,
- (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+ (unsigned long long)pers->size(mddev, 0, 0) / 2);
err = -EINVAL;
- mddev->pers->stop(mddev);
}
- if (err == 0 && mddev->pers->sync_request &&
+ if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
err = bitmap_create(mddev);
- if (err) {
+ if (err)
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
- mddev->pers->stop(mddev);
- }
}
if (err) {
- module_put(mddev->pers->owner);
- mddev->pers = NULL;
+ mddev_detach(mddev);
+ pers->free(mddev, mddev->private);
+ module_put(pers->owner);
bitmap_destroy(mddev);
return err;
}
- if (mddev->pers->sync_request) {
+ if (mddev->queue) {
+ mddev->queue->backing_dev_info.congested_data = mddev;
+ mddev->queue->backing_dev_info.congested_fn = md_congested;
+ blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
+ }
+ if (pers->sync_request) {
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_redundancy_group))
printk(KERN_WARNING
@@ -4927,7 +5109,10 @@ int md_run(struct mddev *mddev)
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
+ spin_lock(&mddev->lock);
+ mddev->pers = pers;
mddev->ready = 1;
+ spin_unlock(&mddev->lock);
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
if (sysfs_link_rdev(mddev, rdev))
@@ -5070,14 +5255,38 @@ void md_stop_writes(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(md_stop_writes);
+static void mddev_detach(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ if (mddev->pers && mddev->pers->quiesce) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ md_unregister_thread(&mddev->thread);
+ if (mddev->queue)
+ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+}
+
static void __md_stop(struct mddev *mddev)
{
+ struct md_personality *pers = mddev->pers;
+ mddev_detach(mddev);
+ spin_lock(&mddev->lock);
mddev->ready = 0;
- mddev->pers->stop(mddev);
- if (mddev->pers->sync_request && mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
- module_put(mddev->pers->owner);
mddev->pers = NULL;
+ spin_unlock(&mddev->lock);
+ pers->free(mddev, mddev->private);
+ if (pers->sync_request && mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+ module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
@@ -5226,8 +5435,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
bitmap_destroy(mddev);
if (mddev->bitmap_info.file) {
- fput(mddev->bitmap_info.file);
+ struct file *f = mddev->bitmap_info.file;
+ spin_lock(&mddev->lock);
mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
}
mddev->bitmap_info.offset = 0;
@@ -5436,37 +5648,31 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
static int get_bitmap_file(struct mddev *mddev, void __user * arg)
{
mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
- char *ptr, *buf = NULL;
- int err = -ENOMEM;
+ char *ptr;
+ int err;
file = kmalloc(sizeof(*file), GFP_NOIO);
-
if (!file)
- goto out;
+ return -ENOMEM;
+ err = 0;
+ spin_lock(&mddev->lock);
/* bitmap disabled, zero the first byte and copy out */
- if (!mddev->bitmap || !mddev->bitmap->storage.file) {
+ if (!mddev->bitmap_info.file)
file->pathname[0] = '\0';
- goto copy_out;
- }
-
- buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
- if (!buf)
- goto out;
-
- ptr = d_path(&mddev->bitmap->storage.file->f_path,
- buf, sizeof(file->pathname));
- if (IS_ERR(ptr))
- goto out;
-
- strcpy(file->pathname, ptr);
+ else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+ file->pathname, sizeof(file->pathname))),
+ IS_ERR(ptr))
+ err = PTR_ERR(ptr);
+ else
+ memmove(file->pathname, ptr,
+ sizeof(file->pathname)-(ptr-file->pathname));
+ spin_unlock(&mddev->lock);
-copy_out:
- err = 0;
- if (copy_to_user(arg, file, sizeof(*file)))
+ if (err == 0 &&
+ copy_to_user(arg, file, sizeof(*file)))
err = -EFAULT;
-out:
- kfree(buf);
+
kfree(file);
return err;
}
@@ -5789,22 +5995,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
if (fd >= 0) {
struct inode *inode;
- if (mddev->bitmap)
+ struct file *f;
+
+ if (mddev->bitmap || mddev->bitmap_info.file)
return -EEXIST; /* cannot add when bitmap is present */
- mddev->bitmap_info.file = fget(fd);
+ f = fget(fd);
- if (mddev->bitmap_info.file == NULL) {
+ if (f == NULL) {
printk(KERN_ERR "%s: error: failed to get bitmap file\n",
mdname(mddev));
return -EBADF;
}
- inode = mddev->bitmap_info.file->f_mapping->host;
+ inode = f->f_mapping->host;
if (!S_ISREG(inode->i_mode)) {
printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
mdname(mddev));
err = -EBADF;
- } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) {
+ } else if (!(f->f_mode & FMODE_WRITE)) {
printk(KERN_ERR "%s: error: bitmap file must open for write\n",
mdname(mddev));
err = -EBADF;
@@ -5814,10 +6022,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = -EBUSY;
}
if (err) {
- fput(mddev->bitmap_info.file);
- mddev->bitmap_info.file = NULL;
+ fput(f);
return err;
}
+ mddev->bitmap_info.file = f;
mddev->bitmap_info.offset = 0; /* file overrides offset */
} else if (mddev->bitmap == NULL)
return -ENOENT; /* cannot remove what isn't there */
@@ -5836,9 +6044,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
mddev->pers->quiesce(mddev, 0);
}
if (fd < 0) {
- if (mddev->bitmap_info.file)
- fput(mddev->bitmap_info.file);
- mddev->bitmap_info.file = NULL;
+ struct file *f = mddev->bitmap_info.file;
+ if (f) {
+ spin_lock(&mddev->lock);
+ mddev->bitmap_info.file = NULL;
+ spin_unlock(&mddev->lock);
+ fput(f);
+ }
}
return err;
@@ -6251,6 +6463,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
case SET_DISK_FAULTY:
err = set_disk_faulty(mddev, new_decode_dev(arg));
goto out;
+
+ case GET_BITMAP_FILE:
+ err = get_bitmap_file(mddev, argp);
+ goto out;
+
}
if (cmd == ADD_NEW_DISK)
@@ -6342,10 +6559,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
* Commands even a read-only array can execute:
*/
switch (cmd) {
- case GET_BITMAP_FILE:
- err = get_bitmap_file(mddev, argp);
- goto unlock;
-
case RESTART_ARRAY_RW:
err = restart_array(mddev);
goto unlock;
@@ -6873,9 +7086,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
}
- if (mddev_lock(mddev) < 0)
- return -EINTR;
-
+ spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
mddev->pers ? "" : "in");
@@ -6888,7 +7099,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
}
sectors = 0;
- rdev_for_each(rdev, mddev) {
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6904,6 +7116,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "(R)");
sectors += rdev->sectors;
}
+ rcu_read_unlock();
if (!list_empty(&mddev->disks)) {
if (mddev->pers)
@@ -6946,7 +7159,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
- mddev_unlock(mddev);
+ spin_unlock(&mddev->lock);
return 0;
}
@@ -7102,7 +7315,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
if (mddev->safemode == 1)
mddev->safemode = 0;
if (mddev->in_sync) {
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7110,7 +7323,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
md_wakeup_thread(mddev->thread);
did_change = 1;
}
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
}
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -7148,7 +7361,7 @@ int md_allow_write(struct mddev *mddev)
if (!mddev->pers->sync_request)
return 0;
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
@@ -7156,11 +7369,11 @@ int md_allow_write(struct mddev *mddev)
if (mddev->safemode_delay &&
mddev->safemode == 0)
mddev->safemode = 1;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
return -EAGAIN;
@@ -7513,6 +7726,7 @@ void md_do_sync(struct md_thread *thread)
skip:
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7521,6 +7735,8 @@ void md_do_sync(struct md_thread *thread)
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
mddev->curr_resync = 0;
+ spin_unlock(&mddev->lock);
+
wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -7688,7 +7904,7 @@ void md_check_recovery(struct mddev *mddev)
if (!mddev->external) {
int did_change = 0;
- spin_lock_irq(&mddev->write_lock);
+ spin_lock(&mddev->lock);
if (mddev->safemode &&
!atomic_read(&mddev->writes_pending) &&
!mddev->in_sync &&
@@ -7699,7 +7915,7 @@ void md_check_recovery(struct mddev *mddev)
}
if (mddev->safemode == 1)
mddev->safemode = 0;
- spin_unlock_irq(&mddev->write_lock);
+ spin_unlock(&mddev->lock);
if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
@@ -7721,7 +7937,9 @@ void md_check_recovery(struct mddev *mddev)
* any transients in the value of "sync_action".
*/
mddev->curr_resync_completed = 0;
+ spin_lock(&mddev->lock);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ spin_unlock(&mddev->lock);
/* Clear some bits that don't mean anything, but
* might be left set
*/
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 03cec5bdcaae..318ca8fd430f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -386,7 +386,18 @@ struct mddev {
struct work_struct del_work; /* used for delayed sysfs removal */
- spinlock_t write_lock;
+ /* "lock" protects:
+ * flush_bio transition from NULL to !NULL
+ * rdev superblocks, events
+ * clearing MD_CHANGE_*
+ * in_sync - and related safemode and MD_CHANGE changes
+ * pers (also protected by reconfig_mutex and pending IO).
+ * clearing ->bitmap
+ * clearing ->bitmap_info.file
+ * changing ->resync_{min,max}
+ * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
+ */
+ spinlock_t lock;
wait_queue_head_t sb_wait; /* for waiting on superblock updates */
atomic_t pending_writes; /* number of active superblock writes */
@@ -439,13 +450,30 @@ struct mddev {
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
};
-static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+static inline int __must_check mddev_lock(struct mddev *mddev)
{
- int faulty = test_bit(Faulty, &rdev->flags);
- if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev *mddev)
+{
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline int mddev_is_locked(struct mddev *mddev)
+{
+ return mutex_is_locked(&mddev->reconfig_mutex);
}
+static inline int mddev_trylock(struct mddev *mddev)
+{
+ return mutex_trylock(&mddev->reconfig_mutex);
+}
+extern void mddev_unlock(struct mddev *mddev);
+
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
@@ -459,7 +487,7 @@ struct md_personality
struct module *owner;
void (*make_request)(struct mddev *mddev, struct bio *bio);
int (*run)(struct mddev *mddev);
- int (*stop)(struct mddev *mddev);
+ void (*free)(struct mddev *mddev, void *priv);
void (*status)(struct seq_file *seq, struct mddev *mddev);
/* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed
@@ -490,6 +518,13 @@ struct md_personality
* array.
*/
void *(*takeover) (struct mddev *mddev);
+ /* congested implements bdi.congested_fn().
+ * Will not be called while array is 'suspended' */
+ int (*congested)(struct mddev *mddev, int bits);
+ /* mergeable_bvec is use to implement ->merge_bvec_fn */
+ int (*mergeable_bvec)(struct mddev *mddev,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec);
};
struct md_sysfs_entry {
@@ -624,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev)
return !!blk_check_plugged(md_unplug, mddev,
sizeof(struct blk_plug_cb));
}
+
+static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
+{
+ int faulty = test_bit(Faulty, &rdev->flags);
+ if (atomic_dec_and_test(&rdev->nr_pending) && faulty) {
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+}
+
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 399272f9c042..ac3ede2bd00e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev)
seq_printf (seq, "]");
}
-static int multipath_congested(void *data, int bits)
+static int multipath_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct mpconf *conf = mddev->private;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
rcu_read_lock();
for (i = 0; i < mddev->raid_disks ; i++) {
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -403,7 +399,7 @@ static int multipath_run (struct mddev *mddev)
/*
* copy the already verified devices into our private MULTIPATH
* bookkeeping area. [whatever we allocate in multipath_run(),
- * should be freed in multipath_stop()]
+ * should be freed in multipath_free()]
*/
conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
@@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev)
*/
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
- mddev->queue->backing_dev_info.congested_fn = multipath_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
-
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -507,17 +500,13 @@ out:
return -EIO;
}
-static int multipath_stop (struct mddev *mddev)
+static void multipath_free(struct mddev *mddev, void *priv)
{
- struct mpconf *conf = mddev->private;
+ struct mpconf *conf = priv;
- md_unregister_thread(&mddev->thread);
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
mempool_destroy(conf->pool);
kfree(conf->multipaths);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static struct md_personality multipath_personality =
@@ -527,12 +516,13 @@ static struct md_personality multipath_personality =
.owner = THIS_MODULE,
.make_request = multipath_make_request,
.run = multipath_run,
- .stop = multipath_stop,
+ .free = multipath_free,
.status = multipath_status,
.error_handler = multipath_error,
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
+ .congested = multipath_congested,
};
static int __init multipath_init (void)
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 0c2dec7aec20..78c74bb71ba4 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -8,7 +8,7 @@ config DM_PERSISTENT_DATA
device-mapper targets such as the thin provisioning target.
config DM_DEBUG_BLOCK_STACK_TRACING
- boolean "Keep stack trace of persistent data block lock holders"
+ bool "Keep stack trace of persistent data block lock holders"
depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
select STACKTRACE
---help---
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index cfbf9617e465..ebb280a14325 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -78,7 +78,9 @@ static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
if (r)
return r;
- return count > 1;
+ *result = count > 1;
+
+ return 0;
}
static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ba6b85de96d2..a13f738a7b39 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,17 +25,13 @@
#include "raid0.h"
#include "raid5.h"
-static int raid0_congested(void *data, int bits)
+static int raid0_congested(struct mddev *mddev, int bits)
{
- struct mddev *mddev = data;
struct r0conf *conf = mddev->private;
struct md_rdev **devlist = conf->devlist;
int raid_disks = conf->strip_zone[0].nb_dev;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
- return 1;
-
for (i = 0; i < raid_disks && !ret ; i++) {
struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
@@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
mdname(mddev),
(unsigned long long)smallest->sectors);
}
- mddev->queue->backing_dev_info.congested_fn = raid0_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
/*
* now since we have the hard sector sizes, we can make sure
@@ -356,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
/**
* raid0_mergeable_bvec -- tell bio layer if two requests can be merged
- * @q: request queue
+ * @mddev: the md device
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
*/
-static int raid0_mergeable_bvec(struct request_queue *q,
+static int raid0_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r0conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
sector_t sector_offset = sector;
@@ -422,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
return array_sectors;
}
-static int raid0_stop(struct mddev *mddev);
+static void raid0_free(struct mddev *mddev, void *priv);
static int raid0_run(struct mddev *mddev)
{
@@ -471,26 +464,22 @@ static int raid0_run(struct mddev *mddev)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
- blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
ret = md_integrity_register(mddev);
if (ret)
- raid0_stop(mddev);
+ raid0_free(mddev, conf);
return ret;
}
-static int raid0_stop(struct mddev *mddev)
+static void raid0_free(struct mddev *mddev, void *priv)
{
- struct r0conf *conf = mddev->private;
+ struct r0conf *conf = priv;
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
/*
@@ -724,11 +713,13 @@ static struct md_personality raid0_personality=
.owner = THIS_MODULE,
.make_request = raid0_make_request,
.run = raid0_run,
- .stop = raid0_stop,
+ .free = raid0_free,
.status = raid0_status,
.size = raid0_size,
.takeover = raid0_takeover,
.quiesce = raid0_quiesce,
+ .congested = raid0_congested,
+ .mergeable_bvec = raid0_mergeable_bvec,
};
static int __init raid0_init (void)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 40b35be34f8d..d34e238afa54 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -560,7 +560,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
- if (best_disk < 0) {
+ if (best_dist_disk < 0) {
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (first_bad < this_sector)
@@ -569,7 +569,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
best_good_sectors = first_bad - this_sector;
} else
best_good_sectors = sectors;
- best_disk = disk;
+ best_dist_disk = disk;
+ best_pending_disk = disk;
}
continue;
}
@@ -701,11 +702,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
-static int raid1_mergeable_bvec(struct request_queue *q,
+static int raid1_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r1conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max = biovec->bv_len;
@@ -734,7 +734,7 @@ static int raid1_mergeable_bvec(struct request_queue *q,
}
-int md_raid1_congested(struct mddev *mddev, int bits)
+static int raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
int i, ret = 0;
@@ -763,15 +763,6 @@ int md_raid1_congested(struct mddev *mddev, int bits)
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(md_raid1_congested);
-
-static int raid1_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid1_congested(mddev, bits);
-}
static void flush_pending_writes(struct r1conf *conf)
{
@@ -2206,7 +2197,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
if (rdev->badblocks.shift < 0)
return 0;
- block_sectors = 1 << rdev->badblocks.shift;
+ block_sectors = roundup(1 << rdev->badblocks.shift,
+ bdev_logical_block_size(rdev->bdev) >> 9);
sector = r1_bio->sector;
sectors = ((sector + block_sectors)
& ~(sector_t)(block_sectors - 1))
@@ -2882,7 +2874,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
-static int stop(struct mddev *mddev);
+static void raid1_free(struct mddev *mddev, void *priv);
static int run(struct mddev *mddev)
{
struct r1conf *conf;
@@ -2904,7 +2896,7 @@ static int run(struct mddev *mddev)
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
- * should be freed in stop()]
+ * should be freed in raid1_free()]
*/
if (mddev->private == NULL)
conf = setup_conf(mddev);
@@ -2955,10 +2947,6 @@ static int run(struct mddev *mddev)
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
if (mddev->queue) {
- mddev->queue->backing_dev_info.congested_fn = raid1_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
- blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
-
if (discard_supported)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
mddev->queue);
@@ -2968,37 +2956,23 @@ static int run(struct mddev *mddev)
}
ret = md_integrity_register(mddev);
- if (ret)
- stop(mddev);
+ if (ret) {
+ md_unregister_thread(&mddev->thread);
+ raid1_free(mddev, conf);
+ }
return ret;
}
-static int stop(struct mddev *mddev)
+static void raid1_free(struct mddev *mddev, void *priv)
{
- struct r1conf *conf = mddev->private;
- struct bitmap *bitmap = mddev->bitmap;
+ struct r1conf *conf = priv;
- /* wait for behind writes to complete */
- if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
- printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
- mdname(mddev));
- /* need to kick something here to make sure I/O goes? */
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
- }
-
- freeze_array(conf, 0);
- unfreeze_array(conf);
-
- md_unregister_thread(&mddev->thread);
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static int raid1_resize(struct mddev *mddev, sector_t sectors)
@@ -3181,7 +3155,7 @@ static struct md_personality raid1_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid1_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid1_add_disk,
@@ -3193,6 +3167,8 @@ static struct md_personality raid1_personality =
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
.takeover = raid1_takeover,
+ .congested = raid1_congested,
+ .mergeable_bvec = raid1_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 33bda55ef9f7..14ebb288c1ef 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -170,7 +170,4 @@ struct r1bio {
*/
#define R1BIO_MadeGood 7
#define R1BIO_WriteError 8
-
-extern int md_raid1_congested(struct mddev *mddev, int bits);
-
#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 32e282f4c83c..a7196c49d15d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
/**
* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
- * @q: request queue
+ * @mddev: the md device
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
*
@@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* This requires checking for end-of-chunk if near_copies != raid_disks,
* and for subordinate merge_bvec_fns if merge_check_needed.
*/
-static int raid10_mergeable_bvec(struct request_queue *q,
+static int raid10_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
struct r10conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
@@ -910,7 +909,7 @@ retry:
return rdev;
}
-int md_raid10_congested(struct mddev *mddev, int bits)
+static int raid10_congested(struct mddev *mddev, int bits)
{
struct r10conf *conf = mddev->private;
int i, ret = 0;
@@ -934,15 +933,6 @@ int md_raid10_congested(struct mddev *mddev, int bits)
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(md_raid10_congested);
-
-static int raid10_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid10_congested(mddev, bits);
-}
static void flush_pending_writes(struct r10conf *conf)
{
@@ -2582,7 +2572,8 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
if (rdev->badblocks.shift < 0)
return 0;
- block_sectors = 1 << rdev->badblocks.shift;
+ block_sectors = roundup(1 << rdev->badblocks.shift,
+ bdev_logical_block_size(rdev->bdev) >> 9);
sector = r10_bio->sector;
sectors = ((r10_bio->sector + block_sectors)
& ~(sector_t)(block_sectors - 1))
@@ -3757,8 +3748,6 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
int stripe = conf->geo.raid_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
- mddev->queue->backing_dev_info.congested_fn = raid10_congested;
- mddev->queue->backing_dev_info.congested_data = mddev;
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
@@ -3767,7 +3756,6 @@ static int run(struct mddev *mddev)
stripe /= conf->geo.near_copies;
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
- blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
}
if (md_integrity_register(mddev))
@@ -3811,17 +3799,9 @@ out:
return -EIO;
}
-static int stop(struct mddev *mddev)
+static void raid10_free(struct mddev *mddev, void *priv)
{
- struct r10conf *conf = mddev->private;
-
- raise_barrier(conf, 0);
- lower_barrier(conf);
-
- md_unregister_thread(&mddev->thread);
- if (mddev->queue)
- /* the unplug fn references 'conf'*/
- blk_sync_queue(mddev->queue);
+ struct r10conf *conf = priv;
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
@@ -3830,8 +3810,6 @@ static int stop(struct mddev *mddev)
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
kfree(conf);
- mddev->private = NULL;
- return 0;
}
static void raid10_quiesce(struct mddev *mddev, int state)
@@ -3895,7 +3873,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
return 0;
}
-static void *raid10_takeover_raid0(struct mddev *mddev)
+static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
{
struct md_rdev *rdev;
struct r10conf *conf;
@@ -3905,6 +3883,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
mdname(mddev));
return ERR_PTR(-EINVAL);
}
+ sector_div(size, devs);
/* Set new parameters */
mddev->new_level = 10;
@@ -3915,12 +3894,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
mddev->raid_disks *= 2;
/* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector;
+ mddev->dev_sectors = size;
conf = setup_conf(mddev);
if (!IS_ERR(conf)) {
rdev_for_each(rdev, mddev)
- if (rdev->raid_disk >= 0)
+ if (rdev->raid_disk >= 0) {
rdev->new_raid_disk = rdev->raid_disk * 2;
+ rdev->sectors = size;
+ }
conf->barrier = 1;
}
@@ -3943,7 +3925,9 @@ static void *raid10_takeover(struct mddev *mddev)
mdname(mddev));
return ERR_PTR(-EINVAL);
}
- return raid10_takeover_raid0(mddev);
+ return raid10_takeover_raid0(mddev,
+ raid0_conf->strip_zone->zone_end,
+ raid0_conf->strip_zone->nb_dev);
}
return ERR_PTR(-EINVAL);
}
@@ -4713,7 +4697,7 @@ static struct md_personality raid10_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid10_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid10_add_disk,
@@ -4727,6 +4711,8 @@ static struct md_personality raid10_personality =
.check_reshape = raid10_check_reshape,
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
+ .congested = raid10_congested,
+ .mergeable_bvec = raid10_mergeable_bvec,
};
static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 157d69e83ff4..5ee6473ddc2c 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -150,7 +150,4 @@ enum r10bio_state {
*/
R10BIO_Previous,
};
-
-extern int md_raid10_congested(struct mddev *mddev, int bits);
-
#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b98765f6f77f..cd2f96b2c572 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- if (atomic_read(&conf->preread_active_stripes)
- < IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
@@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
* Returns 1 when no more member devices need to be checked, otherwise returns
* 0 to tell the loop in handle_stripe_fill to continue
*/
-static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
- int disk_idx, int disks)
+
+static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
+ int i;
+
+
+ if (test_bit(R5_LOCKED, &dev->flags) ||
+ test_bit(R5_UPTODATE, &dev->flags))
+ /* No point reading this as we already have it or have
+ * decided to get it.
+ */
+ return 0;
+
+ if (dev->toread ||
+ (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
+ /* We need this block to directly satisfy a request */
+ return 1;
+
+ if (s->syncing || s->expanding ||
+ (s->replacing && want_replace(sh, disk_idx)))
+ /* When syncing, or expanding we read everything.
+ * When replacing, we need the replaced block.
+ */
+ return 1;
+
+ if ((s->failed >= 1 && fdev[0]->toread) ||
+ (s->failed >= 2 && fdev[1]->toread))
+ /* If we want to read from a failed device, then
+ * we need to actually read every other device.
+ */
+ return 1;
+
+ /* Sometimes neither read-modify-write nor reconstruct-write
+ * cycles can work. In those cases we read every block we
+ * can. Then the parity-update is certain to have enough to
+ * work with.
+ * This can only be a problem when we need to write something,
+ * and some device has failed. If either of those tests
+ * fail we need look no further.
+ */
+ if (!s->failed || !s->to_write)
+ return 0;
+
+ if (test_bit(R5_Insync, &dev->flags) &&
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ /* Pre-reads at not permitted until after short delay
+ * to gather multiple requests. However if this
+ * device is no Insync, the block could only be be computed
+ * and there is no need to delay that.
+ */
+ return 0;
+
+ for (i = 0; i < s->failed; i++) {
+ if (fdev[i]->towrite &&
+ !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ /* If we have a partial write to a failed
+ * device, then we will need to reconstruct
+ * the content of that device, so all other
+ * devices must be read.
+ */
+ return 1;
+ }
+
+ /* If we are forced to do a reconstruct-write, either because
+ * the current RAID6 implementation only supports that, or
+ * or because parity cannot be trusted and we are currently
+ * recovering it, there is extra need to be careful.
+ * If one of the devices that we would need to read, because
+ * it is not being overwritten (and maybe not written at all)
+ * is missing/faulty, then we need to read everything we can.
+ */
+ if (sh->raid_conf->level != 6 &&
+ sh->sector < sh->raid_conf->mddev->recovery_cp)
+ /* reconstruct-write isn't being forced */
+ return 0;
+ for (i = 0; i < s->failed; i++) {
+ if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+ !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+ int disk_idx, int disks)
+{
+ struct r5dev *dev = &sh->dev[disk_idx];
/* is the data in this block needed, and can we get it? */
- if (!test_bit(R5_LOCKED, &dev->flags) &&
- !test_bit(R5_UPTODATE, &dev->flags) &&
- (dev->toread ||
- (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
- s->syncing || s->expanding ||
- (s->replacing && want_replace(sh, disk_idx)) ||
- (s->failed >= 1 && fdev[0]->toread) ||
- (s->failed >= 2 && fdev[1]->toread) ||
- (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
- !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- ((sh->raid_conf->level == 6 ||
- sh->sector >= sh->raid_conf->mddev->recovery_cp)
- && s->failed && s->to_write &&
- (s->to_write - s->non_overwrite <
- sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) &&
- (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
+ if (need_this_block(sh, s, disk_idx, disks)) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -3102,7 +3170,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
* generate correct data from the parity.
*/
if (conf->max_degraded == 2 ||
- (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
+ (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
+ s->failed == 0)) {
/* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
@@ -4081,7 +4150,7 @@ static void activate_bit_delay(struct r5conf *conf,
}
}
-int md_raid5_congested(struct mddev *mddev, int bits)
+static int raid5_congested(struct mddev *mddev, int bits)
{
struct r5conf *conf = mddev->private;
@@ -4098,24 +4167,14 @@ int md_raid5_congested(struct mddev *mddev, int bits)
return 0;
}
-EXPORT_SYMBOL_GPL(md_raid5_congested);
-
-static int raid5_congested(void *data, int bits)
-{
- struct mddev *mddev = data;
-
- return mddev_congested(mddev, bits) ||
- md_raid5_congested(mddev, bits);
-}
/* We want read requests to align with chunks where possible,
* but write requests don't need to.
*/
-static int raid5_mergeable_bvec(struct request_queue *q,
+static int raid5_mergeable_bvec(struct mddev *mddev,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
- struct mddev *mddev = q->queuedata;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -5062,12 +5121,17 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
schedule_timeout_uninterruptible(1);
}
/* Need to check if array will still be degraded after recovery/resync
- * We don't need to check the 'failed' flag as when that gets set,
- * recovery aborts.
+ * Note in case of > 1 drive failures it's possible we're rebuilding
+ * one drive while leaving another faulty drive in array.
*/
- for (i = 0; i < conf->raid_disks; i++)
- if (conf->disks[i].rdev == NULL)
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
+ }
+ rcu_read_unlock();
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
@@ -5296,11 +5360,14 @@ static void raid5d(struct md_thread *thread)
static ssize_t
raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->max_nr_stripes);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+ spin_unlock(&mddev->lock);
+ return ret;
}
int
@@ -5339,21 +5406,25 @@ EXPORT_SYMBOL(raid5_set_cache_size);
static ssize_t
raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- err = raid5_set_cache_size(mddev, new);
+ err = mddev_lock(mddev);
if (err)
return err;
- return len;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else
+ err = raid5_set_cache_size(mddev, new);
+ mddev_unlock(mddev);
+
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5364,29 +5435,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->bypass_threshold);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->bypass_threshold);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new > conf->max_nr_stripes)
- return -EINVAL;
- conf->bypass_threshold = new;
- return len;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new > conf->max_nr_stripes)
+ err = -EINVAL;
+ else
+ conf->bypass_threshold = new;
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5398,39 +5480,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
static ssize_t
raid5_show_skip_copy(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->skip_copy);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->skip_copy);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static ssize_t
raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
new = !!new;
- if (new == conf->skip_copy)
- return len;
- mddev_suspend(mddev);
- conf->skip_copy = new;
- if (new)
- mddev->queue->backing_dev_info.capabilities |=
- BDI_CAP_STABLE_WRITES;
- else
- mddev->queue->backing_dev_info.capabilities &=
- ~BDI_CAP_STABLE_WRITES;
- mddev_resume(mddev);
- return len;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->skip_copy) {
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ }
+ mddev_unlock(mddev);
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -5454,11 +5545,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
static ssize_t
raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
if (conf)
- return sprintf(page, "%d\n", conf->worker_cnt_per_group);
- else
- return 0;
+ ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
+ spin_unlock(&mddev->lock);
+ return ret;
}
static int alloc_thread_groups(struct r5conf *conf, int cnt,
@@ -5468,7 +5562,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf;
unsigned long new;
int err;
struct r5worker_group *new_groups, *old_groups;
@@ -5476,41 +5570,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (len >= PAGE_SIZE)
return -EINVAL;
- if (!conf)
- return -ENODEV;
-
if (kstrtoul(page, 10, &new))
return -EINVAL;
- if (new == conf->worker_cnt_per_group)
- return len;
-
- mddev_suspend(mddev);
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf)
+ err = -ENODEV;
+ else if (new != conf->worker_cnt_per_group) {
+ mddev_suspend(mddev);
- old_groups = conf->worker_groups;
- if (old_groups)
- flush_workqueue(raid5_wq);
+ old_groups = conf->worker_groups;
+ if (old_groups)
+ flush_workqueue(raid5_wq);
- err = alloc_thread_groups(conf, new,
- &group_cnt, &worker_cnt_per_group,
- &new_groups);
- if (!err) {
- spin_lock_irq(&conf->device_lock);
- conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
- conf->worker_groups = new_groups;
- spin_unlock_irq(&conf->device_lock);
+ err = alloc_thread_groups(conf, new,
+ &group_cnt, &worker_cnt_per_group,
+ &new_groups);
+ if (!err) {
+ spin_lock_irq(&conf->device_lock);
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_groups;
+ spin_unlock_irq(&conf->device_lock);
- if (old_groups)
- kfree(old_groups[0].workers);
- kfree(old_groups);
+ if (old_groups)
+ kfree(old_groups[0].workers);
+ kfree(old_groups);
+ }
+ mddev_resume(mddev);
}
+ mddev_unlock(mddev);
- mddev_resume(mddev);
-
- if (err)
- return err;
- return len;
+ return err ?: len;
}
static struct md_sysfs_entry
@@ -6178,11 +6272,6 @@ static int run(struct mddev *mddev)
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
- blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
-
- mddev->queue->backing_dev_info.congested_data = mddev;
- mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
blk_queue_io_opt(mddev->queue, chunk_size *
@@ -6260,17 +6349,12 @@ abort:
return -EIO;
}
-static int stop(struct mddev *mddev)
+static void raid5_free(struct mddev *mddev, void *priv)
{
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf = priv;
- md_unregister_thread(&mddev->thread);
- if (mddev->queue)
- mddev->queue->backing_dev_info.congested_fn = NULL;
free_conf(conf);
- mddev->private = NULL;
mddev->to_remove = &raid5_attrs_group;
- return 0;
}
static void status(struct seq_file *seq, struct mddev *mddev)
@@ -7044,7 +7128,7 @@ static struct md_personality raid6_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7058,6 +7142,8 @@ static struct md_personality raid6_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid5_personality =
{
@@ -7066,7 +7152,7 @@ static struct md_personality raid5_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7080,6 +7166,8 @@ static struct md_personality raid5_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static struct md_personality raid4_personality =
@@ -7089,7 +7177,7 @@ static struct md_personality raid4_personality =
.owner = THIS_MODULE,
.make_request = make_request,
.run = run,
- .stop = stop,
+ .free = raid5_free,
.status = status,
.error_handler = error,
.hot_add_disk = raid5_add_disk,
@@ -7103,6 +7191,8 @@ static struct md_personality raid4_personality =
.finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
+ .congested = raid5_congested,
+ .mergeable_bvec = raid5_mergeable_bvec,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d59f5ca743cd..983e18a83db1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout)
return layout >= 8 && layout <= 10;
}
-extern int md_raid5_congested(struct mddev *mddev, int bits);
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
#endif