summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig22
-rw-r--r--drivers/md/bitmap.c37
-rw-r--r--drivers/md/dm-bio-list.h14
-rw-r--r--drivers/md/dm-crypt.c590
-rw-r--r--drivers/md/dm-emc.c13
-rw-r--r--drivers/md/dm-exception-store.c176
-rw-r--r--drivers/md/dm-hw-handler.h2
-rw-r--r--drivers/md/dm-io.c15
-rw-r--r--drivers/md/dm-ioctl.c25
-rw-r--r--drivers/md/dm-linear.c21
-rw-r--r--drivers/md/dm-log.c24
-rw-r--r--drivers/md/dm-log.h10
-rw-r--r--drivers/md/dm-mpath.c153
-rw-r--r--drivers/md/dm-mpath.h4
-rw-r--r--drivers/md/dm-path-selector.h12
-rw-r--r--drivers/md/dm-raid1.c51
-rw-r--r--drivers/md/dm-round-robin.c14
-rw-r--r--drivers/md/dm-snap.c387
-rw-r--r--drivers/md/dm-snap.h17
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm-table.c109
-rw-r--r--drivers/md/dm-zero.c2
-rw-r--r--drivers/md/dm.c251
-rw-r--r--drivers/md/dm.h26
-rw-r--r--drivers/md/kcopyd.c6
-rw-r--r--drivers/md/linear.c15
-rw-r--r--drivers/md/md.c335
-rw-r--r--drivers/md/multipath.c31
-rw-r--r--drivers/md/raid0.c17
-rw-r--r--drivers/md/raid1.c248
-rw-r--r--drivers/md/raid10.c263
-rw-r--r--drivers/md/raid5.c433
32 files changed, 2119 insertions, 1206 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf869ed03eed..4540ade6b6b5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -2,6 +2,8 @@
# Block device driver configuration
#
+if BLOCK
+
menu "Multi-device support (RAID and LVM)"
config MD
@@ -136,16 +138,16 @@ config MD_RAID456
If unsure, say Y.
config MD_RAID5_RESHAPE
- bool "Support adding drives to a raid-5 array (experimental)"
- depends on MD_RAID456 && EXPERIMENTAL
+ bool "Support adding drives to a raid-5 array"
+ depends on MD_RAID456
+ default y
---help---
A RAID-5 set can be expanded by adding extra drives. This
requires "restriping" the array which means (almost) every
block must be written to a different place.
This option allows such restriping to be done while the array
- is online. However it is still EXPERIMENTAL code. It should
- work, but please be sure that you have backups.
+ is online.
You will need mdadm version 2.4.1 or later to use this
feature safely. During the early stage of reshape there is
@@ -162,6 +164,8 @@ config MD_RAID5_RESHAPE
There should be enough spares already present to make the new
array workable.
+ If unsure, say Y.
+
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
@@ -199,10 +203,19 @@ config BLK_DEV_DM
If unsure, say N.
+config DM_DEBUG
+ boolean "Device mapper debugging support"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ Enable this for messages that may help debug device-mapper problems.
+
+ If unsure, say N.
+
config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM && EXPERIMENTAL
select CRYPTO
+ select CRYPTO_CBC
---help---
This device-mapper target allows you to create a device that
transparently encrypts the data on it. You'll need to activate
@@ -251,3 +264,4 @@ config DM_MULTIPATH_EMC
endmenu
+endif
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ecc56765d949..5432d07c074d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -212,8 +212,8 @@ char *file_path(struct file *file, char *buf, int count)
if (!buf)
return NULL;
- d = file->f_dentry;
- v = file->f_vfsmnt;
+ d = file->f_path.dentry;
+ v = file->f_path.mnt;
buf = d_path(d, v, buf, count);
@@ -349,7 +349,7 @@ static struct page *read_page(struct file *file, unsigned long index,
unsigned long count)
{
struct page *page = NULL;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
struct buffer_head *bh;
sector_t block;
@@ -536,7 +536,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
"-- forcing full recovery\n", bmname(bitmap), events,
(unsigned long long) bitmap->mddev->events);
- sb->state |= BITMAP_STALE;
+ sb->state |= cpu_to_le32(BITMAP_STALE);
}
success:
/* assign fields using values from superblock */
@@ -544,11 +544,11 @@ success:
bitmap->daemon_sleep = daemon_sleep;
bitmap->daemon_lastrun = jiffies;
bitmap->max_write_behind = write_behind;
- bitmap->flags |= sb->state;
+ bitmap->flags |= le32_to_cpu(sb->state);
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
bitmap->flags |= BITMAP_HOSTENDIAN;
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
- if (sb->state & BITMAP_STALE)
+ if (sb->state & cpu_to_le32(BITMAP_STALE))
bitmap->events_cleared = bitmap->mddev->events;
err = 0;
out:
@@ -578,9 +578,9 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
spin_unlock_irqrestore(&bitmap->lock, flags);
sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
switch (op) {
- case MASK_SET: sb->state |= bits;
+ case MASK_SET: sb->state |= cpu_to_le32(bits);
break;
- case MASK_UNSET: sb->state &= ~bits;
+ case MASK_UNSET: sb->state &= cpu_to_le32(~bits);
break;
default: BUG();
}
@@ -613,6 +613,7 @@ static inline unsigned long file_page_offset(unsigned long chunk)
static inline struct page *filemap_get_page(struct bitmap *bitmap,
unsigned long chunk)
{
+ if (file_page_index(chunk) >= bitmap->file_pages) return NULL;
return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
}
@@ -661,7 +662,7 @@ static void bitmap_file_put(struct bitmap *bitmap)
bitmap_file_unmap(bitmap);
if (file) {
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
invalidate_inode_pages(inode->i_mapping);
fput(file);
}
@@ -739,6 +740,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
}
page = filemap_get_page(bitmap, chunk);
+ if (!page) return;
bit = file_page_offset(chunk);
/* set the bit */
@@ -1322,6 +1324,18 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
}
+/* dirty the memory and file bits for bitmap chunks "s" to "e" */
+void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
+{
+ unsigned long chunk;
+
+ for (chunk = s; chunk <= e; chunk++) {
+ sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap);
+ bitmap_set_memory_bits(bitmap, sec, 1);
+ bitmap_file_set_bit(bitmap, sec);
+ }
+}
+
/*
* flush out any pending updates
*/
@@ -1399,7 +1413,7 @@ int bitmap_create(mddev_t *mddev)
int err;
sector_t start;
- BUG_ON(sizeof(bitmap_super_t) != 256);
+ BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */
return 0;
@@ -1430,8 +1444,7 @@ int bitmap_create(mddev_t *mddev)
if (err)
goto error;
- bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
- sizeof(bitmap->chunksize));
+ bitmap->chunkshift = ffz(~bitmap->chunksize);
/* now that chunksize and chunkshift are set, we can use these macros */
chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
index bbf4615f0e30..da4349649f7f 100644
--- a/drivers/md/dm-bio-list.h
+++ b/drivers/md/dm-bio-list.h
@@ -44,6 +44,20 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
bl->tail = bl2->tail;
}
+static inline void bio_list_merge_head(struct bio_list *bl,
+ struct bio_list *bl2)
+{
+ if (!bl2->head)
+ return;
+
+ if (bl->head)
+ bl2->tail->bi_next = bl->head;
+ else
+ bl->tail = bl2->tail;
+
+ bl->head = bl2->head;
+}
+
static inline struct bio *bio_list_pop(struct bio_list *bl)
{
struct bio *bio = bl->head;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bdbd34993a80..4c2471ee054a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,6 +1,7 @@
/*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -15,24 +16,28 @@
#include <linux/slab.h>
#include <linux/crypto.h>
#include <linux/workqueue.h>
+#include <linux/backing-dev.h>
#include <asm/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
+#include <asm/unaligned.h>
#include "dm.h"
#define DM_MSG_PREFIX "crypt"
+#define MESG_STR(x) x, sizeof(x)
/*
* per bio private data
*/
struct crypt_io {
struct dm_target *target;
- struct bio *bio;
+ struct bio *base_bio;
struct bio *first_clone;
struct work_struct work;
atomic_t pending;
int error;
+ int post_process;
};
/*
@@ -63,6 +68,7 @@ struct crypt_iv_operations {
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
*/
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
struct crypt_config {
struct dm_dev *dev;
sector_t start;
@@ -73,28 +79,33 @@ struct crypt_config {
*/
mempool_t *io_pool;
mempool_t *page_pool;
+ struct bio_set *bs;
/*
* crypto related data
*/
struct crypt_iv_operations *iv_gen_ops;
char *iv_mode;
- struct crypto_cipher *iv_gen_private;
+ union {
+ struct crypto_cipher *essiv_tfm;
+ int benbi_shift;
+ } iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
char cipher[CRYPTO_MAX_ALG_NAME];
char chainmode[CRYPTO_MAX_ALG_NAME];
struct crypto_blkcipher *tfm;
+ unsigned long flags;
unsigned int key_size;
u8 key[0];
};
-#define MIN_IOS 256
+#define MIN_IOS 16
#define MIN_POOL_PAGES 32
#define MIN_BIO_PAGES 8
-static kmem_cache_t *_crypt_io_pool;
+static struct kmem_cache *_crypt_io_pool;
/*
* Different IV generation algorithms:
@@ -106,6 +117,9 @@ static kmem_cache_t *_crypt_io_pool;
* encrypted with the bulk cipher using a salt as key. The salt
* should be derived from the bulk cipher's key via hashing.
*
+ * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1
+ * (needed for LRW-32-AES and possible other narrow block modes)
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
@@ -184,21 +198,61 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
}
kfree(salt);
- cc->iv_gen_private = essiv_tfm;
+ cc->iv_gen_private.essiv_tfm = essiv_tfm;
return 0;
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
- crypto_free_cipher(cc->iv_gen_private);
- cc->iv_gen_private = NULL;
+ crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
+ cc->iv_gen_private.essiv_tfm = NULL;
}
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
{
memset(iv, 0, cc->iv_size);
*(u64 *)iv = cpu_to_le64(sector);
- crypto_cipher_encrypt_one(cc->iv_gen_private, iv, iv);
+ crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
+ return 0;
+}
+
+static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ unsigned int bs = crypto_blkcipher_blocksize(cc->tfm);
+ int log = ilog2(bs);
+
+ /* we need to calculate how far we must shift the sector count
+ * to get the cipher block count, we use this shift in _gen */
+
+ if (1 << log != bs) {
+ ti->error = "cypher blocksize is not a power of 2";
+ return -EINVAL;
+ }
+
+ if (log > 9) {
+ ti->error = "cypher blocksize is > 512";
+ return -EINVAL;
+ }
+
+ cc->iv_gen_private.benbi_shift = 9 - log;
+
+ return 0;
+}
+
+static void crypt_iv_benbi_dtr(struct crypt_config *cc)
+{
+}
+
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+ __be64 val;
+
+ memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
+
+ val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
+ put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
+
return 0;
}
@@ -212,13 +266,18 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = {
.generator = crypt_iv_essiv_gen
};
+static struct crypt_iv_operations crypt_iv_benbi_ops = {
+ .ctr = crypt_iv_benbi_ctr,
+ .dtr = crypt_iv_benbi_dtr,
+ .generator = crypt_iv_benbi_gen
+};
static int
crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
struct scatterlist *in, unsigned int length,
int write, sector_t sector)
{
- u8 iv[cc->iv_size];
+ u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64))));
struct blkcipher_desc desc = {
.tfm = cc->tfm,
.info = iv,
@@ -306,6 +365,14 @@ static int crypt_convert(struct crypt_config *cc,
return r;
}
+ static void dm_crypt_bio_destructor(struct bio *bio)
+ {
+ struct crypt_io *io = bio->bi_private;
+ struct crypt_config *cc = io->target->private;
+
+ bio_free(bio, cc->bs);
+ }
+
/*
* Generate a new unfragmented bio with the given size
* This should never violate the device limitations
@@ -315,34 +382,33 @@ static struct bio *
crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
struct bio *base_bio, unsigned int *bio_vec_idx)
{
- struct bio *bio;
+ struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
unsigned int i;
- /*
- * Use __GFP_NOMEMALLOC to tell the VM to act less aggressively and
- * to fail earlier. This is not necessary but increases throughput.
- * FIXME: Is this really intelligent?
- */
- if (base_bio)
- bio = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC);
- else
- bio = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs);
- if (!bio)
+ if (base_bio) {
+ clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
+ __bio_clone(clone, base_bio);
+ } else
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
+
+ if (!clone)
return NULL;
+ clone->bi_destructor = dm_crypt_bio_destructor;
+
/* if the last bio was not complete, continue where that one ended */
- bio->bi_idx = *bio_vec_idx;
- bio->bi_vcnt = *bio_vec_idx;
- bio->bi_size = 0;
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+ clone->bi_idx = *bio_vec_idx;
+ clone->bi_vcnt = *bio_vec_idx;
+ clone->bi_size = 0;
+ clone->bi_flags &= ~(1 << BIO_SEG_VALID);
- /* bio->bi_idx pages have already been allocated */
- size -= bio->bi_idx * PAGE_SIZE;
+ /* clone->bi_idx pages have already been allocated */
+ size -= clone->bi_idx * PAGE_SIZE;
- for(i = bio->bi_idx; i < nr_iovecs; i++) {
- struct bio_vec *bv = bio_iovec_idx(bio, i);
+ for (i = clone->bi_idx; i < nr_iovecs; i++) {
+ struct bio_vec *bv = bio_iovec_idx(clone, i);
bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
if (!bv->bv_page)
@@ -353,7 +419,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
* return a partially allocated bio, the caller will then try
* to allocate additional bios while submitting this partial bio
*/
- if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1))
+ if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1))
gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
bv->bv_offset = 0;
@@ -362,13 +428,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
else
bv->bv_len = size;
- bio->bi_size += bv->bv_len;
- bio->bi_vcnt++;
+ clone->bi_size += bv->bv_len;
+ clone->bi_vcnt++;
size -= bv->bv_len;
}
- if (!bio->bi_size) {
- bio_put(bio);
+ if (!clone->bi_size) {
+ bio_put(clone);
return NULL;
}
@@ -376,13 +442,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
* Remember the last bio_vec allocated to be able
* to correctly continue after the splitting.
*/
- *bio_vec_idx = bio->bi_vcnt;
+ *bio_vec_idx = clone->bi_vcnt;
- return bio;
+ return clone;
}
static void crypt_free_buffer_pages(struct crypt_config *cc,
- struct bio *bio, unsigned int bytes)
+ struct bio *clone, unsigned int bytes)
{
unsigned int i, start, end;
struct bio_vec *bv;
@@ -396,19 +462,19 @@ static void crypt_free_buffer_pages(struct crypt_config *cc,
* A fix to the bi_idx issue in the kernel is in the works, so
* we will hopefully be able to revert to the cleaner solution soon.
*/
- i = bio->bi_vcnt - 1;
- bv = bio_iovec_idx(bio, i);
- end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size;
+ i = clone->bi_vcnt - 1;
+ bv = bio_iovec_idx(clone, i);
+ end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - clone->bi_size;
start = end - bytes;
start >>= PAGE_SHIFT;
- if (!bio->bi_size)
- end = bio->bi_vcnt;
+ if (!clone->bi_size)
+ end = clone->bi_vcnt;
else
end >>= PAGE_SHIFT;
- for(i = start; i < end; i++) {
- bv = bio_iovec_idx(bio, i);
+ for (i = start; i < end; i++) {
+ bv = bio_iovec_idx(clone, i);
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool);
bv->bv_page = NULL;
@@ -432,7 +498,7 @@ static void dec_pending(struct crypt_io *io, int error)
if (io->first_clone)
bio_put(io->first_clone);
- bio_endio(io->bio, io->bio->bi_size, io->error);
+ bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
mempool_free(io, cc->io_pool);
}
@@ -441,29 +507,179 @@ static void dec_pending(struct crypt_io *io, int error)
* kcryptd:
*
* Needed because it would be very unwise to do decryption in an
- * interrupt context, so bios returning from read requests get
- * queued here.
+ * interrupt context.
*/
static struct workqueue_struct *_kcryptd_workqueue;
+static void kcryptd_do_work(struct work_struct *work);
-static void kcryptd_do_work(void *data)
+static void kcryptd_queue_io(struct crypt_io *io)
{
- struct crypt_io *io = (struct crypt_io *) data;
- struct crypt_config *cc = (struct crypt_config *) io->target->private;
+ INIT_WORK(&io->work, kcryptd_do_work);
+ queue_work(_kcryptd_workqueue, &io->work);
+}
+
+static int crypt_endio(struct bio *clone, unsigned int done, int error)
+{
+ struct crypt_io *io = clone->bi_private;
+ struct crypt_config *cc = io->target->private;
+ unsigned read_io = bio_data_dir(clone) == READ;
+
+ /*
+ * free the processed pages, even if
+ * it's only a partially completed write
+ */
+ if (!read_io)
+ crypt_free_buffer_pages(cc, clone, done);
+
+ /* keep going - not finished yet */
+ if (unlikely(clone->bi_size))
+ return 1;
+
+ if (!read_io)
+ goto out;
+
+ if (unlikely(!bio_flagged(clone, BIO_UPTODATE))) {
+ error = -EIO;
+ goto out;
+ }
+
+ bio_put(clone);
+ io->post_process = 1;
+ kcryptd_queue_io(io);
+ return 0;
+
+out:
+ bio_put(clone);
+ dec_pending(io, error);
+ return error;
+}
+
+static void clone_init(struct crypt_io *io, struct bio *clone)
+{
+ struct crypt_config *cc = io->target->private;
+
+ clone->bi_private = io;
+ clone->bi_end_io = crypt_endio;
+ clone->bi_bdev = cc->dev->bdev;
+ clone->bi_rw = io->base_bio->bi_rw;
+}
+
+static void process_read(struct crypt_io *io)
+{
+ struct crypt_config *cc = io->target->private;
+ struct bio *base_bio = io->base_bio;
+ struct bio *clone;
+ sector_t sector = base_bio->bi_sector - io->target->begin;
+
+ atomic_inc(&io->pending);
+
+ /*
+ * The block layer might modify the bvec array, so always
+ * copy the required bvecs because we need the original
+ * one in order to decrypt the whole bio data *afterwards*.
+ */
+ clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
+ if (unlikely(!clone)) {
+ dec_pending(io, -ENOMEM);
+ return;
+ }
+
+ clone_init(io, clone);
+ clone->bi_destructor = dm_crypt_bio_destructor;
+ clone->bi_idx = 0;
+ clone->bi_vcnt = bio_segments(base_bio);
+ clone->bi_size = base_bio->bi_size;
+ clone->bi_sector = cc->start + sector;
+ memcpy(clone->bi_io_vec, bio_iovec(base_bio),
+ sizeof(struct bio_vec) * clone->bi_vcnt);
+
+ generic_make_request(clone);
+}
+
+static void process_write(struct crypt_io *io)
+{
+ struct crypt_config *cc = io->target->private;
+ struct bio *base_bio = io->base_bio;
+ struct bio *clone;
struct convert_context ctx;
- int r;
+ unsigned remaining = base_bio->bi_size;
+ sector_t sector = base_bio->bi_sector - io->target->begin;
+ unsigned bvec_idx = 0;
- crypt_convert_init(cc, &ctx, io->bio, io->bio,
- io->bio->bi_sector - io->target->begin, 0);
- r = crypt_convert(cc, &ctx);
+ atomic_inc(&io->pending);
+
+ crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1);
+
+ /*
+ * The allocated buffers can be smaller than the whole bio,
+ * so repeat the whole process until all the data can be handled.
+ */
+ while (remaining) {
+ clone = crypt_alloc_buffer(cc, base_bio->bi_size,
+ io->first_clone, &bvec_idx);
+ if (unlikely(!clone)) {
+ dec_pending(io, -ENOMEM);
+ return;
+ }
+
+ ctx.bio_out = clone;
+
+ if (unlikely(crypt_convert(cc, &ctx) < 0)) {
+ crypt_free_buffer_pages(cc, clone, clone->bi_size);
+ bio_put(clone);
+ dec_pending(io, -EIO);
+ return;
+ }
- dec_pending(io, r);
+ clone_init(io, clone);
+ clone->bi_sector = cc->start + sector;
+
+ if (!io->first_clone) {
+ /*
+ * hold a reference to the first clone, because it
+ * holds the bio_vec array and that can't be freed
+ * before all other clones are released
+ */
+ bio_get(clone);
+ io->first_clone = clone;
+ }
+
+ remaining -= clone->bi_size;
+ sector += bio_sectors(clone);
+
+ /* prevent bio_put of first_clone */
+ if (remaining)
+ atomic_inc(&io->pending);
+
+ generic_make_request(clone);
+
+ /* out of memory -> run queues */
+ if (remaining)
+ congestion_wait(bio_data_dir(clone), HZ/100);
+ }
}
-static void kcryptd_queue_io(struct crypt_io *io)
+static void process_read_endio(struct crypt_io *io)
{
- INIT_WORK(&io->work, kcryptd_do_work, io);
- queue_work(_kcryptd_workqueue, &io->work);
+ struct crypt_config *cc = io->target->private;
+ struct convert_context ctx;
+
+ crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio,
+ io->base_bio->bi_sector - io->target->begin, 0);
+
+ dec_pending(io, crypt_convert(cc, &ctx));
+}
+
+static void kcryptd_do_work(struct work_struct *work)
+{
+ struct crypt_io *io = container_of(work, struct crypt_io, work);
+
+ if (io->post_process)
+ process_read_endio(io);
+ else if (bio_data_dir(io->base_bio) == READ)
+ process_read(io);
+ else
+ process_write(io);
}
/*
@@ -477,7 +693,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
buffer[2] = '\0';
- for(i = 0; i < size; i++) {
+ for (i = 0; i < size; i++) {
buffer[0] = *hex++;
buffer[1] = *hex++;
@@ -500,13 +716,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
{
unsigned int i;
- for(i = 0; i < size; i++) {
+ for (i = 0; i < size; i++) {
sprintf(hex, "%02x", *key);
hex += 2;
key++;
}
}
+static int crypt_set_key(struct crypt_config *cc, char *key)
+{
+ unsigned key_size = strlen(key) >> 1;
+
+ if (cc->key_size && cc->key_size != key_size)
+ return -EINVAL;
+
+ cc->key_size = key_size; /* initial settings */
+
+ if ((!key_size && strcmp(key, "-")) ||
+ (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+ return -EINVAL;
+
+ set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+
+ return 0;
+}
+
+static int crypt_wipe_key(struct crypt_config *cc)
+{
+ clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
+ memset(&cc->key, 0, cc->key_size * sizeof(u8));
+ return 0;
+}
+
/*
* Construct an encryption mapping:
* <cipher> <key> <iv_offset> <dev_path> <start>
@@ -539,16 +780,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
key_size = strlen(argv[1]) >> 1;
- cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+ cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
if (cc == NULL) {
ti->error =
"Cannot allocate transparent encryption context";
return -ENOMEM;
}
- cc->key_size = key_size;
- if ((!key_size && strcmp(argv[1], "-") != 0) ||
- (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
+ if (crypt_set_key(cc, argv[1])) {
ti->error = "Error decoding key";
goto bad1;
}
@@ -581,7 +820,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->tfm = tfm;
/*
- * Choose ivmode. Valid modes: "plain", "essiv:<esshash>".
+ * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
* See comments at iv code
*/
@@ -591,6 +830,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops = &crypt_iv_plain_ops;
else if (strcmp(ivmode, "essiv") == 0)
cc->iv_gen_ops = &crypt_iv_essiv_ops;
+ else if (strcmp(ivmode, "benbi") == 0)
+ cc->iv_gen_ops = &crypt_iv_benbi_ops;
else {
ti->error = "Invalid IV mode";
goto bad2;
@@ -626,6 +867,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad4;
}
+ cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4);
+ if (!cc->bs) {
+ ti->error = "Cannot allocate crypt bioset";
+ goto bad_bs;
+ }
+
if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) {
ti->error = "Error setting key";
goto bad5;
@@ -665,6 +912,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return 0;
bad5:
+ bioset_free(cc->bs);
+bad_bs:
mempool_destroy(cc->page_pool);
bad4:
mempool_destroy(cc->io_pool);
@@ -684,6 +933,7 @@ static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = (struct crypt_config *) ti->private;
+ bioset_free(cc->bs);
mempool_destroy(cc->page_pool);
mempool_destroy(cc->io_pool);
@@ -698,155 +948,27 @@ static void crypt_dtr(struct dm_target *ti)
kfree(cc);
}
-static int crypt_endio(struct bio *bio, unsigned int done, int error)
-{
- struct crypt_io *io = (struct crypt_io *) bio->bi_private;
- struct crypt_config *cc = (struct crypt_config *) io->target->private;
-
- if (bio_data_dir(bio) == WRITE) {
- /*
- * free the processed pages, even if
- * it's only a partially completed write
- */
- crypt_free_buffer_pages(cc, bio, done);
- }
-
- if (bio->bi_size)
- return 1;
-
- bio_put(bio);
-
- /*
- * successful reads are decrypted by the worker thread
- */
- if ((bio_data_dir(bio) == READ)
- && bio_flagged(bio, BIO_UPTODATE)) {
- kcryptd_queue_io(io);
- return 0;
- }
-
- dec_pending(io, error);
- return error;
-}
-
-static inline struct bio *
-crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
- sector_t sector, unsigned int *bvec_idx,
- struct convert_context *ctx)
-{
- struct bio *clone;
-
- if (bio_data_dir(bio) == WRITE) {
- clone = crypt_alloc_buffer(cc, bio->bi_size,
- io->first_clone, bvec_idx);
- if (clone) {
- ctx->bio_out = clone;
- if (crypt_convert(cc, ctx) < 0) {
- crypt_free_buffer_pages(cc, clone,
- clone->bi_size);
- bio_put(clone);
- return NULL;
- }
- }
- } else {
- /*
- * The block layer might modify the bvec array, so always
- * copy the required bvecs because we need the original
- * one in order to decrypt the whole bio data *afterwards*.
- */
- clone = bio_alloc(GFP_NOIO, bio_segments(bio));
- if (clone) {
- clone->bi_idx = 0;
- clone->bi_vcnt = bio_segments(bio);
- clone->bi_size = bio->bi_size;
- memcpy(clone->bi_io_vec, bio_iovec(bio),
- sizeof(struct bio_vec) * clone->bi_vcnt);
- }
- }
-
- if (!clone)
- return NULL;
-
- clone->bi_private = io;
- clone->bi_end_io = crypt_endio;
- clone->bi_bdev = cc->dev->bdev;
- clone->bi_sector = cc->start + sector;
- clone->bi_rw = bio->bi_rw;
-
- return clone;
-}
-
static int crypt_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
- struct crypt_config *cc = (struct crypt_config *) ti->private;
- struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO);
- struct convert_context ctx;
- struct bio *clone;
- unsigned int remaining = bio->bi_size;
- sector_t sector = bio->bi_sector - ti->begin;
- unsigned int bvec_idx = 0;
+ struct crypt_config *cc = ti->private;
+ struct crypt_io *io;
+ io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->target = ti;
- io->bio = bio;
+ io->base_bio = bio;
io->first_clone = NULL;
- io->error = 0;
- atomic_set(&io->pending, 1); /* hold a reference */
-
- if (bio_data_dir(bio) == WRITE)
- crypt_convert_init(cc, &ctx, NULL, bio, sector, 1);
+ io->error = io->post_process = 0;
+ atomic_set(&io->pending, 0);
+ kcryptd_queue_io(io);
- /*
- * The allocated buffers can be smaller than the whole bio,
- * so repeat the whole process until all the data can be handled.
- */
- while (remaining) {
- clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx);
- if (!clone)
- goto cleanup;
-
- if (!io->first_clone) {
- /*
- * hold a reference to the first clone, because it
- * holds the bio_vec array and that can't be freed
- * before all other clones are released
- */
- bio_get(clone);
- io->first_clone = clone;
- }
- atomic_inc(&io->pending);
-
- remaining -= clone->bi_size;
- sector += bio_sectors(clone);
-
- generic_make_request(clone);
-
- /* out of memory -> run queues */
- if (remaining)
- blk_congestion_wait(bio_data_dir(clone), HZ/100);
- }
-
- /* drop reference, clones could have returned before we reach this */
- dec_pending(io, 0);
- return 0;
-
-cleanup:
- if (io->first_clone) {
- dec_pending(io, -ENOMEM);
- return 0;
- }
-
- /* if no bio has been dispatched yet, we can directly return the error */
- mempool_free(io, cc->io_pool);
- return -ENOMEM;
+ return DM_MAPIO_SUBMITTED;
}
static int crypt_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
struct crypt_config *cc = (struct crypt_config *) ti->private;
- const char *cipher;
- const char *chainmode = NULL;
unsigned int sz = 0;
switch (type) {
@@ -855,14 +977,11 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- cipher = crypto_blkcipher_name(cc->tfm);
-
- chainmode = cc->chainmode;
-
if (cc->iv_mode)
- DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode);
+ DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode,
+ cc->iv_mode);
else
- DMEMIT("%s-%s ", cipher, chainmode);
+ DMEMIT("%s-%s ", cc->cipher, cc->chainmode);
if (cc->key_size > 0) {
if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -883,14 +1002,71 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
return 0;
}
+static void crypt_postsuspend(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ set_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+static int crypt_preresume(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) {
+ DMERR("aborting resume - crypt key is not set.");
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void crypt_resume(struct dm_target *ti)
+{
+ struct crypt_config *cc = ti->private;
+
+ clear_bit(DM_CRYPT_SUSPENDED, &cc->flags);
+}
+
+/* Message interface
+ * key set <key>
+ * key wipe
+ */
+static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct crypt_config *cc = ti->private;
+
+ if (argc < 2)
+ goto error;
+
+ if (!strnicmp(argv[0], MESG_STR("key"))) {
+ if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
+ DMWARN("not suspended during key manipulation.");
+ return -EINVAL;
+ }
+ if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
+ return crypt_set_key(cc, argv[2]);
+ if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
+ return crypt_wipe_key(cc);
+ }
+
+error:
+ DMWARN("unrecognised message received.");
+ return -EINVAL;
+}
+
static struct target_type crypt_target = {
.name = "crypt",
- .version= {1, 1, 0},
+ .version= {1, 3, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
.map = crypt_map,
.status = crypt_status,
+ .postsuspend = crypt_postsuspend,
+ .preresume = crypt_preresume,
+ .resume = crypt_resume,
+ .message = crypt_message,
};
static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index 2a374ccb30dd..265c467854da 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -40,7 +40,7 @@ static inline void free_bio(struct bio *bio)
static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
{
- struct path *path = bio->bi_private;
+ struct dm_path *path = bio->bi_private;
if (bio->bi_size)
return 1;
@@ -61,7 +61,7 @@ static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
return 0;
}
-static struct bio *get_failover_bio(struct path *path, unsigned data_size)
+static struct bio *get_failover_bio(struct dm_path *path, unsigned data_size)
{
struct bio *bio;
struct page *page;
@@ -96,7 +96,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
}
static struct request *get_failover_req(struct emc_handler *h,
- struct bio *bio, struct path *path)
+ struct bio *bio, struct dm_path *path)
{
struct request *rq;
struct block_device *bdev = bio->bi_bdev;
@@ -126,13 +126,14 @@ static struct request *get_failover_req(struct emc_handler *h,
memset(&rq->cmd, 0, BLK_MAX_CDB);
rq->timeout = EMC_FAILOVER_TIMEOUT;
- rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
+ rq->cmd_type = REQ_TYPE_BLOCK_PC;
+ rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE;
return rq;
}
static struct request *emc_trespass_get(struct emc_handler *h,
- struct path *path)
+ struct dm_path *path)
{
struct bio *bio;
struct request *rq;
@@ -190,7 +191,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
}
static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
- struct path *path)
+ struct dm_path *path)
{
struct request *rq;
struct request_queue *q = bdev_get_queue(path->dev->bdev);
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index d12379b5cdb5..99cdffa7fbfe 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#define DM_MSG_PREFIX "snapshots"
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
/*-----------------------------------------------------------------
* Persistent snapshots, by persistent we mean that the snapshot
@@ -150,6 +151,7 @@ static int alloc_area(struct pstore *ps)
static void free_area(struct pstore *ps)
{
vfree(ps->area);
+ ps->area = NULL;
}
/*
@@ -198,48 +200,79 @@ static int read_header(struct pstore *ps, int *new_snapshot)
int r;
struct disk_header *dh;
chunk_t chunk_size;
+ int chunk_size_supplied = 1;
- r = chunk_io(ps, 0, READ);
+ /*
+ * Use default chunk size (or hardsect_size, if larger) if none supplied
+ */
+ if (!ps->snap->chunk_size) {
+ ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+ bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
+ ps->snap->chunk_mask = ps->snap->chunk_size - 1;
+ ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+ chunk_size_supplied = 0;
+ }
+
+ r = dm_io_get(sectors_to_pages(ps->snap->chunk_size));
if (r)
return r;
+ r = alloc_area(ps);
+ if (r)
+ goto bad1;
+
+ r = chunk_io(ps, 0, READ);
+ if (r)
+ goto bad2;
+
dh = (struct disk_header *) ps->area;
if (le32_to_cpu(dh->magic) == 0) {
*new_snapshot = 1;
+ return 0;
+ }
- } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
- *new_snapshot = 0;
- ps->valid = le32_to_cpu(dh->valid);
- ps->version = le32_to_cpu(dh->version);
- chunk_size = le32_to_cpu(dh->chunk_size);
- if (ps->snap->chunk_size != chunk_size) {
- DMWARN("chunk size %llu in device metadata overrides "
- "table chunk size of %llu.",
- (unsigned long long)chunk_size,
- (unsigned long long)ps->snap->chunk_size);
-
- /* We had a bogus chunk_size. Fix stuff up. */
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
- free_area(ps);
-
- ps->snap->chunk_size = chunk_size;
- ps->snap->chunk_mask = chunk_size - 1;
- ps->snap->chunk_shift = ffs(chunk_size) - 1;
-
- r = alloc_area(ps);
- if (r)
- return r;
-
- r = dm_io_get(sectors_to_pages(chunk_size));
- if (r)
- return r;
- }
- } else {
- DMWARN("Invalid/corrupt snapshot");
+ if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
+ DMWARN("Invalid or corrupt snapshot");
r = -ENXIO;
+ goto bad2;
}
+ *new_snapshot = 0;
+ ps->valid = le32_to_cpu(dh->valid);
+ ps->version = le32_to_cpu(dh->version);
+ chunk_size = le32_to_cpu(dh->chunk_size);
+
+ if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+ return 0;
+
+ DMWARN("chunk size %llu in device metadata overrides "
+ "table chunk size of %llu.",
+ (unsigned long long)chunk_size,
+ (unsigned long long)ps->snap->chunk_size);
+
+ /* We had a bogus chunk_size. Fix stuff up. */
+ dm_io_put(sectors_to_pages(ps->snap->chunk_size));
+ free_area(ps);
+
+ ps->snap->chunk_size = chunk_size;
+ ps->snap->chunk_mask = chunk_size - 1;
+ ps->snap->chunk_shift = ffs(chunk_size) - 1;
+
+ r = dm_io_get(sectors_to_pages(chunk_size));
+ if (r)
+ return r;
+
+ r = alloc_area(ps);
+ if (r)
+ goto bad1;
+
+ return 0;
+
+bad2:
+ free_area(ps);
+bad1:
+ dm_io_put(sectors_to_pages(ps->snap->chunk_size));
return r;
}
@@ -263,42 +296,29 @@ static int write_header(struct pstore *ps)
*/
static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
{
- if (index >= ps->exceptions_per_area)
- return NULL;
+ BUG_ON(index >= ps->exceptions_per_area);
return ((struct disk_exception *) ps->area) + index;
}
-static int read_exception(struct pstore *ps,
- uint32_t index, struct disk_exception *result)
+static void read_exception(struct pstore *ps,
+ uint32_t index, struct disk_exception *result)
{
- struct disk_exception *e;
-
- e = get_exception(ps, index);
- if (!e)
- return -EINVAL;
+ struct disk_exception *e = get_exception(ps, index);
/* copy it */
result->old_chunk = le64_to_cpu(e->old_chunk);
result->new_chunk = le64_to_cpu(e->new_chunk);
-
- return 0;
}
-static int write_exception(struct pstore *ps,
- uint32_t index, struct disk_exception *de)
+static void write_exception(struct pstore *ps,
+ uint32_t index, struct disk_exception *de)
{
- struct disk_exception *e;
-
- e = get_exception(ps, index);
- if (!e)
- return -EINVAL;
+ struct disk_exception *e = get_exception(ps, index);
/* copy it */
e->old_chunk = cpu_to_le64(de->old_chunk);
e->new_chunk = cpu_to_le64(de->new_chunk);
-
- return 0;
}
/*
@@ -316,10 +336,7 @@ static int insert_exceptions(struct pstore *ps, int *full)
*full = 1;
for (i = 0; i < ps->exceptions_per_area; i++) {
- r = read_exception(ps, i, &de);
-
- if (r)
- return r;
+ read_exception(ps, i, &de);
/*
* If the new_chunk is pointing at the start of
@@ -519,6 +536,16 @@ static void persistent_commit(struct exception_store *store,
if (r)
ps->valid = 0;
+ /*
+ * Have we completely filled the current area ?
+ */
+ if (ps->current_committed == ps->exceptions_per_area) {
+ ps->current_committed = 0;
+ r = zero_area(ps, ps->current_area + 1);
+ if (r)
+ ps->valid = 0;
+ }
+
for (i = 0; i < ps->callback_count; i++) {
cb = ps->callbacks + i;
cb->callback(cb->context, r == 0 ? 1 : 0);
@@ -526,16 +553,6 @@ static void persistent_commit(struct exception_store *store,
ps->callback_count = 0;
}
-
- /*
- * Have we completely filled the current area ?
- */
- if (ps->current_committed == ps->exceptions_per_area) {
- ps->current_committed = 0;
- r = zero_area(ps, ps->current_area + 1);
- if (r)
- ps->valid = 0;
- }
}
static void persistent_drop(struct exception_store *store)
@@ -547,32 +564,22 @@ static void persistent_drop(struct exception_store *store)
DMWARN("write header failed");
}
-int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
+int dm_create_persistent(struct exception_store *store)
{
- int r;
struct pstore *ps;
- r = dm_io_get(sectors_to_pages(chunk_size));
- if (r)
- return r;
-
/* allocate the pstore */
ps = kmalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps) {
- r = -ENOMEM;
- goto bad;
- }
+ if (!ps)
+ return -ENOMEM;
ps->snap = store->snap;
ps->valid = 1;
ps->version = SNAPSHOT_DISK_VERSION;
+ ps->area = NULL;
ps->next_free = 2; /* skipping the header and first area */
ps->current_committed = 0;
- r = alloc_area(ps);
- if (r)
- goto bad;
-
ps->callback_count = 0;
atomic_set(&ps->pending_count, 0);
ps->callbacks = NULL;
@@ -586,13 +593,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
store->context = ps;
return 0;
-
- bad:
- dm_io_put(sectors_to_pages(chunk_size));
- if (ps && ps->area)
- free_area(ps);
- kfree(ps);
- return r;
}
/*-----------------------------------------------------------------
@@ -642,18 +642,16 @@ static void transient_fraction_full(struct exception_store *store,
*denominator = get_dev_size(store->snap->cow->bdev);
}
-int dm_create_transient(struct exception_store *store,
- struct dm_snapshot *s, int blocksize)
+int dm_create_transient(struct exception_store *store)
{
struct transient_c *tc;
- memset(store, 0, sizeof(*store));
store->destroy = transient_destroy;
store->read_metadata = transient_read_metadata;
store->prepare_exception = transient_prepare;
store->commit_exception = transient_commit;
+ store->drop_snapshot = NULL;
store->fraction_full = transient_fraction_full;
- store->snap = s;
tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
if (!tc)
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
index 15f5629e231a..32eff28e4adc 100644
--- a/drivers/md/dm-hw-handler.h
+++ b/drivers/md/dm-hw-handler.h
@@ -32,7 +32,7 @@ struct hw_handler_type {
void (*destroy) (struct hw_handler *hwh);
void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
- struct path *path);
+ struct dm_path *path);
unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
int (*status) (struct hw_handler *hwh, status_type_t type,
char *result, unsigned int maxlen);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index da663d2ff552..4eb73d395213 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -92,12 +92,12 @@ void dm_io_put(unsigned int num_pages)
*---------------------------------------------------------------*/
static inline void bio_set_region(struct bio *bio, unsigned region)
{
- bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region;
+ bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
}
static inline unsigned bio_get_region(struct bio *bio)
{
- return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len;
+ return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
}
/*-----------------------------------------------------------------
@@ -136,6 +136,7 @@ static int endio(struct bio *bio, unsigned int done, int error)
zero_fill_bio(bio);
dec_count(io, bio_get_region(bio), error);
+ bio->bi_max_vecs++;
bio_put(bio);
return 0;
@@ -250,16 +251,18 @@ static void do_region(int rw, unsigned int region, struct io_region *where,
while (remaining) {
/*
- * Allocate a suitably sized bio, we add an extra
- * bvec for bio_get/set_region().
+ * Allocate a suitably sized-bio: we add an extra
+ * bvec for bio_get/set_region() and decrement bi_max_vecs
+ * to hide it from bio_add_page().
*/
- num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2;
+ num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
bio->bi_private = io;
bio->bi_destructor = dm_bio_destructor;
+ bio->bi_max_vecs--;
bio_set_region(bio, region);
/*
@@ -302,7 +305,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
}
/*
- * Drop the extra refence that we were holding to avoid
+ * Drop the extra reference that we were holding to avoid
* the io being completed too early.
*/
dec_count(io, 0, 0);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index d13bb15a8a02..cd6a184536a1 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -606,9 +606,14 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
return __get_name_cell(param->name);
md = dm_get_md(huge_decode_dev(param->dev));
- if (md)
- mdptr = dm_get_mdptr(md);
+ if (!md)
+ goto out;
+ mdptr = dm_get_mdptr(md);
+ if (!mdptr)
+ dm_put(md);
+
+out:
return mdptr;
}
@@ -760,7 +765,7 @@ out:
static int do_suspend(struct dm_ioctl *param)
{
int r = 0;
- int do_lockfs = 1;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct mapped_device *md;
md = find_device(param);
@@ -768,10 +773,12 @@ static int do_suspend(struct dm_ioctl *param)
return -ENXIO;
if (param->flags & DM_SKIP_LOCKFS_FLAG)
- do_lockfs = 0;
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
- r = dm_suspend(md, do_lockfs);
+ r = dm_suspend(md, suspend_flags);
if (!r)
r = __dev_status(md, param);
@@ -783,7 +790,7 @@ static int do_suspend(struct dm_ioctl *param)
static int do_resume(struct dm_ioctl *param)
{
int r = 0;
- int do_lockfs = 1;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct hash_cell *hc;
struct mapped_device *md;
struct dm_table *new_map;
@@ -809,9 +816,11 @@ static int do_resume(struct dm_ioctl *param)
if (new_map) {
/* Suspend if it isn't already suspended */
if (param->flags & DM_SKIP_LOCKFS_FLAG)
- do_lockfs = 0;
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
- dm_suspend(md, do_lockfs);
+ dm_suspend(md, suspend_flags);
r = dm_swap_table(md, new_map);
if (r) {
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 47b3c62bbdb8..17753d80ad22 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -77,7 +77,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio,
bio->bi_bdev = lc->dev->bdev;
bio->bi_sector = lc->start + (bio->bi_sector - ti->begin);
- return 1;
+ return DM_MAPIO_REMAPPED;
}
static int linear_status(struct dm_target *ti, status_type_t type,
@@ -98,14 +98,31 @@ static int linear_status(struct dm_target *ti, status_type_t type,
return 0;
}
+static int linear_ioctl(struct dm_target *ti, struct inode *inode,
+ struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+ struct block_device *bdev = lc->dev->bdev;
+ struct file fake_file = {};
+ struct dentry fake_dentry = {};
+
+ fake_file.f_mode = lc->dev->mode;
+ fake_file.f_path.dentry = &fake_dentry;
+ fake_dentry.d_inode = bdev->bd_inode;
+
+ return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg);
+}
+
static struct target_type linear_target = {
.name = "linear",
- .version= {1, 0, 1},
+ .version= {1, 0, 2},
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
.map = linear_map,
.status = linear_status,
+ .ioctl = linear_ioctl,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 64b764bd02cc..6a9261351848 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -466,6 +466,7 @@ static int disk_resume(struct dirty_log *log)
/* copy clean across to sync */
memcpy(lc->sync_bits, lc->clean_bits, size);
lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
+ lc->sync_search = 0;
/* set the correct number of regions in the header */
lc->header.nr_regions = lc->region_count;
@@ -480,6 +481,13 @@ static uint32_t core_get_region_size(struct dirty_log *log)
return lc->region_size;
}
+static int core_resume(struct dirty_log *log)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ lc->sync_search = 0;
+ return 0;
+}
+
static int core_is_clean(struct dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
@@ -549,16 +557,19 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region)
return 1;
}
-static void core_complete_resync_work(struct dirty_log *log, region_t region,
- int success)
+static void core_set_region_sync(struct dirty_log *log, region_t region,
+ int in_sync)
{
struct log_c *lc = (struct log_c *) log->context;
log_clear_bit(lc, lc->recovering_bits, region);
- if (success) {
+ if (in_sync) {
log_set_bit(lc, lc->sync_bits, region);
lc->sync_count++;
- }
+ } else if (log_test_bit(lc->sync_bits, region)) {
+ lc->sync_count--;
+ log_clear_bit(lc, lc->sync_bits, region);
+ }
}
static region_t core_get_sync_count(struct dirty_log *log)
@@ -618,6 +629,7 @@ static struct dirty_log_type _core_type = {
.module = THIS_MODULE,
.ctr = core_ctr,
.dtr = core_dtr,
+ .resume = core_resume,
.get_region_size = core_get_region_size,
.is_clean = core_is_clean,
.in_sync = core_in_sync,
@@ -625,7 +637,7 @@ static struct dirty_log_type _core_type = {
.mark_region = core_mark_region,
.clear_region = core_clear_region,
.get_resync_work = core_get_resync_work,
- .complete_resync_work = core_complete_resync_work,
+ .set_region_sync = core_set_region_sync,
.get_sync_count = core_get_sync_count,
.status = core_status,
};
@@ -644,7 +656,7 @@ static struct dirty_log_type _disk_type = {
.mark_region = core_mark_region,
.clear_region = core_clear_region,
.get_resync_work = core_get_resync_work,
- .complete_resync_work = core_complete_resync_work,
+ .set_region_sync = core_set_region_sync,
.get_sync_count = core_get_sync_count,
.status = disk_status,
};
diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h
index 5ae5309ebf28..86a301c8daf1 100644
--- a/drivers/md/dm-log.h
+++ b/drivers/md/dm-log.h
@@ -90,12 +90,12 @@ struct dirty_log_type {
int (*get_resync_work)(struct dirty_log *log, region_t *region);
/*
- * This notifies the log that the resync of an area has
- * been completed. The log should then mark this region
- * as CLEAN.
+ * This notifies the log that the resync status of a region
+ * has changed. It also clears the region from the recovering
+ * list (if present).
*/
- void (*complete_resync_work)(struct dirty_log *log,
- region_t region, int success);
+ void (*set_region_sync)(struct dirty_log *log,
+ region_t region, int in_sync);
/*
* Returns the number of regions that are in sync.
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 93f701ea87bc..3aa013506967 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -31,7 +31,7 @@ struct pgpath {
struct priority_group *pg; /* Owning PG */
unsigned fail_count; /* Cumulative failure count */
- struct path path;
+ struct dm_path path;
};
#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -101,11 +101,11 @@ typedef int (*action_fn) (struct pgpath *pgpath);
#define MIN_IOS 256 /* Mempool size */
-static kmem_cache_t *_mpio_cache;
+static struct kmem_cache *_mpio_cache;
struct workqueue_struct *kmultipathd;
-static void process_queued_ios(void *data);
-static void trigger_event(void *data);
+static void process_queued_ios(struct work_struct *work);
+static void trigger_event(struct work_struct *work);
/*-----------------------------------------------
@@ -114,12 +114,10 @@ static void trigger_event(void *data);
static struct pgpath *alloc_pgpath(void)
{
- struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
+ struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
- if (pgpath) {
- memset(pgpath, 0, sizeof(*pgpath));
+ if (pgpath)
pgpath->path.is_active = 1;
- }
return pgpath;
}
@@ -133,12 +131,10 @@ static struct priority_group *alloc_priority_group(void)
{
struct priority_group *pg;
- pg = kmalloc(sizeof(*pg), GFP_KERNEL);
- if (!pg)
- return NULL;
+ pg = kzalloc(sizeof(*pg), GFP_KERNEL);
- memset(pg, 0, sizeof(*pg));
- INIT_LIST_HEAD(&pg->pgpaths);
+ if (pg)
+ INIT_LIST_HEAD(&pg->pgpaths);
return pg;
}
@@ -168,23 +164,24 @@ static void free_priority_group(struct priority_group *pg,
kfree(pg);
}
-static struct multipath *alloc_multipath(void)
+static struct multipath *alloc_multipath(struct dm_target *ti)
{
struct multipath *m;
- m = kmalloc(sizeof(*m), GFP_KERNEL);
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
if (m) {
- memset(m, 0, sizeof(*m));
INIT_LIST_HEAD(&m->priority_groups);
spin_lock_init(&m->lock);
m->queue_io = 1;
- INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
- INIT_WORK(&m->trigger_event, trigger_event, m);
+ INIT_WORK(&m->process_queued_ios, process_queued_ios);
+ INIT_WORK(&m->trigger_event, trigger_event);
m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
return NULL;
}
+ m->ti = ti;
+ ti->private = m;
}
return m;
@@ -232,7 +229,7 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
{
- struct path *path;
+ struct dm_path *path;
path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
if (!path)
@@ -285,10 +282,27 @@ failed:
m->current_pg = NULL;
}
+/*
+ * Check whether bios must be queued in the device-mapper core rather
+ * than here in the target.
+ *
+ * m->lock must be held on entry.
+ *
+ * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
+ * same value then we are not between multipath_presuspend()
+ * and multipath_resume() calls and we have no need to check
+ * for the DMF_NOFLUSH_SUSPENDING flag.
+ */
+static int __must_push_back(struct multipath *m)
+{
+ return (m->queue_if_no_path != m->saved_queue_if_no_path &&
+ dm_noflush_suspending(m->ti));
+}
+
static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
unsigned was_queued)
{
- int r = 1;
+ int r = DM_MAPIO_REMAPPED;
unsigned long flags;
struct pgpath *pgpath;
@@ -313,11 +327,13 @@ static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
!m->queue_io)
queue_work(kmultipathd, &m->process_queued_ios);
pgpath = NULL;
- r = 0;
- } else if (!pgpath)
- r = -EIO; /* Failed */
- else
+ r = DM_MAPIO_SUBMITTED;
+ } else if (pgpath)
bio->bi_bdev = pgpath->path.dev->bdev;
+ else if (__must_push_back(m))
+ r = DM_MAPIO_REQUEUE;
+ else
+ r = -EIO; /* Failed */
mpio->pgpath = pgpath;
@@ -375,16 +391,19 @@ static void dispatch_queued_ios(struct multipath *m)
r = map_io(m, bio, mpio, 1);
if (r < 0)
bio_endio(bio, bio->bi_size, r);
- else if (r == 1)
+ else if (r == DM_MAPIO_REMAPPED)
generic_make_request(bio);
+ else if (r == DM_MAPIO_REQUEUE)
+ bio_endio(bio, bio->bi_size, -EIO);
bio = next;
}
}
-static void process_queued_ios(void *data)
+static void process_queued_ios(struct work_struct *work)
{
- struct multipath *m = (struct multipath *) data;
+ struct multipath *m =
+ container_of(work, struct multipath, process_queued_ios);
struct hw_handler *hwh = &m->hw_handler;
struct pgpath *pgpath = NULL;
unsigned init_required = 0, must_queue = 1;
@@ -424,9 +443,10 @@ out:
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
*/
-static void trigger_event(void *data)
+static void trigger_event(struct work_struct *work)
{
- struct multipath *m = (struct multipath *) data;
+ struct multipath *m =
+ container_of(work, struct multipath, trigger_event);
dm_table_event(m->ti->table);
}
@@ -557,8 +577,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
}
static struct priority_group *parse_priority_group(struct arg_set *as,
- struct multipath *m,
- struct dm_target *ti)
+ struct multipath *m)
{
static struct param _params[] = {
{1, 1024, "invalid number of paths"},
@@ -568,6 +587,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
int r;
unsigned i, nr_selector_args, nr_params;
struct priority_group *pg;
+ struct dm_target *ti = m->ti;
if (as->argc < 2) {
as->argc = 0;
@@ -624,12 +644,12 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
return NULL;
}
-static int parse_hw_handler(struct arg_set *as, struct multipath *m,
- struct dm_target *ti)
+static int parse_hw_handler(struct arg_set *as, struct multipath *m)
{
int r;
struct hw_handler_type *hwht;
unsigned hw_argc;
+ struct dm_target *ti = m->ti;
static struct param _params[] = {
{0, 1024, "invalid number of hardware handler args"},
@@ -661,11 +681,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m,
return 0;
}
-static int parse_features(struct arg_set *as, struct multipath *m,
- struct dm_target *ti)
+static int parse_features(struct arg_set *as, struct multipath *m)
{
int r;
unsigned argc;
+ struct dm_target *ti = m->ti;
static struct param _params[] = {
{0, 1, "invalid number of feature args"},
@@ -704,19 +724,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
as.argc = argc;
as.argv = argv;
- m = alloc_multipath();
+ m = alloc_multipath(ti);
if (!m) {
ti->error = "can't allocate multipath";
return -EINVAL;
}
- m->ti = ti;
-
- r = parse_features(&as, m, ti);
+ r = parse_features(&as, m);
if (r)
goto bad;
- r = parse_hw_handler(&as, m, ti);
+ r = parse_hw_handler(&as, m);
if (r)
goto bad;
@@ -732,7 +750,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
while (as.argc) {
struct priority_group *pg;
- pg = parse_priority_group(&as, m, ti);
+ pg = parse_priority_group(&as, m);
if (!pg) {
r = -EINVAL;
goto bad;
@@ -752,8 +770,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
goto bad;
}
- ti->private = m;
-
return 0;
bad:
@@ -788,7 +804,7 @@ static int multipath_map(struct dm_target *ti, struct bio *bio,
map_context->ptr = mpio;
bio->bi_rw |= (1 << BIO_RW_FAILFAST);
r = map_io(m, bio, mpio, 0);
- if (r < 0)
+ if (r < 0 || r == DM_MAPIO_REQUEUE)
mempool_free(mpio, m->mpio_pool);
return r;
@@ -962,7 +978,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
/*
* pg_init must call this when it has completed its initialisation
*/
-void dm_pg_init_complete(struct path *path, unsigned err_flags)
+void dm_pg_init_complete(struct dm_path *path, unsigned err_flags)
{
struct pgpath *pgpath = path_to_pgpath(path);
struct priority_group *pg = pgpath->pg;
@@ -1012,7 +1028,10 @@ static int do_end_io(struct multipath *m, struct bio *bio,
spin_lock_irqsave(&m->lock, flags);
if (!m->nr_valid_paths) {
- if (!m->queue_if_no_path) {
+ if (__must_push_back(m)) {
+ spin_unlock_irqrestore(&m->lock, flags);
+ return DM_ENDIO_REQUEUE;
+ } else if (!m->queue_if_no_path) {
spin_unlock_irqrestore(&m->lock, flags);
return -EIO;
} else {
@@ -1047,7 +1066,7 @@ static int do_end_io(struct multipath *m, struct bio *bio,
queue_work(kmultipathd, &m->process_queued_ios);
spin_unlock_irqrestore(&m->lock, flags);
- return 1; /* io not complete */
+ return DM_ENDIO_INCOMPLETE; /* io not complete */
}
static int multipath_end_io(struct dm_target *ti, struct bio *bio,
@@ -1065,7 +1084,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path);
}
- if (r <= 0)
+ if (r != DM_ENDIO_INCOMPLETE)
mempool_free(mpio, m->mpio_pool);
return r;
@@ -1266,12 +1285,47 @@ error:
return -EINVAL;
}
+static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
+ struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct multipath *m = (struct multipath *) ti->private;
+ struct block_device *bdev = NULL;
+ unsigned long flags;
+ struct file fake_file = {};
+ struct dentry fake_dentry = {};
+ int r = 0;
+
+ fake_file.f_path.dentry = &fake_dentry;
+
+ spin_lock_irqsave(&m->lock, flags);
+
+ if (!m->current_pgpath)
+ __choose_pgpath(m);
+
+ if (m->current_pgpath) {
+ bdev = m->current_pgpath->path.dev->bdev;
+ fake_dentry.d_inode = bdev->bd_inode;
+ fake_file.f_mode = m->current_pgpath->path.dev->mode;
+ }
+
+ if (m->queue_io)
+ r = -EAGAIN;
+ else if (!bdev)
+ r = -EIO;
+
+ spin_unlock_irqrestore(&m->lock, flags);
+
+ return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file,
+ bdev->bd_disk, cmd, arg);
+}
+
/*-----------------------------------------------------------------
* Module setup
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 0, 4},
+ .version = {1, 0, 5},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
@@ -1281,6 +1335,7 @@ static struct target_type multipath_target = {
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
+ .ioctl = multipath_ioctl,
};
static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h
index 8a4bf2b6d52e..b9cdcbb3ed59 100644
--- a/drivers/md/dm-mpath.h
+++ b/drivers/md/dm-mpath.h
@@ -11,7 +11,7 @@
struct dm_dev;
-struct path {
+struct dm_path {
struct dm_dev *dev; /* Read-only */
unsigned is_active; /* Read-only */
@@ -20,6 +20,6 @@ struct path {
};
/* Callback for hwh_pg_init_fn to use when complete */
-void dm_pg_init_complete(struct path *path, unsigned err_flags);
+void dm_pg_init_complete(struct dm_path *path, unsigned err_flags);
#endif
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 732d06a84f85..27357b85d73d 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -44,7 +44,7 @@ struct path_selector_type {
* Add an opaque path object, along with some selector specific
* path args (eg, path priority).
*/
- int (*add_path) (struct path_selector *ps, struct path *path,
+ int (*add_path) (struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error);
/*
@@ -55,27 +55,27 @@ struct path_selector_type {
* calling the function again. 0 means don't call it again unless
* the path fails.
*/
- struct path *(*select_path) (struct path_selector *ps,
+ struct dm_path *(*select_path) (struct path_selector *ps,
unsigned *repeat_count);
/*
* Notify the selector that a path has failed.
*/
- void (*fail_path) (struct path_selector *ps, struct path *p);
+ void (*fail_path) (struct path_selector *ps, struct dm_path *p);
/*
* Ask selector to reinstate a path.
*/
- int (*reinstate_path) (struct path_selector *ps, struct path *p);
+ int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);
/*
* Table content based on parameters added in ps_add_path_fn
* or path selector status
*/
- int (*status) (struct path_selector *ps, struct path *path,
+ int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);
- int (*end_io) (struct path_selector *ps, struct path *path);
+ int (*end_io) (struct path_selector *ps, struct dm_path *path);
};
/* Register a path selector */
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index c54de989eb00..23a642619bed 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -24,6 +24,7 @@
static struct workqueue_struct *_kmirrord_wq;
static struct work_struct _kmirrord_work;
+static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
static inline void wake(void)
{
@@ -83,6 +84,7 @@ struct region_hash {
struct list_head *buckets;
spinlock_t region_lock;
+ atomic_t recovery_in_flight;
struct semaphore recovery_count;
struct list_head clean_regions;
struct list_head quiesced_regions;
@@ -191,6 +193,7 @@ static int rh_init(struct region_hash *rh, struct mirror_set *ms,
spin_lock_init(&rh->region_lock);
sema_init(&rh->recovery_count, 0);
+ atomic_set(&rh->recovery_in_flight, 0);
INIT_LIST_HEAD(&rh->clean_regions);
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
@@ -341,6 +344,17 @@ static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
}
}
+static void complete_resync_work(struct region *reg, int success)
+{
+ struct region_hash *rh = reg->rh;
+
+ rh->log->type->set_region_sync(rh->log, reg->key, success);
+ dispatch_bios(rh->ms, &reg->delayed_bios);
+ if (atomic_dec_and_test(&rh->recovery_in_flight))
+ wake_up_all(&_kmirrord_recovery_stopped);
+ up(&rh->recovery_count);
+}
+
static void rh_update_states(struct region_hash *rh)
{
struct region *reg, *next;
@@ -380,9 +394,7 @@ static void rh_update_states(struct region_hash *rh)
*/
list_for_each_entry_safe (reg, next, &recovered, list) {
rh->log->type->clear_region(rh->log, reg->key);
- rh->log->type->complete_resync_work(rh->log, reg->key, 1);
- dispatch_bios(rh->ms, &reg->delayed_bios);
- up(&rh->recovery_count);
+ complete_resync_work(reg, 1);
mempool_free(reg, rh->region_pool);
}
@@ -502,11 +514,21 @@ static int __rh_recovery_prepare(struct region_hash *rh)
static void rh_recovery_prepare(struct region_hash *rh)
{
- while (!down_trylock(&rh->recovery_count))
+ /* Extra reference to avoid race with rh_stop_recovery */
+ atomic_inc(&rh->recovery_in_flight);
+
+ while (!down_trylock(&rh->recovery_count)) {
+ atomic_inc(&rh->recovery_in_flight);
if (__rh_recovery_prepare(rh) <= 0) {
+ atomic_dec(&rh->recovery_in_flight);
up(&rh->recovery_count);
break;
}
+ }
+
+ /* Drop the extra reference */
+ if (atomic_dec_and_test(&rh->recovery_in_flight))
+ wake_up_all(&_kmirrord_recovery_stopped);
}
/*
@@ -868,7 +890,7 @@ static void do_mirror(struct mirror_set *ms)
do_writes(ms, &writes);
}
-static void do_work(void *ignored)
+static void do_work(struct work_struct *ignored)
{
struct mirror_set *ms;
@@ -1122,7 +1144,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
if (rw == WRITE) {
queue_bio(ms, bio, rw);
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
r = ms->rh.log->type->in_sync(ms->rh.log,
@@ -1131,7 +1153,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
return r;
if (r == -EWOULDBLOCK) /* FIXME: ugly */
- r = 0;
+ r = DM_MAPIO_SUBMITTED;
/*
* We don't want to fast track a recovery just for a read
@@ -1144,7 +1166,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
if (!r) {
/* Pass this io over to the daemon */
queue_bio(ms, bio, rw);
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
m = choose_mirror(ms, bio->bi_sector);
@@ -1152,7 +1174,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
return -EIO;
map_bio(ms, m, bio);
- return 1;
+ return DM_MAPIO_REMAPPED;
}
static int mirror_end_io(struct dm_target *ti, struct bio *bio,
@@ -1177,6 +1199,11 @@ static void mirror_postsuspend(struct dm_target *ti)
struct dirty_log *log = ms->rh.log;
rh_stop_recovery(&ms->rh);
+
+ /* Wait for all I/O we generated to complete */
+ wait_event(_kmirrord_recovery_stopped,
+ !atomic_read(&ms->rh.recovery_in_flight));
+
if (log->type->suspend && log->type->suspend(log))
/* FIXME: need better error handling */
DMWARN("log suspend failed");
@@ -1213,9 +1240,9 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- DMEMIT("%d ", ms->nr_mirrors);
+ DMEMIT("%d", ms->nr_mirrors);
for (m = 0; m < ms->nr_mirrors; m++)
- DMEMIT("%s %llu ", ms->mirror[m].dev->name,
+ DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsigned long long)ms->mirror[m].offset);
}
@@ -1249,7 +1276,7 @@ static int __init dm_mirror_init(void)
dm_dirty_log_exit();
return r;
}
- INIT_WORK(&_kmirrord_work, do_work, NULL);
+ INIT_WORK(&_kmirrord_work, do_work);
r = dm_register_target(&mirror_target);
if (r < 0) {
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index c5a16c550122..a348a97b65af 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -21,7 +21,7 @@
*---------------------------------------------------------------*/
struct path_info {
struct list_head list;
- struct path *path;
+ struct dm_path *path;
unsigned repeat_count;
};
@@ -80,7 +80,7 @@ static void rr_destroy(struct path_selector *ps)
ps->context = NULL;
}
-static int rr_status(struct path_selector *ps, struct path *path,
+static int rr_status(struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen)
{
struct path_info *pi;
@@ -106,7 +106,7 @@ static int rr_status(struct path_selector *ps, struct path *path,
* Called during initialisation to register each path with an
* optional repeat_count.
*/
-static int rr_add_path(struct path_selector *ps, struct path *path,
+static int rr_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = (struct selector *) ps->context;
@@ -136,12 +136,12 @@ static int rr_add_path(struct path_selector *ps, struct path *path,
path->pscontext = pi;
- list_add(&pi->list, &s->valid_paths);
+ list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
-static void rr_fail_path(struct path_selector *ps, struct path *p)
+static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = p->pscontext;
@@ -149,7 +149,7 @@ static void rr_fail_path(struct path_selector *ps, struct path *p)
list_move(&pi->list, &s->invalid_paths);
}
-static int rr_reinstate_path(struct path_selector *ps, struct path *p)
+static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = p->pscontext;
@@ -159,7 +159,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct path *p)
return 0;
}
-static struct path *rr_select_path(struct path_selector *ps,
+static struct dm_path *rr_select_path(struct path_selector *ps,
unsigned *repeat_count)
{
struct selector *s = (struct selector *) ps->context;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1d0fafda0f76..0821a2b68a73 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -39,6 +39,9 @@
*/
#define SNAPSHOT_PAGES 256
+static struct workqueue_struct *ksnapd;
+static void flush_queued_bios(struct work_struct *work);
+
struct pending_exception {
struct exception e;
@@ -56,7 +59,7 @@ struct pending_exception {
/*
* The primary pending_exception is the one that holds
- * the sibling_count and the list of origin_bios for a
+ * the ref_count and the list of origin_bios for a
* group of pending_exceptions. It is always last to get freed.
* These fields get set up when writing to the origin.
*/
@@ -69,7 +72,7 @@ struct pending_exception {
* the sibling concerned and not pe->primary_pe->snap->lock unless
* they are the same.
*/
- atomic_t sibling_count;
+ atomic_t ref_count;
/* Pointer back to snapshot context */
struct dm_snapshot *snap;
@@ -85,8 +88,8 @@ struct pending_exception {
* Hash table mapping origin volumes to lists of snapshots and
* a lock to protect it
*/
-static kmem_cache_t *exception_cache;
-static kmem_cache_t *pending_cache;
+static struct kmem_cache *exception_cache;
+static struct kmem_cache *pending_cache;
static mempool_t *pending_pool;
/*
@@ -225,7 +228,7 @@ static int init_exception_table(struct exception_table *et, uint32_t size)
return 0;
}
-static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem)
{
struct list_head *slot;
struct exception *ex, *next;
@@ -387,15 +390,46 @@ static inline ulong round_up(ulong n, ulong size)
return (n + size) & ~size;
}
-static void read_snapshot_metadata(struct dm_snapshot *s)
+static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
+ char **error)
{
- if (s->store.read_metadata(&s->store)) {
- down_write(&s->lock);
- s->valid = 0;
- up_write(&s->lock);
+ unsigned long chunk_size;
+ char *value;
+
+ chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
+ if (*chunk_size_arg == '\0' || *value != '\0') {
+ *error = "Invalid chunk size";
+ return -EINVAL;
+ }
- dm_table_event(s->table);
+ if (!chunk_size) {
+ s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
+ return 0;
}
+
+ /*
+ * Chunk size must be multiple of page size. Silently
+ * round up if it's not.
+ */
+ chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
+
+ /* Check chunk_size is a power of 2 */
+ if (chunk_size & (chunk_size - 1)) {
+ *error = "Chunk size is not a power of 2";
+ return -EINVAL;
+ }
+
+ /* Validate the chunk size against the device block size */
+ if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
+ *error = "Chunk size is not a multiple of device blocksize";
+ return -EINVAL;
+ }
+
+ s->chunk_size = chunk_size;
+ s->chunk_mask = chunk_size - 1;
+ s->chunk_shift = ffs(chunk_size) - 1;
+
+ return 0;
}
/*
@@ -404,15 +438,12 @@ static void read_snapshot_metadata(struct dm_snapshot *s)
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct dm_snapshot *s;
- unsigned long chunk_size;
int r = -EINVAL;
char persistent;
char *origin_path;
char *cow_path;
- char *value;
- int blocksize;
- if (argc < 4) {
+ if (argc != 4) {
ti->error = "requires exactly 4 arguments";
r = -EINVAL;
goto bad1;
@@ -428,13 +459,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad1;
}
- chunk_size = simple_strtoul(argv[3], &value, 10);
- if (chunk_size == 0 || value == NULL) {
- ti->error = "Invalid chunk size";
- r = -EINVAL;
- goto bad1;
- }
-
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL) {
ti->error = "Cannot allocate snapshot context private "
@@ -457,36 +481,17 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad2;
}
- /*
- * Chunk size must be multiple of page size. Silently
- * round up if it's not.
- */
- chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
-
- /* Validate the chunk size against the device block size */
- blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
- if (chunk_size % (blocksize >> 9)) {
- ti->error = "Chunk size is not a multiple of device blocksize";
- r = -EINVAL;
- goto bad3;
- }
-
- /* Check chunk_size is a power of 2 */
- if (chunk_size & (chunk_size - 1)) {
- ti->error = "Chunk size is not a power of 2";
- r = -EINVAL;
+ r = set_chunk_size(s, argv[3], &ti->error);
+ if (r)
goto bad3;
- }
- s->chunk_size = chunk_size;
- s->chunk_mask = chunk_size - 1;
s->type = persistent;
- s->chunk_shift = ffs(chunk_size) - 1;
s->valid = 1;
s->active = 0;
s->last_percent = 0;
init_rwsem(&s->lock);
+ spin_lock_init(&s->pe_lock);
s->table = ti->table;
/* Allocate hash table for COW data */
@@ -496,16 +501,12 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad3;
}
- /*
- * Check the persistent flag - done here because we need the iobuf
- * to check the LV header
- */
s->store.snap = s;
if (persistent == 'P')
- r = dm_create_persistent(&s->store, chunk_size);
+ r = dm_create_persistent(&s->store);
else
- r = dm_create_transient(&s->store, s, blocksize);
+ r = dm_create_transient(&s->store);
if (r) {
ti->error = "Couldn't create exception store";
@@ -520,7 +521,14 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
/* Metadata must only be loaded into one table at once */
- read_snapshot_metadata(s);
+ r = s->store.read_metadata(&s->store);
+ if (r) {
+ ti->error = "Failed to read snapshot metadata";
+ goto bad6;
+ }
+
+ bio_list_init(&s->queued_bios);
+ INIT_WORK(&s->queued_bios_work, flush_queued_bios);
/* Add snapshot to the list of snapshots for this origin */
/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -556,21 +564,28 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return r;
}
+static void __free_exceptions(struct dm_snapshot *s)
+{
+ kcopyd_client_destroy(s->kcopyd_client);
+ s->kcopyd_client = NULL;
+
+ exit_exception_table(&s->pending, pending_cache);
+ exit_exception_table(&s->complete, exception_cache);
+
+ s->store.destroy(&s->store);
+}
+
static void snapshot_dtr(struct dm_target *ti)
{
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+ flush_workqueue(ksnapd);
+
/* Prevent further origin writes from using this snapshot. */
/* After this returns there can be no new kcopyd jobs. */
unregister_snapshot(s);
- kcopyd_client_destroy(s->kcopyd_client);
-
- exit_exception_table(&s->pending, pending_cache);
- exit_exception_table(&s->complete, exception_cache);
-
- /* Deallocate memory used */
- s->store.destroy(&s->store);
+ __free_exceptions(s);
dm_put_device(ti, s->origin);
dm_put_device(ti, s->cow);
@@ -593,6 +608,20 @@ static void flush_bios(struct bio *bio)
}
}
+static void flush_queued_bios(struct work_struct *work)
+{
+ struct dm_snapshot *s =
+ container_of(work, struct dm_snapshot, queued_bios_work);
+ struct bio *queued_bios;
+ unsigned long flags;
+
+ spin_lock_irqsave(&s->pe_lock, flags);
+ queued_bios = bio_list_get(&s->queued_bios);
+ spin_unlock_irqrestore(&s->pe_lock, flags);
+
+ flush_bios(queued_bios);
+}
+
/*
* Error a list of buffers.
*/
@@ -608,28 +637,7 @@ static void error_bios(struct bio *bio)
}
}
-static inline void error_snapshot_bios(struct pending_exception *pe)
-{
- error_bios(bio_list_get(&pe->snapshot_bios));
-}
-
-static struct bio *__flush_bios(struct pending_exception *pe)
-{
- /*
- * If this pe is involved in a write to the origin and
- * it is the last sibling to complete then release
- * the bios for the original write to the origin.
- */
-
- if (pe->primary_pe &&
- atomic_dec_and_test(&pe->primary_pe->sibling_count))
- return bio_list_get(&pe->primary_pe->origin_bios);
-
- return NULL;
-}
-
-static void __invalidate_snapshot(struct dm_snapshot *s,
- struct pending_exception *pe, int err)
+static void __invalidate_snapshot(struct dm_snapshot *s, int err)
{
if (!s->valid)
return;
@@ -639,9 +647,6 @@ static void __invalidate_snapshot(struct dm_snapshot *s,
else if (err == -ENOMEM)
DMERR("Invalidating snapshot: Unable to allocate exception.");
- if (pe)
- remove_exception(&pe->e);
-
if (s->store.drop_snapshot)
s->store.drop_snapshot(&s->store);
@@ -650,78 +655,95 @@ static void __invalidate_snapshot(struct dm_snapshot *s,
dm_table_event(s->table);
}
+static void get_pending_exception(struct pending_exception *pe)
+{
+ atomic_inc(&pe->ref_count);
+}
+
+static struct bio *put_pending_exception(struct pending_exception *pe)
+{
+ struct pending_exception *primary_pe;
+ struct bio *origin_bios = NULL;
+
+ primary_pe = pe->primary_pe;
+
+ /*
+ * If this pe is involved in a write to the origin and
+ * it is the last sibling to complete then release
+ * the bios for the original write to the origin.
+ */
+ if (primary_pe &&
+ atomic_dec_and_test(&primary_pe->ref_count))
+ origin_bios = bio_list_get(&primary_pe->origin_bios);
+
+ /*
+ * Free the pe if it's not linked to an origin write or if
+ * it's not itself a primary pe.
+ */
+ if (!primary_pe || primary_pe != pe)
+ free_pending_exception(pe);
+
+ /*
+ * Free the primary pe if nothing references it.
+ */
+ if (primary_pe && !atomic_read(&primary_pe->ref_count))
+ free_pending_exception(primary_pe);
+
+ return origin_bios;
+}
+
static void pending_complete(struct pending_exception *pe, int success)
{
struct exception *e;
- struct pending_exception *primary_pe;
struct dm_snapshot *s = pe->snap;
- struct bio *flush = NULL;
+ struct bio *origin_bios = NULL;
+ struct bio *snapshot_bios = NULL;
+ int error = 0;
if (!success) {
/* Read/write error - snapshot is unusable */
down_write(&s->lock);
- __invalidate_snapshot(s, pe, -EIO);
- flush = __flush_bios(pe);
- up_write(&s->lock);
-
- error_snapshot_bios(pe);
+ __invalidate_snapshot(s, -EIO);
+ error = 1;
goto out;
}
e = alloc_exception();
if (!e) {
down_write(&s->lock);
- __invalidate_snapshot(s, pe, -ENOMEM);
- flush = __flush_bios(pe);
- up_write(&s->lock);
-
- error_snapshot_bios(pe);
+ __invalidate_snapshot(s, -ENOMEM);
+ error = 1;
goto out;
}
*e = pe->e;
- /*
- * Add a proper exception, and remove the
- * in-flight exception from the list.
- */
down_write(&s->lock);
if (!s->valid) {
- flush = __flush_bios(pe);
- up_write(&s->lock);
-
free_exception(e);
-
- error_snapshot_bios(pe);
+ error = 1;
goto out;
}
+ /*
+ * Add a proper exception, and remove the
+ * in-flight exception from the list.
+ */
insert_exception(&s->complete, e);
+
+ out:
remove_exception(&pe->e);
- flush = __flush_bios(pe);
+ snapshot_bios = bio_list_get(&pe->snapshot_bios);
+ origin_bios = put_pending_exception(pe);
up_write(&s->lock);
/* Submit any pending write bios */
- flush_bios(bio_list_get(&pe->snapshot_bios));
-
- out:
- primary_pe = pe->primary_pe;
-
- /*
- * Free the pe if it's not linked to an origin write or if
- * it's not itself a primary pe.
- */
- if (!primary_pe || primary_pe != pe)
- free_pending_exception(pe);
-
- /*
- * Free the primary pe if nothing references it.
- */
- if (primary_pe && !atomic_read(&primary_pe->sibling_count))
- free_pending_exception(primary_pe);
+ if (error)
+ error_bios(snapshot_bios);
+ else
+ flush_bios(snapshot_bios);
- if (flush)
- flush_bios(flush);
+ flush_bios(origin_bios);
}
static void commit_callback(void *context, int success)
@@ -822,7 +844,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
bio_list_init(&pe->origin_bios);
bio_list_init(&pe->snapshot_bios);
pe->primary_pe = NULL;
- atomic_set(&pe->sibling_count, 1);
+ atomic_set(&pe->ref_count, 0);
pe->snap = s;
pe->started = 0;
@@ -831,6 +853,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
return NULL;
}
+ get_pending_exception(pe);
insert_exception(&s->pending, &pe->e);
out:
@@ -850,8 +873,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
{
struct exception *e;
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
- int copy_needed = 0;
- int r = 1;
+ int r = DM_MAPIO_REMAPPED;
chunk_t chunk;
struct pending_exception *pe = NULL;
@@ -865,32 +887,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
if (unlikely(bio_barrier(bio)))
return -EOPNOTSUPP;
+ /* FIXME: should only take write lock if we need
+ * to copy an exception */
+ down_write(&s->lock);
+
+ if (!s->valid) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /* If the block is already remapped - use that, else remap it */
+ e = lookup_exception(&s->complete, chunk);
+ if (e) {
+ remap_exception(s, e, bio);
+ goto out_unlock;
+ }
+
/*
* Write to snapshot - higher level takes care of RW/RO
* flags so we should only get this if we are
* writeable.
*/
if (bio_rw(bio) == WRITE) {
-
- /* FIXME: should only take write lock if we need
- * to copy an exception */
- down_write(&s->lock);
-
- if (!s->valid) {
- r = -EIO;
- goto out_unlock;
- }
-
- /* If the block is already remapped - use that, else remap it */
- e = lookup_exception(&s->complete, chunk);
- if (e) {
- remap_exception(s, e, bio);
- goto out_unlock;
- }
-
pe = __find_pending_exception(s, bio);
if (!pe) {
- __invalidate_snapshot(s, pe, -ENOMEM);
+ __invalidate_snapshot(s, -ENOMEM);
r = -EIO;
goto out_unlock;
}
@@ -898,45 +919,27 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
remap_exception(s, &pe->e, bio);
bio_list_add(&pe->snapshot_bios, bio);
+ r = DM_MAPIO_SUBMITTED;
+
if (!pe->started) {
/* this is protected by snap->lock */
pe->started = 1;
- copy_needed = 1;
- }
-
- r = 0;
-
- out_unlock:
- up_write(&s->lock);
-
- if (copy_needed)
+ up_write(&s->lock);
start_copy(pe);
- } else {
+ goto out;
+ }
+ } else
/*
* FIXME: this read path scares me because we
* always use the origin when we have a pending
* exception. However I can't think of a
* situation where this is wrong - ejt.
*/
+ bio->bi_bdev = s->origin->bdev;
- /* Do reads */
- down_read(&s->lock);
-
- if (!s->valid) {
- up_read(&s->lock);
- return -EIO;
- }
-
- /* See if it it has been remapped */
- e = lookup_exception(&s->complete, chunk);
- if (e)
- remap_exception(s, e, bio);
- else
- bio->bi_bdev = s->origin->bdev;
-
- up_read(&s->lock);
- }
-
+ out_unlock:
+ up_write(&s->lock);
+ out:
return r;
}
@@ -994,7 +997,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
*---------------------------------------------------------------*/
static int __origin_write(struct list_head *snapshots, struct bio *bio)
{
- int r = 1, first = 0;
+ int r = DM_MAPIO_REMAPPED, first = 0;
struct dm_snapshot *snap;
struct exception *e;
struct pending_exception *pe, *next_pe, *primary_pe = NULL;
@@ -1025,7 +1028,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
* is already remapped in this snapshot
* and trigger an exception if not.
*
- * sibling_count is initialised to 1 so pending_complete()
+ * ref_count is initialised to 1 so pending_complete()
* won't destroy the primary_pe while we're inside this loop.
*/
e = lookup_exception(&snap->complete, chunk);
@@ -1034,7 +1037,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
pe = __find_pending_exception(snap, bio);
if (!pe) {
- __invalidate_snapshot(snap, pe, ENOMEM);
+ __invalidate_snapshot(snap, -ENOMEM);
goto next_snapshot;
}
@@ -1052,12 +1055,12 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
bio_list_add(&primary_pe->origin_bios, bio);
- r = 0;
+ r = DM_MAPIO_SUBMITTED;
}
if (!pe->primary_pe) {
- atomic_inc(&primary_pe->sibling_count);
pe->primary_pe = primary_pe;
+ get_pending_exception(primary_pe);
}
if (!pe->started) {
@@ -1070,20 +1073,20 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
}
if (!primary_pe)
- goto out;
+ return r;
/*
* If this is the first time we're processing this chunk and
- * sibling_count is now 1 it means all the pending exceptions
+ * ref_count is now 1 it means all the pending exceptions
* got completed while we were in the loop above, so it falls to
* us here to remove the primary_pe and submit any origin_bios.
*/
- if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
+ if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
flush_bios(bio_list_get(&primary_pe->origin_bios));
free_pending_exception(primary_pe);
/* If we got here, pe_queue is necessarily empty. */
- goto out;
+ return r;
}
/*
@@ -1092,7 +1095,6 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
start_copy(pe);
- out:
return r;
}
@@ -1102,7 +1104,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
static int do_origin(struct dm_dev *origin, struct bio *bio)
{
struct origin *o;
- int r = 1;
+ int r = DM_MAPIO_REMAPPED;
down_read(&_origins_lock);
o = __lookup_origin(origin->bdev);
@@ -1159,7 +1161,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
return -EOPNOTSUPP;
/* Only tell snapshots if this is a write */
- return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
+ return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
}
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
@@ -1205,7 +1207,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
static struct target_type origin_target = {
.name = "snapshot-origin",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = origin_ctr,
.dtr = origin_dtr,
@@ -1216,7 +1218,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -1275,8 +1277,17 @@ static int __init dm_snapshot_init(void)
goto bad5;
}
+ ksnapd = create_singlethread_workqueue("ksnapd");
+ if (!ksnapd) {
+ DMERR("Failed to create ksnapd workqueue.");
+ r = -ENOMEM;
+ goto bad6;
+ }
+
return 0;
+ bad6:
+ mempool_destroy(pending_pool);
bad5:
kmem_cache_destroy(pending_cache);
bad4:
@@ -1294,6 +1305,8 @@ static void __exit dm_snapshot_exit(void)
{
int r;
+ destroy_workqueue(ksnapd);
+
r = dm_unregister_target(&snapshot_target);
if (r)
DMERR("snapshot unregister failed %d", r);
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index fdec1e2dc871..15fa2ae6cdc2 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -10,7 +10,9 @@
#define DM_SNAPSHOT_H
#include "dm.h"
+#include "dm-bio-list.h"
#include <linux/blkdev.h>
+#include <linux/workqueue.h>
struct exception_table {
uint32_t hash_mask;
@@ -112,10 +114,20 @@ struct dm_snapshot {
struct exception_table pending;
struct exception_table complete;
+ /*
+ * pe_lock protects all pending_exception operations and access
+ * as well as the snapshot_bios list.
+ */
+ spinlock_t pe_lock;
+
/* The on disk metadata handler */
struct exception_store store;
struct kcopyd_client *kcopyd_client;
+
+ /* Queue of snapshot writes for ksnapd to flush */
+ struct bio_list queued_bios;
+ struct work_struct queued_bios_work;
};
/*
@@ -128,10 +140,9 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
* Constructor and destructor for the default persistent
* store.
*/
-int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
+int dm_create_persistent(struct exception_store *store);
-int dm_create_transient(struct exception_store *store,
- struct dm_snapshot *s, int blocksize);
+int dm_create_transient(struct exception_store *store);
/*
* Return the number of sectors in the device.
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 6c29fcecd892..51f5e0760012 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -186,7 +186,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
bio->bi_bdev = sc->stripe[stripe].dev->bdev;
bio->bi_sector = sc->stripe[stripe].physical_start +
(chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
- return 1;
+ return DM_MAPIO_REMAPPED;
}
static int stripe_status(struct dm_target *ti,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 75fe9493e6af..05befa91807a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -522,56 +522,61 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
return 0;
}
-
-int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
- sector_t len, int mode, struct dm_dev **result)
+void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
{
- int r = __table_get_device(ti->table, ti, path,
- start, len, mode, result);
- if (!r) {
- request_queue_t *q = bdev_get_queue((*result)->bdev);
- struct io_restrictions *rs = &ti->limits;
-
- /*
- * Combine the device limits low.
- *
- * FIXME: if we move an io_restriction struct
- * into q this would just be a call to
- * combine_restrictions_low()
- */
+ request_queue_t *q = bdev_get_queue(bdev);
+ struct io_restrictions *rs = &ti->limits;
+
+ /*
+ * Combine the device limits low.
+ *
+ * FIXME: if we move an io_restriction struct
+ * into q this would just be a call to
+ * combine_restrictions_low()
+ */
+ rs->max_sectors =
+ min_not_zero(rs->max_sectors, q->max_sectors);
+
+ /* FIXME: Device-Mapper on top of RAID-0 breaks because DM
+ * currently doesn't honor MD's merge_bvec_fn routine.
+ * In this case, we'll force DM to use PAGE_SIZE or
+ * smaller I/O, just to be safe. A better fix is in the
+ * works, but add this for the time being so it will at
+ * least operate correctly.
+ */
+ if (q->merge_bvec_fn)
rs->max_sectors =
- min_not_zero(rs->max_sectors, q->max_sectors);
+ min_not_zero(rs->max_sectors,
+ (unsigned int) (PAGE_SIZE >> 9));
- /* FIXME: Device-Mapper on top of RAID-0 breaks because DM
- * currently doesn't honor MD's merge_bvec_fn routine.
- * In this case, we'll force DM to use PAGE_SIZE or
- * smaller I/O, just to be safe. A better fix is in the
- * works, but add this for the time being so it will at
- * least operate correctly.
- */
- if (q->merge_bvec_fn)
- rs->max_sectors =
- min_not_zero(rs->max_sectors,
- (unsigned int) (PAGE_SIZE >> 9));
+ rs->max_phys_segments =
+ min_not_zero(rs->max_phys_segments,
+ q->max_phys_segments);
- rs->max_phys_segments =
- min_not_zero(rs->max_phys_segments,
- q->max_phys_segments);
+ rs->max_hw_segments =
+ min_not_zero(rs->max_hw_segments, q->max_hw_segments);
- rs->max_hw_segments =
- min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+ rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
- rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+ rs->max_segment_size =
+ min_not_zero(rs->max_segment_size, q->max_segment_size);
- rs->max_segment_size =
- min_not_zero(rs->max_segment_size, q->max_segment_size);
+ rs->seg_boundary_mask =
+ min_not_zero(rs->seg_boundary_mask,
+ q->seg_boundary_mask);
- rs->seg_boundary_mask =
- min_not_zero(rs->seg_boundary_mask,
- q->seg_boundary_mask);
+ rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+}
+EXPORT_SYMBOL_GPL(dm_set_device_limits);
- rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
- }
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+ sector_t len, int mode, struct dm_dev **result)
+{
+ int r = __table_get_device(ti->table, ti, path,
+ start, len, mode, result);
+
+ if (!r)
+ dm_set_device_limits(ti, (*result)->bdev);
return r;
}
@@ -939,9 +944,20 @@ void dm_table_postsuspend_targets(struct dm_table *t)
return suspend_targets(t, 1);
}
-void dm_table_resume_targets(struct dm_table *t)
+int dm_table_resume_targets(struct dm_table *t)
{
- int i;
+ int i, r = 0;
+
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = t->targets + i;
+
+ if (!ti->type->preresume)
+ continue;
+
+ r = ti->type->preresume(ti);
+ if (r)
+ return r;
+ }
for (i = 0; i < t->num_targets; i++) {
struct dm_target *ti = t->targets + i;
@@ -949,6 +965,8 @@ void dm_table_resume_targets(struct dm_table *t)
if (ti->type->resume)
ti->type->resume(ti);
}
+
+ return 0;
}
int dm_table_any_congested(struct dm_table *t, int bdi_bits)
@@ -983,6 +1001,11 @@ int dm_table_flush_all(struct dm_table *t)
{
struct list_head *d, *devices = dm_table_get_devices(t);
int ret = 0;
+ unsigned i;
+
+ for (i = 0; i < t->num_targets; i++)
+ if (t->targets[i].type->flush)
+ t->targets[i].type->flush(&t->targets[i]);
for (d = devices->next; d != devices; d = d->next) {
struct dm_dev *dd = list_entry(d, struct dm_dev, list);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index ea569f7348d2..f314d7dc9c26 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -46,7 +46,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio,
bio_endio(bio, bio->bi_size, 0);
/* accepted bio, don't make new request */
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
static struct target_type zero_target = {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c99bf9f01759..fe7c56e10435 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/blktrace_api.h>
+#include <linux/smp_lock.h>
#define DM_MSG_PREFIX "core"
@@ -67,10 +68,12 @@ union map_info *dm_get_mapinfo(struct bio *bio)
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
+#define DMF_NOFLUSH_SUSPENDING 5
struct mapped_device {
struct rw_semaphore io_lock;
struct semaphore suspend_lock;
+ spinlock_t pushback_lock;
rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
@@ -88,7 +91,8 @@ struct mapped_device {
*/
atomic_t pending;
wait_queue_head_t wait;
- struct bio_list deferred;
+ struct bio_list deferred;
+ struct bio_list pushback;
/*
* The current mapping.
@@ -101,6 +105,8 @@ struct mapped_device {
mempool_t *io_pool;
mempool_t *tio_pool;
+ struct bio_set *bs;
+
/*
* Event handling.
*/
@@ -118,19 +124,13 @@ struct mapped_device {
};
#define MIN_IOS 256
-static kmem_cache_t *_io_cache;
-static kmem_cache_t *_tio_cache;
-
-static struct bio_set *dm_set;
+static struct kmem_cache *_io_cache;
+static struct kmem_cache *_tio_cache;
static int __init local_init(void)
{
int r;
- dm_set = bioset_create(16, 16, 4);
- if (!dm_set)
- return -ENOMEM;
-
/* allocate a slab for the dm_ios */
_io_cache = kmem_cache_create("dm_io",
sizeof(struct dm_io), 0, 0, NULL, NULL);
@@ -164,8 +164,6 @@ static void local_exit(void)
kmem_cache_destroy(_tio_cache);
kmem_cache_destroy(_io_cache);
- bioset_free(dm_set);
-
if (unregister_blkdev(_major, _name) < 0)
DMERR("unregister_blkdev failed");
@@ -288,6 +286,45 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
+static int dm_blk_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct mapped_device *md;
+ struct dm_table *map;
+ struct dm_target *tgt;
+ int r = -ENOTTY;
+
+ /* We don't really need this lock, but we do need 'inode'. */
+ unlock_kernel();
+
+ md = inode->i_bdev->bd_disk->private_data;
+
+ map = dm_get_table(md);
+
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ /* We only support devices that have a single target */
+ if (dm_table_get_num_targets(map) != 1)
+ goto out;
+
+ tgt = dm_table_get_target(map, 0);
+
+ if (dm_suspended(md)) {
+ r = -EAGAIN;
+ goto out;
+ }
+
+ if (tgt->type->ioctl)
+ r = tgt->type->ioctl(tgt, inode, file, cmd, arg);
+
+out:
+ dm_table_put(map);
+
+ lock_kernel();
+ return r;
+}
+
static inline struct dm_io *alloc_io(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -410,23 +447,50 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
* you this clearly demarcated crap.
*---------------------------------------------------------------*/
+static int __noflush_suspending(struct mapped_device *md)
+{
+ return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+}
+
/*
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
static void dec_pending(struct dm_io *io, int error)
{
- if (error)
+ unsigned long flags;
+
+ /* Push-back supersedes any I/O errors */
+ if (error && !(io->error > 0 && __noflush_suspending(io->md)))
io->error = error;
if (atomic_dec_and_test(&io->io_count)) {
+ if (io->error == DM_ENDIO_REQUEUE) {
+ /*
+ * Target requested pushing back the I/O.
+ * This must be handled before the sleeper on
+ * suspend queue merges the pushback list.
+ */
+ spin_lock_irqsave(&io->md->pushback_lock, flags);
+ if (__noflush_suspending(io->md))
+ bio_list_add(&io->md->pushback, io->bio);
+ else
+ /* noflush suspend was interrupted. */
+ io->error = -EIO;
+ spin_unlock_irqrestore(&io->md->pushback_lock, flags);
+ }
+
if (end_io_acct(io))
/* nudge anyone waiting on suspend queue */
wake_up(&io->md->wait);
- blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+ if (io->error != DM_ENDIO_REQUEUE) {
+ blk_add_trace_bio(io->md->queue, io->bio,
+ BLK_TA_COMPLETE);
+
+ bio_endio(io->bio, io->bio->bi_size, io->error);
+ }
- bio_endio(io->bio, io->bio->bi_size, io->error);
free_io(io->md, io);
}
}
@@ -435,7 +499,7 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
{
int r = 0;
struct target_io *tio = bio->bi_private;
- struct dm_io *io = tio->io;
+ struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
if (bio->bi_size)
@@ -446,17 +510,30 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
if (endio) {
r = endio(tio->ti, bio, error, &tio->info);
- if (r < 0)
+ if (r < 0 || r == DM_ENDIO_REQUEUE)
+ /*
+ * error and requeue request are handled
+ * in dec_pending().
+ */
error = r;
-
- else if (r > 0)
- /* the target wants another shot at the io */
+ else if (r == DM_ENDIO_INCOMPLETE)
+ /* The target will handle the io */
return 1;
+ else if (r) {
+ DMWARN("unimplemented target endio return value: %d", r);
+ BUG();
+ }
}
- free_tio(io->md, tio);
- dec_pending(io, error);
+ dec_pending(tio->io, error);
+
+ /*
+ * Store md for cleanup instead of tio which is about to get freed.
+ */
+ bio->bi_private = md->bs;
+
bio_put(bio);
+ free_tio(md, tio);
return r;
}
@@ -485,6 +562,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
{
int r;
sector_t sector;
+ struct mapped_device *md;
/*
* Sanity checks.
@@ -502,7 +580,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
atomic_inc(&tio->io->io_count);
sector = clone->bi_sector;
r = ti->type->map(ti, clone, &tio->info);
- if (r > 0) {
+ if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
@@ -510,14 +588,19 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
clone->bi_sector);
generic_make_request(clone);
- }
-
- else if (r < 0) {
- /* error the io and bail out */
- struct dm_io *io = tio->io;
- free_tio(tio->io->md, tio);
- dec_pending(io, r);
+ } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
+ /* error the io and bail out, or requeue it if needed */
+ md = tio->io->md;
+ dec_pending(tio->io, r);
+ /*
+ * Store bio_set for cleanup.
+ */
+ clone->bi_private = md->bs;
bio_put(clone);
+ free_tio(md, tio);
+ } else if (r) {
+ DMWARN("unimplemented target map return value: %d", r);
+ BUG();
}
}
@@ -533,7 +616,9 @@ struct clone_info {
static void dm_bio_destructor(struct bio *bio)
{
- bio_free(bio, dm_set);
+ struct bio_set *bs = bio->bi_private;
+
+ bio_free(bio, bs);
}
/*
@@ -541,12 +626,12 @@ static void dm_bio_destructor(struct bio *bio)
*/
static struct bio *split_bvec(struct bio *bio, sector_t sector,
unsigned short idx, unsigned int offset,
- unsigned int len)
+ unsigned int len, struct bio_set *bs)
{
struct bio *clone;
struct bio_vec *bv = bio->bi_io_vec + idx;
- clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
+ clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
clone->bi_destructor = dm_bio_destructor;
*clone->bi_io_vec = *bv;
@@ -566,11 +651,13 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
*/
static struct bio *clone_bio(struct bio *bio, sector_t sector,
unsigned short idx, unsigned short bv_count,
- unsigned int len)
+ unsigned int len, struct bio_set *bs)
{
struct bio *clone;
- clone = bio_clone(bio, GFP_NOIO);
+ clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
+ __bio_clone(clone, bio);
+ clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
clone->bi_vcnt = idx + bv_count;
@@ -601,7 +688,8 @@ static void __clone_and_map(struct clone_info *ci)
* the remaining io with a single clone.
*/
clone = clone_bio(bio, ci->sector, ci->idx,
- bio->bi_vcnt - ci->idx, ci->sector_count);
+ bio->bi_vcnt - ci->idx, ci->sector_count,
+ ci->md->bs);
__map_bio(ti, clone, tio);
ci->sector_count = 0;
@@ -624,7 +712,8 @@ static void __clone_and_map(struct clone_info *ci)
len += bv_len;
}
- clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
+ clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
+ ci->md->bs);
__map_bio(ti, clone, tio);
ci->sector += len;
@@ -653,7 +742,8 @@ static void __clone_and_map(struct clone_info *ci)
len = min(remaining, max);
clone = split_bvec(bio, ci->sector, ci->idx,
- bv->bv_offset + offset, len);
+ bv->bv_offset + offset, len,
+ ci->md->bs);
__map_bio(ti, clone, tio);
@@ -896,6 +986,7 @@ static struct mapped_device *alloc_dev(int minor)
memset(md, 0, sizeof(*md));
init_rwsem(&md->io_lock);
init_MUTEX(&md->suspend_lock);
+ spin_lock_init(&md->pushback_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
@@ -903,7 +994,7 @@ static struct mapped_device *alloc_dev(int minor)
md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue)
- goto bad1;
+ goto bad1_free_minor;
md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested;
@@ -914,13 +1005,17 @@ static struct mapped_device *alloc_dev(int minor)
md->queue->issue_flush_fn = dm_flush_all;
md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
- if (!md->io_pool)
- goto bad2;
+ if (!md->io_pool)
+ goto bad2;
md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
if (!md->tio_pool)
goto bad3;
+ md->bs = bioset_create(16, 16, 4);
+ if (!md->bs)
+ goto bad_no_bioset;
+
md->disk = alloc_disk(1);
if (!md->disk)
goto bad4;
@@ -948,11 +1043,14 @@ static struct mapped_device *alloc_dev(int minor)
return md;
bad4:
+ bioset_free(md->bs);
+ bad_no_bioset:
mempool_destroy(md->tio_pool);
bad3:
mempool_destroy(md->io_pool);
bad2:
blk_cleanup_queue(md->queue);
+ bad1_free_minor:
free_minor(minor);
bad1:
module_put(THIS_MODULE);
@@ -971,6 +1069,7 @@ static void free_dev(struct mapped_device *md)
}
mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool);
+ bioset_free(md->bs);
del_gendisk(md->disk);
free_minor(minor);
@@ -1215,20 +1314,30 @@ static void unlock_fs(struct mapped_device *md)
* dm_bind_table, dm_suspend must be called to flush any in
* flight bios and ensure that any further io gets deferred.
*/
-int dm_suspend(struct mapped_device *md, int do_lockfs)
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
+ unsigned long flags;
DECLARE_WAITQUEUE(wait, current);
struct bio *def;
int r = -EINVAL;
+ int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
+ int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
down(&md->suspend_lock);
if (dm_suspended(md))
- goto out;
+ goto out_unlock;
map = dm_get_table(md);
+ /*
+ * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
+ * This flag is cleared before dm_suspend returns.
+ */
+ if (noflush)
+ set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+
/* This does not get reverted if there's an error later. */
dm_table_presuspend_targets(map);
@@ -1236,11 +1345,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
if (!md->suspended_bdev) {
DMWARN("bdget failed in dm_suspend");
r = -ENOMEM;
- goto out;
+ goto flush_and_out;
}
- /* Flush I/O to the device. */
- if (do_lockfs) {
+ /*
+ * Flush I/O to the device.
+ * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os.
+ */
+ if (do_lockfs && !noflush) {
r = lock_fs(md);
if (r)
goto out;
@@ -1276,6 +1388,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
down_write(&md->io_lock);
remove_wait_queue(&md->wait, &wait);
+ if (noflush) {
+ spin_lock_irqsave(&md->pushback_lock, flags);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ bio_list_merge_head(&md->deferred, &md->pushback);
+ bio_list_init(&md->pushback);
+ spin_unlock_irqrestore(&md->pushback_lock, flags);
+ }
+
/* were we interrupted ? */
r = -EINTR;
if (atomic_read(&md->pending)) {
@@ -1284,7 +1404,7 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
__flush_deferred_io(md, def);
up_write(&md->io_lock);
unlock_fs(md);
- goto out;
+ goto out; /* pushback list is already flushed, so skip flush */
}
up_write(&md->io_lock);
@@ -1294,6 +1414,25 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
r = 0;
+flush_and_out:
+ if (r && noflush) {
+ /*
+ * Because there may be already I/Os in the pushback list,
+ * flush them before return.
+ */
+ down_write(&md->io_lock);
+
+ spin_lock_irqsave(&md->pushback_lock, flags);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ bio_list_merge_head(&md->deferred, &md->pushback);
+ bio_list_init(&md->pushback);
+ spin_unlock_irqrestore(&md->pushback_lock, flags);
+
+ def = bio_list_get(&md->deferred);
+ __flush_deferred_io(md, def);
+ up_write(&md->io_lock);
+ }
+
out:
if (r && md->suspended_bdev) {
bdput(md->suspended_bdev);
@@ -1301,6 +1440,8 @@ out:
}
dm_table_put(map);
+
+out_unlock:
up(&md->suspend_lock);
return r;
}
@@ -1319,7 +1460,9 @@ int dm_resume(struct mapped_device *md)
if (!map || !dm_table_get_size(map))
goto out;
- dm_table_resume_targets(map);
+ r = dm_table_resume_targets(map);
+ if (r)
+ goto out;
down_write(&md->io_lock);
clear_bit(DMF_BLOCK_IO, &md->flags);
@@ -1337,6 +1480,8 @@ int dm_resume(struct mapped_device *md)
dm_table_unplug_all(map);
+ kobject_uevent(&md->disk->kobj, KOBJ_CHANGE);
+
r = 0;
out:
@@ -1374,9 +1519,21 @@ int dm_suspended(struct mapped_device *md)
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_noflush_suspending(struct dm_target *ti)
+{
+ struct mapped_device *md = dm_table_get_md(ti->table);
+ int r = __noflush_suspending(md);
+
+ dm_put(md);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_noflush_suspending);
+
static struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
+ .ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
.owner = THIS_MODULE
};
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 3c03c0ecab7e..2f796b1436b2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -21,6 +21,11 @@
#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
+#ifdef CONFIG_DM_DEBUG
+# define DMDEBUG(f, arg...) printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg)
+#else
+# define DMDEBUG(f, arg...) do {} while (0)
+#endif
#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
0 : scnprintf(result + sz, maxlen - sz, x))
@@ -28,6 +33,25 @@
#define SECTOR_SHIFT 9
/*
+ * Definitions of return values from target end_io function.
+ */
+#define DM_ENDIO_INCOMPLETE 1
+#define DM_ENDIO_REQUEUE 2
+
+/*
+ * Definitions of return values from target map function.
+ */
+#define DM_MAPIO_SUBMITTED 0
+#define DM_MAPIO_REMAPPED 1
+#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE
+
+/*
+ * Suspend feature flags
+ */
+#define DM_SUSPEND_LOCKFS_FLAG (1 << 0)
+#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
+
+/*
* List of devices that a metadevice uses and should open/close.
*/
struct dm_dev {
@@ -52,7 +76,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
-void dm_table_resume_targets(struct dm_table *t);
+int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
void dm_table_unplug_all(struct dm_table *t);
int dm_table_flush_all(struct dm_table *t);
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index f1db6eff4857..b46f6c575f7e 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -203,7 +203,7 @@ struct kcopyd_job {
/* FIXME: this should scale with the number of pages */
#define MIN_JOBS 512
-static kmem_cache_t *_job_cache;
+static struct kmem_cache *_job_cache;
static mempool_t *_job_pool;
/*
@@ -417,7 +417,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
/*
* kcopyd does this every time it's woken up.
*/
-static void do_work(void *ignored)
+static void do_work(struct work_struct *ignored)
{
/*
* The order that these are called is *very* important.
@@ -628,7 +628,7 @@ static int kcopyd_init(void)
}
kcopyd_clients++;
- INIT_WORK(&_kcopyd_work, do_work, NULL);
+ INIT_WORK(&_kcopyd_work, do_work);
mutex_unlock(&kcopyd_init_lock);
return 0;
}
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index b99c19c7eb22..c625ddb8833d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -111,6 +111,19 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int linear_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ linear_conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ request_queue_t *q = bdev_get_queue(conf->disks[i].rdev->bdev);
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ return ret;
+}
+
static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
{
linear_conf_t *conf;
@@ -269,6 +282,8 @@ static int linear_run (mddev_t *mddev)
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
mddev->queue->issue_flush_fn = linear_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = linear_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8dbab2ef3885..21e2a7b08841 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -39,10 +39,10 @@
#include <linux/raid/bitmap.h>
#include <linux/sysctl.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
-#include <linux/suspend.h>
#include <linux/poll.h>
#include <linux/mutex.h>
#include <linux/ctype.h>
+#include <linux/freezer.h>
#include <linux/init.h>
@@ -389,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error)
if (bio->bi_size)
return 1;
- if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ printk("md: super_written gets error=%d, uptodate=%d\n",
+ error, test_bit(BIO_UPTODATE, &bio->bi_flags));
+ WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
md_error(mddev, rdev);
+ }
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
@@ -970,12 +974,13 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
* version 1 superblock
*/
-static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
{
- unsigned int disk_csum, csum;
+ __le32 disk_csum;
+ u32 csum;
unsigned long long newcsum;
int size = 256 + le32_to_cpu(sb->max_dev)*2;
- unsigned int *isuper = (unsigned int*)sb;
+ __le32 *isuper = (__le32*)sb;
int i;
disk_csum = sb->sb_csum;
@@ -985,7 +990,7 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
newcsum += le32_to_cpu(*isuper++);
if (size == 2)
- newcsum += le16_to_cpu(*(unsigned short*) isuper);
+ newcsum += le16_to_cpu(*(__le16*) isuper);
csum = (newcsum & 0xffffffff) + (newcsum >> 32);
sb->sb_csum = disk_csum;
@@ -1102,7 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (le32_to_cpu(sb->chunksize))
rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
- if (le32_to_cpu(sb->size) > rdev->size*2)
+ if (le64_to_cpu(sb->size) > rdev->size*2)
return -EINVAL;
return ret;
}
@@ -1224,7 +1229,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
else
sb->resync_offset = cpu_to_le64(0);
- sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
+ sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
sb->raid_disks = cpu_to_le32(mddev->raid_disks);
sb->size = cpu_to_le64(mddev->size<<1);
@@ -1408,7 +1413,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
if (IS_ERR(bdev)) {
printk(KERN_ERR "md: could not open %s.\n",
__bdevname(dev, b));
@@ -1418,7 +1423,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
if (err) {
printk(KERN_ERR "md: could not bd_claim %s.\n",
bdevname(bdev, b));
- blkdev_put_partition(bdev);
+ blkdev_put(bdev);
return err;
}
rdev->bdev = bdev;
@@ -1432,7 +1437,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
if (!bdev)
MD_BUG();
bd_release(bdev);
- blkdev_put_partition(bdev);
+ blkdev_put(bdev);
}
void md_autodetect_dev(dev_t dev);
@@ -1587,7 +1592,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
}
}
-void md_update_sb(mddev_t * mddev)
+static void md_update_sb(mddev_t * mddev, int force_change)
{
int err;
struct list_head *tmp;
@@ -1598,7 +1603,18 @@ void md_update_sb(mddev_t * mddev)
repeat:
spin_lock_irq(&mddev->write_lock);
- if (mddev->degraded && mddev->sb_dirty == 3)
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ /* just a clean<-> dirty transition, possibly leave spares alone,
+ * though if events isn't the right even/odd, we will have to do
+ * spares after all
+ */
+ nospares = 1;
+ if (force_change)
+ nospares = 0;
+ if (mddev->degraded)
/* If the array is degraded, then skipping spares is both
* dangerous and fairly pointless.
* Dangerous because a device that was removed from the array
@@ -1608,20 +1624,14 @@ repeat:
* then a recovery will happen and soon that array won't
* be degraded any more and the spare can go back to sleep then.
*/
- mddev->sb_dirty = 1;
+ nospares = 0;
sync_req = mddev->in_sync;
mddev->utime = get_seconds();
- if (mddev->sb_dirty == 3)
- /* just a clean<-> dirty transition, possibly leave spares alone,
- * though if events isn't the right even/odd, we will have to do
- * spares after all
- */
- nospares = 1;
/* If this is just a dirty<->clean transition, and the array is clean
* and 'events' is odd, we can roll back to the previous clean state */
- if (mddev->sb_dirty == 3
+ if (nospares
&& (mddev->in_sync && mddev->recovery_cp == MaxSector)
&& (mddev->events & 1))
mddev->events--;
@@ -1652,7 +1662,6 @@ repeat:
MD_BUG();
mddev->events --;
}
- mddev->sb_dirty = 2;
sync_sbs(mddev, nospares);
/*
@@ -1660,7 +1669,7 @@ repeat:
* nonpersistent superblocks
*/
if (!mddev->persistent) {
- mddev->sb_dirty = 0;
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
return;
@@ -1697,20 +1706,20 @@ repeat:
break;
}
md_super_wait(mddev);
- /* if there was a failure, sb_dirty was set to 1, and we re-write super */
+ /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
spin_lock_irq(&mddev->write_lock);
- if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+ if (mddev->in_sync != sync_req ||
+ test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
/* have to write it out again */
spin_unlock_irq(&mddev->write_lock);
goto repeat;
}
- mddev->sb_dirty = 0;
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
}
-EXPORT_SYMBOL_GPL(md_update_sb);
/* words written to sysfs files may, or my not, be \n terminated.
* We want to accept with case. For this we use cmd_match.
@@ -1783,7 +1792,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
else {
mddev_t *mddev = rdev->mddev;
kick_rdev_from_array(rdev);
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
md_new_event(mddev);
err = 0;
}
@@ -1994,6 +2003,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
kobject_init(&rdev->kobj);
rdev->desc_nr = -1;
+ rdev->saved_raid_disk = -1;
rdev->flags = 0;
rdev->data_offset = 0;
rdev->sb_events = 0;
@@ -2426,7 +2436,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
spin_lock_irq(&mddev->write_lock);
if (atomic_read(&mddev->writes_pending) == 0) {
mddev->in_sync = 1;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
spin_unlock_irq(&mddev->write_lock);
} else {
@@ -2438,7 +2448,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
case active:
if (mddev->pers) {
restart_array(mddev);
- mddev->sb_dirty = 0;
+ clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
} else {
@@ -2520,6 +2530,36 @@ static struct md_sysfs_entry md_new_device =
__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
static ssize_t
+bitmap_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *end;
+ unsigned long chunk, end_chunk;
+
+ if (!mddev->bitmap)
+ goto out;
+ /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
+ while (*buf) {
+ chunk = end_chunk = simple_strtoul(buf, &end, 0);
+ if (buf == end) break;
+ if (*end == '-') { /* range */
+ buf = end + 1;
+ end_chunk = simple_strtoul(buf, &end, 0);
+ if (buf == end) break;
+ }
+ if (*end && !isspace(*end)) break;
+ bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+ buf = end;
+ while (isspace(*buf)) buf++;
+ }
+ bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+out:
+ return len;
+}
+
+static struct md_sysfs_entry md_bitmap =
+__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
+
+static ssize_t
size_show(mddev_t *mddev, char *page)
{
return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
@@ -2543,7 +2583,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
if (mddev->pers) {
err = update_size(mddev, size);
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
} else {
if (mddev->size == 0 ||
mddev->size > size)
@@ -2839,6 +2879,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_sync_completed.attr,
&md_suspend_lo.attr,
&md_suspend_hi.attr,
+ &md_bitmap.attr,
NULL,
};
static struct attribute_group md_redundancy_group = {
@@ -3111,8 +3152,8 @@ static int do_md_run(mddev_t * mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- if (mddev->sb_dirty)
- md_update_sb(mddev);
+ if (mddev->flags)
+ md_update_sb(mddev, 0);
set_capacity(disk, mddev->array_size<<1);
@@ -3159,6 +3200,7 @@ static int do_md_run(mddev_t * mddev)
mddev->changed = 1;
md_new_event(mddev);
+ kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
return 0;
}
@@ -3272,13 +3314,17 @@ static int do_md_stop(mddev_t * mddev, int mode)
module_put(mddev->pers->owner);
mddev->pers = NULL;
+
+ set_capacity(disk, 0);
+ mddev->changed = 1;
+
if (mddev->ro)
mddev->ro = 0;
}
- if (!mddev->in_sync || mddev->sb_dirty) {
+ if (!mddev->in_sync || mddev->flags) {
/* mark array as shutdown cleanly */
mddev->in_sync = 1;
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
}
if (mode == 1)
set_disk_ro(disk, 1);
@@ -3291,7 +3337,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
if (mode == 0) {
mdk_rdev_t *rdev;
struct list_head *tmp;
- struct gendisk *disk;
+
printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
bitmap_destroy(mddev);
@@ -3316,10 +3362,6 @@ static int do_md_stop(mddev_t * mddev, int mode)
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
- disk = mddev->gendisk;
- if (disk)
- set_capacity(disk, 0);
- mddev->changed = 1;
} else if (mddev->pers)
printk(KERN_INFO "md: %s switched to read-only mode.\n",
mdname(mddev));
@@ -3329,6 +3371,7 @@ out:
return err;
}
+#ifndef MODULE
static void autorun_array(mddev_t *mddev)
{
mdk_rdev_t *rdev;
@@ -3374,6 +3417,7 @@ static void autorun_devices(int part)
printk(KERN_INFO "md: autorun ...\n");
while (!list_empty(&pending_raid_disks)) {
+ int unit;
dev_t dev;
LIST_HEAD(candidates);
rdev0 = list_entry(pending_raid_disks.next,
@@ -3393,16 +3437,19 @@ static void autorun_devices(int part)
* mostly sane superblocks. It's time to allocate the
* mddev.
*/
- if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+ if (part) {
+ dev = MKDEV(mdp_major,
+ rdev0->preferred_minor << MdpMinorShift);
+ unit = MINOR(dev) >> MdpMinorShift;
+ } else {
+ dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+ unit = MINOR(dev);
+ }
+ if (rdev0->preferred_minor != unit) {
printk(KERN_INFO "md: unit number in %s is bad: %d\n",
bdevname(rdev0->bdev, b), rdev0->preferred_minor);
break;
}
- if (part)
- dev = MKDEV(mdp_major,
- rdev0->preferred_minor << MdpMinorShift);
- else
- dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
md_probe(dev, NULL, NULL);
mddev = mddev_find(dev);
@@ -3439,67 +3486,7 @@ static void autorun_devices(int part)
}
printk(KERN_INFO "md: ... autorun DONE.\n");
}
-
-/*
- * import RAID devices based on one partition
- * if possible, the array gets run as well.
- */
-
-static int autostart_array(dev_t startdev)
-{
- char b[BDEVNAME_SIZE];
- int err = -EINVAL, i;
- mdp_super_t *sb = NULL;
- mdk_rdev_t *start_rdev = NULL, *rdev;
-
- start_rdev = md_import_device(startdev, 0, 0);
- if (IS_ERR(start_rdev))
- return err;
-
-
- /* NOTE: this can only work for 0.90.0 superblocks */
- sb = (mdp_super_t*)page_address(start_rdev->sb_page);
- if (sb->major_version != 0 ||
- sb->minor_version != 90 ) {
- printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
- export_rdev(start_rdev);
- return err;
- }
-
- if (test_bit(Faulty, &start_rdev->flags)) {
- printk(KERN_WARNING
- "md: can not autostart based on faulty %s!\n",
- bdevname(start_rdev->bdev,b));
- export_rdev(start_rdev);
- return err;
- }
- list_add(&start_rdev->same_set, &pending_raid_disks);
-
- for (i = 0; i < MD_SB_DISKS; i++) {
- mdp_disk_t *desc = sb->disks + i;
- dev_t dev = MKDEV(desc->major, desc->minor);
-
- if (!dev)
- continue;
- if (dev == startdev)
- continue;
- if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
- continue;
- rdev = md_import_device(dev, 0, 0);
- if (IS_ERR(rdev))
- continue;
-
- list_add(&rdev->same_set, &pending_raid_disks);
- }
-
- /*
- * possibly return codes
- */
- autorun_devices(0);
- return 0;
-
-}
-
+#endif /* !MODULE */
static int get_version(void __user * arg)
{
@@ -3737,6 +3724,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (err)
export_rdev(rdev);
+ md_update_sb(mddev, 1);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return err;
@@ -3808,7 +3796,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
goto busy;
kick_rdev_from_array(rdev);
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
md_new_event(mddev);
return 0;
@@ -3867,6 +3855,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
}
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
+ rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
goto abort_export;
@@ -3885,7 +3874,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
rdev->raid_disk = -1;
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
/*
* Kick recovery, maybe this spare has to be added to the
@@ -4016,7 +4005,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
mddev->max_disks = MD_SB_DISKS;
- mddev->sb_dirty = 1;
+ mddev->flags = 0;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
mddev->bitmap_offset = 0;
@@ -4059,11 +4049,8 @@ static int update_size(mddev_t *mddev, unsigned long size)
return -EBUSY;
ITERATE_RDEV(mddev,rdev,tmp) {
sector_t avail;
- if (rdev->sb_offset > rdev->data_offset)
- avail = (rdev->sb_offset*2) - rdev->data_offset;
- else
- avail = get_capacity(rdev->bdev->bd_disk)
- - rdev->data_offset;
+ avail = rdev->size * 2;
+
if (fit && (size == 0 || size > avail/2))
size = avail/2;
if (avail < ((sector_t)size << 1))
@@ -4185,7 +4172,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
mddev->bitmap_offset = 0;
}
}
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
return rv;
}
@@ -4259,27 +4246,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
goto abort;
}
-
- if (cmd == START_ARRAY) {
- /* START_ARRAY doesn't need to lock the array as autostart_array
- * does the locking, and it could even be a different array
- */
- static int cnt = 3;
- if (cnt > 0 ) {
- printk(KERN_WARNING
- "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
- "This will not be supported beyond July 2006\n",
- current->comm, current->pid);
- cnt--;
- }
- err = autostart_array(new_decode_dev(arg));
- if (err) {
- printk(KERN_WARNING "md: autostart failed!\n");
- goto abort;
- }
- goto done;
- }
-
err = mddev_lock(mddev);
if (err) {
printk(KERN_INFO
@@ -4460,7 +4426,7 @@ static int md_open(struct inode *inode, struct file *file)
mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
int err;
- if ((err = mddev_lock(mddev)))
+ if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
goto out;
err = 0;
@@ -4476,8 +4442,7 @@ static int md_release(struct inode *inode, struct file * file)
{
mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
- if (!mddev)
- BUG();
+ BUG_ON(!mddev);
mddev_put(mddev);
return 0;
@@ -4524,6 +4489,7 @@ static int md_thread(void * arg)
* many dirty RAID5 blocks.
*/
+ current->flags |= PF_NOFREEZE;
allow_signal(SIGKILL);
while (!kthread_should_stop()) {
@@ -4540,7 +4506,6 @@ static int md_thread(void * arg)
test_bit(THREAD_WAKEUP, &thread->flags)
|| kthread_should_stop(),
thread->timeout);
- try_to_freeze();
clear_bit(THREAD_WAKEUP, &thread->flags);
@@ -4687,9 +4652,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
(test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
"reshape" :
- (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
- "resync" : "recovery")),
- per_milli/10, per_milli % 10,
+ (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
+ "check" :
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery"))),
+ per_milli/10, per_milli % 10,
(unsigned long long) resync,
(unsigned long long) max_blocks);
@@ -4882,8 +4849,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
chunk_kb ? "KB" : "B");
if (bitmap->file) {
seq_printf(seq, ", file: ");
- seq_path(seq, bitmap->file->f_vfsmnt,
- bitmap->file->f_dentry," \t\n");
+ seq_path(seq, bitmap->file->f_path.mnt,
+ bitmap->file->f_path.dentry," \t\n");
}
seq_printf(seq, "\n");
@@ -4948,6 +4915,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
}
static struct file_operations md_seq_fops = {
+ .owner = THIS_MODULE,
.open = md_seq_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -5042,12 +5010,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
- mddev->sb_dirty = 3;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
md_wakeup_thread(mddev->thread);
}
spin_unlock_irq(&mddev->write_lock);
}
- wait_event(mddev->sb_wait, mddev->sb_dirty==0);
+ wait_event(mddev->sb_wait, mddev->flags==0);
}
void md_write_end(mddev_t *mddev)
@@ -5078,6 +5046,7 @@ void md_do_sync(mddev_t *mddev)
int skipped = 0;
struct list_head *rtmp;
mdk_rdev_t *rdev;
+ char *desc;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -5085,6 +5054,18 @@ void md_do_sync(mddev_t *mddev)
if (mddev->ro) /* never try to sync a read-only array */
return;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ desc = "data-check";
+ else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ desc = "requested-resync";
+ else
+ desc = "resync";
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ desc = "reshape";
+ else
+ desc = "recovery";
+
/* we overload curr_resync somewhat here.
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
@@ -5128,10 +5109,10 @@ void md_do_sync(mddev_t *mddev)
prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
if (!kthread_should_stop() &&
mddev2->curr_resync >= mddev->curr_resync) {
- printk(KERN_INFO "md: delaying resync of %s"
- " until %s has finished resync (they"
+ printk(KERN_INFO "md: delaying %s of %s"
+ " until %s has finished (they"
" share one or more physical units)\n",
- mdname(mddev), mdname(mddev2));
+ desc, mdname(mddev), mdname(mddev2));
mddev_put(mddev2);
schedule();
finish_wait(&resync_wait, &wq);
@@ -5167,12 +5148,12 @@ void md_do_sync(mddev_t *mddev)
j = rdev->recovery_offset;
}
- printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
- printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
- " %d KB/sec/disc.\n", speed_min(mddev));
+ printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ speed:"
+ " %d KB/sec/disk.\n", speed_min(mddev));
printk(KERN_INFO "md: using maximum available idle IO bandwidth "
- "(but not more than %d KB/sec) for reconstruction.\n",
- speed_max(mddev));
+ "(but not more than %d KB/sec) for %s.\n",
+ speed_max(mddev), desc);
is_mddev_idle(mddev); /* this also initializes IO event counters */
@@ -5198,8 +5179,8 @@ void md_do_sync(mddev_t *mddev)
if (j>2) {
printk(KERN_INFO
- "md: resuming recovery of %s from checkpoint.\n",
- mdname(mddev));
+ "md: resuming %s of %s from checkpoint.\n",
+ desc, mdname(mddev));
mddev->curr_resync = j;
}
@@ -5282,7 +5263,7 @@ void md_do_sync(mddev_t *mddev)
}
}
}
- printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+ printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
/*
* this also signals 'finished resyncing' to md_stop
*/
@@ -5295,15 +5276,14 @@ void md_do_sync(mddev_t *mddev)
mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
- test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
printk(KERN_INFO
- "md: checkpointing recovery of %s.\n",
- mdname(mddev));
+ "md: checkpointing %s of %s.\n",
+ desc, mdname(mddev));
mddev->recovery_cp = mddev->curr_resync;
}
} else
@@ -5317,9 +5297,9 @@ void md_do_sync(mddev_t *mddev)
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
rdev->recovery_offset = mddev->curr_resync;
- mddev->sb_dirty = 1;
}
}
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
skip:
mddev->curr_resync = 0;
@@ -5374,7 +5354,7 @@ void md_check_recovery(mddev_t *mddev)
}
if ( ! (
- mddev->sb_dirty ||
+ mddev->flags ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->safemode == 1) ||
@@ -5390,14 +5370,14 @@ void md_check_recovery(mddev_t *mddev)
if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
!mddev->in_sync && mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
- mddev->sb_dirty = 3;
+ set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
if (mddev->safemode == 1)
mddev->safemode = 0;
spin_unlock_irq(&mddev->write_lock);
- if (mddev->sb_dirty)
- md_update_sb(mddev);
+ if (mddev->flags)
+ md_update_sb(mddev, 0);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -5416,7 +5396,7 @@ void md_check_recovery(mddev_t *mddev)
/* activate any spares */
mddev->pers->spare_active(mddev);
}
- md_update_sb(mddev);
+ md_update_sb(mddev, 1);
/* if array is no-longer degraded, then any saved_raid_disk
* information must be scrapped
@@ -5556,22 +5536,15 @@ static void md_geninit(void)
static int __init md_init(void)
{
- printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
- " MD_SB_DISKS=%d\n",
- MD_MAJOR_VERSION, MD_MINOR_VERSION,
- MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
- printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
- BITMAP_MINOR);
-
if (register_blkdev(MAJOR_NR, "md"))
return -1;
if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
unregister_blkdev(MAJOR_NR, "md");
return -1;
}
- blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
- md_probe, NULL, NULL);
- blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+ blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+ md_probe, NULL, NULL);
+ blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL);
register_reboot_notifier(&md_notifier);
@@ -5623,15 +5596,15 @@ static void autostart_arrays(int part)
autorun_devices(part);
}
-#endif
+#endif /* !MODULE */
static __exit void md_exit(void)
{
mddev_t *mddev;
struct list_head *tmp;
- blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
- blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
+ blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+ blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
unregister_blkdev(MAJOR_NR,"md");
unregister_blkdev(mdp_major, "mdp");
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1cc9de44ce86..14da37fee37b 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -228,6 +228,28 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
rcu_read_unlock();
return ret;
}
+static int multipath_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks ; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ /* Just like multipath_map, we just check the
+ * first available device
+ */
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
/*
* Careful, this can execute in IRQ contexts as well!
@@ -253,8 +275,9 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
char b[BDEVNAME_SIZE];
clear_bit(In_sync, &rdev->flags);
set_bit(Faulty, &rdev->flags);
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
conf->working_disks--;
+ mddev->degraded++;
printk(KERN_ALERT "multipath: IO failure on %s,"
" disabling IO path. \n Operation continuing"
" on %d IO paths.\n",
@@ -314,6 +337,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
conf->working_disks++;
+ mddev->degraded--;
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
@@ -470,7 +494,6 @@ static int multipath_run (mddev_t *mddev)
}
conf->raid_disks = mddev->raid_disks;
- mddev->sb_dirty = 1;
conf->mddev = mddev;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
@@ -480,7 +503,7 @@ static int multipath_run (mddev_t *mddev)
mdname(mddev));
goto out_free_conf;
}
- mddev->degraded = conf->raid_disks = conf->working_disks;
+ mddev->degraded = conf->raid_disks - conf->working_disks;
conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
sizeof(struct multipath_bh));
@@ -510,6 +533,8 @@ static int multipath_run (mddev_t *mddev)
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->issue_flush_fn = multipath_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = multipath_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
return 0;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index cb8c6317e4e5..dfe32149ad3a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -60,6 +60,21 @@ static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid0_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ raid0_conf_t *conf = mddev_to_conf(mddev);
+ mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+ int i, ret = 0;
+
+ for (i = 0; i < mddev->raid_disks && !ret ; i++) {
+ request_queue_t *q = bdev_get_queue(devlist[i]->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ return ret;
+}
+
static int create_strip_zones (mddev_t *mddev)
{
@@ -236,6 +251,8 @@ static int create_strip_zones (mddev_t *mddev)
mddev->queue->unplug_fn = raid0_unplug;
mddev->queue->issue_flush_fn = raid0_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid0_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
printk("raid0: done.\n");
return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3b4d69c05623..b3c5e12f081d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -271,7 +271,7 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
*/
update_head_pos(mirror, r1_bio);
- if (uptodate || conf->working_disks <= 1) {
+ if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) {
/*
* Set R1BIO_Uptodate in our master bio, so that
* we will return a good error code for to the higher
@@ -601,6 +601,32 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid1_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+ /* Note the '|| 1' - when read_balance prefers
+ * non-congested targets, it can be removed
+ */
+ if ((bits & (1<<BDI_write_congested)) || 1)
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ else
+ ret &= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -929,7 +955,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
int i;
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
- conf->working_disks);
+ conf->raid_disks - mddev->degraded);
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -953,26 +979,27 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* else mark the drive as failed
*/
if (test_bit(In_sync, &rdev->flags)
- && conf->working_disks == 1)
+ && (conf->raid_disks - mddev->degraded) == 1)
/*
* Don't fail the drive, act as though we were just a
* normal single drive
*/
return;
- if (test_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->working_disks--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
}
- clear_bit(In_sync, &rdev->flags);
set_bit(Faulty, &rdev->flags);
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
static void print_conf(conf_t *conf)
@@ -984,7 +1011,7 @@ static void print_conf(conf_t *conf)
printk("(!conf)\n");
return;
}
- printk(" --- wd:%d rd:%d\n", conf->working_disks,
+ printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
rcu_read_lock();
@@ -1023,10 +1050,11 @@ static int raid1_spare_active(mddev_t *mddev)
mdk_rdev_t *rdev = conf->mirrors[i].rdev;
if (rdev
&& !test_bit(Faulty, &rdev->flags)
- && !test_bit(In_sync, &rdev->flags)) {
- conf->working_disks++;
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- set_bit(In_sync, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
@@ -1368,6 +1396,95 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
* 3. Performs writes following reads for array syncronising.
*/
+static void fix_read_error(conf_t *conf, int read_disk,
+ sector_t sect, int sectors)
+{
+ mddev_t *mddev = conf->mddev;
+ while(sectors) {
+ int s = sectors;
+ int d = read_disk;
+ int success = 0;
+ int start;
+ mdk_rdev_t *rdev;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ do {
+ /* Note: no rcu protection needed here
+ * as this is synchronous in the raid1d thread
+ * which is the thread that might remove
+ * a device. If raid1d ever becomes multi-threaded....
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags) &&
+ sync_page_io(rdev->bdev,
+ sect + rdev->data_offset,
+ s<<9,
+ conf->tmppage, READ))
+ success = 1;
+ else {
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ }
+ } while (!success && d != read_disk);
+
+ if (!success) {
+ /* Cannot read from anywhere -- bye bye array */
+ md_error(mddev, conf->mirrors[read_disk].rdev);
+ break;
+ }
+ /* write it back and re-read */
+ start = d;
+ while (d != read_disk) {
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev->bdev,
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, WRITE)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ }
+ }
+ d = start;
+ while (d != read_disk) {
+ char b[BDEVNAME_SIZE];
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev->bdev,
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, READ)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ else {
+ atomic_add(s, &rdev->corrected_errors);
+ printk(KERN_INFO
+ "raid1:%s: read error corrected "
+ "(%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(sect +
+ rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ }
+ }
+ }
+ sectors -= s;
+ sect += s;
+ }
+}
+
static void raid1d(mddev_t *mddev)
{
r1bio_t *r1_bio;
@@ -1460,86 +1577,14 @@ static void raid1d(mddev_t *mddev)
* This is all done synchronously while the array is
* frozen
*/
- sector_t sect = r1_bio->sector;
- int sectors = r1_bio->sectors;
- freeze_array(conf);
- if (mddev->ro == 0) while(sectors) {
- int s = sectors;
- int d = r1_bio->read_disk;
- int success = 0;
-
- if (s > (PAGE_SIZE>>9))
- s = PAGE_SIZE >> 9;
-
- do {
- /* Note: no rcu protection needed here
- * as this is synchronous in the raid1d thread
- * which is the thread that might remove
- * a device. If raid1d ever becomes multi-threaded....
- */
- rdev = conf->mirrors[d].rdev;
- if (rdev &&
- test_bit(In_sync, &rdev->flags) &&
- sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ))
- success = 1;
- else {
- d++;
- if (d == conf->raid_disks)
- d = 0;
- }
- } while (!success && d != r1_bio->read_disk);
-
- if (success) {
- /* write it back and re-read */
- int start = d;
- while (d != r1_bio->read_disk) {
- if (d==0)
- d = conf->raid_disks;
- d--;
- rdev = conf->mirrors[d].rdev;
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- }
- }
- d = start;
- while (d != r1_bio->read_disk) {
- if (d==0)
- d = conf->raid_disks;
- d--;
- rdev = conf->mirrors[d].rdev;
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- if (sync_page_io(rdev->bdev,
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- else {
- atomic_add(s, &rdev->corrected_errors);
- printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
- mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b));
- }
- }
- }
- } else {
- /* Cannot read from anywhere -- bye bye array */
- md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
- break;
- }
- sectors -= s;
- sect += s;
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, r1_bio->read_disk,
+ r1_bio->sector,
+ r1_bio->sectors);
+ unfreeze_array(conf);
}
- unfreeze_array(conf);
-
bio = r1_bio->bios[r1_bio->read_disk];
if ((disk=read_balance(conf, r1_bio)) == -1) {
printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1884,15 +1929,11 @@ static int run(mddev_t *mddev)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->head_position = 0;
- if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
- conf->working_disks++;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
- if (conf->working_disks == 1)
- mddev->recovery_cp = MaxSector;
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
@@ -1900,11 +1941,6 @@ static int run(mddev_t *mddev)
bio_list_init(&conf->pending_bio_list);
bio_list_init(&conf->flushing_bio_list);
- if (!conf->working_disks) {
- printk(KERN_ERR "raid1: no operational mirrors for %s\n",
- mdname(mddev));
- goto out_free_conf;
- }
mddev->degraded = 0;
for (i = 0; i < conf->raid_disks; i++) {
@@ -1915,8 +1951,16 @@ static int run(mddev_t *mddev)
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
mddev->degraded++;
+ conf->fullsync = 1;
}
}
+ if (mddev->degraded == conf->raid_disks) {
+ printk(KERN_ERR "raid1: no operational mirrors for %s\n",
+ mdname(mddev));
+ goto out_free_conf;
+ }
+ if (conf->raid_disks - mddev->degraded == 1)
+ mddev->recovery_cp = MaxSector;
/*
* find the first working one and use it as a starting point
@@ -1948,6 +1992,8 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->issue_flush_fn = raid1_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid1_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
return 0;
@@ -2035,7 +2081,7 @@ static int raid1_reshape(mddev_t *mddev)
mirror_info_t *newmirrors;
conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks;
-
+ unsigned long flags;
int d, d2;
/* Cannot change chunk_size, layout, or level */
@@ -2094,7 +2140,9 @@ static int raid1_reshape(mddev_t *mddev)
kfree(conf->poolinfo);
conf->poolinfo = newpoolinfo;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (raid_disks - conf->raid_disks);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
conf->raid_disks = mddev->raid_disks = raid_disks;
mddev->delta_disks = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 016ddb831c9b..7492d6033ac6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -648,6 +648,26 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid10_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ conf_t *conf = mddev_to_conf(mddev);
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ request_queue_t *q = bdev_get_queue(rdev->bdev);
+
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
@@ -921,7 +941,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
seq_printf(seq, " %d far-copies", conf->far_copies);
}
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
- conf->working_disks);
+ conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf(seq, "%s",
conf->mirrors[i].rdev &&
@@ -941,7 +961,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* else mark the drive as failed
*/
if (test_bit(In_sync, &rdev->flags)
- && conf->working_disks == 1)
+ && conf->raid_disks-mddev->degraded == 1)
/*
* Don't fail the drive, just return an IO error.
* The test should really be more sophisticated than
@@ -950,20 +970,21 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* really dead" tests...
*/
return;
- if (test_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->working_disks--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
}
- clear_bit(In_sync, &rdev->flags);
set_bit(Faulty, &rdev->flags);
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
static void print_conf(conf_t *conf)
@@ -976,7 +997,7 @@ static void print_conf(conf_t *conf)
printk("(!conf)\n");
return;
}
- printk(" --- wd:%d rd:%d\n", conf->working_disks,
+ printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
for (i = 0; i < conf->raid_disks; i++) {
@@ -1034,10 +1055,11 @@ static int raid10_spare_active(mddev_t *mddev)
tmp = conf->mirrors + i;
if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags)
- && !test_bit(In_sync, &tmp->rdev->flags)) {
- conf->working_disks++;
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- set_bit(In_sync, &tmp->rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
@@ -1350,9 +1372,119 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
*
* 1. Retries failed read operations on working mirrors.
* 2. Updates the raid superblock when problems encounter.
- * 3. Performs writes following reads for array syncronising.
+ * 3. Performs writes following reads for array synchronising.
*/
+static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
+{
+ int sect = 0; /* Offset from r10_bio->sector */
+ int sectors = r10_bio->sectors;
+ mdk_rdev_t*rdev;
+ while(sectors) {
+ int s = sectors;
+ int sl = r10_bio->read_slot;
+ int success = 0;
+ int start;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ rcu_read_lock();
+ do {
+ int d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ success = sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9,
+ conf->tmppage, READ);
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ if (success)
+ break;
+ }
+ sl++;
+ if (sl == conf->copies)
+ sl = 0;
+ } while (!success && sl != r10_bio->read_slot);
+ rcu_read_unlock();
+
+ if (!success) {
+ /* Cannot read from anywhere -- bye bye array */
+ int dn = r10_bio->devs[r10_bio->read_slot].devnum;
+ md_error(mddev, conf->mirrors[dn].rdev);
+ break;
+ }
+
+ start = sl;
+ /* write it back and re-read */
+ rcu_read_lock();
+ while (sl != r10_bio->read_slot) {
+ int d;
+ if (sl==0)
+ sl = conf->copies;
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ atomic_add(s, &rdev->corrected_errors);
+ if (sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, WRITE)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ }
+ }
+ sl = start;
+ while (sl != r10_bio->read_slot) {
+ int d;
+ if (sl==0)
+ sl = conf->copies;
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ char b[BDEVNAME_SIZE];
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ if (sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+ s<<9, conf->tmppage, READ) == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ else
+ printk(KERN_INFO
+ "raid10:%s: read error corrected"
+ " (%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(sect+
+ rdev->data_offset),
+ bdevname(rdev->bdev, b));
+
+ rdev_dec_pending(rdev, mddev);
+ rcu_read_lock();
+ }
+ }
+ rcu_read_unlock();
+
+ sectors -= s;
+ sect += s;
+ }
+}
+
static void raid10d(mddev_t *mddev)
{
r10bio_t *r10_bio;
@@ -1413,105 +1545,12 @@ static void raid10d(mddev_t *mddev)
* This is all done synchronously while the array is
* frozen.
*/
- int sect = 0; /* Offset from r10_bio->sector */
- int sectors = r10_bio->sectors;
- freeze_array(conf);
- if (mddev->ro == 0) while(sectors) {
- int s = sectors;
- int sl = r10_bio->read_slot;
- int success = 0;
-
- if (s > (PAGE_SIZE>>9))
- s = PAGE_SIZE >> 9;
-
- rcu_read_lock();
- do {
- int d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- success = sync_page_io(rdev->bdev,
- r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9,
- conf->tmppage, READ);
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- if (success)
- break;
- }
- sl++;
- if (sl == conf->copies)
- sl = 0;
- } while (!success && sl != r10_bio->read_slot);
- rcu_read_unlock();
-
- if (success) {
- int start = sl;
- /* write it back and re-read */
- rcu_read_lock();
- while (sl != r10_bio->read_slot) {
- int d;
- if (sl==0)
- sl = conf->copies;
- sl--;
- d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- atomic_add(s, &rdev->corrected_errors);
- if (sync_page_io(rdev->bdev,
- r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9, conf->tmppage, WRITE) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- sl = start;
- while (sl != r10_bio->read_slot) {
- int d;
- if (sl==0)
- sl = conf->copies;
- sl--;
- d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- if (sync_page_io(rdev->bdev,
- r10_bio->devs[sl].addr +
- sect + rdev->data_offset,
- s<<9, conf->tmppage, READ) == 0)
- /* Well, this device is dead */
- md_error(mddev, rdev);
- else
- printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
- mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b));
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
- } else {
- /* Cannot read from anywhere -- bye bye array */
- md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
- break;
- }
- sectors -= s;
- sect += s;
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, mddev, r10_bio);
+ unfreeze_array(conf);
}
- unfreeze_array(conf);
-
bio = r10_bio->devs[r10_bio->read_slot].bio;
r10_bio->devs[r10_bio->read_slot].bio =
mddev->ro ? IO_BLOCKED : NULL;
@@ -2018,8 +2057,6 @@ static int run(mddev_t *mddev)
mddev->queue->max_sectors = (PAGE_SIZE>>9);
disk->head_position = 0;
- if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
- conf->working_disks++;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
@@ -2042,7 +2079,7 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + i;
if (!disk->rdev ||
- !test_bit(In_sync, &rdev->flags)) {
+ !test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
mddev->degraded++;
}
@@ -2077,6 +2114,8 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid10_unplug;
mddev->queue->issue_flush_fn = raid10_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid10_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
/* Calculate max read-ahead size.
* We need to readahead at least twice a whole stripe....
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 450066007160..377f8bc9b78b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
list_add_tail(&sh->lru, &conf->inactive_list);
wake_up(&conf->wait_for_stripe);
+ if (conf->retry_read_aligned)
+ md_wakeup_thread(conf->mddev->thread);
}
}
}
@@ -348,7 +350,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
static int grow_stripes(raid5_conf_t *conf, int num)
{
- kmem_cache_t *sc;
+ struct kmem_cache *sc;
int devs = conf->raid_disks;
sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
@@ -397,7 +399,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
LIST_HEAD(newstripes);
struct disk_info *ndisks;
int err = 0;
- kmem_cache_t *sc;
+ struct kmem_cache *sc;
int i;
if (newsize <= conf->pool_size)
@@ -542,35 +544,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
}
if (uptodate) {
-#if 0
- struct bio *bio;
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- /* we can return a buffer if we bypassed the cache or
- * if the top buffer is not in highmem. If there are
- * multiple buffers, leave the extra work to
- * handle_stripe
- */
- buffer = sh->bh_read[i];
- if (buffer &&
- (!PageHighMem(buffer->b_page)
- || buffer->b_page == bh->b_page )
- ) {
- sh->bh_read[i] = buffer->b_reqnext;
- buffer->b_reqnext = NULL;
- } else
- buffer = NULL;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (sh->bh_page[i]==bh->b_page)
- set_buffer_uptodate(bh);
- if (buffer) {
- if (buffer->b_page != bh->b_page)
- memcpy(buffer->b_data, bh->b_data, bh->b_size);
- buffer->b_end_io(buffer, 1);
- }
-#else
set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
rdev = conf->disks[i].rdev;
printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
@@ -616,14 +590,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
}
}
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-#if 0
- /* must restore b_page before unlocking buffer... */
- if (sh->bh_page[i] != bh->b_page) {
- bh->b_page = sh->bh_page[i];
- bh->b_data = page_address(bh->b_page);
- clear_buffer_uptodate(bh);
- }
-#endif
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
@@ -636,7 +602,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
- unsigned long flags;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
if (bi->bi_size)
@@ -654,7 +619,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
return 0;
}
- spin_lock_irqsave(&conf->device_lock, flags);
if (!uptodate)
md_error(conf->mddev, conf->disks[i].rdev);
@@ -662,8 +626,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- __release_stripe(conf, sh);
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ release_stripe(sh);
return 0;
}
@@ -696,12 +659,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
PRINTK("raid5: error called\n");
if (!test_bit(Faulty, &rdev->flags)) {
- mddev->sb_dirty = 1;
- if (test_bit(In_sync, &rdev->flags)) {
- conf->working_disks--;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->failed_disks++;
- clear_bit(In_sync, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery was running, make sure it aborts.
*/
@@ -711,7 +674,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
printk (KERN_ALERT
"raid5: Disk failure on %s, disabling device."
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
}
@@ -824,7 +787,8 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
static sector_t compute_blocknr(struct stripe_head *sh, int i)
{
raid5_conf_t *conf = sh->raid_conf;
- int raid_disks = sh->disks, data_disks = raid_disks - 1;
+ int raid_disks = sh->disks;
+ int data_disks = raid_disks - conf->max_degraded;
sector_t new_sector = sh->sector, check;
int sectors_per_chunk = conf->chunk_size >> 9;
sector_t stripe;
@@ -860,7 +824,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
}
break;
case 6:
- data_disks = raid_disks - 2;
if (i == raid6_next_disk(sh->pd_idx, raid_disks))
return 0; /* It is the Q disk */
switch (conf->algorithm) {
@@ -1108,7 +1071,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- if (sh->dev[i].written) BUG();
+ BUG_ON(sh->dev[i].written);
sh->dev[i].written = chosen;
}
break;
@@ -1353,12 +1316,13 @@ static int page_is_zero(struct page *p)
static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
{
int sectors_per_chunk = conf->chunk_size >> 9;
- sector_t x = stripe;
int pd_idx, dd_idx;
- int chunk_offset = sector_div(x, sectors_per_chunk);
- stripe = x;
- raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
- + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+ int chunk_offset = sector_div(stripe, sectors_per_chunk);
+
+ raid5_compute_sector(stripe * (disks - conf->max_degraded)
+ *sectors_per_chunk + chunk_offset,
+ disks, disks - conf->max_degraded,
+ &dd_idx, &pd_idx, conf);
return pd_idx;
}
@@ -1619,15 +1583,6 @@ static void handle_stripe5(struct stripe_head *sh)
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
-#if 0
- /* if I am just reading this block and we don't have
- a failed drive, or any pending writes then sidestep the cache */
- if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
- ! syncing && !failed && !to_write) {
- sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
- sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
- }
-#endif
locked++;
PRINTK("Reading block %d (sync=%d)\n",
i, syncing);
@@ -1645,9 +1600,6 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
(!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-|| sh->bh_page[i]!=bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)
@@ -1659,9 +1611,6 @@ static void handle_stripe5(struct stripe_head *sh)
/* Would I have to read this buffer for reconstruct_write */
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
(!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-|| sh->bh_page[i] != bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)) rcw++;
@@ -1869,7 +1818,9 @@ static void handle_stripe5(struct stripe_head *sh)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
for (i=disks; i-- ;) {
int rw;
@@ -2197,15 +2148,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
-#if 0
- /* if I am just reading this block and we don't have
- a failed drive, or any pending writes then sidestep the cache */
- if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
- ! syncing && !failed && !to_write) {
- sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
- sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
- }
-#endif
locked++;
PRINTK("Reading block %d (sync=%d)\n",
i, syncing);
@@ -2224,9 +2166,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
if (!test_bit(R5_OVERWRITE, &dev->flags)
&& i != pd_idx && i != qd_idx
&& (!test_bit(R5_LOCKED, &dev->flags)
-#if 0
- || sh->bh_page[i] != bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)) rcw++;
@@ -2422,7 +2361,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
for (i=disks; i-- ;) {
int rw;
@@ -2597,6 +2538,198 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
return ret;
}
+static int raid5_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ /* No difference between reads and writes. Just check
+ * how busy the stripe_cache is
+ */
+ if (conf->inactive_blocked)
+ return 1;
+ if (conf->quiesce)
+ return 1;
+ if (list_empty_careful(&conf->inactive_list))
+ return 1;
+
+ return 0;
+}
+
+/* We want read requests to align with chunks where possible,
+ * but write requests don't need to.
+ */
+static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
+{
+ mddev_t *mddev = q->queuedata;
+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ int max;
+ unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int bio_sectors = bio->bi_size >> 9;
+
+ if (bio_data_dir(bio))
+ return biovec->bv_len; /* always allow writes to be mergeable */
+
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+ if (max < 0) max = 0;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ else
+ return max;
+}
+
+
+static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
+{
+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int bio_sectors = bio->bi_size >> 9;
+
+ return chunk_sectors >=
+ ((sector & (chunk_sectors - 1)) + bio_sectors);
+}
+
+/*
+ * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
+ * later sampled by raid5d.
+ */
+static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+
+ bi->bi_next = conf->retry_read_aligned_list;
+ conf->retry_read_aligned_list = bi;
+
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ md_wakeup_thread(conf->mddev->thread);
+}
+
+
+static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
+{
+ struct bio *bi;
+
+ bi = conf->retry_read_aligned;
+ if (bi) {
+ conf->retry_read_aligned = NULL;
+ return bi;
+ }
+ bi = conf->retry_read_aligned_list;
+ if(bi) {
+ conf->retry_read_aligned = bi->bi_next;
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* biased count of active stripes */
+ bi->bi_hw_segments = 0; /* count of processed stripes */
+ }
+
+ return bi;
+}
+
+
+/*
+ * The "raid5_align_endio" should check if the read succeeded and if it
+ * did, call bio_endio on the original bio (having bio_put the new bio
+ * first).
+ * If the read failed..
+ */
+static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
+{
+ struct bio* raid_bi = bi->bi_private;
+ mddev_t *mddev;
+ raid5_conf_t *conf;
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ mdk_rdev_t *rdev;
+
+ if (bi->bi_size)
+ return 1;
+ bio_put(bi);
+
+ mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
+ conf = mddev_to_conf(mddev);
+ rdev = (void*)raid_bi->bi_next;
+ raid_bi->bi_next = NULL;
+
+ rdev_dec_pending(rdev, conf->mddev);
+
+ if (!error && uptodate) {
+ bio_endio(raid_bi, bytes, 0);
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return 0;
+ }
+
+
+ PRINTK("raid5_align_endio : io error...handing IO for a retry\n");
+
+ add_bio_to_retry(raid_bi, conf);
+ return 0;
+}
+
+static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
+{
+ mddev_t *mddev = q->queuedata;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+ const unsigned int raid_disks = conf->raid_disks;
+ const unsigned int data_disks = raid_disks - conf->max_degraded;
+ unsigned int dd_idx, pd_idx;
+ struct bio* align_bi;
+ mdk_rdev_t *rdev;
+
+ if (!in_chunk_boundary(mddev, raid_bio)) {
+ printk("chunk_aligned_read : non aligned\n");
+ return 0;
+ }
+ /*
+ * use bio_clone to make a copy of the bio
+ */
+ align_bi = bio_clone(raid_bio, GFP_NOIO);
+ if (!align_bi)
+ return 0;
+ /*
+ * set bi_end_io to a new function, and set bi_private to the
+ * original bio.
+ */
+ align_bi->bi_end_io = raid5_align_endio;
+ align_bi->bi_private = raid_bio;
+ /*
+ * compute position
+ */
+ align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector,
+ raid_disks,
+ data_disks,
+ &dd_idx,
+ &pd_idx,
+ conf);
+
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+ if (rdev && test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ raid_bio->bi_next = (void*)rdev;
+ align_bi->bi_bdev = rdev->bdev;
+ align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
+ align_bi->bi_sector += rdev->data_offset;
+
+ spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0,
+ conf->device_lock, /* nothing */);
+ atomic_inc(&conf->active_aligned_reads);
+ spin_unlock_irq(&conf->device_lock);
+
+ generic_make_request(align_bi);
+ return 1;
+ } else {
+ rcu_read_unlock();
+ bio_put(align_bi);
+ return 0;
+ }
+}
+
+
static int make_request(request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
@@ -2618,6 +2751,11 @@ static int make_request(request_queue_t *q, struct bio * bi)
disk_stat_inc(mddev->gendisk, ios[rw]);
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
+ if (bio_data_dir(bi) == READ &&
+ mddev->reshape_position == MaxSector &&
+ chunk_aligned_read(q,bi))
+ return 0;
+
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
@@ -2725,7 +2863,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
return 0;
}
@@ -2781,9 +2921,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->expand_progress;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
- wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+ wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop());
spin_lock_irq(&conf->device_lock);
conf->expand_lo = mddev->reshape_position;
@@ -2936,6 +3076,74 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
return STRIPE_SECTORS;
}
+static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
+{
+ /* We may not be able to submit a whole bio at once as there
+ * may not be enough stripe_heads available.
+ * We cannot pre-allocate enough stripe_heads as we may need
+ * more than exist in the cache (if we allow ever large chunks).
+ * So we do one stripe head at a time and record in
+ * ->bi_hw_segments how many have been done.
+ *
+ * We *know* that this entire raid_bio is in one chunk, so
+ * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
+ */
+ struct stripe_head *sh;
+ int dd_idx, pd_idx;
+ sector_t sector, logical_sector, last_sector;
+ int scnt = 0;
+ int remaining;
+ int handled = 0;
+
+ logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ sector = raid5_compute_sector( logical_sector,
+ conf->raid_disks,
+ conf->raid_disks - conf->max_degraded,
+ &dd_idx,
+ &pd_idx,
+ conf);
+ last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS, scnt++) {
+
+ if (scnt < raid_bio->bi_hw_segments)
+ /* already done this stripe */
+ continue;
+
+ sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+
+ if (!sh) {
+ /* failed to get a stripe - must wait */
+ raid_bio->bi_hw_segments = scnt;
+ conf->retry_read_aligned = raid_bio;
+ return handled;
+ }
+
+ set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
+ add_stripe_bio(sh, raid_bio, dd_idx, 0);
+ handle_stripe(sh, NULL);
+ release_stripe(sh);
+ handled++;
+ }
+ spin_lock_irq(&conf->device_lock);
+ remaining = --raid_bio->bi_phys_segments;
+ spin_unlock_irq(&conf->device_lock);
+ if (remaining == 0) {
+ int bytes = raid_bio->bi_size;
+
+ raid_bio->bi_size = 0;
+ raid_bio->bi_end_io(raid_bio, bytes,
+ test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
+ ? 0 : -EIO);
+ }
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return handled;
+}
+
+
+
/*
* This is our raid5 kernel thread.
*
@@ -2957,6 +3165,7 @@ static void raid5d (mddev_t *mddev)
spin_lock_irq(&conf->device_lock);
while (1) {
struct list_head *first;
+ struct bio *bio;
if (conf->seq_flush != conf->seq_write) {
int seq = conf->seq_flush;
@@ -2973,6 +3182,16 @@ static void raid5d (mddev_t *mddev)
!list_empty(&conf->delayed_list))
raid5_activate_delayed(conf);
+ while ((bio = remove_bio_from_retry(conf))) {
+ int ok;
+ spin_unlock_irq(&conf->device_lock);
+ ok = retry_aligned_read(conf, bio);
+ spin_lock_irq(&conf->device_lock);
+ if (!ok)
+ break;
+ handled++;
+ }
+
if (list_empty(&conf->handle_list))
break;
@@ -3074,6 +3293,7 @@ static int run(mddev_t *mddev)
mdk_rdev_t *rdev;
struct disk_info *disk;
struct list_head *tmp;
+ int working_disks = 0;
if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
@@ -3159,6 +3379,7 @@ static int run(mddev_t *mddev)
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
+ atomic_set(&conf->active_aligned_reads, 0);
PRINTK("raid5: run(%s) called.\n", mdname(mddev));
@@ -3176,14 +3397,14 @@ static int run(mddev_t *mddev)
printk(KERN_INFO "raid5: device %s operational as raid"
" disk %d\n", bdevname(rdev->bdev,b),
raid_disk);
- conf->working_disks++;
+ working_disks++;
}
}
/*
* 0 for a fully functional array, 1 or 2 for a degraded array.
*/
- mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+ mddev->degraded = conf->raid_disks - working_disks;
conf->mddev = mddev;
conf->chunk_size = mddev->chunk_size;
conf->level = mddev->level;
@@ -3218,7 +3439,7 @@ static int run(mddev_t *mddev)
if (mddev->degraded > conf->max_degraded) {
printk(KERN_ERR "raid5: not enough operational devices for %s"
" (%d/%d failed)\n",
- mdname(mddev), conf->failed_disks, conf->raid_disks);
+ mdname(mddev), mddev->degraded, conf->raid_disks);
goto abort;
}
@@ -3299,9 +3520,14 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->issue_flush_fn = raid5_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
+
mddev->array_size = mddev->size * (conf->previous_raid_disks -
conf->max_degraded);
+ blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+
return 0;
abort:
if (conf) {
@@ -3375,7 +3601,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
int i;
seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
- seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+ seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->disks[i].rdev &&
@@ -3397,8 +3623,8 @@ static void print_raid5_conf (raid5_conf_t *conf)
printk("(conf==NULL)\n");
return;
}
- printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
- conf->working_disks, conf->failed_disks);
+ printk(" --- rd:%d wd:%d\n", conf->raid_disks,
+ conf->raid_disks - conf->mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
@@ -3420,11 +3646,11 @@ static int raid5_spare_active(mddev_t *mddev)
tmp = conf->disks + i;
if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags)
- && !test_bit(In_sync, &tmp->rdev->flags)) {
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- conf->failed_disks--;
- conf->working_disks++;
- set_bit(In_sync, &tmp->rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
print_raid5_conf(conf);
@@ -3560,6 +3786,7 @@ static int raid5_start_reshape(mddev_t *mddev)
struct list_head *rtmp;
int spares = 0;
int added_devices = 0;
+ unsigned long flags;
if (mddev->degraded ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3593,7 +3820,6 @@ static int raid5_start_reshape(mddev_t *mddev)
if (raid5_add_disk(mddev, rdev)) {
char nm[20];
set_bit(In_sync, &rdev->flags);
- conf->working_disks++;
added_devices++;
rdev->recovery_offset = 0;
sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3602,10 +3828,12 @@ static int raid5_start_reshape(mddev_t *mddev)
break;
}
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
mddev->raid_disks = conf->raid_disks;
mddev->reshape_position = 0;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -3639,7 +3867,7 @@ static void end_reshape(raid5_conf_t *conf)
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+ i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
@@ -3674,7 +3902,8 @@ static void raid5_quiesce(mddev_t *mddev, int state)
spin_lock_irq(&conf->device_lock);
conf->quiesce = 1;
wait_event_lock_irq(conf->wait_for_stripe,
- atomic_read(&conf->active_stripes) == 0,
+ atomic_read(&conf->active_stripes) == 0 &&
+ atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
spin_unlock_irq(&conf->device_lock);
break;