summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig1
-rw-r--r--drivers/md/bitmap.c18
-rw-r--r--drivers/md/dm-bio-list.h14
-rw-r--r--drivers/md/dm-crypt.c94
-rw-r--r--drivers/md/dm-emc.c10
-rw-r--r--drivers/md/dm-hw-handler.h2
-rw-r--r--drivers/md/dm-io.c15
-rw-r--r--drivers/md/dm-ioctl.c25
-rw-r--r--drivers/md/dm-linear.c4
-rw-r--r--drivers/md/dm-log.c24
-rw-r--r--drivers/md/dm-log.h10
-rw-r--r--drivers/md/dm-mpath.c72
-rw-r--r--drivers/md/dm-mpath.h4
-rw-r--r--drivers/md/dm-path-selector.h12
-rw-r--r--drivers/md/dm-raid1.c47
-rw-r--r--drivers/md/dm-round-robin.c14
-rw-r--r--drivers/md/dm-snap.c48
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm-zero.c2
-rw-r--r--drivers/md/dm.c138
-rw-r--r--drivers/md/dm.h19
-rw-r--r--drivers/md/kcopyd.c6
-rw-r--r--drivers/md/md.c56
-rw-r--r--drivers/md/multipath.c4
-rw-r--r--drivers/md/raid1.c5
-rw-r--r--drivers/md/raid10.c6
-rw-r--r--drivers/md/raid5.c357
27 files changed, 742 insertions, 267 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index c92c1521546d..4540ade6b6b5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -215,6 +215,7 @@ config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM && EXPERIMENTAL
select CRYPTO
+ select CRYPTO_CBC
---help---
This device-mapper target allows you to create a device that
transparently encrypts the data on it. You'll need to activate
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d47d38ac71b1..5432d07c074d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -212,8 +212,8 @@ char *file_path(struct file *file, char *buf, int count)
if (!buf)
return NULL;
- d = file->f_dentry;
- v = file->f_vfsmnt;
+ d = file->f_path.dentry;
+ v = file->f_path.mnt;
buf = d_path(d, v, buf, count);
@@ -349,7 +349,7 @@ static struct page *read_page(struct file *file, unsigned long index,
unsigned long count)
{
struct page *page = NULL;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
struct buffer_head *bh;
sector_t block;
@@ -536,7 +536,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
"-- forcing full recovery\n", bmname(bitmap), events,
(unsigned long long) bitmap->mddev->events);
- sb->state |= BITMAP_STALE;
+ sb->state |= cpu_to_le32(BITMAP_STALE);
}
success:
/* assign fields using values from superblock */
@@ -544,11 +544,11 @@ success:
bitmap->daemon_sleep = daemon_sleep;
bitmap->daemon_lastrun = jiffies;
bitmap->max_write_behind = write_behind;
- bitmap->flags |= sb->state;
+ bitmap->flags |= le32_to_cpu(sb->state);
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
bitmap->flags |= BITMAP_HOSTENDIAN;
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
- if (sb->state & BITMAP_STALE)
+ if (sb->state & cpu_to_le32(BITMAP_STALE))
bitmap->events_cleared = bitmap->mddev->events;
err = 0;
out:
@@ -578,9 +578,9 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
spin_unlock_irqrestore(&bitmap->lock, flags);
sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
switch (op) {
- case MASK_SET: sb->state |= bits;
+ case MASK_SET: sb->state |= cpu_to_le32(bits);
break;
- case MASK_UNSET: sb->state &= ~bits;
+ case MASK_UNSET: sb->state &= cpu_to_le32(~bits);
break;
default: BUG();
}
@@ -662,7 +662,7 @@ static void bitmap_file_put(struct bitmap *bitmap)
bitmap_file_unmap(bitmap);
if (file) {
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
invalidate_inode_pages(inode->i_mapping);
fput(file);
}
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
index bbf4615f0e30..da4349649f7f 100644
--- a/drivers/md/dm-bio-list.h
+++ b/drivers/md/dm-bio-list.h
@@ -44,6 +44,20 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
bl->tail = bl2->tail;
}
+static inline void bio_list_merge_head(struct bio_list *bl,
+ struct bio_list *bl2)
+{
+ if (!bl2->head)
+ return;
+
+ if (bl->head)
+ bl2->tail->bi_next = bl->head;
+ else
+ bl->tail = bl2->tail;
+
+ bl->head = bl2->head;
+}
+
static inline struct bio *bio_list_pop(struct bio_list *bl)
{
struct bio *bio = bl->head;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 655d816760e5..4c2471ee054a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -16,9 +16,11 @@
#include <linux/slab.h>
#include <linux/crypto.h>
#include <linux/workqueue.h>
+#include <linux/backing-dev.h>
#include <asm/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
+#include <asm/unaligned.h>
#include "dm.h"
@@ -84,7 +86,10 @@ struct crypt_config {
*/
struct crypt_iv_operations *iv_gen_ops;
char *iv_mode;
- struct crypto_cipher *iv_gen_private;
+ union {
+ struct crypto_cipher *essiv_tfm;
+ int benbi_shift;
+ } iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
@@ -100,7 +105,7 @@ struct crypt_config {
#define MIN_POOL_PAGES 32
#define MIN_BIO_PAGES 8
-static kmem_cache_t *_crypt_io_pool;
+static struct kmem_cache *_crypt_io_pool;
/*
* Different IV generation algorithms:
@@ -112,6 +117,9 @@ static kmem_cache_t *_crypt_io_pool;
* encrypted with the bulk cipher using a salt as key. The salt
* should be derived from the bulk cipher's key via hashing.
*
+ * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1
+ * (needed for LRW-32-AES and possible other narrow block modes)
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
@@ -190,21 +198,61 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
}
kfree(salt);
- cc->iv_gen_private = essiv_tfm;
+ cc->iv_gen_private.essiv_tfm = essiv_tfm;
return 0;
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
- crypto_free_cipher(cc->iv_gen_private);
- cc->iv_gen_private = NULL;
+ crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
+ cc->iv_gen_private.essiv_tfm = NULL;
}
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
{
memset(iv, 0, cc->iv_size);
*(u64 *)iv = cpu_to_le64(sector);
- crypto_cipher_encrypt_one(cc->iv_gen_private, iv, iv);
+ crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
+ return 0;
+}
+
+static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ unsigned int bs = crypto_blkcipher_blocksize(cc->tfm);
+ int log = ilog2(bs);
+
+ /* we need to calculate how far we must shift the sector count
+ * to get the cipher block count, we use this shift in _gen */
+
+ if (1 << log != bs) {
+ ti->error = "cypher blocksize is not a power of 2";
+ return -EINVAL;
+ }
+
+ if (log > 9) {
+ ti->error = "cypher blocksize is > 512";
+ return -EINVAL;
+ }
+
+ cc->iv_gen_private.benbi_shift = 9 - log;
+
+ return 0;
+}
+
+static void crypt_iv_benbi_dtr(struct crypt_config *cc)
+{
+}
+
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+ __be64 val;
+
+ memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
+
+ val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
+ put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
+
return 0;
}
@@ -218,13 +266,18 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = {
.generator = crypt_iv_essiv_gen
};
+static struct crypt_iv_operations crypt_iv_benbi_ops = {
+ .ctr = crypt_iv_benbi_ctr,
+ .dtr = crypt_iv_benbi_dtr,
+ .generator = crypt_iv_benbi_gen
+};
static int
crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
struct scatterlist *in, unsigned int length,
int write, sector_t sector)
{
- u8 iv[cc->iv_size];
+ u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64))));
struct blkcipher_desc desc = {
.tfm = cc->tfm,
.info = iv,
@@ -457,11 +510,11 @@ static void dec_pending(struct crypt_io *io, int error)
* interrupt context.
*/
static struct workqueue_struct *_kcryptd_workqueue;
-static void kcryptd_do_work(void *data);
+static void kcryptd_do_work(struct work_struct *work);
static void kcryptd_queue_io(struct crypt_io *io)
{
- INIT_WORK(&io->work, kcryptd_do_work, io);
+ INIT_WORK(&io->work, kcryptd_do_work);
queue_work(_kcryptd_workqueue, &io->work);
}
@@ -602,7 +655,7 @@ static void process_write(struct crypt_io *io)
/* out of memory -> run queues */
if (remaining)
- blk_congestion_wait(bio_data_dir(clone), HZ/100);
+ congestion_wait(bio_data_dir(clone), HZ/100);
}
}
@@ -617,9 +670,9 @@ static void process_read_endio(struct crypt_io *io)
dec_pending(io, crypt_convert(cc, &ctx));
}
-static void kcryptd_do_work(void *data)
+static void kcryptd_do_work(struct work_struct *work)
{
- struct crypt_io *io = data;
+ struct crypt_io *io = container_of(work, struct crypt_io, work);
if (io->post_process)
process_read_endio(io);
@@ -767,7 +820,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->tfm = tfm;
/*
- * Choose ivmode. Valid modes: "plain", "essiv:<esshash>".
+ * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
* See comments at iv code
*/
@@ -777,6 +830,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops = &crypt_iv_plain_ops;
else if (strcmp(ivmode, "essiv") == 0)
cc->iv_gen_ops = &crypt_iv_essiv_ops;
+ else if (strcmp(ivmode, "benbi") == 0)
+ cc->iv_gen_ops = &crypt_iv_benbi_ops;
else {
ti->error = "Invalid IV mode";
goto bad2;
@@ -907,15 +962,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
atomic_set(&io->pending, 0);
kcryptd_queue_io(io);
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
static int crypt_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
struct crypt_config *cc = (struct crypt_config *) ti->private;
- const char *cipher;
- const char *chainmode = NULL;
unsigned int sz = 0;
switch (type) {
@@ -924,14 +977,11 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- cipher = crypto_blkcipher_name(cc->tfm);
-
- chainmode = cc->chainmode;
-
if (cc->iv_mode)
- DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode);
+ DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode,
+ cc->iv_mode);
else
- DMEMIT("%s-%s ", cipher, chainmode);
+ DMEMIT("%s-%s ", cc->cipher, cc->chainmode);
if (cc->key_size > 0) {
if ((maxlen - sz) < ((cc->key_size << 1) + 1))
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index 2b2d45d7baaa..265c467854da 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -40,7 +40,7 @@ static inline void free_bio(struct bio *bio)
static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
{
- struct path *path = bio->bi_private;
+ struct dm_path *path = bio->bi_private;
if (bio->bi_size)
return 1;
@@ -61,7 +61,7 @@ static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
return 0;
}
-static struct bio *get_failover_bio(struct path *path, unsigned data_size)
+static struct bio *get_failover_bio(struct dm_path *path, unsigned data_size)
{
struct bio *bio;
struct page *page;
@@ -96,7 +96,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size)
}
static struct request *get_failover_req(struct emc_handler *h,
- struct bio *bio, struct path *path)
+ struct bio *bio, struct dm_path *path)
{
struct request *rq;
struct block_device *bdev = bio->bi_bdev;
@@ -133,7 +133,7 @@ static struct request *get_failover_req(struct emc_handler *h,
}
static struct request *emc_trespass_get(struct emc_handler *h,
- struct path *path)
+ struct dm_path *path)
{
struct bio *bio;
struct request *rq;
@@ -191,7 +191,7 @@ static struct request *emc_trespass_get(struct emc_handler *h,
}
static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
- struct path *path)
+ struct dm_path *path)
{
struct request *rq;
struct request_queue *q = bdev_get_queue(path->dev->bdev);
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
index 15f5629e231a..32eff28e4adc 100644
--- a/drivers/md/dm-hw-handler.h
+++ b/drivers/md/dm-hw-handler.h
@@ -32,7 +32,7 @@ struct hw_handler_type {
void (*destroy) (struct hw_handler *hwh);
void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
- struct path *path);
+ struct dm_path *path);
unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
int (*status) (struct hw_handler *hwh, status_type_t type,
char *result, unsigned int maxlen);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index da663d2ff552..4eb73d395213 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -92,12 +92,12 @@ void dm_io_put(unsigned int num_pages)
*---------------------------------------------------------------*/
static inline void bio_set_region(struct bio *bio, unsigned region)
{
- bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region;
+ bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
}
static inline unsigned bio_get_region(struct bio *bio)
{
- return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len;
+ return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
}
/*-----------------------------------------------------------------
@@ -136,6 +136,7 @@ static int endio(struct bio *bio, unsigned int done, int error)
zero_fill_bio(bio);
dec_count(io, bio_get_region(bio), error);
+ bio->bi_max_vecs++;
bio_put(bio);
return 0;
@@ -250,16 +251,18 @@ static void do_region(int rw, unsigned int region, struct io_region *where,
while (remaining) {
/*
- * Allocate a suitably sized bio, we add an extra
- * bvec for bio_get/set_region().
+ * Allocate a suitably sized-bio: we add an extra
+ * bvec for bio_get/set_region() and decrement bi_max_vecs
+ * to hide it from bio_add_page().
*/
- num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2;
+ num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
bio->bi_private = io;
bio->bi_destructor = dm_bio_destructor;
+ bio->bi_max_vecs--;
bio_set_region(bio, region);
/*
@@ -302,7 +305,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
}
/*
- * Drop the extra refence that we were holding to avoid
+ * Drop the extra reference that we were holding to avoid
* the io being completed too early.
*/
dec_count(io, 0, 0);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index d13bb15a8a02..cd6a184536a1 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -606,9 +606,14 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
return __get_name_cell(param->name);
md = dm_get_md(huge_decode_dev(param->dev));
- if (md)
- mdptr = dm_get_mdptr(md);
+ if (!md)
+ goto out;
+ mdptr = dm_get_mdptr(md);
+ if (!mdptr)
+ dm_put(md);
+
+out:
return mdptr;
}
@@ -760,7 +765,7 @@ out:
static int do_suspend(struct dm_ioctl *param)
{
int r = 0;
- int do_lockfs = 1;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct mapped_device *md;
md = find_device(param);
@@ -768,10 +773,12 @@ static int do_suspend(struct dm_ioctl *param)
return -ENXIO;
if (param->flags & DM_SKIP_LOCKFS_FLAG)
- do_lockfs = 0;
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
- r = dm_suspend(md, do_lockfs);
+ r = dm_suspend(md, suspend_flags);
if (!r)
r = __dev_status(md, param);
@@ -783,7 +790,7 @@ static int do_suspend(struct dm_ioctl *param)
static int do_resume(struct dm_ioctl *param)
{
int r = 0;
- int do_lockfs = 1;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct hash_cell *hc;
struct mapped_device *md;
struct dm_table *new_map;
@@ -809,9 +816,11 @@ static int do_resume(struct dm_ioctl *param)
if (new_map) {
/* Suspend if it isn't already suspended */
if (param->flags & DM_SKIP_LOCKFS_FLAG)
- do_lockfs = 0;
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
- dm_suspend(md, do_lockfs);
+ dm_suspend(md, suspend_flags);
r = dm_swap_table(md, new_map);
if (r) {
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 00234909b3db..17753d80ad22 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -77,7 +77,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio,
bio->bi_bdev = lc->dev->bdev;
bio->bi_sector = lc->start + (bio->bi_sector - ti->begin);
- return 1;
+ return DM_MAPIO_REMAPPED;
}
static int linear_status(struct dm_target *ti, status_type_t type,
@@ -108,7 +108,7 @@ static int linear_ioctl(struct dm_target *ti, struct inode *inode,
struct dentry fake_dentry = {};
fake_file.f_mode = lc->dev->mode;
- fake_file.f_dentry = &fake_dentry;
+ fake_file.f_path.dentry = &fake_dentry;
fake_dentry.d_inode = bdev->bd_inode;
return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg);
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 64b764bd02cc..6a9261351848 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -466,6 +466,7 @@ static int disk_resume(struct dirty_log *log)
/* copy clean across to sync */
memcpy(lc->sync_bits, lc->clean_bits, size);
lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
+ lc->sync_search = 0;
/* set the correct number of regions in the header */
lc->header.nr_regions = lc->region_count;
@@ -480,6 +481,13 @@ static uint32_t core_get_region_size(struct dirty_log *log)
return lc->region_size;
}
+static int core_resume(struct dirty_log *log)
+{
+ struct log_c *lc = (struct log_c *) log->context;
+ lc->sync_search = 0;
+ return 0;
+}
+
static int core_is_clean(struct dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
@@ -549,16 +557,19 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region)
return 1;
}
-static void core_complete_resync_work(struct dirty_log *log, region_t region,
- int success)
+static void core_set_region_sync(struct dirty_log *log, region_t region,
+ int in_sync)
{
struct log_c *lc = (struct log_c *) log->context;
log_clear_bit(lc, lc->recovering_bits, region);
- if (success) {
+ if (in_sync) {
log_set_bit(lc, lc->sync_bits, region);
lc->sync_count++;
- }
+ } else if (log_test_bit(lc->sync_bits, region)) {
+ lc->sync_count--;
+ log_clear_bit(lc, lc->sync_bits, region);
+ }
}
static region_t core_get_sync_count(struct dirty_log *log)
@@ -618,6 +629,7 @@ static struct dirty_log_type _core_type = {
.module = THIS_MODULE,
.ctr = core_ctr,
.dtr = core_dtr,
+ .resume = core_resume,
.get_region_size = core_get_region_size,
.is_clean = core_is_clean,
.in_sync = core_in_sync,
@@ -625,7 +637,7 @@ static struct dirty_log_type _core_type = {
.mark_region = core_mark_region,
.clear_region = core_clear_region,
.get_resync_work = core_get_resync_work,
- .complete_resync_work = core_complete_resync_work,
+ .set_region_sync = core_set_region_sync,
.get_sync_count = core_get_sync_count,
.status = core_status,
};
@@ -644,7 +656,7 @@ static struct dirty_log_type _disk_type = {
.mark_region = core_mark_region,
.clear_region = core_clear_region,
.get_resync_work = core_get_resync_work,
- .complete_resync_work = core_complete_resync_work,
+ .set_region_sync = core_set_region_sync,
.get_sync_count = core_get_sync_count,
.status = disk_status,
};
diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h
index 5ae5309ebf28..86a301c8daf1 100644
--- a/drivers/md/dm-log.h
+++ b/drivers/md/dm-log.h
@@ -90,12 +90,12 @@ struct dirty_log_type {
int (*get_resync_work)(struct dirty_log *log, region_t *region);
/*
- * This notifies the log that the resync of an area has
- * been completed. The log should then mark this region
- * as CLEAN.
+ * This notifies the log that the resync status of a region
+ * has changed. It also clears the region from the recovering
+ * list (if present).
*/
- void (*complete_resync_work)(struct dirty_log *log,
- region_t region, int success);
+ void (*set_region_sync)(struct dirty_log *log,
+ region_t region, int in_sync);
/*
* Returns the number of regions that are in sync.
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d754e0bc6e90..3aa013506967 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -31,7 +31,7 @@ struct pgpath {
struct priority_group *pg; /* Owning PG */
unsigned fail_count; /* Cumulative failure count */
- struct path path;
+ struct dm_path path;
};
#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -101,11 +101,11 @@ typedef int (*action_fn) (struct pgpath *pgpath);
#define MIN_IOS 256 /* Mempool size */
-static kmem_cache_t *_mpio_cache;
+static struct kmem_cache *_mpio_cache;
struct workqueue_struct *kmultipathd;
-static void process_queued_ios(void *data);
-static void trigger_event(void *data);
+static void process_queued_ios(struct work_struct *work);
+static void trigger_event(struct work_struct *work);
/*-----------------------------------------------
@@ -173,8 +173,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
INIT_LIST_HEAD(&m->priority_groups);
spin_lock_init(&m->lock);
m->queue_io = 1;
- INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
- INIT_WORK(&m->trigger_event, trigger_event, m);
+ INIT_WORK(&m->process_queued_ios, process_queued_ios);
+ INIT_WORK(&m->trigger_event, trigger_event);
m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
@@ -229,7 +229,7 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
{
- struct path *path;
+ struct dm_path *path;
path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
if (!path)
@@ -282,10 +282,27 @@ failed:
m->current_pg = NULL;
}
+/*
+ * Check whether bios must be queued in the device-mapper core rather
+ * than here in the target.
+ *
+ * m->lock must be held on entry.
+ *
+ * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
+ * same value then we are not between multipath_presuspend()
+ * and multipath_resume() calls and we have no need to check
+ * for the DMF_NOFLUSH_SUSPENDING flag.
+ */
+static int __must_push_back(struct multipath *m)
+{
+ return (m->queue_if_no_path != m->saved_queue_if_no_path &&
+ dm_noflush_suspending(m->ti));
+}
+
static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
unsigned was_queued)
{
- int r = 1;
+ int r = DM_MAPIO_REMAPPED;
unsigned long flags;
struct pgpath *pgpath;
@@ -310,11 +327,13 @@ static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
!m->queue_io)
queue_work(kmultipathd, &m->process_queued_ios);
pgpath = NULL;
- r = 0;
- } else if (!pgpath)
- r = -EIO; /* Failed */
- else
+ r = DM_MAPIO_SUBMITTED;
+ } else if (pgpath)
bio->bi_bdev = pgpath->path.dev->bdev;
+ else if (__must_push_back(m))
+ r = DM_MAPIO_REQUEUE;
+ else
+ r = -EIO; /* Failed */
mpio->pgpath = pgpath;
@@ -372,16 +391,19 @@ static void dispatch_queued_ios(struct multipath *m)
r = map_io(m, bio, mpio, 1);
if (r < 0)
bio_endio(bio, bio->bi_size, r);
- else if (r == 1)
+ else if (r == DM_MAPIO_REMAPPED)
generic_make_request(bio);
+ else if (r == DM_MAPIO_REQUEUE)
+ bio_endio(bio, bio->bi_size, -EIO);
bio = next;
}
}
-static void process_queued_ios(void *data)
+static void process_queued_ios(struct work_struct *work)
{
- struct multipath *m = (struct multipath *) data;
+ struct multipath *m =
+ container_of(work, struct multipath, process_queued_ios);
struct hw_handler *hwh = &m->hw_handler;
struct pgpath *pgpath = NULL;
unsigned init_required = 0, must_queue = 1;
@@ -421,9 +443,10 @@ out:
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
*/
-static void trigger_event(void *data)
+static void trigger_event(struct work_struct *work)
{
- struct multipath *m = (struct multipath *) data;
+ struct multipath *m =
+ container_of(work, struct multipath, trigger_event);
dm_table_event(m->ti->table);
}
@@ -781,7 +804,7 @@ static int multipath_map(struct dm_target *ti, struct bio *bio,
map_context->ptr = mpio;
bio->bi_rw |= (1 << BIO_RW_FAILFAST);
r = map_io(m, bio, mpio, 0);
- if (r < 0)
+ if (r < 0 || r == DM_MAPIO_REQUEUE)
mempool_free(mpio, m->mpio_pool);
return r;
@@ -955,7 +978,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
/*
* pg_init must call this when it has completed its initialisation
*/
-void dm_pg_init_complete(struct path *path, unsigned err_flags)
+void dm_pg_init_complete(struct dm_path *path, unsigned err_flags)
{
struct pgpath *pgpath = path_to_pgpath(path);
struct priority_group *pg = pgpath->pg;
@@ -1005,7 +1028,10 @@ static int do_end_io(struct multipath *m, struct bio *bio,
spin_lock_irqsave(&m->lock, flags);
if (!m->nr_valid_paths) {
- if (!m->queue_if_no_path) {
+ if (__must_push_back(m)) {
+ spin_unlock_irqrestore(&m->lock, flags);
+ return DM_ENDIO_REQUEUE;
+ } else if (!m->queue_if_no_path) {
spin_unlock_irqrestore(&m->lock, flags);
return -EIO;
} else {
@@ -1040,7 +1066,7 @@ static int do_end_io(struct multipath *m, struct bio *bio,
queue_work(kmultipathd, &m->process_queued_ios);
spin_unlock_irqrestore(&m->lock, flags);
- return 1; /* io not complete */
+ return DM_ENDIO_INCOMPLETE; /* io not complete */
}
static int multipath_end_io(struct dm_target *ti, struct bio *bio,
@@ -1058,7 +1084,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path);
}
- if (r <= 0)
+ if (r != DM_ENDIO_INCOMPLETE)
mempool_free(mpio, m->mpio_pool);
return r;
@@ -1270,7 +1296,7 @@ static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
struct dentry fake_dentry = {};
int r = 0;
- fake_file.f_dentry = &fake_dentry;
+ fake_file.f_path.dentry = &fake_dentry;
spin_lock_irqsave(&m->lock, flags);
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h
index 8a4bf2b6d52e..b9cdcbb3ed59 100644
--- a/drivers/md/dm-mpath.h
+++ b/drivers/md/dm-mpath.h
@@ -11,7 +11,7 @@
struct dm_dev;
-struct path {
+struct dm_path {
struct dm_dev *dev; /* Read-only */
unsigned is_active; /* Read-only */
@@ -20,6 +20,6 @@ struct path {
};
/* Callback for hwh_pg_init_fn to use when complete */
-void dm_pg_init_complete(struct path *path, unsigned err_flags);
+void dm_pg_init_complete(struct dm_path *path, unsigned err_flags);
#endif
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 732d06a84f85..27357b85d73d 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -44,7 +44,7 @@ struct path_selector_type {
* Add an opaque path object, along with some selector specific
* path args (eg, path priority).
*/
- int (*add_path) (struct path_selector *ps, struct path *path,
+ int (*add_path) (struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error);
/*
@@ -55,27 +55,27 @@ struct path_selector_type {
* calling the function again. 0 means don't call it again unless
* the path fails.
*/
- struct path *(*select_path) (struct path_selector *ps,
+ struct dm_path *(*select_path) (struct path_selector *ps,
unsigned *repeat_count);
/*
* Notify the selector that a path has failed.
*/
- void (*fail_path) (struct path_selector *ps, struct path *p);
+ void (*fail_path) (struct path_selector *ps, struct dm_path *p);
/*
* Ask selector to reinstate a path.
*/
- int (*reinstate_path) (struct path_selector *ps, struct path *p);
+ int (*reinstate_path) (struct path_selector *ps, struct dm_path *p);
/*
* Table content based on parameters added in ps_add_path_fn
* or path selector status
*/
- int (*status) (struct path_selector *ps, struct path *path,
+ int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);
- int (*end_io) (struct path_selector *ps, struct path *path);
+ int (*end_io) (struct path_selector *ps, struct dm_path *path);
};
/* Register a path selector */
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 659224cb7c53..23a642619bed 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -24,6 +24,7 @@
static struct workqueue_struct *_kmirrord_wq;
static struct work_struct _kmirrord_work;
+static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
static inline void wake(void)
{
@@ -83,6 +84,7 @@ struct region_hash {
struct list_head *buckets;
spinlock_t region_lock;
+ atomic_t recovery_in_flight;
struct semaphore recovery_count;
struct list_head clean_regions;
struct list_head quiesced_regions;
@@ -191,6 +193,7 @@ static int rh_init(struct region_hash *rh, struct mirror_set *ms,
spin_lock_init(&rh->region_lock);
sema_init(&rh->recovery_count, 0);
+ atomic_set(&rh->recovery_in_flight, 0);
INIT_LIST_HEAD(&rh->clean_regions);
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
@@ -341,6 +344,17 @@ static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
}
}
+static void complete_resync_work(struct region *reg, int success)
+{
+ struct region_hash *rh = reg->rh;
+
+ rh->log->type->set_region_sync(rh->log, reg->key, success);
+ dispatch_bios(rh->ms, &reg->delayed_bios);
+ if (atomic_dec_and_test(&rh->recovery_in_flight))
+ wake_up_all(&_kmirrord_recovery_stopped);
+ up(&rh->recovery_count);
+}
+
static void rh_update_states(struct region_hash *rh)
{
struct region *reg, *next;
@@ -380,9 +394,7 @@ static void rh_update_states(struct region_hash *rh)
*/
list_for_each_entry_safe (reg, next, &recovered, list) {
rh->log->type->clear_region(rh->log, reg->key);
- rh->log->type->complete_resync_work(rh->log, reg->key, 1);
- dispatch_bios(rh->ms, &reg->delayed_bios);
- up(&rh->recovery_count);
+ complete_resync_work(reg, 1);
mempool_free(reg, rh->region_pool);
}
@@ -502,11 +514,21 @@ static int __rh_recovery_prepare(struct region_hash *rh)
static void rh_recovery_prepare(struct region_hash *rh)
{
- while (!down_trylock(&rh->recovery_count))
+ /* Extra reference to avoid race with rh_stop_recovery */
+ atomic_inc(&rh->recovery_in_flight);
+
+ while (!down_trylock(&rh->recovery_count)) {
+ atomic_inc(&rh->recovery_in_flight);
if (__rh_recovery_prepare(rh) <= 0) {
+ atomic_dec(&rh->recovery_in_flight);
up(&rh->recovery_count);
break;
}
+ }
+
+ /* Drop the extra reference */
+ if (atomic_dec_and_test(&rh->recovery_in_flight))
+ wake_up_all(&_kmirrord_recovery_stopped);
}
/*
@@ -868,7 +890,7 @@ static void do_mirror(struct mirror_set *ms)
do_writes(ms, &writes);
}
-static void do_work(void *ignored)
+static void do_work(struct work_struct *ignored)
{
struct mirror_set *ms;
@@ -1122,7 +1144,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
if (rw == WRITE) {
queue_bio(ms, bio, rw);
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
r = ms->rh.log->type->in_sync(ms->rh.log,
@@ -1131,7 +1153,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
return r;
if (r == -EWOULDBLOCK) /* FIXME: ugly */
- r = 0;
+ r = DM_MAPIO_SUBMITTED;
/*
* We don't want to fast track a recovery just for a read
@@ -1144,7 +1166,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
if (!r) {
/* Pass this io over to the daemon */
queue_bio(ms, bio, rw);
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
m = choose_mirror(ms, bio->bi_sector);
@@ -1152,7 +1174,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
return -EIO;
map_bio(ms, m, bio);
- return 1;
+ return DM_MAPIO_REMAPPED;
}
static int mirror_end_io(struct dm_target *ti, struct bio *bio,
@@ -1177,6 +1199,11 @@ static void mirror_postsuspend(struct dm_target *ti)
struct dirty_log *log = ms->rh.log;
rh_stop_recovery(&ms->rh);
+
+ /* Wait for all I/O we generated to complete */
+ wait_event(_kmirrord_recovery_stopped,
+ !atomic_read(&ms->rh.recovery_in_flight));
+
if (log->type->suspend && log->type->suspend(log))
/* FIXME: need better error handling */
DMWARN("log suspend failed");
@@ -1249,7 +1276,7 @@ static int __init dm_mirror_init(void)
dm_dirty_log_exit();
return r;
}
- INIT_WORK(&_kmirrord_work, do_work, NULL);
+ INIT_WORK(&_kmirrord_work, do_work);
r = dm_register_target(&mirror_target);
if (r < 0) {
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index c5a16c550122..a348a97b65af 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -21,7 +21,7 @@
*---------------------------------------------------------------*/
struct path_info {
struct list_head list;
- struct path *path;
+ struct dm_path *path;
unsigned repeat_count;
};
@@ -80,7 +80,7 @@ static void rr_destroy(struct path_selector *ps)
ps->context = NULL;
}
-static int rr_status(struct path_selector *ps, struct path *path,
+static int rr_status(struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen)
{
struct path_info *pi;
@@ -106,7 +106,7 @@ static int rr_status(struct path_selector *ps, struct path *path,
* Called during initialisation to register each path with an
* optional repeat_count.
*/
-static int rr_add_path(struct path_selector *ps, struct path *path,
+static int rr_add_path(struct path_selector *ps, struct dm_path *path,
int argc, char **argv, char **error)
{
struct selector *s = (struct selector *) ps->context;
@@ -136,12 +136,12 @@ static int rr_add_path(struct path_selector *ps, struct path *path,
path->pscontext = pi;
- list_add(&pi->list, &s->valid_paths);
+ list_add_tail(&pi->list, &s->valid_paths);
return 0;
}
-static void rr_fail_path(struct path_selector *ps, struct path *p)
+static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = p->pscontext;
@@ -149,7 +149,7 @@ static void rr_fail_path(struct path_selector *ps, struct path *p)
list_move(&pi->list, &s->invalid_paths);
}
-static int rr_reinstate_path(struct path_selector *ps, struct path *p)
+static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = p->pscontext;
@@ -159,7 +159,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct path *p)
return 0;
}
-static struct path *rr_select_path(struct path_selector *ps,
+static struct dm_path *rr_select_path(struct path_selector *ps,
unsigned *repeat_count)
{
struct selector *s = (struct selector *) ps->context;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5281e0094072..0821a2b68a73 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -39,8 +39,8 @@
*/
#define SNAPSHOT_PAGES 256
-struct workqueue_struct *ksnapd;
-static void flush_queued_bios(void *data);
+static struct workqueue_struct *ksnapd;
+static void flush_queued_bios(struct work_struct *work);
struct pending_exception {
struct exception e;
@@ -88,8 +88,8 @@ struct pending_exception {
* Hash table mapping origin volumes to lists of snapshots and
* a lock to protect it
*/
-static kmem_cache_t *exception_cache;
-static kmem_cache_t *pending_cache;
+static struct kmem_cache *exception_cache;
+static struct kmem_cache *pending_cache;
static mempool_t *pending_pool;
/*
@@ -228,7 +228,7 @@ static int init_exception_table(struct exception_table *et, uint32_t size)
return 0;
}
-static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem)
{
struct list_head *slot;
struct exception *ex, *next;
@@ -528,7 +528,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
bio_list_init(&s->queued_bios);
- INIT_WORK(&s->queued_bios_work, flush_queued_bios, s);
+ INIT_WORK(&s->queued_bios_work, flush_queued_bios);
/* Add snapshot to the list of snapshots for this origin */
/* Exceptions aren't triggered till snapshot_resume() is called */
@@ -564,6 +564,17 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return r;
}
+static void __free_exceptions(struct dm_snapshot *s)
+{
+ kcopyd_client_destroy(s->kcopyd_client);
+ s->kcopyd_client = NULL;
+
+ exit_exception_table(&s->pending, pending_cache);
+ exit_exception_table(&s->complete, exception_cache);
+
+ s->store.destroy(&s->store);
+}
+
static void snapshot_dtr(struct dm_target *ti)
{
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
@@ -574,13 +585,7 @@ static void snapshot_dtr(struct dm_target *ti)
/* After this returns there can be no new kcopyd jobs. */
unregister_snapshot(s);
- kcopyd_client_destroy(s->kcopyd_client);
-
- exit_exception_table(&s->pending, pending_cache);
- exit_exception_table(&s->complete, exception_cache);
-
- /* Deallocate memory used */
- s->store.destroy(&s->store);
+ __free_exceptions(s);
dm_put_device(ti, s->origin);
dm_put_device(ti, s->cow);
@@ -603,9 +608,10 @@ static void flush_bios(struct bio *bio)
}
}
-static void flush_queued_bios(void *data)
+static void flush_queued_bios(struct work_struct *work)
{
- struct dm_snapshot *s = (struct dm_snapshot *) data;
+ struct dm_snapshot *s =
+ container_of(work, struct dm_snapshot, queued_bios_work);
struct bio *queued_bios;
unsigned long flags;
@@ -867,7 +873,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
{
struct exception *e;
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
- int r = 1;
+ int r = DM_MAPIO_REMAPPED;
chunk_t chunk;
struct pending_exception *pe = NULL;
@@ -913,7 +919,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
remap_exception(s, &pe->e, bio);
bio_list_add(&pe->snapshot_bios, bio);
- r = 0;
+ r = DM_MAPIO_SUBMITTED;
if (!pe->started) {
/* this is protected by snap->lock */
@@ -991,7 +997,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
*---------------------------------------------------------------*/
static int __origin_write(struct list_head *snapshots, struct bio *bio)
{
- int r = 1, first = 0;
+ int r = DM_MAPIO_REMAPPED, first = 0;
struct dm_snapshot *snap;
struct exception *e;
struct pending_exception *pe, *next_pe, *primary_pe = NULL;
@@ -1049,7 +1055,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
bio_list_add(&primary_pe->origin_bios, bio);
- r = 0;
+ r = DM_MAPIO_SUBMITTED;
}
if (!pe->primary_pe) {
@@ -1098,7 +1104,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
static int do_origin(struct dm_dev *origin, struct bio *bio)
{
struct origin *o;
- int r = 1;
+ int r = DM_MAPIO_REMAPPED;
down_read(&_origins_lock);
o = __lookup_origin(origin->bdev);
@@ -1155,7 +1161,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
return -EOPNOTSUPP;
/* Only tell snapshots if this is a write */
- return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
+ return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
}
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 6c29fcecd892..51f5e0760012 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -186,7 +186,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
bio->bi_bdev = sc->stripe[stripe].dev->bdev;
bio->bi_sector = sc->stripe[stripe].physical_start +
(chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
- return 1;
+ return DM_MAPIO_REMAPPED;
}
static int stripe_status(struct dm_target *ti,
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index ea569f7348d2..f314d7dc9c26 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -46,7 +46,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio,
bio_endio(bio, bio->bi_size, 0);
/* accepted bio, don't make new request */
- return 0;
+ return DM_MAPIO_SUBMITTED;
}
static struct target_type zero_target = {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b5764a86c8b5..fe7c56e10435 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -68,10 +68,12 @@ union map_info *dm_get_mapinfo(struct bio *bio)
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
+#define DMF_NOFLUSH_SUSPENDING 5
struct mapped_device {
struct rw_semaphore io_lock;
struct semaphore suspend_lock;
+ spinlock_t pushback_lock;
rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
@@ -89,7 +91,8 @@ struct mapped_device {
*/
atomic_t pending;
wait_queue_head_t wait;
- struct bio_list deferred;
+ struct bio_list deferred;
+ struct bio_list pushback;
/*
* The current mapping.
@@ -121,8 +124,8 @@ struct mapped_device {
};
#define MIN_IOS 256
-static kmem_cache_t *_io_cache;
-static kmem_cache_t *_tio_cache;
+static struct kmem_cache *_io_cache;
+static struct kmem_cache *_tio_cache;
static int __init local_init(void)
{
@@ -444,23 +447,50 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
* you this clearly demarcated crap.
*---------------------------------------------------------------*/
+static int __noflush_suspending(struct mapped_device *md)
+{
+ return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+}
+
/*
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
static void dec_pending(struct dm_io *io, int error)
{
- if (error)
+ unsigned long flags;
+
+ /* Push-back supersedes any I/O errors */
+ if (error && !(io->error > 0 && __noflush_suspending(io->md)))
io->error = error;
if (atomic_dec_and_test(&io->io_count)) {
+ if (io->error == DM_ENDIO_REQUEUE) {
+ /*
+ * Target requested pushing back the I/O.
+ * This must be handled before the sleeper on
+ * suspend queue merges the pushback list.
+ */
+ spin_lock_irqsave(&io->md->pushback_lock, flags);
+ if (__noflush_suspending(io->md))
+ bio_list_add(&io->md->pushback, io->bio);
+ else
+ /* noflush suspend was interrupted. */
+ io->error = -EIO;
+ spin_unlock_irqrestore(&io->md->pushback_lock, flags);
+ }
+
if (end_io_acct(io))
/* nudge anyone waiting on suspend queue */
wake_up(&io->md->wait);
- blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+ if (io->error != DM_ENDIO_REQUEUE) {
+ blk_add_trace_bio(io->md->queue, io->bio,
+ BLK_TA_COMPLETE);
+
+ bio_endio(io->bio, io->bio->bi_size, io->error);
+ }
- bio_endio(io->bio, io->bio->bi_size, io->error);
free_io(io->md, io);
}
}
@@ -480,12 +510,19 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
if (endio) {
r = endio(tio->ti, bio, error, &tio->info);
- if (r < 0)
+ if (r < 0 || r == DM_ENDIO_REQUEUE)
+ /*
+ * error and requeue request are handled
+ * in dec_pending().
+ */
error = r;
-
- else if (r > 0)
- /* the target wants another shot at the io */
+ else if (r == DM_ENDIO_INCOMPLETE)
+ /* The target will handle the io */
return 1;
+ else if (r) {
+ DMWARN("unimplemented target endio return value: %d", r);
+ BUG();
+ }
}
dec_pending(tio->io, error);
@@ -543,7 +580,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
atomic_inc(&tio->io->io_count);
sector = clone->bi_sector;
r = ti->type->map(ti, clone, &tio->info);
- if (r > 0) {
+ if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
@@ -551,10 +588,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
clone->bi_sector);
generic_make_request(clone);
- }
-
- else if (r < 0) {
- /* error the io and bail out */
+ } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
+ /* error the io and bail out, or requeue it if needed */
md = tio->io->md;
dec_pending(tio->io, r);
/*
@@ -563,6 +598,9 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
clone->bi_private = md->bs;
bio_put(clone);
free_tio(md, tio);
+ } else if (r) {
+ DMWARN("unimplemented target map return value: %d", r);
+ BUG();
}
}
@@ -948,6 +986,7 @@ static struct mapped_device *alloc_dev(int minor)
memset(md, 0, sizeof(*md));
init_rwsem(&md->io_lock);
init_MUTEX(&md->suspend_lock);
+ spin_lock_init(&md->pushback_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
@@ -966,8 +1005,8 @@ static struct mapped_device *alloc_dev(int minor)
md->queue->issue_flush_fn = dm_flush_all;
md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
- if (!md->io_pool)
- goto bad2;
+ if (!md->io_pool)
+ goto bad2;
md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
if (!md->tio_pool)
@@ -1275,20 +1314,30 @@ static void unlock_fs(struct mapped_device *md)
* dm_bind_table, dm_suspend must be called to flush any in
* flight bios and ensure that any further io gets deferred.
*/
-int dm_suspend(struct mapped_device *md, int do_lockfs)
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
+ unsigned long flags;
DECLARE_WAITQUEUE(wait, current);
struct bio *def;
int r = -EINVAL;
+ int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
+ int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
down(&md->suspend_lock);
if (dm_suspended(md))
- goto out;
+ goto out_unlock;
map = dm_get_table(md);
+ /*
+ * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
+ * This flag is cleared before dm_suspend returns.
+ */
+ if (noflush)
+ set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+
/* This does not get reverted if there's an error later. */
dm_table_presuspend_targets(map);
@@ -1296,11 +1345,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
if (!md->suspended_bdev) {
DMWARN("bdget failed in dm_suspend");
r = -ENOMEM;
- goto out;
+ goto flush_and_out;
}
- /* Flush I/O to the device. */
- if (do_lockfs) {
+ /*
+ * Flush I/O to the device.
+ * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os.
+ */
+ if (do_lockfs && !noflush) {
r = lock_fs(md);
if (r)
goto out;
@@ -1336,6 +1388,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
down_write(&md->io_lock);
remove_wait_queue(&md->wait, &wait);
+ if (noflush) {
+ spin_lock_irqsave(&md->pushback_lock, flags);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ bio_list_merge_head(&md->deferred, &md->pushback);
+ bio_list_init(&md->pushback);
+ spin_unlock_irqrestore(&md->pushback_lock, flags);
+ }
+
/* were we interrupted ? */
r = -EINTR;
if (atomic_read(&md->pending)) {
@@ -1344,7 +1404,7 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
__flush_deferred_io(md, def);
up_write(&md->io_lock);
unlock_fs(md);
- goto out;
+ goto out; /* pushback list is already flushed, so skip flush */
}
up_write(&md->io_lock);
@@ -1354,6 +1414,25 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
r = 0;
+flush_and_out:
+ if (r && noflush) {
+ /*
+ * Because there may be already I/Os in the pushback list,
+ * flush them before return.
+ */
+ down_write(&md->io_lock);
+
+ spin_lock_irqsave(&md->pushback_lock, flags);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ bio_list_merge_head(&md->deferred, &md->pushback);
+ bio_list_init(&md->pushback);
+ spin_unlock_irqrestore(&md->pushback_lock, flags);
+
+ def = bio_list_get(&md->deferred);
+ __flush_deferred_io(md, def);
+ up_write(&md->io_lock);
+ }
+
out:
if (r && md->suspended_bdev) {
bdput(md->suspended_bdev);
@@ -1361,6 +1440,8 @@ out:
}
dm_table_put(map);
+
+out_unlock:
up(&md->suspend_lock);
return r;
}
@@ -1438,6 +1519,17 @@ int dm_suspended(struct mapped_device *md)
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_noflush_suspending(struct dm_target *ti)
+{
+ struct mapped_device *md = dm_table_get_md(ti->table);
+ int r = __noflush_suspending(md);
+
+ dm_put(md);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_noflush_suspending);
+
static struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a48ec5e3c1f4..2f796b1436b2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -33,6 +33,25 @@
#define SECTOR_SHIFT 9
/*
+ * Definitions of return values from target end_io function.
+ */
+#define DM_ENDIO_INCOMPLETE 1
+#define DM_ENDIO_REQUEUE 2
+
+/*
+ * Definitions of return values from target map function.
+ */
+#define DM_MAPIO_SUBMITTED 0
+#define DM_MAPIO_REMAPPED 1
+#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE
+
+/*
+ * Suspend feature flags
+ */
+#define DM_SUSPEND_LOCKFS_FLAG (1 << 0)
+#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
+
+/*
* List of devices that a metadevice uses and should open/close.
*/
struct dm_dev {
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index f1db6eff4857..b46f6c575f7e 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -203,7 +203,7 @@ struct kcopyd_job {
/* FIXME: this should scale with the number of pages */
#define MIN_JOBS 512
-static kmem_cache_t *_job_cache;
+static struct kmem_cache *_job_cache;
static mempool_t *_job_pool;
/*
@@ -417,7 +417,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
/*
* kcopyd does this every time it's woken up.
*/
-static void do_work(void *ignored)
+static void do_work(struct work_struct *ignored)
{
/*
* The order that these are called is *very* important.
@@ -628,7 +628,7 @@ static int kcopyd_init(void)
}
kcopyd_clients++;
- INIT_WORK(&_kcopyd_work, do_work, NULL);
+ INIT_WORK(&_kcopyd_work, do_work);
mutex_unlock(&kcopyd_init_lock);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 57fa64f93e5f..21e2a7b08841 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -39,10 +39,10 @@
#include <linux/raid/bitmap.h>
#include <linux/sysctl.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
-#include <linux/suspend.h>
#include <linux/poll.h>
#include <linux/mutex.h>
#include <linux/ctype.h>
+#include <linux/freezer.h>
#include <linux/init.h>
@@ -974,12 +974,13 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
* version 1 superblock
*/
-static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
{
- unsigned int disk_csum, csum;
+ __le32 disk_csum;
+ u32 csum;
unsigned long long newcsum;
int size = 256 + le32_to_cpu(sb->max_dev)*2;
- unsigned int *isuper = (unsigned int*)sb;
+ __le32 *isuper = (__le32*)sb;
int i;
disk_csum = sb->sb_csum;
@@ -989,7 +990,7 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
newcsum += le32_to_cpu(*isuper++);
if (size == 2)
- newcsum += le16_to_cpu(*(unsigned short*) isuper);
+ newcsum += le16_to_cpu(*(__le16*) isuper);
csum = (newcsum & 0xffffffff) + (newcsum >> 32);
sb->sb_csum = disk_csum;
@@ -1106,7 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (le32_to_cpu(sb->chunksize))
rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
- if (le32_to_cpu(sb->size) > rdev->size*2)
+ if (le64_to_cpu(sb->size) > rdev->size*2)
return -EINVAL;
return ret;
}
@@ -1228,7 +1229,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
else
sb->resync_offset = cpu_to_le64(0);
- sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
+ sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
sb->raid_disks = cpu_to_le32(mddev->raid_disks);
sb->size = cpu_to_le64(mddev->size<<1);
@@ -1412,7 +1413,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
if (IS_ERR(bdev)) {
printk(KERN_ERR "md: could not open %s.\n",
__bdevname(dev, b));
@@ -1422,7 +1423,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
if (err) {
printk(KERN_ERR "md: could not bd_claim %s.\n",
bdevname(bdev, b));
- blkdev_put_partition(bdev);
+ blkdev_put(bdev);
return err;
}
rdev->bdev = bdev;
@@ -1436,7 +1437,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
if (!bdev)
MD_BUG();
bd_release(bdev);
- blkdev_put_partition(bdev);
+ blkdev_put(bdev);
}
void md_autodetect_dev(dev_t dev);
@@ -2002,6 +2003,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
kobject_init(&rdev->kobj);
rdev->desc_nr = -1;
+ rdev->saved_raid_disk = -1;
rdev->flags = 0;
rdev->data_offset = 0;
rdev->sb_events = 0;
@@ -3198,6 +3200,7 @@ static int do_md_run(mddev_t * mddev)
mddev->changed = 1;
md_new_event(mddev);
+ kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
return 0;
}
@@ -3311,6 +3314,10 @@ static int do_md_stop(mddev_t * mddev, int mode)
module_put(mddev->pers->owner);
mddev->pers = NULL;
+
+ set_capacity(disk, 0);
+ mddev->changed = 1;
+
if (mddev->ro)
mddev->ro = 0;
}
@@ -3330,7 +3337,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
if (mode == 0) {
mdk_rdev_t *rdev;
struct list_head *tmp;
- struct gendisk *disk;
+
printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
bitmap_destroy(mddev);
@@ -3355,10 +3362,6 @@ static int do_md_stop(mddev_t * mddev, int mode)
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
- disk = mddev->gendisk;
- if (disk)
- set_capacity(disk, 0);
- mddev->changed = 1;
} else if (mddev->pers)
printk(KERN_INFO "md: %s switched to read-only mode.\n",
mdname(mddev));
@@ -3368,6 +3371,7 @@ out:
return err;
}
+#ifndef MODULE
static void autorun_array(mddev_t *mddev)
{
mdk_rdev_t *rdev;
@@ -3482,6 +3486,7 @@ static void autorun_devices(int part)
}
printk(KERN_INFO "md: ... autorun DONE.\n");
}
+#endif /* !MODULE */
static int get_version(void __user * arg)
{
@@ -3719,6 +3724,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (err)
export_rdev(rdev);
+ md_update_sb(mddev, 1);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return err;
@@ -4043,11 +4049,8 @@ static int update_size(mddev_t *mddev, unsigned long size)
return -EBUSY;
ITERATE_RDEV(mddev,rdev,tmp) {
sector_t avail;
- if (rdev->sb_offset > rdev->data_offset)
- avail = (rdev->sb_offset*2) - rdev->data_offset;
- else
- avail = get_capacity(rdev->bdev->bd_disk)
- - rdev->data_offset;
+ avail = rdev->size * 2;
+
if (fit && (size == 0 || size > avail/2))
size = avail/2;
if (avail < ((sector_t)size << 1))
@@ -4423,7 +4426,7 @@ static int md_open(struct inode *inode, struct file *file)
mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
int err;
- if ((err = mddev_lock(mddev)))
+ if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
goto out;
err = 0;
@@ -4486,6 +4489,7 @@ static int md_thread(void * arg)
* many dirty RAID5 blocks.
*/
+ current->flags |= PF_NOFREEZE;
allow_signal(SIGKILL);
while (!kthread_should_stop()) {
@@ -4502,7 +4506,6 @@ static int md_thread(void * arg)
test_bit(THREAD_WAKEUP, &thread->flags)
|| kthread_should_stop(),
thread->timeout);
- try_to_freeze();
clear_bit(THREAD_WAKEUP, &thread->flags);
@@ -4846,8 +4849,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
chunk_kb ? "KB" : "B");
if (bitmap->file) {
seq_printf(seq, ", file: ");
- seq_path(seq, bitmap->file->f_vfsmnt,
- bitmap->file->f_dentry," \t\n");
+ seq_path(seq, bitmap->file->f_path.mnt,
+ bitmap->file->f_path.dentry," \t\n");
}
seq_printf(seq, "\n");
@@ -4912,6 +4915,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
}
static struct file_operations md_seq_fops = {
+ .owner = THIS_MODULE,
.open = md_seq_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -5272,7 +5276,6 @@ void md_do_sync(mddev_t *mddev)
mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
- test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -5296,6 +5299,7 @@ void md_do_sync(mddev_t *mddev)
rdev->recovery_offset = mddev->curr_resync;
}
}
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
skip:
mddev->curr_resync = 0;
@@ -5592,7 +5596,7 @@ static void autostart_arrays(int part)
autorun_devices(part);
}
-#endif
+#endif /* !MODULE */
static __exit void md_exit(void)
{
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 171ff41b52b0..14da37fee37b 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -277,6 +277,7 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
conf->working_disks--;
+ mddev->degraded++;
printk(KERN_ALERT "multipath: IO failure on %s,"
" disabling IO path. \n Operation continuing"
" on %d IO paths.\n",
@@ -336,6 +337,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
conf->working_disks++;
+ mddev->degraded--;
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
@@ -501,7 +503,7 @@ static int multipath_run (mddev_t *mddev)
mdname(mddev));
goto out_free_conf;
}
- mddev->degraded = conf->raid_disks = conf->working_disks;
+ mddev->degraded = conf->raid_disks - conf->working_disks;
conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS,
sizeof(struct multipath_bh));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index dc9d2def0270..b3c5e12f081d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1474,8 +1474,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
"raid1:%s: read error corrected "
"(%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)sect +
- rdev->data_offset,
+ (unsigned long long)(sect +
+ rdev->data_offset),
bdevname(rdev->bdev, b));
}
}
@@ -1951,6 +1951,7 @@ static int run(mddev_t *mddev)
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
mddev->degraded++;
+ conf->fullsync = 1;
}
}
if (mddev->degraded == conf->raid_disks) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1250f0eab4af..7492d6033ac6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1470,8 +1470,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"raid10:%s: read error corrected"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)sect+
- rdev->data_offset,
+ (unsigned long long)(sect+
+ rdev->data_offset),
bdevname(rdev->bdev, b));
rdev_dec_pending(rdev, mddev);
@@ -2079,7 +2079,7 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + i;
if (!disk->rdev ||
- !test_bit(In_sync, &rdev->flags)) {
+ !test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
mddev->degraded++;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e14f45780720..377f8bc9b78b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
list_add_tail(&sh->lru, &conf->inactive_list);
wake_up(&conf->wait_for_stripe);
+ if (conf->retry_read_aligned)
+ md_wakeup_thread(conf->mddev->thread);
}
}
}
@@ -348,7 +350,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
static int grow_stripes(raid5_conf_t *conf, int num)
{
- kmem_cache_t *sc;
+ struct kmem_cache *sc;
int devs = conf->raid_disks;
sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
@@ -397,7 +399,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
LIST_HEAD(newstripes);
struct disk_info *ndisks;
int err = 0;
- kmem_cache_t *sc;
+ struct kmem_cache *sc;
int i;
if (newsize <= conf->pool_size)
@@ -542,35 +544,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
}
if (uptodate) {
-#if 0
- struct bio *bio;
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- /* we can return a buffer if we bypassed the cache or
- * if the top buffer is not in highmem. If there are
- * multiple buffers, leave the extra work to
- * handle_stripe
- */
- buffer = sh->bh_read[i];
- if (buffer &&
- (!PageHighMem(buffer->b_page)
- || buffer->b_page == bh->b_page )
- ) {
- sh->bh_read[i] = buffer->b_reqnext;
- buffer->b_reqnext = NULL;
- } else
- buffer = NULL;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (sh->bh_page[i]==bh->b_page)
- set_buffer_uptodate(bh);
- if (buffer) {
- if (buffer->b_page != bh->b_page)
- memcpy(buffer->b_data, bh->b_data, bh->b_size);
- buffer->b_end_io(buffer, 1);
- }
-#else
set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
rdev = conf->disks[i].rdev;
printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
@@ -616,14 +590,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
}
}
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-#if 0
- /* must restore b_page before unlocking buffer... */
- if (sh->bh_page[i] != bh->b_page) {
- bh->b_page = sh->bh_page[i];
- bh->b_data = page_address(bh->b_page);
- clear_buffer_uptodate(bh);
- }
-#endif
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
@@ -821,7 +787,8 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
static sector_t compute_blocknr(struct stripe_head *sh, int i)
{
raid5_conf_t *conf = sh->raid_conf;
- int raid_disks = sh->disks, data_disks = raid_disks - 1;
+ int raid_disks = sh->disks;
+ int data_disks = raid_disks - conf->max_degraded;
sector_t new_sector = sh->sector, check;
int sectors_per_chunk = conf->chunk_size >> 9;
sector_t stripe;
@@ -857,7 +824,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
}
break;
case 6:
- data_disks = raid_disks - 2;
if (i == raid6_next_disk(sh->pd_idx, raid_disks))
return 0; /* It is the Q disk */
switch (conf->algorithm) {
@@ -1353,8 +1319,10 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
int pd_idx, dd_idx;
int chunk_offset = sector_div(stripe, sectors_per_chunk);
- raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
- + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+ raid5_compute_sector(stripe * (disks - conf->max_degraded)
+ *sectors_per_chunk + chunk_offset,
+ disks, disks - conf->max_degraded,
+ &dd_idx, &pd_idx, conf);
return pd_idx;
}
@@ -1615,15 +1583,6 @@ static void handle_stripe5(struct stripe_head *sh)
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
-#if 0
- /* if I am just reading this block and we don't have
- a failed drive, or any pending writes then sidestep the cache */
- if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
- ! syncing && !failed && !to_write) {
- sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
- sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
- }
-#endif
locked++;
PRINTK("Reading block %d (sync=%d)\n",
i, syncing);
@@ -1641,9 +1600,6 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
(!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-|| sh->bh_page[i]!=bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)
@@ -1655,9 +1611,6 @@ static void handle_stripe5(struct stripe_head *sh)
/* Would I have to read this buffer for reconstruct_write */
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
(!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-|| sh->bh_page[i] != bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)) rcw++;
@@ -1865,7 +1818,9 @@ static void handle_stripe5(struct stripe_head *sh)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
for (i=disks; i-- ;) {
int rw;
@@ -2193,15 +2148,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
-#if 0
- /* if I am just reading this block and we don't have
- a failed drive, or any pending writes then sidestep the cache */
- if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
- ! syncing && !failed && !to_write) {
- sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
- sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
- }
-#endif
locked++;
PRINTK("Reading block %d (sync=%d)\n",
i, syncing);
@@ -2220,9 +2166,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
if (!test_bit(R5_OVERWRITE, &dev->flags)
&& i != pd_idx && i != qd_idx
&& (!test_bit(R5_LOCKED, &dev->flags)
-#if 0
- || sh->bh_page[i] != bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)) rcw++;
@@ -2418,7 +2361,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
for (i=disks; i-- ;) {
int rw;
@@ -2611,6 +2556,180 @@ static int raid5_congested(void *data, int bits)
return 0;
}
+/* We want read requests to align with chunks where possible,
+ * but write requests don't need to.
+ */
+static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
+{
+ mddev_t *mddev = q->queuedata;
+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ int max;
+ unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int bio_sectors = bio->bi_size >> 9;
+
+ if (bio_data_dir(bio))
+ return biovec->bv_len; /* always allow writes to be mergeable */
+
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+ if (max < 0) max = 0;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ else
+ return max;
+}
+
+
+static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
+{
+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int bio_sectors = bio->bi_size >> 9;
+
+ return chunk_sectors >=
+ ((sector & (chunk_sectors - 1)) + bio_sectors);
+}
+
+/*
+ * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
+ * later sampled by raid5d.
+ */
+static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+
+ bi->bi_next = conf->retry_read_aligned_list;
+ conf->retry_read_aligned_list = bi;
+
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ md_wakeup_thread(conf->mddev->thread);
+}
+
+
+static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
+{
+ struct bio *bi;
+
+ bi = conf->retry_read_aligned;
+ if (bi) {
+ conf->retry_read_aligned = NULL;
+ return bi;
+ }
+ bi = conf->retry_read_aligned_list;
+ if(bi) {
+ conf->retry_read_aligned = bi->bi_next;
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* biased count of active stripes */
+ bi->bi_hw_segments = 0; /* count of processed stripes */
+ }
+
+ return bi;
+}
+
+
+/*
+ * The "raid5_align_endio" should check if the read succeeded and if it
+ * did, call bio_endio on the original bio (having bio_put the new bio
+ * first).
+ * If the read failed..
+ */
+static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
+{
+ struct bio* raid_bi = bi->bi_private;
+ mddev_t *mddev;
+ raid5_conf_t *conf;
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ mdk_rdev_t *rdev;
+
+ if (bi->bi_size)
+ return 1;
+ bio_put(bi);
+
+ mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
+ conf = mddev_to_conf(mddev);
+ rdev = (void*)raid_bi->bi_next;
+ raid_bi->bi_next = NULL;
+
+ rdev_dec_pending(rdev, conf->mddev);
+
+ if (!error && uptodate) {
+ bio_endio(raid_bi, bytes, 0);
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return 0;
+ }
+
+
+ PRINTK("raid5_align_endio : io error...handing IO for a retry\n");
+
+ add_bio_to_retry(raid_bi, conf);
+ return 0;
+}
+
+static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
+{
+ mddev_t *mddev = q->queuedata;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+ const unsigned int raid_disks = conf->raid_disks;
+ const unsigned int data_disks = raid_disks - conf->max_degraded;
+ unsigned int dd_idx, pd_idx;
+ struct bio* align_bi;
+ mdk_rdev_t *rdev;
+
+ if (!in_chunk_boundary(mddev, raid_bio)) {
+ printk("chunk_aligned_read : non aligned\n");
+ return 0;
+ }
+ /*
+ * use bio_clone to make a copy of the bio
+ */
+ align_bi = bio_clone(raid_bio, GFP_NOIO);
+ if (!align_bi)
+ return 0;
+ /*
+ * set bi_end_io to a new function, and set bi_private to the
+ * original bio.
+ */
+ align_bi->bi_end_io = raid5_align_endio;
+ align_bi->bi_private = raid_bio;
+ /*
+ * compute position
+ */
+ align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector,
+ raid_disks,
+ data_disks,
+ &dd_idx,
+ &pd_idx,
+ conf);
+
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+ if (rdev && test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ raid_bio->bi_next = (void*)rdev;
+ align_bi->bi_bdev = rdev->bdev;
+ align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
+ align_bi->bi_sector += rdev->data_offset;
+
+ spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0,
+ conf->device_lock, /* nothing */);
+ atomic_inc(&conf->active_aligned_reads);
+ spin_unlock_irq(&conf->device_lock);
+
+ generic_make_request(align_bi);
+ return 1;
+ } else {
+ rcu_read_unlock();
+ bio_put(align_bi);
+ return 0;
+ }
+}
+
+
static int make_request(request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
@@ -2632,6 +2751,11 @@ static int make_request(request_queue_t *q, struct bio * bi)
disk_stat_inc(mddev->gendisk, ios[rw]);
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
+ if (bio_data_dir(bi) == READ &&
+ mddev->reshape_position == MaxSector &&
+ chunk_aligned_read(q,bi))
+ return 0;
+
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
@@ -2739,7 +2863,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
return 0;
}
@@ -2950,6 +3076,74 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
return STRIPE_SECTORS;
}
+static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
+{
+ /* We may not be able to submit a whole bio at once as there
+ * may not be enough stripe_heads available.
+ * We cannot pre-allocate enough stripe_heads as we may need
+ * more than exist in the cache (if we allow ever large chunks).
+ * So we do one stripe head at a time and record in
+ * ->bi_hw_segments how many have been done.
+ *
+ * We *know* that this entire raid_bio is in one chunk, so
+ * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
+ */
+ struct stripe_head *sh;
+ int dd_idx, pd_idx;
+ sector_t sector, logical_sector, last_sector;
+ int scnt = 0;
+ int remaining;
+ int handled = 0;
+
+ logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ sector = raid5_compute_sector( logical_sector,
+ conf->raid_disks,
+ conf->raid_disks - conf->max_degraded,
+ &dd_idx,
+ &pd_idx,
+ conf);
+ last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS, scnt++) {
+
+ if (scnt < raid_bio->bi_hw_segments)
+ /* already done this stripe */
+ continue;
+
+ sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+
+ if (!sh) {
+ /* failed to get a stripe - must wait */
+ raid_bio->bi_hw_segments = scnt;
+ conf->retry_read_aligned = raid_bio;
+ return handled;
+ }
+
+ set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
+ add_stripe_bio(sh, raid_bio, dd_idx, 0);
+ handle_stripe(sh, NULL);
+ release_stripe(sh);
+ handled++;
+ }
+ spin_lock_irq(&conf->device_lock);
+ remaining = --raid_bio->bi_phys_segments;
+ spin_unlock_irq(&conf->device_lock);
+ if (remaining == 0) {
+ int bytes = raid_bio->bi_size;
+
+ raid_bio->bi_size = 0;
+ raid_bio->bi_end_io(raid_bio, bytes,
+ test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
+ ? 0 : -EIO);
+ }
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return handled;
+}
+
+
+
/*
* This is our raid5 kernel thread.
*
@@ -2971,6 +3165,7 @@ static void raid5d (mddev_t *mddev)
spin_lock_irq(&conf->device_lock);
while (1) {
struct list_head *first;
+ struct bio *bio;
if (conf->seq_flush != conf->seq_write) {
int seq = conf->seq_flush;
@@ -2987,6 +3182,16 @@ static void raid5d (mddev_t *mddev)
!list_empty(&conf->delayed_list))
raid5_activate_delayed(conf);
+ while ((bio = remove_bio_from_retry(conf))) {
+ int ok;
+ spin_unlock_irq(&conf->device_lock);
+ ok = retry_aligned_read(conf, bio);
+ spin_lock_irq(&conf->device_lock);
+ if (!ok)
+ break;
+ handled++;
+ }
+
if (list_empty(&conf->handle_list))
break;
@@ -3174,6 +3379,7 @@ static int run(mddev_t *mddev)
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
+ atomic_set(&conf->active_aligned_reads, 0);
PRINTK("raid5: run(%s) called.\n", mdname(mddev));
@@ -3320,6 +3526,8 @@ static int run(mddev_t *mddev)
mddev->array_size = mddev->size * (conf->previous_raid_disks -
conf->max_degraded);
+ blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+
return 0;
abort:
if (conf) {
@@ -3659,7 +3867,7 @@ static void end_reshape(raid5_conf_t *conf)
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+ i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
@@ -3694,7 +3902,8 @@ static void raid5_quiesce(mddev_t *mddev, int state)
spin_lock_irq(&conf->device_lock);
conf->quiesce = 1;
wait_event_lock_irq(conf->wait_for_stripe,
- atomic_read(&conf->active_stripes) == 0,
+ atomic_read(&conf->active_stripes) == 0 &&
+ atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
spin_unlock_irq(&conf->device_lock);
break;