diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 24 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 47 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 486 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 32 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 51 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 664 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 95 | ||||
-rw-r--r-- | drivers/md/dm-snap.h | 50 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 105 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 24 | ||||
-rw-r--r-- | drivers/md/dm.c | 238 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 398 | ||||
-rw-r--r-- | drivers/md/mktables.c | 187 | ||||
-rw-r--r-- | drivers/md/multipath.c | 2 | ||||
-rw-r--r-- | drivers/md/raid0.c | 8 | ||||
-rw-r--r-- | drivers/md/raid1.c | 5 | ||||
-rw-r--r-- | drivers/md/raid10.c | 7 | ||||
-rw-r--r-- | drivers/md/raid5.c | 48 | ||||
-rw-r--r-- | drivers/md/raid6test/test.c | 117 |
23 files changed, 1907 insertions, 689 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3fa7c77d9bd9..610af916891e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -204,7 +204,7 @@ config BLK_DEV_DM config DM_DEBUG boolean "Device mapper debugging support" - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM ---help--- Enable this for messages that may help debug device-mapper problems. @@ -212,7 +212,7 @@ config DM_DEBUG config DM_CRYPT tristate "Crypt target support" - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM select CRYPTO select CRYPTO_CBC ---help--- @@ -230,34 +230,34 @@ config DM_CRYPT If unsure, say N. config DM_SNAPSHOT - tristate "Snapshot target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Snapshot target" + depends on BLK_DEV_DM ---help--- Allow volume managers to take writable snapshots of a device. config DM_MIRROR - tristate "Mirror target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Mirror target" + depends on BLK_DEV_DM ---help--- Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. config DM_ZERO - tristate "Zero target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Zero target" + depends on BLK_DEV_DM ---help--- A target that discards writes, and returns all zeroes for reads. Useful in some recovery situations. config DM_MULTIPATH - tristate "Multipath target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "Multipath target" + depends on BLK_DEV_DM ---help--- Allow volume managers to support multipath hardware. config DM_MULTIPATH_EMC - tristate "EMC CX/AX multipath support (EXPERIMENTAL)" - depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL + tristate "EMC CX/AX multipath support" + depends on DM_MULTIPATH && BLK_DEV_DM ---help--- Multipath support for EMC CX/AX series hardware. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1b1ef3130e6e..7aeceedcf7d4 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -206,16 +206,10 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) /* copy the pathname of a file to a buffer */ char *file_path(struct file *file, char *buf, int count) { - struct dentry *d; - struct vfsmount *v; - if (!buf) return NULL; - d = file->f_path.dentry; - v = file->f_path.mnt; - - buf = d_path(d, v, buf, count); + buf = d_path(&file->f_path, buf, count); return IS_ERR(buf) ? NULL : buf; } @@ -237,7 +231,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde if (!page) return ERR_PTR(-ENOMEM); - ITERATE_RDEV(mddev, rdev, tmp) { + rdev_for_each(rdev, tmp, mddev) { if (! test_bit(In_sync, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; @@ -261,7 +255,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) struct list_head *tmp; mddev_t *mddev = bitmap->mddev; - ITERATE_RDEV(mddev, rdev, tmp) + rdev_for_each(rdev, tmp, mddev) if (test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { int size = PAGE_SIZE; @@ -1348,14 +1342,38 @@ void bitmap_close_sync(struct bitmap *bitmap) */ sector_t sector = 0; int blocks; - if (!bitmap) return; + if (!bitmap) + return; while (sector < bitmap->mddev->resync_max_sectors) { bitmap_end_sync(bitmap, sector, &blocks, 0); -/* - if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n", - (unsigned long long)sector, blocks); -*/ sector += blocks; + sector += blocks; + } +} + +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) +{ + sector_t s = 0; + int blocks; + + if (!bitmap) + return; + if (sector == 0) { + bitmap->last_end_sync = jiffies; + return; + } + if (time_before(jiffies, (bitmap->last_end_sync + + bitmap->daemon_sleep * HZ))) + return; + wait_event(bitmap->mddev->recovery_wait, + atomic_read(&bitmap->mddev->recovery_active) == 0); + + sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); + s = 0; + while (s < sector && s < bitmap->mddev->resync_max_sectors) { + bitmap_end_sync(bitmap, s, &blocks, 0); + s += blocks; } + bitmap->last_end_sync = jiffies; } static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) @@ -1565,3 +1583,4 @@ EXPORT_SYMBOL(bitmap_start_sync); EXPORT_SYMBOL(bitmap_end_sync); EXPORT_SYMBOL(bitmap_unplug); EXPORT_SYMBOL(bitmap_close_sync); +EXPORT_SYMBOL(bitmap_cond_end_sync); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6b66ee46b87d..b04f98df94ea 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,11 +1,12 @@ /* * Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> - * Copyright (C) 2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ +#include <linux/completion.h> #include <linux/err.h> #include <linux/module.h> #include <linux/init.h> @@ -28,20 +29,10 @@ #define MESG_STR(x) x, sizeof(x) /* - * per bio private data - */ -struct dm_crypt_io { - struct dm_target *target; - struct bio *base_bio; - struct work_struct work; - atomic_t pending; - int error; -}; - -/* * context holding the current state of a multi-part conversion */ struct convert_context { + struct completion restart; struct bio *bio_in; struct bio *bio_out; unsigned int offset_in; @@ -49,7 +40,27 @@ struct convert_context { unsigned int idx_in; unsigned int idx_out; sector_t sector; - int write; + atomic_t pending; +}; + +/* + * per bio private data + */ +struct dm_crypt_io { + struct dm_target *target; + struct bio *base_bio; + struct work_struct work; + + struct convert_context ctx; + + atomic_t pending; + int error; + sector_t sector; +}; + +struct dm_crypt_request { + struct scatterlist sg_in; + struct scatterlist sg_out; }; struct crypt_config; @@ -72,10 +83,11 @@ struct crypt_config { sector_t start; /* - * pool for per bio private data and - * for encryption buffer pages + * pool for per bio private data, crypto requests and + * encryption requeusts/buffer pages */ mempool_t *io_pool; + mempool_t *req_pool; mempool_t *page_pool; struct bio_set *bs; @@ -93,9 +105,25 @@ struct crypt_config { sector_t iv_offset; unsigned int iv_size; + /* + * Layout of each crypto request: + * + * struct ablkcipher_request + * context + * padding + * struct dm_crypt_request + * padding + * IV + * + * The padding is added so that dm_crypt_request and the IV are + * correctly aligned. + */ + unsigned int dmreq_start; + struct ablkcipher_request *req; + char cipher[CRYPTO_MAX_ALG_NAME]; char chainmode[CRYPTO_MAX_ALG_NAME]; - struct crypto_blkcipher *tfm; + struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; u8 key[0]; @@ -108,6 +136,7 @@ struct crypt_config { static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); +static void kcryptd_queue_crypt(struct dm_crypt_io *io); /* * Different IV generation algorithms: @@ -188,7 +217,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, return PTR_ERR(essiv_tfm); } if (crypto_cipher_blocksize(essiv_tfm) != - crypto_blkcipher_ivsize(cc->tfm)) { + crypto_ablkcipher_ivsize(cc->tfm)) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; crypto_free_cipher(essiv_tfm); @@ -225,7 +254,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); + unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -289,42 +318,10 @@ static struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; -static int -crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, - struct scatterlist *in, unsigned int length, - int write, sector_t sector) -{ - u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64)))); - struct blkcipher_desc desc = { - .tfm = cc->tfm, - .info = iv, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP, - }; - int r; - - if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, sector); - if (r < 0) - return r; - - if (write) - r = crypto_blkcipher_encrypt_iv(&desc, out, in, length); - else - r = crypto_blkcipher_decrypt_iv(&desc, out, in, length); - } else { - if (write) - r = crypto_blkcipher_encrypt(&desc, out, in, length); - else - r = crypto_blkcipher_decrypt(&desc, out, in, length); - } - - return r; -} - static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, - sector_t sector, int write) + sector_t sector) { ctx->bio_in = bio_in; ctx->bio_out = bio_out; @@ -333,7 +330,79 @@ static void crypt_convert_init(struct crypt_config *cc, ctx->idx_in = bio_in ? bio_in->bi_idx : 0; ctx->idx_out = bio_out ? bio_out->bi_idx : 0; ctx->sector = sector + cc->iv_offset; - ctx->write = write; + init_completion(&ctx->restart); + /* + * Crypto operation can be asynchronous, + * ctx->pending is increased after request submission. + * We need to ensure that we don't call the crypt finish + * operation before pending got incremented + * (dependent on crypt submission return code). + */ + atomic_set(&ctx->pending, 2); +} + +static int crypt_convert_block(struct crypt_config *cc, + struct convert_context *ctx, + struct ablkcipher_request *req) +{ + struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); + struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); + struct dm_crypt_request *dmreq; + u8 *iv; + int r = 0; + + dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start); + iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(cc->tfm) + 1); + + sg_init_table(&dmreq->sg_in, 1); + sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, + bv_in->bv_offset + ctx->offset_in); + + sg_init_table(&dmreq->sg_out, 1); + sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, + bv_out->bv_offset + ctx->offset_out); + + ctx->offset_in += 1 << SECTOR_SHIFT; + if (ctx->offset_in >= bv_in->bv_len) { + ctx->offset_in = 0; + ctx->idx_in++; + } + + ctx->offset_out += 1 << SECTOR_SHIFT; + if (ctx->offset_out >= bv_out->bv_len) { + ctx->offset_out = 0; + ctx->idx_out++; + } + + if (cc->iv_gen_ops) { + r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); + if (r < 0) + return r; + } + + ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, + 1 << SECTOR_SHIFT, iv); + + if (bio_data_dir(ctx->bio_in) == WRITE) + r = crypto_ablkcipher_encrypt(req); + else + r = crypto_ablkcipher_decrypt(req); + + return r; +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error); +static void crypt_alloc_req(struct crypt_config *cc, + struct convert_context *ctx) +{ + if (!cc->req) + cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); + ablkcipher_request_set_tfm(cc->req, cc->tfm); + ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, ctx); } /* @@ -346,36 +415,38 @@ static int crypt_convert(struct crypt_config *cc, while(ctx->idx_in < ctx->bio_in->bi_vcnt && ctx->idx_out < ctx->bio_out->bi_vcnt) { - struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); - struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); - struct scatterlist sg_in, sg_out; - - sg_init_table(&sg_in, 1); - sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in); - - sg_init_table(&sg_out, 1); - sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out); - ctx->offset_in += sg_in.length; - if (ctx->offset_in >= bv_in->bv_len) { - ctx->offset_in = 0; - ctx->idx_in++; + crypt_alloc_req(cc, ctx); + + r = crypt_convert_block(cc, ctx, cc->req); + + switch (r) { + case -EBUSY: + wait_for_completion(&ctx->restart); + INIT_COMPLETION(ctx->restart); + /* fall through*/ + case -EINPROGRESS: + atomic_inc(&ctx->pending); + cc->req = NULL; + r = 0; + /* fall through*/ + case 0: + ctx->sector++; + continue; } - ctx->offset_out += sg_out.length; - if (ctx->offset_out >= bv_out->bv_len) { - ctx->offset_out = 0; - ctx->idx_out++; - } - - r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length, - ctx->write, ctx->sector); - if (r < 0) - break; - - ctx->sector++; + break; } + /* + * If there are pending crypto operation run async + * code. Otherwise process return code synchronously. + * The step of 2 ensures that async finish doesn't + * call crypto finish too early. + */ + if (atomic_sub_return(2, &ctx->pending)) + return -EINPROGRESS; + return r; } @@ -455,18 +526,14 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. */ -static void crypt_dec_pending(struct dm_crypt_io *io, int error) +static void crypt_dec_pending(struct dm_crypt_io *io) { - struct crypt_config *cc = (struct crypt_config *) io->target->private; - - if (error < 0) - io->error = error; + struct crypt_config *cc = io->target->private; if (!atomic_dec_and_test(&io->pending)) return; bio_endio(io->base_bio, io->error); - mempool_free(io, cc->io_pool); } @@ -484,30 +551,11 @@ static void crypt_dec_pending(struct dm_crypt_io *io, int error) * starved by new requests which can block in the first stages due * to memory allocation. */ -static void kcryptd_do_work(struct work_struct *work); -static void kcryptd_do_crypt(struct work_struct *work); - -static void kcryptd_queue_io(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - - INIT_WORK(&io->work, kcryptd_do_work); - queue_work(cc->io_queue, &io->work); -} - -static void kcryptd_queue_crypt(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; - - INIT_WORK(&io->work, kcryptd_do_crypt); - queue_work(cc->crypt_queue, &io->work); -} - static void crypt_endio(struct bio *clone, int error) { struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->target->private; - unsigned read_io = bio_data_dir(clone) == READ; + unsigned rw = bio_data_dir(clone); if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) error = -EIO; @@ -515,21 +563,20 @@ static void crypt_endio(struct bio *clone, int error) /* * free the processed pages */ - if (!read_io) { + if (rw == WRITE) crypt_free_buffer_pages(cc, clone); - goto out; + + bio_put(clone); + + if (rw == READ && !error) { + kcryptd_queue_crypt(io); + return; } if (unlikely(error)) - goto out; - - bio_put(clone); - kcryptd_queue_crypt(io); - return; + io->error = error; -out: - bio_put(clone); - crypt_dec_pending(io, error); + crypt_dec_pending(io); } static void clone_init(struct dm_crypt_io *io, struct bio *clone) @@ -543,12 +590,11 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_destructor = dm_crypt_bio_destructor; } -static void process_read(struct dm_crypt_io *io) +static void kcryptd_io_read(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; struct bio *base_bio = io->base_bio; struct bio *clone; - sector_t sector = base_bio->bi_sector - io->target->begin; atomic_inc(&io->pending); @@ -559,7 +605,8 @@ static void process_read(struct dm_crypt_io *io) */ clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); if (unlikely(!clone)) { - crypt_dec_pending(io, -ENOMEM); + io->error = -ENOMEM; + crypt_dec_pending(io); return; } @@ -567,25 +614,71 @@ static void process_read(struct dm_crypt_io *io) clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); clone->bi_size = base_bio->bi_size; - clone->bi_sector = cc->start + sector; + clone->bi_sector = cc->start + io->sector; memcpy(clone->bi_io_vec, bio_iovec(base_bio), sizeof(struct bio_vec) * clone->bi_vcnt); generic_make_request(clone); } -static void process_write(struct dm_crypt_io *io) +static void kcryptd_io_write(struct dm_crypt_io *io) +{ + struct bio *clone = io->ctx.bio_out; + + generic_make_request(clone); +} + +static void kcryptd_io(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + + if (bio_data_dir(io->base_bio) == READ) + kcryptd_io_read(io); + else + kcryptd_io_write(io); +} + +static void kcryptd_queue_io(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; - struct bio *base_bio = io->base_bio; - struct bio *clone; - struct convert_context ctx; - unsigned remaining = base_bio->bi_size; - sector_t sector = base_bio->bi_sector - io->target->begin; - atomic_inc(&io->pending); + INIT_WORK(&io->work, kcryptd_io); + queue_work(cc->io_queue, &io->work); +} - crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1); +static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, + int error, int async) +{ + struct bio *clone = io->ctx.bio_out; + struct crypt_config *cc = io->target->private; + + if (unlikely(error < 0)) { + crypt_free_buffer_pages(cc, clone); + bio_put(clone); + io->error = -EIO; + return; + } + + /* crypt_convert should have filled the clone bio */ + BUG_ON(io->ctx.idx_out < clone->bi_vcnt); + + clone->bi_sector = cc->start + io->sector; + io->sector += bio_sectors(clone); + + if (async) + kcryptd_queue_io(io); + else { + atomic_inc(&io->pending); + generic_make_request(clone); + } +} + +static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *clone; + unsigned remaining = io->base_bio->bi_size; + int r; /* * The allocated buffers can be smaller than the whole bio, @@ -594,70 +687,110 @@ static void process_write(struct dm_crypt_io *io) while (remaining) { clone = crypt_alloc_buffer(io, remaining); if (unlikely(!clone)) { - crypt_dec_pending(io, -ENOMEM); + io->error = -ENOMEM; return; } - ctx.bio_out = clone; - ctx.idx_out = 0; + io->ctx.bio_out = clone; + io->ctx.idx_out = 0; - if (unlikely(crypt_convert(cc, &ctx) < 0)) { - crypt_free_buffer_pages(cc, clone); - bio_put(clone); - crypt_dec_pending(io, -EIO); - return; - } - - /* crypt_convert should have filled the clone bio */ - BUG_ON(ctx.idx_out < clone->bi_vcnt); - - clone->bi_sector = cc->start + sector; remaining -= clone->bi_size; - sector += bio_sectors(clone); - /* Grab another reference to the io struct - * before we kick off the request */ - if (remaining) - atomic_inc(&io->pending); + r = crypt_convert(cc, &io->ctx); - generic_make_request(clone); - - /* Do not reference clone after this - it - * may be gone already. */ + if (r != -EINPROGRESS) { + kcryptd_crypt_write_io_submit(io, r, 0); + if (unlikely(r < 0)) + return; + } else + atomic_inc(&io->pending); /* out of memory -> run queues */ - if (remaining) + if (unlikely(remaining)) congestion_wait(WRITE, HZ/100); } } -static void process_read_endio(struct dm_crypt_io *io) +static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->target->private; - struct convert_context ctx; - crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, - io->base_bio->bi_sector - io->target->begin, 0); + /* + * Prevent io from disappearing until this function completes. + */ + atomic_inc(&io->pending); + + crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); + kcryptd_crypt_write_convert_loop(io); - crypt_dec_pending(io, crypt_convert(cc, &ctx)); + crypt_dec_pending(io); } -static void kcryptd_do_work(struct work_struct *work) +static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) { - struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + if (unlikely(error < 0)) + io->error = -EIO; + + crypt_dec_pending(io); +} + +static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + int r = 0; + + atomic_inc(&io->pending); + + crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, + io->sector); + + r = crypt_convert(cc, &io->ctx); + + if (r != -EINPROGRESS) + kcryptd_crypt_read_done(io, r); + + crypt_dec_pending(io); +} + +static void kcryptd_async_done(struct crypto_async_request *async_req, + int error) +{ + struct convert_context *ctx = async_req->data; + struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); + struct crypt_config *cc = io->target->private; + + if (error == -EINPROGRESS) { + complete(&ctx->restart); + return; + } + + mempool_free(ablkcipher_request_cast(async_req), cc->req_pool); + + if (!atomic_dec_and_test(&ctx->pending)) + return; if (bio_data_dir(io->base_bio) == READ) - process_read(io); + kcryptd_crypt_read_done(io, error); + else + kcryptd_crypt_write_io_submit(io, error, 1); } -static void kcryptd_do_crypt(struct work_struct *work) +static void kcryptd_crypt(struct work_struct *work) { struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); if (bio_data_dir(io->base_bio) == READ) - process_read_endio(io); + kcryptd_crypt_read_convert(io); else - process_write(io); + kcryptd_crypt_write_convert(io); +} + +static void kcryptd_queue_crypt(struct dm_crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + + INIT_WORK(&io->work, kcryptd_crypt); + queue_work(cc->crypt_queue, &io->work); } /* @@ -733,7 +866,7 @@ static int crypt_wipe_key(struct crypt_config *cc) static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; - struct crypto_blkcipher *tfm; + struct crypto_ablkcipher *tfm; char *tmp; char *cipher; char *chainmode; @@ -787,7 +920,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_cipher; } - tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); if (IS_ERR(tfm)) { ti->error = "Error allocating crypto tfm"; goto bad_cipher; @@ -821,7 +954,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) goto bad_ivmode; - cc->iv_size = crypto_blkcipher_ivsize(tfm); + cc->iv_size = crypto_ablkcipher_ivsize(tfm); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -841,6 +974,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_slab_pool; } + cc->dmreq_start = sizeof(struct ablkcipher_request); + cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); + cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); + cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & + ~(crypto_tfm_ctx_alignment() - 1); + + cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + + sizeof(struct dm_crypt_request) + cc->iv_size); + if (!cc->req_pool) { + ti->error = "Cannot allocate crypt request mempool"; + goto bad_req_pool; + } + cc->req = NULL; + cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; @@ -853,7 +1000,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_bs; } - if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { + if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { ti->error = "Error setting key"; goto bad_device; } @@ -914,12 +1061,14 @@ bad_device: bad_bs: mempool_destroy(cc->page_pool); bad_page_pool: + mempool_destroy(cc->req_pool); +bad_req_pool: mempool_destroy(cc->io_pool); bad_slab_pool: if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); bad_ivmode: - crypto_free_blkcipher(tfm); + crypto_free_ablkcipher(tfm); bad_cipher: /* Must zero key material before freeing */ memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); @@ -934,14 +1083,18 @@ static void crypt_dtr(struct dm_target *ti) destroy_workqueue(cc->io_queue); destroy_workqueue(cc->crypt_queue); + if (cc->req) + mempool_free(cc->req, cc->req_pool); + bioset_free(cc->bs); mempool_destroy(cc->page_pool); + mempool_destroy(cc->req_pool); mempool_destroy(cc->io_pool); kfree(cc->iv_mode); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); - crypto_free_blkcipher(cc->tfm); + crypto_free_ablkcipher(cc->tfm); dm_put_device(ti, cc->dev); /* Must zero key material before freeing */ @@ -958,6 +1111,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, io = mempool_alloc(cc->io_pool, GFP_NOIO); io->target = ti; io->base_bio = bio; + io->sector = bio->bi_sector - ti->begin; io->error = 0; atomic_set(&io->pending, 0); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 8fe81e1807e0..5bbce29f143a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -449,7 +449,7 @@ static void persistent_destroy(struct exception_store *store) static int persistent_read_metadata(struct exception_store *store) { - int r, new_snapshot; + int r, uninitialized_var(new_snapshot); struct pstore *ps = get_info(store); /* diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 9627fa0f9470..b262c0042de3 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/dm-ioctl.h> #include <linux/hdreg.h> +#include <linux/compat.h> #include <asm/uaccess.h> @@ -702,7 +703,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) int r; char *new_name = (char *) param + param->data_start; - if (new_name < (char *) param->data || + if (new_name < param->data || invalid_str(new_name, (void *) param + param_size)) { DMWARN("Invalid new logical volume name supplied."); return -EINVAL; @@ -728,7 +729,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) if (!md) return -ENXIO; - if (geostr < (char *) param->data || + if (geostr < param->data || invalid_str(geostr, (void *) param + param_size)) { DMWARN("Invalid geometry supplied."); goto out; @@ -1350,10 +1351,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) { struct dm_ioctl tmp, *dmi; - if (copy_from_user(&tmp, user, sizeof(tmp))) + if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) return -EFAULT; - if (tmp.data_size < sizeof(tmp)) + if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) return -EINVAL; dmi = vmalloc(tmp.data_size); @@ -1397,13 +1398,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param) return 0; } -static int ctl_ioctl(struct inode *inode, struct file *file, - uint command, ulong u) +static int ctl_ioctl(uint command, struct dm_ioctl __user *user) { int r = 0; unsigned int cmd; - struct dm_ioctl *param; - struct dm_ioctl __user *user = (struct dm_ioctl __user *) u; + struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; size_t param_size; @@ -1471,8 +1470,23 @@ static int ctl_ioctl(struct inode *inode, struct file *file, return r; } +static long dm_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u); +} + +#ifdef CONFIG_COMPAT +static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) +{ + return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define dm_compat_ctl_ioctl NULL +#endif + static const struct file_operations _ctl_fops = { - .ioctl = ctl_ioctl, + .unlocked_ioctl = dm_ctl_ioctl, + .compat_ioctl = dm_compat_ctl_ioctl, .owner = THIS_MODULE, }; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 072ee4353eab..2a74b2142f50 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -41,7 +41,7 @@ int dm_unregister_dirty_log_type(struct dirty_log_type *type) return 0; } -static struct dirty_log_type *get_type(const char *type_name) +static struct dirty_log_type *_get_type(const char *type_name) { struct dirty_log_type *type; @@ -61,6 +61,55 @@ static struct dirty_log_type *get_type(const char *type_name) return NULL; } +/* + * get_type + * @type_name + * + * Attempt to retrieve the dirty_log_type by name. If not already + * available, attempt to load the appropriate module. + * + * Log modules are named "dm-log-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-log-<type_name>", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-disk", it would search + * 'dm-log-clustered-disk' then 'dm-log-clustered'. + * + * Returns: dirty_log_type* on success, NULL on failure + */ +static struct dirty_log_type *get_type(const char *type_name) +{ + char *p, *type_name_dup; + struct dirty_log_type *type; + + type = _get_type(type_name); + if (type) + return type; + + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMWARN("No memory left to attempt log module load for \"%s\"", + type_name); + return NULL; + } + + while (request_module("dm-log-%s", type_name_dup) || + !(type = _get_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; + } + + if (!type) + DMWARN("Module for logging type \"%s\" not found.", type_name); + + kfree(type_name_dup); + + return type; +} + static void put_type(struct dirty_log_type *type) { spin_lock(&_lock); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 24b2b1e32fae..e7ee59e655d5 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -106,7 +106,7 @@ typedef int (*action_fn) (struct pgpath *pgpath); static struct kmem_cache *_mpio_cache; -struct workqueue_struct *kmultipathd; +static struct workqueue_struct *kmultipathd; static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 31123d4a6b9c..2928ef228101 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -6,6 +6,7 @@ #include "dm.h" #include "dm-bio-list.h" +#include "dm-bio-record.h" #include "dm-io.h" #include "dm-log.h" #include "kcopyd.h" @@ -20,6 +21,7 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/log2.h> +#include <linux/hardirq.h> #define DM_MSG_PREFIX "raid1" #define DM_IO_PAGES 64 @@ -113,9 +115,16 @@ struct region { /*----------------------------------------------------------------- * Mirror set structures. *---------------------------------------------------------------*/ +enum dm_raid1_error { + DM_RAID1_WRITE_ERROR, + DM_RAID1_SYNC_ERROR, + DM_RAID1_READ_ERROR +}; + struct mirror { struct mirror_set *ms; atomic_t error_count; + unsigned long error_type; struct dm_dev *dev; sector_t offset; }; @@ -127,21 +136,25 @@ struct mirror_set { struct kcopyd_client *kcopyd_client; uint64_t features; - spinlock_t lock; /* protects the next two lists */ + spinlock_t lock; /* protects the lists */ struct bio_list reads; struct bio_list writes; + struct bio_list failures; struct dm_io_client *io_client; + mempool_t *read_record_pool; /* recovery */ region_t nr_regions; int in_sync; int log_failure; + atomic_t suspend; - struct mirror *default_mirror; /* Default mirror */ + atomic_t default_mirror; /* Default mirror */ struct workqueue_struct *kmirrord_wq; struct work_struct kmirrord_work; + struct work_struct trigger_event; unsigned int nr_mirrors; struct mirror mirror[0]; @@ -362,6 +375,16 @@ static void complete_resync_work(struct region *reg, int success) struct region_hash *rh = reg->rh; rh->log->type->set_region_sync(rh->log, reg->key, success); + + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ dispatch_bios(rh->ms, ®->delayed_bios); if (atomic_dec_and_test(&rh->recovery_in_flight)) wake_up_all(&_kmirrord_recovery_stopped); @@ -626,24 +649,101 @@ static void rh_start_recovery(struct region_hash *rh) wake(rh->ms); } +#define MIN_READ_RECORDS 20 +struct dm_raid1_read_record { + struct mirror *m; + struct dm_bio_details details; +}; + /* * Every mirror should look like this one. */ #define DEFAULT_MIRROR 0 /* - * This is yucky. We squirrel the mirror_set struct away inside - * bi_next for write buffers. This is safe since the bh + * This is yucky. We squirrel the mirror struct away inside + * bi_next for read/write buffers. This is safe since the bh * doesn't get submitted to the lower levels of block layer. */ -static struct mirror_set *bio_get_ms(struct bio *bio) +static struct mirror *bio_get_m(struct bio *bio) +{ + return (struct mirror *) bio->bi_next; +} + +static void bio_set_m(struct bio *bio, struct mirror *m) +{ + bio->bi_next = (struct bio *) m; +} + +static struct mirror *get_default_mirror(struct mirror_set *ms) { - return (struct mirror_set *) bio->bi_next; + return &ms->mirror[atomic_read(&ms->default_mirror)]; } -static void bio_set_ms(struct bio *bio, struct mirror_set *ms) +static void set_default_mirror(struct mirror *m) { - bio->bi_next = (struct bio *) ms; + struct mirror_set *ms = m->ms; + struct mirror *m0 = &(ms->mirror[0]); + + atomic_set(&ms->default_mirror, m - m0); +} + +/* fail_mirror + * @m: mirror device to fail + * @error_type: one of the enum's, DM_RAID1_*_ERROR + * + * If errors are being handled, record the type of + * error encountered for this device. If this type + * of error has already been recorded, we can return; + * otherwise, we must signal userspace by triggering + * an event. Additionally, if the device is the + * primary device, we must choose a new primary, but + * only if the mirror is in-sync. + * + * This function must not block. + */ +static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) +{ + struct mirror_set *ms = m->ms; + struct mirror *new; + + if (!errors_handled(ms)) + return; + + /* + * error_count is used for nothing more than a + * simple way to tell if a device has encountered + * errors. + */ + atomic_inc(&m->error_count); + + if (test_and_set_bit(error_type, &m->error_type)) + return; + + if (m != get_default_mirror(ms)) + goto out; + + if (!ms->in_sync) { + /* + * Better to issue requests to same failing device + * than to risk returning corrupt data. + */ + DMERR("Primary mirror (%s) failed while out-of-sync: " + "Reads may fail.", m->dev->name); + goto out; + } + + for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) + if (!atomic_read(&new->error_count)) { + set_default_mirror(new); + break; + } + + if (unlikely(new == ms->mirror + ms->nr_mirrors)) + DMWARN("All sides of mirror have failed."); + +out: + schedule_work(&ms->trigger_event); } /*----------------------------------------------------------------- @@ -656,15 +756,32 @@ static void bio_set_ms(struct bio *bio, struct mirror_set *ms) static void recovery_complete(int read_err, unsigned int write_err, void *context) { - struct region *reg = (struct region *) context; + struct region *reg = (struct region *)context; + struct mirror_set *ms = reg->rh->ms; + int m, bit = 0; - if (read_err) + if (read_err) { /* Read error means the failure of default mirror. */ DMERR_LIMIT("Unable to read primary mirror during recovery"); + fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); + } - if (write_err) + if (write_err) { DMERR_LIMIT("Write error during recovery (error = 0x%x)", write_err); + /* + * Bits correspond to devices (excluding default mirror). + * The default mirror cannot change during recovery. + */ + for (m = 0; m < ms->nr_mirrors; m++) { + if (&ms->mirror[m] == get_default_mirror(ms)) + continue; + if (test_bit(bit, &write_err)) + fail_mirror(ms->mirror + m, + DM_RAID1_SYNC_ERROR); + bit++; + } + } rh_recovery_end(reg, !(read_err || write_err)); } @@ -678,7 +795,7 @@ static int recover(struct mirror_set *ms, struct region *reg) unsigned long flags = 0; /* fill in the source */ - m = ms->default_mirror; + m = get_default_mirror(ms); from.bdev = m->dev->bdev; from.sector = m->offset + region_to_sector(reg->rh, reg->key); if (reg->key == (ms->nr_regions - 1)) { @@ -694,7 +811,7 @@ static int recover(struct mirror_set *ms, struct region *reg) /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (&ms->mirror[i] == ms->default_mirror) + if (&ms->mirror[i] == get_default_mirror(ms)) continue; m = ms->mirror + i; @@ -748,17 +865,105 @@ static void do_recovery(struct mirror_set *ms) *---------------------------------------------------------------*/ static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) { - /* FIXME: add read balancing */ - return ms->default_mirror; + struct mirror *m = get_default_mirror(ms); + + do { + if (likely(!atomic_read(&m->error_count))) + return m; + + if (m-- == ms->mirror) + m += ms->nr_mirrors; + } while (m != get_default_mirror(ms)); + + return NULL; +} + +static int default_ok(struct mirror *m) +{ + struct mirror *default_mirror = get_default_mirror(m->ms); + + return !atomic_read(&default_mirror->error_count); +} + +static int mirror_available(struct mirror_set *ms, struct bio *bio) +{ + region_t region = bio_to_region(&ms->rh, bio); + + if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) + return choose_mirror(ms, bio->bi_sector) ? 1 : 0; + + return 0; } /* * remap a buffer to a particular mirror. */ -static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) +static sector_t map_sector(struct mirror *m, struct bio *bio) +{ + return m->offset + (bio->bi_sector - m->ms->ti->begin); +} + +static void map_bio(struct mirror *m, struct bio *bio) { bio->bi_bdev = m->dev->bdev; - bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); + bio->bi_sector = map_sector(m, bio); +} + +static void map_region(struct io_region *io, struct mirror *m, + struct bio *bio) +{ + io->bdev = m->dev->bdev; + io->sector = map_sector(m, bio); + io->count = bio->bi_size >> 9; +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static void read_callback(unsigned long error, void *context) +{ + struct bio *bio = context; + struct mirror *m; + + m = bio_get_m(bio); + bio_set_m(bio, NULL); + + if (likely(!error)) { + bio_endio(bio, 0); + return; + } + + fail_mirror(m, DM_RAID1_READ_ERROR); + + if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { + DMWARN_LIMIT("Read failure on mirror device %s. " + "Trying alternative device.", + m->dev->name); + queue_bio(m->ms, bio, bio_rw(bio)); + return; + } + + DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", + m->dev->name); + bio_endio(bio, -EIO); +} + +/* Asynchronous read. */ +static void read_async_bio(struct mirror *m, struct bio *bio) +{ + struct io_region io; + struct dm_io_request io_req = { + .bi_rw = READ, + .mem.type = DM_IO_BVEC, + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .notify.fn = read_callback, + .notify.context = bio, + .client = m->ms->io_client, + }; + + map_region(&io, m, bio); + bio_set_m(bio, m); + (void) dm_io(&io_req, 1, &io, NULL); } static void do_reads(struct mirror_set *ms, struct bio_list *reads) @@ -769,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) while ((bio = bio_list_pop(reads))) { region = bio_to_region(&ms->rh, bio); + m = get_default_mirror(ms); /* * We can only read balance if the region is in sync. */ - if (rh_in_sync(&ms->rh, region, 1)) + if (likely(rh_in_sync(&ms->rh, region, 1))) m = choose_mirror(ms, bio->bi_sector); - else - m = ms->default_mirror; + else if (m && atomic_read(&m->error_count)) + m = NULL; - map_bio(ms, m, bio); - generic_make_request(bio); + if (likely(m)) + read_async_bio(m, bio); + else + bio_endio(bio, -EIO); } } @@ -793,15 +1001,70 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) * RECOVERING: delay the io until recovery completes * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ + +/* __bio_mark_nosync + * @ms + * @bio + * @done + * @error + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +static void __bio_mark_nosync(struct mirror_set *ms, + struct bio *bio, unsigned done, int error) +{ + unsigned long flags; + struct region_hash *rh = &ms->rh; + struct dirty_log *log = ms->rh.log; + struct region *reg; + region_t region = bio_to_region(rh, bio); + int recovering = 0; + + /* We must inform the log that the sync count has changed. */ + log->type->set_region_sync(log, region, 0); + ms->in_sync = 0; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + /* region hash entry should exist because write was in-flight */ + BUG_ON(!reg); + BUG_ON(!list_empty(®->list)); + + spin_lock_irqsave(&rh->region_lock, flags); + /* + * Possible cases: + * 1) RH_DIRTY + * 2) RH_NOSYNC: was dirty, other preceeding writes failed + * 3) RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + recovering = (reg->state == RH_RECOVERING); + reg->state = RH_NOSYNC; + BUG_ON(!list_empty(®->list)); + spin_unlock_irqrestore(&rh->region_lock, flags); + + bio_endio(bio, error); + if (recovering) + complete_resync_work(reg, 0); +} + static void write_callback(unsigned long error, void *context) { - unsigned int i; - int uptodate = 1; + unsigned i, ret = 0; struct bio *bio = (struct bio *) context; struct mirror_set *ms; + int uptodate = 0; + int should_wake = 0; + unsigned long flags; - ms = bio_get_ms(bio); - bio_set_ms(bio, NULL); + ms = bio_get_m(bio)->ms; + bio_set_m(bio, NULL); /* * NOTE: We don't decrement the pending count here, @@ -809,26 +1072,42 @@ static void write_callback(unsigned long error, void *context) * This way we handle both writes to SYNC and NOSYNC * regions with the same code. */ + if (likely(!error)) + goto out; + + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error)) + fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); + else + uptodate = 1; - if (error) { + if (unlikely(!uptodate)) { + DMERR("All replicated volumes dead, failing I/O"); + /* None of the writes succeeded, fail the I/O. */ + ret = -EIO; + } else if (errors_handled(ms)) { /* - * only error the io if all mirrors failed. - * FIXME: bogus + * Need to raise event. Since raising + * events can block, we need to do it in + * the main thread. */ - uptodate = 0; - for (i = 0; i < ms->nr_mirrors; i++) - if (!test_bit(i, &error)) { - uptodate = 1; - break; - } + spin_lock_irqsave(&ms->lock, flags); + if (!ms->failures.head) + should_wake = 1; + bio_list_add(&ms->failures, bio); + spin_unlock_irqrestore(&ms->lock, flags); + if (should_wake) + wake(ms); + return; } - bio_endio(bio, 0); +out: + bio_endio(bio, ret); } static void do_write(struct mirror_set *ms, struct bio *bio) { unsigned int i; - struct io_region io[KCOPYD_MAX_REGIONS+1]; + struct io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { .bi_rw = WRITE, @@ -839,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio) .client = ms->io_client, }; - for (i = 0; i < ms->nr_mirrors; i++) { - m = ms->mirror + i; - - io[i].bdev = m->dev->bdev; - io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); - io[i].count = bio->bi_size >> 9; - } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) + map_region(dest++, m, bio); - bio_set_ms(bio, ms); + /* + * Use default mirror because we only need it to retrieve the reference + * to the mirror set in write_callback(). + */ + bio_set_m(bio, get_default_mirror(ms)); (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); } @@ -900,43 +1178,125 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) /* * Dispatch io. */ - if (unlikely(ms->log_failure)) + if (unlikely(ms->log_failure)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, &sync); + spin_unlock_irq(&ms->lock); + } else while ((bio = bio_list_pop(&sync))) - bio_endio(bio, -EIO); - else while ((bio = bio_list_pop(&sync))) - do_write(ms, bio); + do_write(ms, bio); while ((bio = bio_list_pop(&recover))) rh_delay(&ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(ms, ms->default_mirror, bio); + map_bio(get_default_mirror(ms), bio); generic_make_request(bio); } } +static void do_failures(struct mirror_set *ms, struct bio_list *failures) +{ + struct bio *bio; + + if (!failures->head) + return; + + if (!ms->log_failure) { + while ((bio = bio_list_pop(failures))) + __bio_mark_nosync(ms, bio, bio->bi_size, 0); + return; + } + + /* + * If the log has failed, unattempted writes are being + * put on the failures list. We can't issue those writes + * until a log has been marked, so we must store them. + * + * If a 'noflush' suspend is in progress, we can requeue + * the I/O's to the core. This give userspace a chance + * to reconfigure the mirror, at which point the core + * will reissue the writes. If the 'noflush' flag is + * not set, we have no choice but to return errors. + * + * Some writes on the failures list may have been + * submitted before the log failure and represent a + * failure to write to one of the devices. It is ok + * for us to treat them the same and requeue them + * as well. + */ + if (dm_noflush_suspending(ms->ti)) { + while ((bio = bio_list_pop(failures))) + bio_endio(bio, DM_ENDIO_REQUEUE); + return; + } + + if (atomic_read(&ms->suspend)) { + while ((bio = bio_list_pop(failures))) + bio_endio(bio, -EIO); + return; + } + + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->failures, failures); + spin_unlock_irq(&ms->lock); + + wake(ms); +} + +static void trigger_event(struct work_struct *work) +{ + struct mirror_set *ms = + container_of(work, struct mirror_set, trigger_event); + + dm_table_event(ms->ti->table); +} + /*----------------------------------------------------------------- * kmirrord *---------------------------------------------------------------*/ -static void do_mirror(struct work_struct *work) +static int _do_mirror(struct work_struct *work) { struct mirror_set *ms =container_of(work, struct mirror_set, kmirrord_work); - struct bio_list reads, writes; + struct bio_list reads, writes, failures; + unsigned long flags; - spin_lock(&ms->lock); + spin_lock_irqsave(&ms->lock, flags); reads = ms->reads; writes = ms->writes; + failures = ms->failures; bio_list_init(&ms->reads); bio_list_init(&ms->writes); - spin_unlock(&ms->lock); + bio_list_init(&ms->failures); + spin_unlock_irqrestore(&ms->lock, flags); rh_update_states(&ms->rh); do_recovery(ms); do_reads(ms, &reads); do_writes(ms, &writes); + do_failures(ms, &failures); + + return (ms->failures.head) ? 1 : 0; } +static void do_mirror(struct work_struct *work) +{ + /* + * If _do_mirror returns 1, we give it + * another shot. This helps for cases like + * 'suspend' where we call flush_workqueue + * and expect all work to be finished. If + * a failure happens during a suspend, we + * couldn't issue a 'wake' because it would + * not be honored. Therefore, we return '1' + * from _do_mirror, and retry here. + */ + while (_do_mirror(work)) + schedule(); +} + + /*----------------------------------------------------------------- * Target functions *---------------------------------------------------------------*/ @@ -965,11 +1325,23 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms->nr_mirrors = nr_mirrors; ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; - ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; + ms->log_failure = 0; + atomic_set(&ms->suspend, 0); + atomic_set(&ms->default_mirror, DEFAULT_MIRROR); + + len = sizeof(struct dm_raid1_read_record); + ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, + len); + if (!ms->read_record_pool) { + ti->error = "Error creating mirror read_record_pool"; + kfree(ms); + return NULL; + } ms->io_client = dm_io_client_create(DM_IO_PAGES); if (IS_ERR(ms->io_client)) { ti->error = "Error creating dm_io client"; + mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } @@ -977,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ti->error = "Error creating dirty region hash"; dm_io_client_destroy(ms->io_client); + mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } @@ -992,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, dm_io_client_destroy(ms->io_client); rh_exit(&ms->rh); + mempool_destroy(ms->read_record_pool); kfree(ms); } @@ -1019,6 +1393,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, } ms->mirror[mirror].ms = ms; + atomic_set(&(ms->mirror[mirror].error_count), 0); + ms->mirror[mirror].error_type = 0; ms->mirror[mirror].offset = offset; return 0; @@ -1171,6 +1547,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_free_context; } INIT_WORK(&ms->kmirrord_work, do_mirror); + INIT_WORK(&ms->trigger_event, trigger_event); r = parse_features(ms, argc, argv, &args_used); if (r) @@ -1220,14 +1597,15 @@ static void mirror_dtr(struct dm_target *ti) static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) { + unsigned long flags; int should_wake = 0; struct bio_list *bl; bl = (rw == WRITE) ? &ms->writes : &ms->reads; - spin_lock(&ms->lock); + spin_lock_irqsave(&ms->lock, flags); should_wake = !(bl->head); bio_list_add(bl, bio); - spin_unlock(&ms->lock); + spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) wake(ms); @@ -1242,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, int r, rw = bio_rw(bio); struct mirror *m; struct mirror_set *ms = ti->private; - - map_context->ll = bio_to_region(&ms->rh, bio); + struct dm_raid1_read_record *read_record = NULL; if (rw == WRITE) { + /* Save region for mirror_end_io() handler */ + map_context->ll = bio_to_region(&ms->rh, bio); queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } @@ -1255,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, if (r < 0 && r != -EWOULDBLOCK) return r; - if (r == -EWOULDBLOCK) /* FIXME: ugly */ - r = DM_MAPIO_SUBMITTED; - /* - * We don't want to fast track a recovery just for a read - * ahead. So we just let it silently fail. - * FIXME: get rid of this. + * If region is not in-sync queue the bio. */ - if (!r && rw == READA) - return -EIO; + if (!r || (r == -EWOULDBLOCK)) { + if (rw == READA) + return -EWOULDBLOCK; - if (!r) { - /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } + /* + * The region is in-sync and we can perform reads directly. + * Store enough information so we can retry if it fails. + */ m = choose_mirror(ms, bio->bi_sector); - if (!m) + if (unlikely(!m)) return -EIO; - map_bio(ms, m, bio); + read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); + if (likely(read_record)) { + dm_bio_record(&read_record->details, bio); + map_context->ptr = read_record; + read_record->m = m; + } + + map_bio(m, bio); + return DM_MAPIO_REMAPPED; } @@ -1285,71 +1670,173 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, { int rw = bio_rw(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; - region_t region = map_context->ll; + struct mirror *m = NULL; + struct dm_bio_details *bd = NULL; + struct dm_raid1_read_record *read_record = map_context->ptr; /* * We need to dec pending if this was a write. */ - if (rw == WRITE) - rh_dec(&ms->rh, region); + if (rw == WRITE) { + rh_dec(&ms->rh, map_context->ll); + return error; + } - return 0; + if (error == -EOPNOTSUPP) + goto out; + + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + goto out; + + if (unlikely(error)) { + if (!read_record) { + /* + * There wasn't enough memory to record necessary + * information for a retry or there was no other + * mirror in-sync. + */ + DMERR_LIMIT("Mirror read failed from %s.", + m->dev->name); + return -EIO; + } + DMERR("Mirror read failed from %s. Trying alternative device.", + m->dev->name); + + m = read_record->m; + fail_mirror(m, DM_RAID1_READ_ERROR); + + /* + * A failed read is requeued for another attempt using an intact + * mirror. + */ + if (default_ok(m) || mirror_available(ms, bio)) { + bd = &read_record->details; + + dm_bio_restore(bd, bio); + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + queue_bio(ms, bio, rw); + return 1; + } + DMERR("All replicated volumes dead, failing I/O"); + } + +out: + if (read_record) { + mempool_free(read_record, ms->read_record_pool); + map_context->ptr = NULL; + } + + return error; } -static void mirror_postsuspend(struct dm_target *ti) +static void mirror_presuspend(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; struct dirty_log *log = ms->rh.log; + atomic_set(&ms->suspend, 1); + + /* + * We must finish up all the work that we've + * generated (i.e. recovery work). + */ rh_stop_recovery(&ms->rh); - /* Wait for all I/O we generated to complete */ wait_event(_kmirrord_recovery_stopped, !atomic_read(&ms->rh.recovery_in_flight)); + if (log->type->presuspend && log->type->presuspend(log)) + /* FIXME: need better error handling */ + DMWARN("log presuspend failed"); + + /* + * Now that recovery is complete/stopped and the + * delayed bios are queued, we need to wait for + * the worker thread to complete. This way, + * we know that all of our I/O has been pushed. + */ + flush_workqueue(ms->kmirrord_wq); +} + +static void mirror_postsuspend(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + struct dirty_log *log = ms->rh.log; + if (log->type->postsuspend && log->type->postsuspend(log)) /* FIXME: need better error handling */ - DMWARN("log suspend failed"); + DMWARN("log postsuspend failed"); } static void mirror_resume(struct dm_target *ti) { - struct mirror_set *ms = (struct mirror_set *) ti->private; + struct mirror_set *ms = ti->private; struct dirty_log *log = ms->rh.log; + + atomic_set(&ms->suspend, 0); if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); rh_start_recovery(&ms->rh); } +/* + * device_status_char + * @m: mirror device/leg we want the status of + * + * We return one character representing the most severe error + * we have encountered. + * A => Alive - No failures + * D => Dead - A write failure occurred leaving mirror out-of-sync + * S => Sync - A sychronization failure occurred, mirror out-of-sync + * R => Read - A read failure occurred, mirror data unaffected + * + * Returns: <char> + */ +static char device_status_char(struct mirror *m) +{ + if (!atomic_read(&(m->error_count))) + return 'A'; + + return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : + (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : + (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; +} + + static int mirror_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { unsigned int m, sz = 0; struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dirty_log *log = ms->rh.log; + char buffer[ms->nr_mirrors + 1]; switch (type) { case STATUSTYPE_INFO: DMEMIT("%d ", ms->nr_mirrors); - for (m = 0; m < ms->nr_mirrors; m++) + for (m = 0; m < ms->nr_mirrors; m++) { DMEMIT("%s ", ms->mirror[m].dev->name); + buffer[m] = device_status_char(&(ms->mirror[m])); + } + buffer[m] = '\0'; - DMEMIT("%llu/%llu 0 ", - (unsigned long long)ms->rh.log->type-> - get_sync_count(ms->rh.log), - (unsigned long long)ms->nr_regions); + DMEMIT("%llu/%llu 1 %s ", + (unsigned long long)log->type->get_sync_count(ms->rh.log), + (unsigned long long)ms->nr_regions, buffer); - sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); + sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz); break; case STATUSTYPE_TABLE: - sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); + sz = log->type->status(ms->rh.log, type, result, maxlen); DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) DMEMIT(" %s %llu", ms->mirror[m].dev->name, - (unsigned long long)ms->mirror[m].offset); + (unsigned long long)ms->mirror[m].offset); if (ms->features & DM_RAID1_HANDLE_ERRORS) DMEMIT(" 1 handle_errors"); @@ -1360,12 +1847,13 @@ static int mirror_status(struct dm_target *ti, status_type_t type, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 3}, + .version = {1, 0, 20}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, .map = mirror_map, .end_io = mirror_end_io, + .presuspend = mirror_presuspend, .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status, diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index cee16fadd9ee..ae24eab8cd81 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -213,11 +213,15 @@ static void unregister_snapshot(struct dm_snapshot *s) /* * Implementation of the exception hash tables. + * The lowest hash_shift bits of the chunk number are ignored, allowing + * some consecutive chunks to be grouped together. */ -static int init_exception_table(struct exception_table *et, uint32_t size) +static int init_exception_table(struct exception_table *et, uint32_t size, + unsigned hash_shift) { unsigned int i; + et->hash_shift = hash_shift; et->hash_mask = size - 1; et->table = dm_vcalloc(size, sizeof(struct list_head)); if (!et->table) @@ -248,7 +252,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) { - return chunk & et->hash_mask; + return (chunk >> et->hash_shift) & et->hash_mask; } static void insert_exception(struct exception_table *eh, @@ -275,7 +279,8 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, slot = &et->table[exception_hash(et, chunk)]; list_for_each_entry (e, slot, hash_list) - if (e->old_chunk == chunk) + if (chunk >= e->old_chunk && + chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) return e; return NULL; @@ -307,6 +312,49 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) mempool_free(pe, pending_pool); } +static void insert_completed_exception(struct dm_snapshot *s, + struct dm_snap_exception *new_e) +{ + struct exception_table *eh = &s->complete; + struct list_head *l; + struct dm_snap_exception *e = NULL; + + l = &eh->table[exception_hash(eh, new_e->old_chunk)]; + + /* Add immediately if this table doesn't support consecutive chunks */ + if (!eh->hash_shift) + goto out; + + /* List is ordered by old_chunk */ + list_for_each_entry_reverse(e, l, hash_list) { + /* Insert after an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk + + dm_consecutive_chunk_count(e) + 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) + + dm_consecutive_chunk_count(e) + 1)) { + dm_consecutive_chunk_count_inc(e); + free_exception(new_e); + return; + } + + /* Insert before an existing chunk? */ + if (new_e->old_chunk == (e->old_chunk - 1) && + new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { + dm_consecutive_chunk_count_inc(e); + e->old_chunk--; + e->new_chunk--; + free_exception(new_e); + return; + } + + if (new_e->old_chunk > e->old_chunk) + break; + } + +out: + list_add(&new_e->hash_list, e ? &e->hash_list : l); +} + int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) { struct dm_snap_exception *e; @@ -316,8 +364,12 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) return -ENOMEM; e->old_chunk = old; + + /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; - insert_exception(&s->complete, e); + + insert_completed_exception(s, e); + return 0; } @@ -334,16 +386,6 @@ static int calc_max_buckets(void) } /* - * Rounds a number down to a power of 2. - */ -static uint32_t round_down(uint32_t n) -{ - while (n & (n - 1)) - n &= (n - 1); - return n; -} - -/* * Allocate room for a suitable hash table. */ static int init_hash_tables(struct dm_snapshot *s) @@ -361,9 +403,9 @@ static int init_hash_tables(struct dm_snapshot *s) hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; hash_size = min(hash_size, max_buckets); - /* Round it down to a power of 2 */ - hash_size = round_down(hash_size); - if (init_exception_table(&s->complete, hash_size)) + hash_size = rounddown_pow_of_two(hash_size); + if (init_exception_table(&s->complete, hash_size, + DM_CHUNK_CONSECUTIVE_BITS)) return -ENOMEM; /* @@ -374,7 +416,7 @@ static int init_hash_tables(struct dm_snapshot *s) if (hash_size < 64) hash_size = 64; - if (init_exception_table(&s->pending, hash_size)) { + if (init_exception_table(&s->pending, hash_size, 0)) { exit_exception_table(&s->complete, exception_cache); return -ENOMEM; } @@ -733,7 +775,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) * Add a proper exception, and remove the * in-flight exception from the list. */ - insert_exception(&s->complete, e); + insert_completed_exception(s, e); out: remove_exception(&pe->e); @@ -867,11 +909,12 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) } static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, - struct bio *bio) + struct bio *bio, chunk_t chunk) { bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s, e->new_chunk) + - (bio->bi_sector & s->chunk_mask); + bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_sector & s->chunk_mask); } static int snapshot_map(struct dm_target *ti, struct bio *bio, @@ -902,7 +945,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, /* If the block is already remapped - use that, else remap it */ e = lookup_exception(&s->complete, chunk); if (e) { - remap_exception(s, e, bio); + remap_exception(s, e, bio, chunk); goto out_unlock; } @@ -919,7 +962,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, goto out_unlock; } - remap_exception(s, &pe->e, bio); + remap_exception(s, &pe->e, bio, chunk); bio_list_add(&pe->snapshot_bios, bio); r = DM_MAPIO_SUBMITTED; @@ -1207,7 +1250,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -1218,7 +1261,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h index 650e0f1f51d8..93bce5d49742 100644 --- a/drivers/md/dm-snap.h +++ b/drivers/md/dm-snap.h @@ -16,19 +16,22 @@ struct exception_table { uint32_t hash_mask; + unsigned hash_shift; struct list_head *table; }; /* * The snapshot code deals with largish chunks of the disk at a - * time. Typically 64k - 256k. + * time. Typically 32k - 512k. */ -/* FIXME: can we get away with limiting these to a uint32_t ? */ typedef sector_t chunk_t; /* * An exception is used where an old chunk of data has been * replaced by a new one. + * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number + * of chunks that follow contiguously. Remaining bits hold the number of the + * chunk within the device. */ struct dm_snap_exception { struct list_head hash_list; @@ -38,6 +41,49 @@ struct dm_snap_exception { }; /* + * Funtions to manipulate consecutive chunks + */ +# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +# define DM_CHUNK_CONSECUTIVE_BITS 8 +# define DM_CHUNK_NUMBER_BITS 56 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +{ + return e->new_chunk >> DM_CHUNK_NUMBER_BITS; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +{ + e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); + + BUG_ON(!dm_consecutive_chunk_count(e)); +} + +# else +# define DM_CHUNK_CONSECUTIVE_BITS 0 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk; +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +{ + return 0; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +{ +} + +# endif + +/* * Abstraction to handle the meta/layout of exception stores (the * COW device). */ diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 969944a8aba2..4de90ab3968b 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -14,10 +14,13 @@ #include <linux/log2.h> #define DM_MSG_PREFIX "striped" +#define DM_IO_ERROR_THRESHOLD 15 struct stripe { struct dm_dev *dev; sector_t physical_start; + + atomic_t error_count; }; struct stripe_c { @@ -30,9 +33,29 @@ struct stripe_c { uint32_t chunk_shift; sector_t chunk_mask; + /* Needed for handling events */ + struct dm_target *ti; + + /* Work struct used for triggering events*/ + struct work_struct kstriped_ws; + struct stripe stripe[0]; }; +static struct workqueue_struct *kstriped; + +/* + * An event is triggered whenever a drive + * drops out of a stripe volume. + */ +static void trigger_event(struct work_struct *work) +{ + struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); + + dm_table_event(sc->ti->table); + +} + static inline struct stripe_c *alloc_context(unsigned int stripes) { size_t len; @@ -63,6 +86,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, return -ENXIO; sc->stripe[stripe].physical_start = start; + return 0; } @@ -135,6 +159,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } + INIT_WORK(&sc->kstriped_ws, trigger_event); + + /* Set pointer to dm target; used in trigger_event */ + sc->ti = ti; + sc->stripes = stripes; sc->stripe_width = width; ti->split_io = chunk_size; @@ -158,9 +187,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) kfree(sc); return r; } + atomic_set(&(sc->stripe[i].error_count), 0); } ti->private = sc; + return 0; } @@ -172,6 +203,7 @@ static void stripe_dtr(struct dm_target *ti) for (i = 0; i < sc->stripes; i++) dm_put_device(ti, sc->stripe[i].dev); + flush_workqueue(kstriped); kfree(sc); } @@ -190,16 +222,37 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, return DM_MAPIO_REMAPPED; } +/* + * Stripe status: + * + * INFO + * #stripes [stripe_name <stripe_name>] [group word count] + * [error count 'A|D' <error count 'A|D'>] + * + * TABLE + * #stripes [stripe chunk size] + * [stripe_name physical_start <stripe_name physical_start>] + * + */ + static int stripe_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct stripe_c *sc = (struct stripe_c *) ti->private; + char buffer[sc->stripes + 1]; unsigned int sz = 0; unsigned int i; switch (type) { case STATUSTYPE_INFO: - result[0] = '\0'; + DMEMIT("%d ", sc->stripes); + for (i = 0; i < sc->stripes; i++) { + DMEMIT("%s ", sc->stripe[i].dev->name); + buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? + 'D' : 'A'; + } + buffer[i] = '\0'; + DMEMIT("1 %s", buffer); break; case STATUSTYPE_TABLE: @@ -213,13 +266,52 @@ static int stripe_status(struct dm_target *ti, return 0; } +static int stripe_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + unsigned i; + char major_minor[16]; + struct stripe_c *sc = ti->private; + + if (!error) + return 0; /* I/O complete */ + + if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + return error; + + if (error == -EOPNOTSUPP) + return error; + + memset(major_minor, 0, sizeof(major_minor)); + sprintf(major_minor, "%d:%d", + bio->bi_bdev->bd_disk->major, + bio->bi_bdev->bd_disk->first_minor); + + /* + * Test to see which stripe drive triggered the event + * and increment error count for all stripes on that device. + * If the error count for a given device exceeds the threshold + * value we will no longer trigger any further events. + */ + for (i = 0; i < sc->stripes; i++) + if (!strcmp(sc->stripe[i].dev->name, major_minor)) { + atomic_inc(&(sc->stripe[i].error_count)); + if (atomic_read(&(sc->stripe[i].error_count)) < + DM_IO_ERROR_THRESHOLD) + queue_work(kstriped, &sc->kstriped_ws); + } + + return error; +} + static struct target_type stripe_target = { .name = "striped", - .version= {1, 0, 2}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, .map = stripe_map, + .end_io = stripe_end_io, .status = stripe_status, }; @@ -231,6 +323,13 @@ int __init dm_stripe_init(void) if (r < 0) DMWARN("target registration failed"); + kstriped = create_singlethread_workqueue("kstriped"); + if (!kstriped) { + DMERR("failed to create workqueue kstriped"); + dm_unregister_target(&stripe_target); + return -ENOMEM; + } + return r; } @@ -239,5 +338,7 @@ void dm_stripe_exit(void) if (dm_unregister_target(&stripe_target)) DMWARN("target unregistration failed"); + destroy_workqueue(kstriped); + return; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 47818d8249cb..e75b1437b58b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -287,9 +287,8 @@ static void free_devices(struct list_head *devices) { struct list_head *tmp, *next; - for (tmp = devices->next; tmp != devices; tmp = next) { + list_for_each_safe(tmp, next, devices) { struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); - next = tmp->next; kfree(dd); } } @@ -362,7 +361,7 @@ static int lookup_device(const char *path, dev_t *dev) if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd))) return r; - inode = nd.dentry->d_inode; + inode = nd.path.dentry->d_inode; if (!inode) { r = -ENOENT; goto out; @@ -376,7 +375,7 @@ static int lookup_device(const char *path, dev_t *dev) *dev = inode->i_rdev; out: - path_release(&nd); + path_put(&nd.path); return r; } @@ -476,7 +475,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, int mode, struct dm_dev **result) { int r; - dev_t dev; + dev_t uninitialized_var(dev); struct dm_dev *dd; unsigned int major, minor; @@ -805,7 +804,7 @@ static int setup_indexes(struct dm_table *t) return -ENOMEM; /* set up internal nodes, bottom-up */ - for (i = t->depth - 2, total = 0; i >= 0; i--) { + for (i = t->depth - 2; i >= 0; i--) { t->index[i] = indexes; indexes += (KEYS_PER_NODE * t->counts[i]); setup_btree_index(i, t); @@ -993,12 +992,11 @@ int dm_table_resume_targets(struct dm_table *t) int dm_table_any_congested(struct dm_table *t, int bdi_bits) { - struct list_head *d, *devices; + struct dm_dev *dd; + struct list_head *devices = dm_table_get_devices(t); int r = 0; - devices = dm_table_get_devices(t); - for (d = devices->next; d != devices; d = d->next) { - struct dm_dev *dd = list_entry(d, struct dm_dev, list); + list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->bdev); r |= bdi_congested(&q->backing_dev_info, bdi_bits); } @@ -1008,10 +1006,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) void dm_table_unplug_all(struct dm_table *t) { - struct list_head *d, *devices = dm_table_get_devices(t); + struct dm_dev *dd; + struct list_head *devices = dm_table_get_devices(t); - for (d = devices->next; d != devices; d = d->next) { - struct dm_dev *dd = list_entry(d, struct dm_dev, list); + list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->bdev); blk_unplug(q); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f2d24eb3208c..6617ce4af095 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -71,9 +71,22 @@ union map_info *dm_get_mapinfo(struct bio *bio) #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 +/* + * Work processed by per-device workqueue. + */ +struct dm_wq_req { + enum { + DM_WQ_FLUSH_ALL, + DM_WQ_FLUSH_DEFERRED, + } type; + struct work_struct work; + struct mapped_device *md; + void *context; +}; + struct mapped_device { struct rw_semaphore io_lock; - struct semaphore suspend_lock; + struct mutex suspend_lock; spinlock_t pushback_lock; rwlock_t map_lock; atomic_t holders; @@ -96,6 +109,11 @@ struct mapped_device { struct bio_list pushback; /* + * Processing queue (flush/barriers) + */ + struct workqueue_struct *wq; + + /* * The current mapping. */ struct dm_table *map; @@ -181,7 +199,7 @@ static void local_exit(void) DMINFO("cleaned up"); } -int (*_inits[])(void) __initdata = { +static int (*_inits[])(void) __initdata = { local_init, dm_target_init, dm_linear_init, @@ -189,7 +207,7 @@ int (*_inits[])(void) __initdata = { dm_interface_init, }; -void (*_exits[])(void) = { +static void (*_exits[])(void) = { local_exit, dm_target_exit, dm_linear_exit, @@ -982,7 +1000,7 @@ static struct mapped_device *alloc_dev(int minor) } if (!try_module_get(THIS_MODULE)) - goto bad0; + goto bad_module_get; /* get a minor number for the dev */ if (minor == DM_ANY_MINOR) @@ -990,11 +1008,11 @@ static struct mapped_device *alloc_dev(int minor) else r = specific_minor(md, minor); if (r < 0) - goto bad1; + goto bad_minor; memset(md, 0, sizeof(*md)); init_rwsem(&md->io_lock); - init_MUTEX(&md->suspend_lock); + mutex_init(&md->suspend_lock); spin_lock_init(&md->pushback_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); @@ -1006,7 +1024,7 @@ static struct mapped_device *alloc_dev(int minor) md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) - goto bad1_free_minor; + goto bad_queue; md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; @@ -1017,11 +1035,11 @@ static struct mapped_device *alloc_dev(int minor) md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); if (!md->io_pool) - goto bad2; + goto bad_io_pool; md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); if (!md->tio_pool) - goto bad3; + goto bad_tio_pool; md->bs = bioset_create(16, 16); if (!md->bs) @@ -1029,7 +1047,7 @@ static struct mapped_device *alloc_dev(int minor) md->disk = alloc_disk(1); if (!md->disk) - goto bad4; + goto bad_disk; atomic_set(&md->pending, 0); init_waitqueue_head(&md->wait); @@ -1044,6 +1062,10 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); + md->wq = create_singlethread_workqueue("kdmflush"); + if (!md->wq) + goto bad_thread; + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -1053,19 +1075,21 @@ static struct mapped_device *alloc_dev(int minor) return md; - bad4: +bad_thread: + put_disk(md->disk); +bad_disk: bioset_free(md->bs); - bad_no_bioset: +bad_no_bioset: mempool_destroy(md->tio_pool); - bad3: +bad_tio_pool: mempool_destroy(md->io_pool); - bad2: +bad_io_pool: blk_cleanup_queue(md->queue); - bad1_free_minor: +bad_queue: free_minor(minor); - bad1: +bad_minor: module_put(THIS_MODULE); - bad0: +bad_module_get: kfree(md); return NULL; } @@ -1080,6 +1104,7 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); bdput(md->suspended_bdev); } + destroy_workqueue(md->wq); mempool_destroy(md->tio_pool); mempool_destroy(md->io_pool); bioset_free(md->bs); @@ -1259,20 +1284,91 @@ void dm_put(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_put); +static int dm_wait_for_completion(struct mapped_device *md) +{ + int r = 0; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + smp_mb(); + if (!atomic_read(&md->pending)) + break; + + if (signal_pending(current)) { + r = -EINTR; + break; + } + + io_schedule(); + } + set_current_state(TASK_RUNNING); + + return r; +} + /* * Process the deferred bios */ -static void __flush_deferred_io(struct mapped_device *md, struct bio *c) +static void __flush_deferred_io(struct mapped_device *md) { - struct bio *n; + struct bio *c; - while (c) { - n = c->bi_next; - c->bi_next = NULL; + while ((c = bio_list_pop(&md->deferred))) { if (__split_bio(md, c)) bio_io_error(c); - c = n; } + + clear_bit(DMF_BLOCK_IO, &md->flags); +} + +static void __merge_pushback_list(struct mapped_device *md) +{ + unsigned long flags; + + spin_lock_irqsave(&md->pushback_lock, flags); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + bio_list_merge_head(&md->deferred, &md->pushback); + bio_list_init(&md->pushback); + spin_unlock_irqrestore(&md->pushback_lock, flags); +} + +static void dm_wq_work(struct work_struct *work) +{ + struct dm_wq_req *req = container_of(work, struct dm_wq_req, work); + struct mapped_device *md = req->md; + + down_write(&md->io_lock); + switch (req->type) { + case DM_WQ_FLUSH_ALL: + __merge_pushback_list(md); + /* pass through */ + case DM_WQ_FLUSH_DEFERRED: + __flush_deferred_io(md); + break; + default: + DMERR("dm_wq_work: unrecognised work type %d", req->type); + BUG(); + } + up_write(&md->io_lock); +} + +static void dm_wq_queue(struct mapped_device *md, int type, void *context, + struct dm_wq_req *req) +{ + req->type = type; + req->md = md; + req->context = context; + INIT_WORK(&req->work, dm_wq_work); + queue_work(md->wq, &req->work); +} + +static void dm_queue_flush(struct mapped_device *md, int type, void *context) +{ + struct dm_wq_req req; + + dm_wq_queue(md, type, context, &req); + flush_workqueue(md->wq); } /* @@ -1282,7 +1378,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) { int r = -EINVAL; - down(&md->suspend_lock); + mutex_lock(&md->suspend_lock); /* device must be suspended */ if (!dm_suspended(md)) @@ -1297,7 +1393,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) r = __bind(md, table); out: - up(&md->suspend_lock); + mutex_unlock(&md->suspend_lock); return r; } @@ -1346,17 +1442,17 @@ static void unlock_fs(struct mapped_device *md) int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; - unsigned long flags; DECLARE_WAITQUEUE(wait, current); - struct bio *def; - int r = -EINVAL; + int r = 0; int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - down(&md->suspend_lock); + mutex_lock(&md->suspend_lock); - if (dm_suspended(md)) + if (dm_suspended(md)) { + r = -EINVAL; goto out_unlock; + } map = dm_get_table(md); @@ -1378,16 +1474,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) r = -ENOMEM; goto flush_and_out; } - } - /* - * Flush I/O to the device. - * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. - */ - if (do_lockfs && !noflush) { - r = lock_fs(md); - if (r) - goto out; + /* + * Flush I/O to the device. noflush supersedes do_lockfs, + * because lock_fs() needs to flush I/Os. + */ + if (do_lockfs) { + r = lock_fs(md); + if (r) + goto out; + } } /* @@ -1404,66 +1500,36 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) dm_table_unplug_all(map); /* - * Then we wait for the already mapped ios to - * complete. + * Wait for the already-mapped ios to complete. */ - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (!atomic_read(&md->pending) || signal_pending(current)) - break; - - io_schedule(); - } - set_current_state(TASK_RUNNING); + r = dm_wait_for_completion(md); down_write(&md->io_lock); remove_wait_queue(&md->wait, &wait); - if (noflush) { - spin_lock_irqsave(&md->pushback_lock, flags); - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - bio_list_merge_head(&md->deferred, &md->pushback); - bio_list_init(&md->pushback); - spin_unlock_irqrestore(&md->pushback_lock, flags); - } + if (noflush) + __merge_pushback_list(md); + up_write(&md->io_lock); /* were we interrupted ? */ - r = -EINTR; - if (atomic_read(&md->pending)) { - clear_bit(DMF_BLOCK_IO, &md->flags); - def = bio_list_get(&md->deferred); - __flush_deferred_io(md, def); - up_write(&md->io_lock); + if (r < 0) { + dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + unlock_fs(md); goto out; /* pushback list is already flushed, so skip flush */ } - up_write(&md->io_lock); dm_table_postsuspend_targets(map); set_bit(DMF_SUSPENDED, &md->flags); - r = 0; - flush_and_out: - if (r && noflush) { + if (r && noflush) /* * Because there may be already I/Os in the pushback list, * flush them before return. */ - down_write(&md->io_lock); - - spin_lock_irqsave(&md->pushback_lock, flags); - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - bio_list_merge_head(&md->deferred, &md->pushback); - bio_list_init(&md->pushback); - spin_unlock_irqrestore(&md->pushback_lock, flags); - - def = bio_list_get(&md->deferred); - __flush_deferred_io(md, def); - up_write(&md->io_lock); - } + dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL); out: if (r && md->suspended_bdev) { @@ -1474,17 +1540,16 @@ out: dm_table_put(map); out_unlock: - up(&md->suspend_lock); + mutex_unlock(&md->suspend_lock); return r; } int dm_resume(struct mapped_device *md) { int r = -EINVAL; - struct bio *def; struct dm_table *map = NULL; - down(&md->suspend_lock); + mutex_lock(&md->suspend_lock); if (!dm_suspended(md)) goto out; @@ -1496,12 +1561,7 @@ int dm_resume(struct mapped_device *md) if (r) goto out; - down_write(&md->io_lock); - clear_bit(DMF_BLOCK_IO, &md->flags); - - def = bio_list_get(&md->deferred); - __flush_deferred_io(md, def); - up_write(&md->io_lock); + dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); unlock_fs(md); @@ -1520,7 +1580,7 @@ int dm_resume(struct mapped_device *md) out: dm_table_put(map); - up(&md->suspend_lock); + mutex_unlock(&md->suspend_lock); return r; } diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index cf2ddce34118..d107ddceefcd 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -294,7 +294,7 @@ static int run(mddev_t *mddev) } conf->nfaults = 0; - ITERATE_RDEV(mddev, rdev, tmp) + rdev_for_each(rdev, tmp, mddev) conf->rdev = rdev; mddev->array_size = mddev->size; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 3dac1cfb8189..0b8511776b3e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -122,7 +122,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) cnt = 0; conf->array_size = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { int j = rdev->raid_disk; dev_info_t *disk = conf->disks + j; diff --git a/drivers/md/md.c b/drivers/md/md.c index c28a120b4161..7da6ec244e15 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -195,7 +195,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */ -#define ITERATE_MDDEV(mddev,tmp) \ +#define for_each_mddev(mddev,tmp) \ \ for (({ spin_lock(&all_mddevs_lock); \ tmp = all_mddevs.next; \ @@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit) spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); new->reshape_position = MaxSector; + new->resync_max = MaxSector; new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { @@ -310,7 +311,7 @@ static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) mdk_rdev_t * rdev; struct list_head *tmp; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->desc_nr == nr) return rdev; } @@ -322,7 +323,7 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->bdev->bd_dev == dev) return rdev; } @@ -773,12 +774,16 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) __u64 ev1 = md_event(sb); rdev->raid_disk = -1; - rdev->flags = 0; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + clear_bit(BarriersNotsupp, &rdev->flags); + if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; - mddev->persistent = ! sb->not_persistent; + mddev->external = 0; mddev->chunk_size = sb->chunk_size; mddev->ctime = sb->ctime; mddev->utime = sb->utime; @@ -904,7 +909,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->size = mddev->size; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; - sb->not_persistent = !mddev->persistent; + sb->not_persistent = 0; sb->utime = mddev->utime; sb->state = 0; sb->events_hi = (mddev->events>>32); @@ -938,7 +943,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { mdp_disk_t *d; int desc_nr; if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) @@ -1153,11 +1158,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; - rdev->flags = 0; + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + clear_bit(BarriersNotsupp, &rdev->flags); + if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; - mddev->persistent = 1; + mddev->external = 0; mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); @@ -1286,7 +1295,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } max_dev = 0; - ITERATE_RDEV(mddev,rdev2,tmp) + rdev_for_each(rdev2, tmp, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; @@ -1295,7 +1304,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - ITERATE_RDEV(mddev,rdev2,tmp) { + rdev_for_each(rdev2, tmp, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1333,8 +1342,8 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) struct list_head *tmp, *tmp2; mdk_rdev_t *rdev, *rdev2; - ITERATE_RDEV(mddev1,rdev,tmp) - ITERATE_RDEV(mddev2, rdev2, tmp2) + rdev_for_each(rdev, tmp, mddev1) + rdev_for_each(rdev2, tmp2, mddev2) if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) return 1; @@ -1401,7 +1410,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) goto fail; } list_add(&rdev->same_set, &mddev->disks); - bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); + bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); return 0; fail: @@ -1410,10 +1419,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) return err; } -static void delayed_delete(struct work_struct *ws) +static void md_delayed_delete(struct work_struct *ws) { mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); kobject_del(&rdev->kobj); + kobject_put(&rdev->kobj); } static void unbind_rdev_from_array(mdk_rdev_t * rdev) @@ -1432,7 +1442,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state" */ - INIT_WORK(&rdev->del_work, delayed_delete); + INIT_WORK(&rdev->del_work, md_delayed_delete); + kobject_get(&rdev->kobj); schedule_work(&rdev->del_work); } @@ -1441,7 +1452,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) +static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; @@ -1453,13 +1464,15 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, rdev); + err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); if (err) { printk(KERN_ERR "md: could not bd_claim %s.\n", bdevname(bdev, b)); blkdev_put(bdev); return err; } + if (!shared) + set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; return err; } @@ -1503,7 +1516,7 @@ static void export_array(mddev_t *mddev) struct list_head *tmp; mdk_rdev_t *rdev; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -1581,17 +1594,17 @@ static void md_print_devices(void) printk("md: **********************************\n"); printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); printk("md: **********************************\n"); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { if (mddev->bitmap) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - ITERATE_RDEV(mddev,rdev,tmp2) + rdev_for_each(rdev, tmp2, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - ITERATE_RDEV(mddev,rdev,tmp2) + rdev_for_each(rdev, tmp2, mddev) print_rdev(rdev); } printk("md: **********************************\n"); @@ -1610,7 +1623,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) mdk_rdev_t *rdev; struct list_head *tmp; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && @@ -1696,18 +1709,20 @@ repeat: MD_BUG(); mddev->events --; } - sync_sbs(mddev, nospares); /* * do not write anything to disk if using * nonpersistent superblocks */ if (!mddev->persistent) { - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + if (!mddev->external) + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); return; } + sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); dprintk(KERN_INFO @@ -1715,7 +1730,7 @@ repeat: mdname(mddev),mddev->in_sync); bitmap_update_sb(mddev->bitmap); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); if (rdev->sb_loaded != 1) @@ -1785,7 +1800,7 @@ static ssize_t state_show(mdk_rdev_t *rdev, char *page) { char *sep = ""; - int len=0; + size_t len = 0; if (test_bit(Faulty, &rdev->flags)) { len+= sprintf(page+len, "%sfaulty",sep); @@ -1887,20 +1902,45 @@ static ssize_t slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; + int err; + char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; else if (e==buf || (*e && *e!= '\n')) return -EINVAL; - if (rdev->mddev->pers) - /* Cannot set slot in active array (yet) */ - return -EBUSY; - if (slot >= rdev->mddev->raid_disks) - return -ENOSPC; - rdev->raid_disk = slot; - /* assume it is working */ - rdev->flags = 0; - set_bit(In_sync, &rdev->flags); + if (rdev->mddev->pers) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. + * For now we only support removing + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ + if (slot != -1) + return -EBUSY; + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ + if (rdev->mddev->pers->hot_add_disk == NULL) + return -EINVAL; + err = rdev->mddev->pers-> + hot_remove_disk(rdev->mddev, rdev->raid_disk); + if (err) + return err; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&rdev->mddev->kobj, nm); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + } else { + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; + rdev->raid_disk = slot; + /* assume it is working */ + clear_bit(Faulty, &rdev->flags); + clear_bit(WriteMostly, &rdev->flags); + set_bit(In_sync, &rdev->flags); + } return len; } @@ -1923,6 +1963,10 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) return -EINVAL; if (rdev->mddev->pers) return -EBUSY; + if (rdev->size && rdev->mddev->external) + /* Must set offset before size, so overlap checks + * can be sane */ + return -EBUSY; rdev->data_offset = offset; return len; } @@ -1936,16 +1980,69 @@ rdev_size_show(mdk_rdev_t *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->size); } +static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +{ + /* check if two start/length pairs overlap */ + if (s1+l1 <= s2) + return 0; + if (s2+l2 <= s1) + return 0; + return 1; +} + static ssize_t rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) { char *e; unsigned long long size = simple_strtoull(buf, &e, 10); + unsigned long long oldsize = rdev->size; if (e==buf || (*e && *e != '\n')) return -EINVAL; if (rdev->mddev->pers) return -EBUSY; rdev->size = size; + if (size > oldsize && rdev->mddev->external) { + /* need to check that all other rdevs with the same ->bdev + * do not overlap. We need to unlock the mddev to avoid + * a deadlock. We have already changed rdev->size, and if + * we have to change it back, we will have the lock again. + */ + mddev_t *mddev; + int overlap = 0; + struct list_head *tmp, *tmp2; + + mddev_unlock(rdev->mddev); + for_each_mddev(mddev, tmp) { + mdk_rdev_t *rdev2; + + mddev_lock(mddev); + rdev_for_each(rdev2, tmp2, mddev) + if (test_bit(AllReserved, &rdev2->flags) || + (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->size, + rdev2->data_offset, rdev2->size))) { + overlap = 1; + break; + } + mddev_unlock(mddev); + if (overlap) { + mddev_put(mddev); + break; + } + } + mddev_lock(rdev->mddev); + if (overlap) { + /* Someone else could have slipped in a size + * change here, but doing so is just silly. + * We put oldsize back because we *know* it is + * safe, and trust userspace not to race with + * itself + */ + rdev->size = oldsize; + return -EBUSY; + } + } if (size < rdev->mddev->size || rdev->mddev->size == 0) rdev->mddev->size = size; return len; @@ -1980,12 +2077,18 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + int rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - return entry->store(rdev, page, length); + rv = mddev_lock(rdev->mddev); + if (!rv) { + rv = entry->store(rdev, page, length); + mddev_unlock(rdev->mddev); + } + return rv; } static void rdev_free(struct kobject *ko) @@ -2029,7 +2132,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi if ((err = alloc_disk_sb(rdev))) goto abort_free; - err = lock_rdev(rdev, newdev); + err = lock_rdev(rdev, newdev, super_format == -2); if (err) goto abort_free; @@ -2099,7 +2202,7 @@ static void analyze_sbs(mddev_t * mddev) char b[BDEVNAME_SIZE]; freshest = NULL; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -2120,7 +2223,7 @@ static void analyze_sbs(mddev_t * mddev) validate_super(mddev, freshest); i = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (rdev != freshest) if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { @@ -2215,7 +2318,7 @@ level_show(mddev_t *mddev, char *page) static ssize_t level_store(mddev_t *mddev, const char *buf, size_t len) { - int rv = len; + ssize_t rv = len; if (mddev->pers) return -EBUSY; if (len == 0) @@ -2425,6 +2528,8 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; + else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + st = write_pending; else if (mddev->safemode) st = active_idle; else @@ -2455,11 +2560,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->active) > 1) - return -EBUSY; - err = do_md_stop(mddev, 0); - } + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 0); break; case inactive: /* stopping an active array */ @@ -2467,7 +2570,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (atomic_read(&mddev->active) > 1) return -EBUSY; err = do_md_stop(mddev, 2); - } + } else + err = 0; /* already inactive */ break; case suspended: break; /* not supported yet */ @@ -2495,9 +2599,15 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) restart_array(mddev); spin_lock_irq(&mddev->write_lock); if (atomic_read(&mddev->writes_pending) == 0) { - mddev->in_sync = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - } + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, + &mddev->flags); + } + err = 0; + } else + err = -EBUSY; spin_unlock_irq(&mddev->write_lock); } else { mddev->ro = 0; @@ -2508,7 +2618,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->external) + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -2574,7 +2685,9 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) if (err < 0) goto out; } - } else + } else if (mddev->external) + rdev = md_import_device(dev, -2, -1); + else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) @@ -2659,7 +2772,9 @@ __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); /* Metdata version. - * This is either 'none' for arrays with externally managed metadata, + * This is one of + * 'none' for arrays with no metadata (good luck...) + * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t @@ -2668,6 +2783,8 @@ metadata_show(mddev_t *mddev, char *page) if (mddev->persistent) return sprintf(page, "%d.%d\n", mddev->major_version, mddev->minor_version); + else if (mddev->external) + return sprintf(page, "external:%s\n", mddev->metadata_type); else return sprintf(page, "none\n"); } @@ -2682,6 +2799,21 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) if (cmd_match(buf, "none")) { mddev->persistent = 0; + mddev->external = 0; + mddev->major_version = 0; + mddev->minor_version = 90; + return len; + } + if (strncmp(buf, "external:", 9) == 0) { + size_t namelen = len-9; + if (namelen >= sizeof(mddev->metadata_type)) + namelen = sizeof(mddev->metadata_type)-1; + strncpy(mddev->metadata_type, buf+9, namelen); + mddev->metadata_type[namelen] = 0; + if (namelen && mddev->metadata_type[namelen-1] == '\n') + mddev->metadata_type[--namelen] = 0; + mddev->persistent = 0; + mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; return len; @@ -2698,6 +2830,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; + mddev->external = 0; return len; } @@ -2865,6 +2998,43 @@ sync_completed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t +max_sync_show(mddev_t *mddev, char *page) +{ + if (mddev->resync_max == MaxSector) + return sprintf(page, "max\n"); + else + return sprintf(page, "%llu\n", + (unsigned long long)mddev->resync_max); +} +static ssize_t +max_sync_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (strncmp(buf, "max", 3) == 0) + mddev->resync_max = MaxSector; + else { + char *ep; + unsigned long long max = simple_strtoull(buf, &ep, 10); + if (ep == buf || (*ep != 0 && *ep != '\n')) + return -EINVAL; + if (max < mddev->resync_max && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + /* Must be a multiple of chunk_size */ + if (mddev->chunk_size) { + if (max & (sector_t)((mddev->chunk_size>>9)-1)) + return -EINVAL; + } + mddev->resync_max = max; + } + wake_up(&mddev->recovery_wait); + return len; +} + +static struct md_sysfs_entry md_max_sync = +__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); + +static ssize_t suspend_lo_show(mddev_t *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); @@ -2974,6 +3144,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_max.attr, &md_sync_speed.attr, &md_sync_completed.attr, + &md_max_sync.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, &md_bitmap.attr, @@ -3118,8 +3289,11 @@ static int do_md_run(mddev_t * mddev) /* * Analyze all RAID superblock(s) */ - if (!mddev->raid_disks) + if (!mddev->raid_disks) { + if (!mddev->persistent) + return -EINVAL; analyze_sbs(mddev); + } chunk_size = mddev->chunk_size; @@ -3143,7 +3317,7 @@ static int do_md_run(mddev_t * mddev) } /* devices must have minimum size of one chunk */ - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->size < chunk_size / 1024) { @@ -3170,7 +3344,7 @@ static int do_md_run(mddev_t * mddev) * the only valid external interface is through the md * device. */ - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -3236,8 +3410,8 @@ static int do_md_run(mddev_t * mddev) mdk_rdev_t *rdev2; struct list_head *tmp2; int warned = 0; - ITERATE_RDEV(mddev, rdev, tmp) { - ITERATE_RDEV(mddev, rdev2, tmp2) { + rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev2, tmp2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -3297,7 +3471,7 @@ static int do_md_run(mddev_t * mddev) mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) if (rdev->raid_disk >= 0) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -3330,7 +3504,7 @@ static int do_md_run(mddev_t * mddev) if (mddev->degraded && !mddev->sync_thread) { struct list_head *rtmp; int spares = 0; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -3507,14 +3681,14 @@ static int do_md_stop(mddev_t * mddev, int mode) } mddev->bitmap_offset = 0; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) if (rdev->raid_disk >= 0) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } - /* make sure all delayed_delete calls have finished */ + /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); export_array(mddev); @@ -3523,7 +3697,10 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->size = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; + mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", @@ -3546,7 +3723,7 @@ static void autorun_array(mddev_t *mddev) printk(KERN_INFO "md: running: "); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } @@ -3589,7 +3766,7 @@ static void autorun_devices(int part) printk(KERN_INFO "md: considering %s ...\n", bdevname(rdev0->bdev,b)); INIT_LIST_HEAD(&candidates); - ITERATE_RDEV_PENDING(rdev,tmp) + rdev_for_each_list(rdev, tmp, pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { printk(KERN_INFO "md: adding %s ...\n", bdevname(rdev->bdev,b)); @@ -3632,7 +3809,8 @@ static void autorun_devices(int part) mddev_unlock(mddev); } else { printk(KERN_INFO "md: created %s\n", mdname(mddev)); - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { + mddev->persistent = 1; + rdev_for_each_list(rdev, tmp, candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev); @@ -3643,7 +3821,7 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - ITERATE_RDEV_GENERIC(candidates,rdev,tmp) + rdev_for_each_list(rdev, tmp, candidates) export_rdev(rdev); mddev_put(mddev); } @@ -3673,7 +3851,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) struct list_head *tmp; nr=working=active=failed=spare=0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -3919,8 +4097,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->raid_disk = -1; - rdev->flags = 0; - if (rdev->raid_disk < mddev->raid_disks) if (info->state & (1<<MD_DISK_SYNC)) set_bit(In_sync, &rdev->flags); @@ -4165,13 +4341,15 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) else mddev->recovery_cp = 0; mddev->persistent = ! info->not_persistent; + mddev->external = 0; mddev->layout = info->layout; mddev->chunk_size = info->chunk_size; mddev->max_disks = MD_SB_DISKS; - mddev->flags = 0; + if (mddev->persistent) + mddev->flags = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -4213,7 +4391,7 @@ static int update_size(mddev_t *mddev, unsigned long size) */ if (mddev->sync_thread) return -EBUSY; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { sector_t avail; avail = rdev->size * 2; @@ -4471,9 +4649,10 @@ static int md_ioctl(struct inode *inode, struct file *file, */ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ - if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY - && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE - && cmd != GET_BITMAP_FILE) { + if ((!mddev->raid_disks && !mddev->external) + && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY + && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE + && cmd != GET_BITMAP_FILE) { err = -ENODEV; goto abort_unlock; } @@ -4757,7 +4936,7 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "unused devices: "); - ITERATE_RDEV_PENDING(rdev,tmp) { + rdev_for_each_list(rdev, tmp, pending_raid_disks) { char b[BDEVNAME_SIZE]; i++; seq_printf(seq, "%s ", @@ -4953,7 +5132,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } size = 0; - ITERATE_RDEV(mddev,rdev,tmp2) { + rdev_for_each(rdev, tmp2, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -4982,7 +5161,10 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->major_version, mddev->minor_version); } - } else + } else if (mddev->external) + seq_printf(seq, " super external:%s", + mddev->metadata_type); + else seq_printf(seq, " super non-persistent"); if (mddev->pers) { @@ -5015,8 +5197,7 @@ static int md_seq_show(struct seq_file *seq, void *v) chunk_kb ? "KB" : "B"); if (bitmap->file) { seq_printf(seq, ", file: "); - seq_path(seq, bitmap->file->f_path.mnt, - bitmap->file->f_path.dentry," \t\n"); + seq_path(seq, &bitmap->file->f_path, " \t\n"); } seq_printf(seq, "\n"); @@ -5106,7 +5287,7 @@ static int is_mddev_idle(mddev_t *mddev) long curr_events; idle = 1; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; curr_events = disk_stat_read(disk, sectors[0]) + disk_stat_read(disk, sectors[1]) - @@ -5283,7 +5464,7 @@ void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto skip; } - ITERATE_MDDEV(mddev2,tmp) { + for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; if (mddev2->curr_resync && @@ -5333,7 +5514,7 @@ void md_do_sync(mddev_t *mddev) /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; j = MaxSector; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -5381,8 +5562,16 @@ void md_do_sync(mddev_t *mddev) sector_t sectors; skipped = 0; + if (j >= mddev->resync_max) { + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + wait_event(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + } + if (kthread_should_stop()) + goto interrupted; sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < speed_min(mddev)); + currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_ERR, &mddev->recovery); goto out; @@ -5424,15 +5613,9 @@ void md_do_sync(mddev_t *mddev) } - if (kthread_should_stop()) { - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } + if (kthread_should_stop()) + goto interrupted; + /* * this loop exits only if either when we are slower than @@ -5484,7 +5667,7 @@ void md_do_sync(mddev_t *mddev) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -5496,9 +5679,22 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; + mddev->resync_max = MaxSector; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); + return; + + interrupted: + /* + * got a signal, exit. + */ + printk(KERN_INFO + "md: md_do_sync() got signal ... exiting\n"); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto out; + } EXPORT_SYMBOL_GPL(md_do_sync); @@ -5509,8 +5705,9 @@ static int remove_and_add_spares(mddev_t *mddev) struct list_head *rtmp; int spares = 0; - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk >= 0 && + !mddev->external && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { @@ -5524,7 +5721,7 @@ static int remove_and_add_spares(mddev_t *mddev) } if (mddev->degraded) { - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { rdev->recovery_offset = 0; @@ -5589,7 +5786,7 @@ void md_check_recovery(mddev_t *mddev) } if ( ! ( - mddev->flags || + (mddev->flags && !mddev->external) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->safemode == 1) || @@ -5605,7 +5802,8 @@ void md_check_recovery(mddev_t *mddev) if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + if (mddev->persistent) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; @@ -5637,7 +5835,7 @@ void md_check_recovery(mddev_t *mddev) * information must be scrapped */ if (!mddev->degraded) - ITERATE_RDEV(mddev,rdev,rtmp) + rdev_for_each(rdev, rtmp, mddev) rdev->saved_raid_disk = -1; mddev->recovery = 0; @@ -5714,7 +5912,7 @@ static int md_notify_reboot(struct notifier_block *this, printk(KERN_INFO "md: stopping all md devices.\n"); - ITERATE_MDDEV(mddev,tmp) + for_each_mddev(mddev, tmp) if (mddev_trylock(mddev)) { do_md_stop (mddev, 1); mddev_unlock(mddev); @@ -5848,7 +6046,7 @@ static __exit void md_exit(void) unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); remove_proc_entry("mdstat", NULL); - ITERATE_MDDEV(mddev,tmp) { + for_each_mddev(mddev, tmp) { struct gendisk *disk = mddev->gendisk; if (!disk) continue; diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c index adef299908cf..b61d5767aae7 100644 --- a/drivers/md/mktables.c +++ b/drivers/md/mktables.c @@ -1,13 +1,10 @@ -#ident "$Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $" -/* ----------------------------------------------------------------------- * +/* -*- linux-c -*- ------------------------------------------------------- * * - * Copyright 2002 H. Peter Anvin - All Rights Reserved + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -26,100 +23,98 @@ static uint8_t gfmul(uint8_t a, uint8_t b) { - uint8_t v = 0; - - while ( b ) { - if ( b & 1 ) v ^= a; - a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); - b >>= 1; - } - return v; + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; } static uint8_t gfpow(uint8_t a, int b) { - uint8_t v = 1; - - b %= 255; - if ( b < 0 ) - b += 255; - - while ( b ) { - if ( b & 1 ) v = gfmul(v,a); - a = gfmul(a,a); - b >>= 1; - } - return v; + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; } int main(int argc, char *argv[]) { - int i, j, k; - uint8_t v; - uint8_t exptbl[256], invtbl[256]; - - printf("#include \"raid6.h\"\n"); - - /* Compute multiplication table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfmul[256][256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i++ ) { - printf("\t{\n"); - for ( j = 0 ; j < 256 ; j += 8 ) { - printf("\t\t"); - for ( k = 0 ; k < 8 ; k++ ) { - printf("0x%02x, ", gfmul(i,j+k)); - } - printf("\n"); - } - printf("\t},\n"); - } - printf("};\n"); - - /* Compute power-of-2 table (exponent) */ - v = 1; - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexp[256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i += 8 ) { - printf("\t"); - for ( j = 0 ; j < 8 ; j++ ) { - exptbl[i+j] = v; - printf("0x%02x, ", v); - v = gfmul(v,2); - if ( v == 1 ) v = 0; /* For entry 255, not a real entry */ - } - printf("\n"); - } - printf("};\n"); - - /* Compute inverse table x^-1 == x^254 */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfinv[256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i += 8 ) { - printf("\t"); - for ( j = 0 ; j < 8 ; j++ ) { - invtbl[i+j] = v = gfpow(i+j,254); - printf("0x%02x, ", v); - } - printf("\n"); - } - printf("};\n"); - - /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexi[256] =\n" - "{\n"); - for ( i = 0 ; i < 256 ; i += 8 ) { - printf("\t"); - for ( j = 0 ; j < 8 ; j++ ) { - printf("0x%02x, ", invtbl[exptbl[i+j]^1]); - } - printf("\n"); - } - printf("};\n\n"); - - return 0; + int i, j, k; + uint8_t v; + uint8_t exptbl[256], invtbl[256]; + + printf("#include \"raid6.h\"\n"); + + /* Compute multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfmul[256][256] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 256; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + + /* Compute power-of-2 table (exponent) */ + v = 1; + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexp[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + exptbl[i + j] = v; + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + } + printf("};\n"); + + /* Compute inverse table x^-1 == x^254 */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfinv[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + invtbl[i + j] = v = gfpow(i + j, 254); + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexi[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) + printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], + (j == 7) ? '\n' : ' '); + } + printf("};\n"); + + return 0; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index eb631ebed686..3f299d835a2b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -436,7 +436,7 @@ static int multipath_run (mddev_t *mddev) } conf->working_disks = 0; - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || disk_idx >= mddev->raid_disks) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f8e591708d1f..818b48284096 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -72,11 +72,11 @@ static int create_strip_zones (mddev_t *mddev) */ conf->nr_strip_zones = 0; - ITERATE_RDEV(mddev,rdev1,tmp1) { + rdev_for_each(rdev1, tmp1, mddev) { printk("raid0: looking at %s\n", bdevname(rdev1->bdev,b)); c = 0; - ITERATE_RDEV(mddev,rdev2,tmp2) { + rdev_for_each(rdev2, tmp2, mddev) { printk("raid0: comparing %s(%llu)", bdevname(rdev1->bdev,b), (unsigned long long)rdev1->size); @@ -124,7 +124,7 @@ static int create_strip_zones (mddev_t *mddev) cnt = 0; smallest = NULL; zone->dev = conf->devlist; - ITERATE_RDEV(mddev, rdev1, tmp1) { + rdev_for_each(rdev1, tmp1, mddev) { int j = rdev1->raid_disk; if (j < 0 || j >= mddev->raid_disks) { @@ -293,7 +293,7 @@ static int raid0_run (mddev_t *mddev) /* calculate array device size */ mddev->array_size = 0; - ITERATE_RDEV(mddev,rdev,tmp) + rdev_for_each(rdev, tmp, mddev) mddev->array_size += rdev->size; printk("raid0 : md_size is %llu blocks.\n", diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4a69c416e045..5c7fef091cec 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1684,6 +1684,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev->bitmap, sector_nr); raise_barrier(conf); conf->next_resync = sector_nr; @@ -1766,6 +1767,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return rv; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ nr_sectors = 0; sync_blocks = 0; do { @@ -1884,7 +1887,7 @@ static int run(mddev_t *mddev) if (!conf->r1bio_pool) goto out_no_mem; - ITERATE_RDEV(mddev, rdev, tmp) { + rdev_for_each(rdev, tmp, mddev) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5cdcc9386200..017f58113c33 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1657,6 +1657,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return (max_sector - sector_nr) + sectors_skipped; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + /* make sure whole request will fit in a chunk - if chunks * are meaningful */ @@ -1670,6 +1673,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, @@ -2021,7 +2026,7 @@ static int run(mddev_t *mddev) goto out_free_conf; } - ITERATE_RDEV(mddev, rdev, tmp) { + rdev_for_each(rdev, tmp, mddev) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e8c8157b02fc..2d6f1a51359c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3159,7 +3159,8 @@ static void raid5_activate_delayed(raid5_conf_t *conf) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->handle_list); } - } + } else + blk_plug_device(conf->mddev->queue); } static void activate_bit_delay(raid5_conf_t *conf) @@ -3549,7 +3550,8 @@ static int make_request(struct request_queue *q, struct bio * bi) goto retry; } finish_wait(&conf->wait_for_overlap, &w); - handle_stripe(sh, NULL); + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -3698,6 +3700,25 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* If this takes us to the resync_max point where we have to pause, + * then we need to write out the superblock. + */ + sector_nr += conf->chunk_size>>9; + if (sector_nr >= mddev->resync_max) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes) == 0); + mddev->reshape_position = conf->expand_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) + || kthread_should_stop()); + spin_lock_irq(&conf->device_lock); + conf->expand_lo = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + } return conf->chunk_size>>9; } @@ -3734,6 +3755,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); + /* No need to check resync_max as we never do more than one + * stripe, and as resync_max will always be on a chunk boundary, + * if the check in md_do_sync didn't fire, there is no chance + * of overstepping resync_max here + */ + /* if there is too many failed drives and we are trying * to resync, then assert that we are finished, because there is * nothing we can do. @@ -3753,6 +3780,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } + + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); if (sh == NULL) { @@ -3864,7 +3894,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d (mddev_t *mddev) +static void raid5d(mddev_t *mddev) { struct stripe_head *sh; raid5_conf_t *conf = mddev_to_conf(mddev); @@ -3889,12 +3919,6 @@ static void raid5d (mddev_t *mddev) activate_bit_delay(conf); } - if (list_empty(&conf->handle_list) && - atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && - !blk_queue_plugged(mddev->queue) && - !list_empty(&conf->delayed_list)) - raid5_activate_delayed(conf); - while ((bio = remove_bio_from_retry(conf))) { int ok; spin_unlock_irq(&conf->device_lock); @@ -4108,7 +4132,7 @@ static int run(mddev_t *mddev) pr_debug("raid5: run(%s) called.\n", mdname(mddev)); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= conf->raid_disks || raid_disk < 0) @@ -4521,7 +4545,7 @@ static int raid5_start_reshape(mddev_t *mddev) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; - ITERATE_RDEV(mddev, rdev, rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) spares++; @@ -4543,7 +4567,7 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. */ - ITERATE_RDEV(mddev, rdev, rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev)) { diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c index 0d5cd57accd7..559cc41b2585 100644 --- a/drivers/md/raid6test/test.c +++ b/drivers/md/raid6test/test.c @@ -1,12 +1,10 @@ /* -*- linux-c -*- ------------------------------------------------------- * * - * Copyright 2002 H. Peter Anvin - All Rights Reserved + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. * * ----------------------------------------------------------------------- */ @@ -30,67 +28,87 @@ char *dataptrs[NDISKS]; char data[NDISKS][PAGE_SIZE]; char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; -void makedata(void) +static void makedata(void) { int i, j; - for ( i = 0 ; i < NDISKS ; i++ ) { - for ( j = 0 ; j < PAGE_SIZE ; j++ ) { + for (i = 0; i < NDISKS; i++) { + for (j = 0; j < PAGE_SIZE; j++) data[i][j] = rand(); - } + dataptrs[i] = data[i]; } } +static char disk_type(int d) +{ + switch (d) { + case NDISKS-2: + return 'P'; + case NDISKS-1: + return 'Q'; + default: + return 'D'; + } +} + +static int test_disks(int i, int j) +{ + int erra, errb; + + memset(recovi, 0xf0, PAGE_SIZE); + memset(recovj, 0xba, PAGE_SIZE); + + dataptrs[i] = recovi; + dataptrs[j] = recovj; + + raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); + + erra = memcmp(data[i], recovi, PAGE_SIZE); + errb = memcmp(data[j], recovj, PAGE_SIZE); + + if (i < NDISKS-2 && j == NDISKS-1) { + /* We don't implement the DQ failure scenario, since it's + equivalent to a RAID-5 failure (XOR, then recompute Q) */ + erra = errb = 0; + } else { + printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", + raid6_call.name, + i, disk_type(i), + j, disk_type(j), + (!erra && !errb) ? "OK" : + !erra ? "ERRB" : + !errb ? "ERRA" : "ERRAB"); + } + + dataptrs[i] = data[i]; + dataptrs[j] = data[j]; + + return erra || errb; +} + int main(int argc, char *argv[]) { - const struct raid6_calls * const * algo; + const struct raid6_calls *const *algo; int i, j; - int erra, errb; + int err = 0; makedata(); - for ( algo = raid6_algos ; *algo ; algo++ ) { - if ( !(*algo)->valid || (*algo)->valid() ) { + for (algo = raid6_algos; *algo; algo++) { + if (!(*algo)->valid || (*algo)->valid()) { raid6_call = **algo; /* Nuke syndromes */ memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs); - - for ( i = 0 ; i < NDISKS-1 ; i++ ) { - for ( j = i+1 ; j < NDISKS ; j++ ) { - memset(recovi, 0xf0, PAGE_SIZE); - memset(recovj, 0xba, PAGE_SIZE); - - dataptrs[i] = recovi; - dataptrs[j] = recovj; - - raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); - - erra = memcmp(data[i], recovi, PAGE_SIZE); - errb = memcmp(data[j], recovj, PAGE_SIZE); - - if ( i < NDISKS-2 && j == NDISKS-1 ) { - /* We don't implement the DQ failure scenario, since it's - equivalent to a RAID-5 failure (XOR, then recompute Q) */ - } else { - printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", - raid6_call.name, - i, (i==NDISKS-2)?'P':'D', - j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D', - (!erra && !errb) ? "OK" : - !erra ? "ERRB" : - !errb ? "ERRA" : - "ERRAB"); - } - - dataptrs[i] = data[i]; - dataptrs[j] = data[j]; - } - } + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); } printf("\n"); } @@ -99,5 +117,8 @@ int main(int argc, char *argv[]) /* Pick the best algorithm test */ raid6_select_algo(); - return 0; + if (err) + printf("\n*** ERRORS FOUND ***\n"); + + return err; } |