diff options
Diffstat (limited to 'drivers/md')
32 files changed, 2119 insertions, 1206 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf869ed03eed..4540ade6b6b5 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -2,6 +2,8 @@ # Block device driver configuration # +if BLOCK + menu "Multi-device support (RAID and LVM)" config MD @@ -136,16 +138,16 @@ config MD_RAID456 If unsure, say Y. config MD_RAID5_RESHAPE - bool "Support adding drives to a raid-5 array (experimental)" - depends on MD_RAID456 && EXPERIMENTAL + bool "Support adding drives to a raid-5 array" + depends on MD_RAID456 + default y ---help--- A RAID-5 set can be expanded by adding extra drives. This requires "restriping" the array which means (almost) every block must be written to a different place. This option allows such restriping to be done while the array - is online. However it is still EXPERIMENTAL code. It should - work, but please be sure that you have backups. + is online. You will need mdadm version 2.4.1 or later to use this feature safely. During the early stage of reshape there is @@ -162,6 +164,8 @@ config MD_RAID5_RESHAPE There should be enough spares already present to make the new array workable. + If unsure, say Y. + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD @@ -199,10 +203,19 @@ config BLK_DEV_DM If unsure, say N. +config DM_DEBUG + boolean "Device mapper debugging support" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Enable this for messages that may help debug device-mapper problems. + + If unsure, say N. + config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM && EXPERIMENTAL select CRYPTO + select CRYPTO_CBC ---help--- This device-mapper target allows you to create a device that transparently encrypts the data on it. You'll need to activate @@ -251,3 +264,4 @@ config DM_MULTIPATH_EMC endmenu +endif diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index ecc56765d949..5432d07c074d 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -212,8 +212,8 @@ char *file_path(struct file *file, char *buf, int count) if (!buf) return NULL; - d = file->f_dentry; - v = file->f_vfsmnt; + d = file->f_path.dentry; + v = file->f_path.mnt; buf = d_path(d, v, buf, count); @@ -349,7 +349,7 @@ static struct page *read_page(struct file *file, unsigned long index, unsigned long count) { struct page *page = NULL; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct buffer_head *bh; sector_t block; @@ -536,7 +536,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " "-- forcing full recovery\n", bmname(bitmap), events, (unsigned long long) bitmap->mddev->events); - sb->state |= BITMAP_STALE; + sb->state |= cpu_to_le32(BITMAP_STALE); } success: /* assign fields using values from superblock */ @@ -544,11 +544,11 @@ success: bitmap->daemon_sleep = daemon_sleep; bitmap->daemon_lastrun = jiffies; bitmap->max_write_behind = write_behind; - bitmap->flags |= sb->state; + bitmap->flags |= le32_to_cpu(sb->state); if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) bitmap->flags |= BITMAP_HOSTENDIAN; bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - if (sb->state & BITMAP_STALE) + if (sb->state & cpu_to_le32(BITMAP_STALE)) bitmap->events_cleared = bitmap->mddev->events; err = 0; out: @@ -578,9 +578,9 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, spin_unlock_irqrestore(&bitmap->lock, flags); sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); switch (op) { - case MASK_SET: sb->state |= bits; + case MASK_SET: sb->state |= cpu_to_le32(bits); break; - case MASK_UNSET: sb->state &= ~bits; + case MASK_UNSET: sb->state &= cpu_to_le32(~bits); break; default: BUG(); } @@ -613,6 +613,7 @@ static inline unsigned long file_page_offset(unsigned long chunk) static inline struct page *filemap_get_page(struct bitmap *bitmap, unsigned long chunk) { + if (file_page_index(chunk) >= bitmap->file_pages) return NULL; return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; } @@ -661,7 +662,7 @@ static void bitmap_file_put(struct bitmap *bitmap) bitmap_file_unmap(bitmap); if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; invalidate_inode_pages(inode->i_mapping); fput(file); } @@ -739,6 +740,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) } page = filemap_get_page(bitmap, chunk); + if (!page) return; bit = file_page_offset(chunk); /* set the bit */ @@ -1322,6 +1324,18 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n } +/* dirty the memory and file bits for bitmap chunks "s" to "e" */ +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +{ + unsigned long chunk; + + for (chunk = s; chunk <= e; chunk++) { + sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap); + bitmap_set_memory_bits(bitmap, sec, 1); + bitmap_file_set_bit(bitmap, sec); + } +} + /* * flush out any pending updates */ @@ -1399,7 +1413,7 @@ int bitmap_create(mddev_t *mddev) int err; sector_t start; - BUG_ON(sizeof(bitmap_super_t) != 256); + BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ return 0; @@ -1430,8 +1444,7 @@ int bitmap_create(mddev_t *mddev) if (err) goto error; - bitmap->chunkshift = find_first_bit(&bitmap->chunksize, - sizeof(bitmap->chunksize)); + bitmap->chunkshift = ffz(~bitmap->chunksize); /* now that chunksize and chunkshift are set, we can use these macros */ chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) / diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h index bbf4615f0e30..da4349649f7f 100644 --- a/drivers/md/dm-bio-list.h +++ b/drivers/md/dm-bio-list.h @@ -44,6 +44,20 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) bl->tail = bl2->tail; } +static inline void bio_list_merge_head(struct bio_list *bl, + struct bio_list *bl2) +{ + if (!bl2->head) + return; + + if (bl->head) + bl2->tail->bi_next = bl->head; + else + bl->tail = bl2->tail; + + bl->head = bl2->head; +} + static inline struct bio *bio_list_pop(struct bio_list *bl) { struct bio *bio = bl->head; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index bdbd34993a80..4c2471ee054a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,6 +1,7 @@ /* * Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> + * Copyright (C) 2006 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -15,24 +16,28 @@ #include <linux/slab.h> #include <linux/crypto.h> #include <linux/workqueue.h> +#include <linux/backing-dev.h> #include <asm/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> +#include <asm/unaligned.h> #include "dm.h" #define DM_MSG_PREFIX "crypt" +#define MESG_STR(x) x, sizeof(x) /* * per bio private data */ struct crypt_io { struct dm_target *target; - struct bio *bio; + struct bio *base_bio; struct bio *first_clone; struct work_struct work; atomic_t pending; int error; + int post_process; }; /* @@ -63,6 +68,7 @@ struct crypt_iv_operations { * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. */ +enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; struct crypt_config { struct dm_dev *dev; sector_t start; @@ -73,28 +79,33 @@ struct crypt_config { */ mempool_t *io_pool; mempool_t *page_pool; + struct bio_set *bs; /* * crypto related data */ struct crypt_iv_operations *iv_gen_ops; char *iv_mode; - struct crypto_cipher *iv_gen_private; + union { + struct crypto_cipher *essiv_tfm; + int benbi_shift; + } iv_gen_private; sector_t iv_offset; unsigned int iv_size; char cipher[CRYPTO_MAX_ALG_NAME]; char chainmode[CRYPTO_MAX_ALG_NAME]; struct crypto_blkcipher *tfm; + unsigned long flags; unsigned int key_size; u8 key[0]; }; -#define MIN_IOS 256 +#define MIN_IOS 16 #define MIN_POOL_PAGES 32 #define MIN_BIO_PAGES 8 -static kmem_cache_t *_crypt_io_pool; +static struct kmem_cache *_crypt_io_pool; /* * Different IV generation algorithms: @@ -106,6 +117,9 @@ static kmem_cache_t *_crypt_io_pool; * encrypted with the bulk cipher using a salt as key. The salt * should be derived from the bulk cipher's key via hashing. * + * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1 + * (needed for LRW-32-AES and possible other narrow block modes) + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ @@ -184,21 +198,61 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, } kfree(salt); - cc->iv_gen_private = essiv_tfm; + cc->iv_gen_private.essiv_tfm = essiv_tfm; return 0; } static void crypt_iv_essiv_dtr(struct crypt_config *cc) { - crypto_free_cipher(cc->iv_gen_private); - cc->iv_gen_private = NULL; + crypto_free_cipher(cc->iv_gen_private.essiv_tfm); + cc->iv_gen_private.essiv_tfm = NULL; } static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) { memset(iv, 0, cc->iv_size); *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private, iv, iv); + crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); + return 0; +} + +static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); + int log = ilog2(bs); + + /* we need to calculate how far we must shift the sector count + * to get the cipher block count, we use this shift in _gen */ + + if (1 << log != bs) { + ti->error = "cypher blocksize is not a power of 2"; + return -EINVAL; + } + + if (log > 9) { + ti->error = "cypher blocksize is > 512"; + return -EINVAL; + } + + cc->iv_gen_private.benbi_shift = 9 - log; + + return 0; +} + +static void crypt_iv_benbi_dtr(struct crypt_config *cc) +{ +} + +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +{ + __be64 val; + + memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ + + val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); + put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); + return 0; } @@ -212,13 +266,18 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = { .generator = crypt_iv_essiv_gen }; +static struct crypt_iv_operations crypt_iv_benbi_ops = { + .ctr = crypt_iv_benbi_ctr, + .dtr = crypt_iv_benbi_dtr, + .generator = crypt_iv_benbi_gen +}; static int crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, struct scatterlist *in, unsigned int length, int write, sector_t sector) { - u8 iv[cc->iv_size]; + u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64)))); struct blkcipher_desc desc = { .tfm = cc->tfm, .info = iv, @@ -306,6 +365,14 @@ static int crypt_convert(struct crypt_config *cc, return r; } + static void dm_crypt_bio_destructor(struct bio *bio) + { + struct crypt_io *io = bio->bi_private; + struct crypt_config *cc = io->target->private; + + bio_free(bio, cc->bs); + } + /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations @@ -315,34 +382,33 @@ static struct bio * crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, struct bio *base_bio, unsigned int *bio_vec_idx) { - struct bio *bio; + struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; unsigned int i; - /* - * Use __GFP_NOMEMALLOC to tell the VM to act less aggressively and - * to fail earlier. This is not necessary but increases throughput. - * FIXME: Is this really intelligent? - */ - if (base_bio) - bio = bio_clone(base_bio, GFP_NOIO|__GFP_NOMEMALLOC); - else - bio = bio_alloc(GFP_NOIO|__GFP_NOMEMALLOC, nr_iovecs); - if (!bio) + if (base_bio) { + clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs); + __bio_clone(clone, base_bio); + } else + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); + + if (!clone) return NULL; + clone->bi_destructor = dm_crypt_bio_destructor; + /* if the last bio was not complete, continue where that one ended */ - bio->bi_idx = *bio_vec_idx; - bio->bi_vcnt = *bio_vec_idx; - bio->bi_size = 0; - bio->bi_flags &= ~(1 << BIO_SEG_VALID); + clone->bi_idx = *bio_vec_idx; + clone->bi_vcnt = *bio_vec_idx; + clone->bi_size = 0; + clone->bi_flags &= ~(1 << BIO_SEG_VALID); - /* bio->bi_idx pages have already been allocated */ - size -= bio->bi_idx * PAGE_SIZE; + /* clone->bi_idx pages have already been allocated */ + size -= clone->bi_idx * PAGE_SIZE; - for(i = bio->bi_idx; i < nr_iovecs; i++) { - struct bio_vec *bv = bio_iovec_idx(bio, i); + for (i = clone->bi_idx; i < nr_iovecs; i++) { + struct bio_vec *bv = bio_iovec_idx(clone, i); bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask); if (!bv->bv_page) @@ -353,7 +419,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, * return a partially allocated bio, the caller will then try * to allocate additional bios while submitting this partial bio */ - if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1)) + if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1)) gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; bv->bv_offset = 0; @@ -362,13 +428,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, else bv->bv_len = size; - bio->bi_size += bv->bv_len; - bio->bi_vcnt++; + clone->bi_size += bv->bv_len; + clone->bi_vcnt++; size -= bv->bv_len; } - if (!bio->bi_size) { - bio_put(bio); + if (!clone->bi_size) { + bio_put(clone); return NULL; } @@ -376,13 +442,13 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, * Remember the last bio_vec allocated to be able * to correctly continue after the splitting. */ - *bio_vec_idx = bio->bi_vcnt; + *bio_vec_idx = clone->bi_vcnt; - return bio; + return clone; } static void crypt_free_buffer_pages(struct crypt_config *cc, - struct bio *bio, unsigned int bytes) + struct bio *clone, unsigned int bytes) { unsigned int i, start, end; struct bio_vec *bv; @@ -396,19 +462,19 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, * A fix to the bi_idx issue in the kernel is in the works, so * we will hopefully be able to revert to the cleaner solution soon. */ - i = bio->bi_vcnt - 1; - bv = bio_iovec_idx(bio, i); - end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size; + i = clone->bi_vcnt - 1; + bv = bio_iovec_idx(clone, i); + end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - clone->bi_size; start = end - bytes; start >>= PAGE_SHIFT; - if (!bio->bi_size) - end = bio->bi_vcnt; + if (!clone->bi_size) + end = clone->bi_vcnt; else end >>= PAGE_SHIFT; - for(i = start; i < end; i++) { - bv = bio_iovec_idx(bio, i); + for (i = start; i < end; i++) { + bv = bio_iovec_idx(clone, i); BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); bv->bv_page = NULL; @@ -432,7 +498,7 @@ static void dec_pending(struct crypt_io *io, int error) if (io->first_clone) bio_put(io->first_clone); - bio_endio(io->bio, io->bio->bi_size, io->error); + bio_endio(io->base_bio, io->base_bio->bi_size, io->error); mempool_free(io, cc->io_pool); } @@ -441,29 +507,179 @@ static void dec_pending(struct crypt_io *io, int error) * kcryptd: * * Needed because it would be very unwise to do decryption in an - * interrupt context, so bios returning from read requests get - * queued here. + * interrupt context. */ static struct workqueue_struct *_kcryptd_workqueue; +static void kcryptd_do_work(struct work_struct *work); -static void kcryptd_do_work(void *data) +static void kcryptd_queue_io(struct crypt_io *io) { - struct crypt_io *io = (struct crypt_io *) data; - struct crypt_config *cc = (struct crypt_config *) io->target->private; + INIT_WORK(&io->work, kcryptd_do_work); + queue_work(_kcryptd_workqueue, &io->work); +} + +static int crypt_endio(struct bio *clone, unsigned int done, int error) +{ + struct crypt_io *io = clone->bi_private; + struct crypt_config *cc = io->target->private; + unsigned read_io = bio_data_dir(clone) == READ; + + /* + * free the processed pages, even if + * it's only a partially completed write + */ + if (!read_io) + crypt_free_buffer_pages(cc, clone, done); + + /* keep going - not finished yet */ + if (unlikely(clone->bi_size)) + return 1; + + if (!read_io) + goto out; + + if (unlikely(!bio_flagged(clone, BIO_UPTODATE))) { + error = -EIO; + goto out; + } + + bio_put(clone); + io->post_process = 1; + kcryptd_queue_io(io); + return 0; + +out: + bio_put(clone); + dec_pending(io, error); + return error; +} + +static void clone_init(struct crypt_io *io, struct bio *clone) +{ + struct crypt_config *cc = io->target->private; + + clone->bi_private = io; + clone->bi_end_io = crypt_endio; + clone->bi_bdev = cc->dev->bdev; + clone->bi_rw = io->base_bio->bi_rw; +} + +static void process_read(struct crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *base_bio = io->base_bio; + struct bio *clone; + sector_t sector = base_bio->bi_sector - io->target->begin; + + atomic_inc(&io->pending); + + /* + * The block layer might modify the bvec array, so always + * copy the required bvecs because we need the original + * one in order to decrypt the whole bio data *afterwards*. + */ + clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); + if (unlikely(!clone)) { + dec_pending(io, -ENOMEM); + return; + } + + clone_init(io, clone); + clone->bi_destructor = dm_crypt_bio_destructor; + clone->bi_idx = 0; + clone->bi_vcnt = bio_segments(base_bio); + clone->bi_size = base_bio->bi_size; + clone->bi_sector = cc->start + sector; + memcpy(clone->bi_io_vec, bio_iovec(base_bio), + sizeof(struct bio_vec) * clone->bi_vcnt); + + generic_make_request(clone); +} + +static void process_write(struct crypt_io *io) +{ + struct crypt_config *cc = io->target->private; + struct bio *base_bio = io->base_bio; + struct bio *clone; struct convert_context ctx; - int r; + unsigned remaining = base_bio->bi_size; + sector_t sector = base_bio->bi_sector - io->target->begin; + unsigned bvec_idx = 0; - crypt_convert_init(cc, &ctx, io->bio, io->bio, - io->bio->bi_sector - io->target->begin, 0); - r = crypt_convert(cc, &ctx); + atomic_inc(&io->pending); + + crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1); + + /* + * The allocated buffers can be smaller than the whole bio, + * so repeat the whole process until all the data can be handled. + */ + while (remaining) { + clone = crypt_alloc_buffer(cc, base_bio->bi_size, + io->first_clone, &bvec_idx); + if (unlikely(!clone)) { + dec_pending(io, -ENOMEM); + return; + } + + ctx.bio_out = clone; + + if (unlikely(crypt_convert(cc, &ctx) < 0)) { + crypt_free_buffer_pages(cc, clone, clone->bi_size); + bio_put(clone); + dec_pending(io, -EIO); + return; + } - dec_pending(io, r); + clone_init(io, clone); + clone->bi_sector = cc->start + sector; + + if (!io->first_clone) { + /* + * hold a reference to the first clone, because it + * holds the bio_vec array and that can't be freed + * before all other clones are released + */ + bio_get(clone); + io->first_clone = clone; + } + + remaining -= clone->bi_size; + sector += bio_sectors(clone); + + /* prevent bio_put of first_clone */ + if (remaining) + atomic_inc(&io->pending); + + generic_make_request(clone); + + /* out of memory -> run queues */ + if (remaining) + congestion_wait(bio_data_dir(clone), HZ/100); + } } -static void kcryptd_queue_io(struct crypt_io *io) +static void process_read_endio(struct crypt_io *io) { - INIT_WORK(&io->work, kcryptd_do_work, io); - queue_work(_kcryptd_workqueue, &io->work); + struct crypt_config *cc = io->target->private; + struct convert_context ctx; + + crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, + io->base_bio->bi_sector - io->target->begin, 0); + + dec_pending(io, crypt_convert(cc, &ctx)); +} + +static void kcryptd_do_work(struct work_struct *work) +{ + struct crypt_io *io = container_of(work, struct crypt_io, work); + + if (io->post_process) + process_read_endio(io); + else if (bio_data_dir(io->base_bio) == READ) + process_read(io); + else + process_write(io); } /* @@ -477,7 +693,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size) buffer[2] = '\0'; - for(i = 0; i < size; i++) { + for (i = 0; i < size; i++) { buffer[0] = *hex++; buffer[1] = *hex++; @@ -500,13 +716,38 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) { unsigned int i; - for(i = 0; i < size; i++) { + for (i = 0; i < size; i++) { sprintf(hex, "%02x", *key); hex += 2; key++; } } +static int crypt_set_key(struct crypt_config *cc, char *key) +{ + unsigned key_size = strlen(key) >> 1; + + if (cc->key_size && cc->key_size != key_size) + return -EINVAL; + + cc->key_size = key_size; /* initial settings */ + + if ((!key_size && strcmp(key, "-")) || + (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) + return -EINVAL; + + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + + return 0; +} + +static int crypt_wipe_key(struct crypt_config *cc) +{ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + memset(&cc->key, 0, cc->key_size * sizeof(u8)); + return 0; +} + /* * Construct an encryption mapping: * <cipher> <key> <iv_offset> <dev_path> <start> @@ -539,16 +780,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) key_size = strlen(argv[1]) >> 1; - cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); if (cc == NULL) { ti->error = "Cannot allocate transparent encryption context"; return -ENOMEM; } - cc->key_size = key_size; - if ((!key_size && strcmp(argv[1], "-") != 0) || - (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) { + if (crypt_set_key(cc, argv[1])) { ti->error = "Error decoding key"; goto bad1; } @@ -581,7 +820,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->tfm = tfm; /* - * Choose ivmode. Valid modes: "plain", "essiv:<esshash>". + * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". * See comments at iv code */ @@ -591,6 +830,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops = &crypt_iv_plain_ops; else if (strcmp(ivmode, "essiv") == 0) cc->iv_gen_ops = &crypt_iv_essiv_ops; + else if (strcmp(ivmode, "benbi") == 0) + cc->iv_gen_ops = &crypt_iv_benbi_ops; else { ti->error = "Invalid IV mode"; goto bad2; @@ -626,6 +867,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad4; } + cc->bs = bioset_create(MIN_IOS, MIN_IOS, 4); + if (!cc->bs) { + ti->error = "Cannot allocate crypt bioset"; + goto bad_bs; + } + if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { ti->error = "Error setting key"; goto bad5; @@ -665,6 +912,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) return 0; bad5: + bioset_free(cc->bs); +bad_bs: mempool_destroy(cc->page_pool); bad4: mempool_destroy(cc->io_pool); @@ -684,6 +933,7 @@ static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = (struct crypt_config *) ti->private; + bioset_free(cc->bs); mempool_destroy(cc->page_pool); mempool_destroy(cc->io_pool); @@ -698,155 +948,27 @@ static void crypt_dtr(struct dm_target *ti) kfree(cc); } -static int crypt_endio(struct bio *bio, unsigned int done, int error) -{ - struct crypt_io *io = (struct crypt_io *) bio->bi_private; - struct crypt_config *cc = (struct crypt_config *) io->target->private; - - if (bio_data_dir(bio) == WRITE) { - /* - * free the processed pages, even if - * it's only a partially completed write - */ - crypt_free_buffer_pages(cc, bio, done); - } - - if (bio->bi_size) - return 1; - - bio_put(bio); - - /* - * successful reads are decrypted by the worker thread - */ - if ((bio_data_dir(bio) == READ) - && bio_flagged(bio, BIO_UPTODATE)) { - kcryptd_queue_io(io); - return 0; - } - - dec_pending(io, error); - return error; -} - -static inline struct bio * -crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio, - sector_t sector, unsigned int *bvec_idx, - struct convert_context *ctx) -{ - struct bio *clone; - - if (bio_data_dir(bio) == WRITE) { - clone = crypt_alloc_buffer(cc, bio->bi_size, - io->first_clone, bvec_idx); - if (clone) { - ctx->bio_out = clone; - if (crypt_convert(cc, ctx) < 0) { - crypt_free_buffer_pages(cc, clone, - clone->bi_size); - bio_put(clone); - return NULL; - } - } - } else { - /* - * The block layer might modify the bvec array, so always - * copy the required bvecs because we need the original - * one in order to decrypt the whole bio data *afterwards*. - */ - clone = bio_alloc(GFP_NOIO, bio_segments(bio)); - if (clone) { - clone->bi_idx = 0; - clone->bi_vcnt = bio_segments(bio); - clone->bi_size = bio->bi_size; - memcpy(clone->bi_io_vec, bio_iovec(bio), - sizeof(struct bio_vec) * clone->bi_vcnt); - } - } - - if (!clone) - return NULL; - - clone->bi_private = io; - clone->bi_end_io = crypt_endio; - clone->bi_bdev = cc->dev->bdev; - clone->bi_sector = cc->start + sector; - clone->bi_rw = bio->bi_rw; - - return clone; -} - static int crypt_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - struct crypt_config *cc = (struct crypt_config *) ti->private; - struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO); - struct convert_context ctx; - struct bio *clone; - unsigned int remaining = bio->bi_size; - sector_t sector = bio->bi_sector - ti->begin; - unsigned int bvec_idx = 0; + struct crypt_config *cc = ti->private; + struct crypt_io *io; + io = mempool_alloc(cc->io_pool, GFP_NOIO); io->target = ti; - io->bio = bio; + io->base_bio = bio; io->first_clone = NULL; - io->error = 0; - atomic_set(&io->pending, 1); /* hold a reference */ - - if (bio_data_dir(bio) == WRITE) - crypt_convert_init(cc, &ctx, NULL, bio, sector, 1); + io->error = io->post_process = 0; + atomic_set(&io->pending, 0); + kcryptd_queue_io(io); - /* - * The allocated buffers can be smaller than the whole bio, - * so repeat the whole process until all the data can be handled. - */ - while (remaining) { - clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx); - if (!clone) - goto cleanup; - - if (!io->first_clone) { - /* - * hold a reference to the first clone, because it - * holds the bio_vec array and that can't be freed - * before all other clones are released - */ - bio_get(clone); - io->first_clone = clone; - } - atomic_inc(&io->pending); - - remaining -= clone->bi_size; - sector += bio_sectors(clone); - - generic_make_request(clone); - - /* out of memory -> run queues */ - if (remaining) - blk_congestion_wait(bio_data_dir(clone), HZ/100); - } - - /* drop reference, clones could have returned before we reach this */ - dec_pending(io, 0); - return 0; - -cleanup: - if (io->first_clone) { - dec_pending(io, -ENOMEM); - return 0; - } - - /* if no bio has been dispatched yet, we can directly return the error */ - mempool_free(io, cc->io_pool); - return -ENOMEM; + return DM_MAPIO_SUBMITTED; } static int crypt_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct crypt_config *cc = (struct crypt_config *) ti->private; - const char *cipher; - const char *chainmode = NULL; unsigned int sz = 0; switch (type) { @@ -855,14 +977,11 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - cipher = crypto_blkcipher_name(cc->tfm); - - chainmode = cc->chainmode; - if (cc->iv_mode) - DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode); + DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode, + cc->iv_mode); else - DMEMIT("%s-%s ", cipher, chainmode); + DMEMIT("%s-%s ", cc->cipher, cc->chainmode); if (cc->key_size > 0) { if ((maxlen - sz) < ((cc->key_size << 1) + 1)) @@ -883,14 +1002,71 @@ static int crypt_status(struct dm_target *ti, status_type_t type, return 0; } +static void crypt_postsuspend(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + set_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +static int crypt_preresume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + if (!test_bit(DM_CRYPT_KEY_VALID, &cc->flags)) { + DMERR("aborting resume - crypt key is not set."); + return -EAGAIN; + } + + return 0; +} + +static void crypt_resume(struct dm_target *ti) +{ + struct crypt_config *cc = ti->private; + + clear_bit(DM_CRYPT_SUSPENDED, &cc->flags); +} + +/* Message interface + * key set <key> + * key wipe + */ +static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct crypt_config *cc = ti->private; + + if (argc < 2) + goto error; + + if (!strnicmp(argv[0], MESG_STR("key"))) { + if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { + DMWARN("not suspended during key manipulation."); + return -EINVAL; + } + if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) + return crypt_set_key(cc, argv[2]); + if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) + return crypt_wipe_key(cc); + } + +error: + DMWARN("unrecognised message received."); + return -EINVAL; +} + static struct target_type crypt_target = { .name = "crypt", - .version= {1, 1, 0}, + .version= {1, 3, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, .map = crypt_map, .status = crypt_status, + .postsuspend = crypt_postsuspend, + .preresume = crypt_preresume, + .resume = crypt_resume, + .message = crypt_message, }; static int __init dm_crypt_init(void) diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c index 2a374ccb30dd..265c467854da 100644 --- a/drivers/md/dm-emc.c +++ b/drivers/md/dm-emc.c @@ -40,7 +40,7 @@ static inline void free_bio(struct bio *bio) static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) { - struct path *path = bio->bi_private; + struct dm_path *path = bio->bi_private; if (bio->bi_size) return 1; @@ -61,7 +61,7 @@ static int emc_endio(struct bio *bio, unsigned int bytes_done, int error) return 0; } -static struct bio *get_failover_bio(struct path *path, unsigned data_size) +static struct bio *get_failover_bio(struct dm_path *path, unsigned data_size) { struct bio *bio; struct page *page; @@ -96,7 +96,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size) } static struct request *get_failover_req(struct emc_handler *h, - struct bio *bio, struct path *path) + struct bio *bio, struct dm_path *path) { struct request *rq; struct block_device *bdev = bio->bi_bdev; @@ -126,13 +126,14 @@ static struct request *get_failover_req(struct emc_handler *h, memset(&rq->cmd, 0, BLK_MAX_CDB); rq->timeout = EMC_FAILOVER_TIMEOUT; - rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE); + rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE; return rq; } static struct request *emc_trespass_get(struct emc_handler *h, - struct path *path) + struct dm_path *path) { struct bio *bio; struct request *rq; @@ -190,7 +191,7 @@ static struct request *emc_trespass_get(struct emc_handler *h, } static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, - struct path *path) + struct dm_path *path) { struct request *rq; struct request_queue *q = bdev_get_queue(path->dev->bdev); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index d12379b5cdb5..99cdffa7fbfe 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #define DM_MSG_PREFIX "snapshots" +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ /*----------------------------------------------------------------- * Persistent snapshots, by persistent we mean that the snapshot @@ -150,6 +151,7 @@ static int alloc_area(struct pstore *ps) static void free_area(struct pstore *ps) { vfree(ps->area); + ps->area = NULL; } /* @@ -198,48 +200,79 @@ static int read_header(struct pstore *ps, int *new_snapshot) int r; struct disk_header *dh; chunk_t chunk_size; + int chunk_size_supplied = 1; - r = chunk_io(ps, 0, READ); + /* + * Use default chunk size (or hardsect_size, if larger) if none supplied + */ + if (!ps->snap->chunk_size) { + ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, + bdev_hardsect_size(ps->snap->cow->bdev) >> 9); + ps->snap->chunk_mask = ps->snap->chunk_size - 1; + ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; + chunk_size_supplied = 0; + } + + r = dm_io_get(sectors_to_pages(ps->snap->chunk_size)); if (r) return r; + r = alloc_area(ps); + if (r) + goto bad1; + + r = chunk_io(ps, 0, READ); + if (r) + goto bad2; + dh = (struct disk_header *) ps->area; if (le32_to_cpu(dh->magic) == 0) { *new_snapshot = 1; + return 0; + } - } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { - *new_snapshot = 0; - ps->valid = le32_to_cpu(dh->valid); - ps->version = le32_to_cpu(dh->version); - chunk_size = le32_to_cpu(dh->chunk_size); - if (ps->snap->chunk_size != chunk_size) { - DMWARN("chunk size %llu in device metadata overrides " - "table chunk size of %llu.", - (unsigned long long)chunk_size, - (unsigned long long)ps->snap->chunk_size); - - /* We had a bogus chunk_size. Fix stuff up. */ - dm_io_put(sectors_to_pages(ps->snap->chunk_size)); - free_area(ps); - - ps->snap->chunk_size = chunk_size; - ps->snap->chunk_mask = chunk_size - 1; - ps->snap->chunk_shift = ffs(chunk_size) - 1; - - r = alloc_area(ps); - if (r) - return r; - - r = dm_io_get(sectors_to_pages(chunk_size)); - if (r) - return r; - } - } else { - DMWARN("Invalid/corrupt snapshot"); + if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { + DMWARN("Invalid or corrupt snapshot"); r = -ENXIO; + goto bad2; } + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + chunk_size = le32_to_cpu(dh->chunk_size); + + if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) + return 0; + + DMWARN("chunk size %llu in device metadata overrides " + "table chunk size of %llu.", + (unsigned long long)chunk_size, + (unsigned long long)ps->snap->chunk_size); + + /* We had a bogus chunk_size. Fix stuff up. */ + dm_io_put(sectors_to_pages(ps->snap->chunk_size)); + free_area(ps); + + ps->snap->chunk_size = chunk_size; + ps->snap->chunk_mask = chunk_size - 1; + ps->snap->chunk_shift = ffs(chunk_size) - 1; + + r = dm_io_get(sectors_to_pages(chunk_size)); + if (r) + return r; + + r = alloc_area(ps); + if (r) + goto bad1; + + return 0; + +bad2: + free_area(ps); +bad1: + dm_io_put(sectors_to_pages(ps->snap->chunk_size)); return r; } @@ -263,42 +296,29 @@ static int write_header(struct pstore *ps) */ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) { - if (index >= ps->exceptions_per_area) - return NULL; + BUG_ON(index >= ps->exceptions_per_area); return ((struct disk_exception *) ps->area) + index; } -static int read_exception(struct pstore *ps, - uint32_t index, struct disk_exception *result) +static void read_exception(struct pstore *ps, + uint32_t index, struct disk_exception *result) { - struct disk_exception *e; - - e = get_exception(ps, index); - if (!e) - return -EINVAL; + struct disk_exception *e = get_exception(ps, index); /* copy it */ result->old_chunk = le64_to_cpu(e->old_chunk); result->new_chunk = le64_to_cpu(e->new_chunk); - - return 0; } -static int write_exception(struct pstore *ps, - uint32_t index, struct disk_exception *de) +static void write_exception(struct pstore *ps, + uint32_t index, struct disk_exception *de) { - struct disk_exception *e; - - e = get_exception(ps, index); - if (!e) - return -EINVAL; + struct disk_exception *e = get_exception(ps, index); /* copy it */ e->old_chunk = cpu_to_le64(de->old_chunk); e->new_chunk = cpu_to_le64(de->new_chunk); - - return 0; } /* @@ -316,10 +336,7 @@ static int insert_exceptions(struct pstore *ps, int *full) *full = 1; for (i = 0; i < ps->exceptions_per_area; i++) { - r = read_exception(ps, i, &de); - - if (r) - return r; + read_exception(ps, i, &de); /* * If the new_chunk is pointing at the start of @@ -519,6 +536,16 @@ static void persistent_commit(struct exception_store *store, if (r) ps->valid = 0; + /* + * Have we completely filled the current area ? + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + r = zero_area(ps, ps->current_area + 1); + if (r) + ps->valid = 0; + } + for (i = 0; i < ps->callback_count; i++) { cb = ps->callbacks + i; cb->callback(cb->context, r == 0 ? 1 : 0); @@ -526,16 +553,6 @@ static void persistent_commit(struct exception_store *store, ps->callback_count = 0; } - - /* - * Have we completely filled the current area ? - */ - if (ps->current_committed == ps->exceptions_per_area) { - ps->current_committed = 0; - r = zero_area(ps, ps->current_area + 1); - if (r) - ps->valid = 0; - } } static void persistent_drop(struct exception_store *store) @@ -547,32 +564,22 @@ static void persistent_drop(struct exception_store *store) DMWARN("write header failed"); } -int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +int dm_create_persistent(struct exception_store *store) { - int r; struct pstore *ps; - r = dm_io_get(sectors_to_pages(chunk_size)); - if (r) - return r; - /* allocate the pstore */ ps = kmalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) { - r = -ENOMEM; - goto bad; - } + if (!ps) + return -ENOMEM; ps->snap = store->snap; ps->valid = 1; ps->version = SNAPSHOT_DISK_VERSION; + ps->area = NULL; ps->next_free = 2; /* skipping the header and first area */ ps->current_committed = 0; - r = alloc_area(ps); - if (r) - goto bad; - ps->callback_count = 0; atomic_set(&ps->pending_count, 0); ps->callbacks = NULL; @@ -586,13 +593,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) store->context = ps; return 0; - - bad: - dm_io_put(sectors_to_pages(chunk_size)); - if (ps && ps->area) - free_area(ps); - kfree(ps); - return r; } /*----------------------------------------------------------------- @@ -642,18 +642,16 @@ static void transient_fraction_full(struct exception_store *store, *denominator = get_dev_size(store->snap->cow->bdev); } -int dm_create_transient(struct exception_store *store, - struct dm_snapshot *s, int blocksize) +int dm_create_transient(struct exception_store *store) { struct transient_c *tc; - memset(store, 0, sizeof(*store)); store->destroy = transient_destroy; store->read_metadata = transient_read_metadata; store->prepare_exception = transient_prepare; store->commit_exception = transient_commit; + store->drop_snapshot = NULL; store->fraction_full = transient_fraction_full; - store->snap = s; tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); if (!tc) diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h index 15f5629e231a..32eff28e4adc 100644 --- a/drivers/md/dm-hw-handler.h +++ b/drivers/md/dm-hw-handler.h @@ -32,7 +32,7 @@ struct hw_handler_type { void (*destroy) (struct hw_handler *hwh); void (*pg_init) (struct hw_handler *hwh, unsigned bypassed, - struct path *path); + struct dm_path *path); unsigned (*error) (struct hw_handler *hwh, struct bio *bio); int (*status) (struct hw_handler *hwh, status_type_t type, char *result, unsigned int maxlen); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index da663d2ff552..4eb73d395213 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -92,12 +92,12 @@ void dm_io_put(unsigned int num_pages) *---------------------------------------------------------------*/ static inline void bio_set_region(struct bio *bio, unsigned region) { - bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region; + bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; } static inline unsigned bio_get_region(struct bio *bio) { - return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len; + return bio->bi_io_vec[bio->bi_max_vecs].bv_len; } /*----------------------------------------------------------------- @@ -136,6 +136,7 @@ static int endio(struct bio *bio, unsigned int done, int error) zero_fill_bio(bio); dec_count(io, bio_get_region(bio), error); + bio->bi_max_vecs++; bio_put(bio); return 0; @@ -250,16 +251,18 @@ static void do_region(int rw, unsigned int region, struct io_region *where, while (remaining) { /* - * Allocate a suitably sized bio, we add an extra - * bvec for bio_get/set_region(). + * Allocate a suitably sized-bio: we add an extra + * bvec for bio_get/set_region() and decrement bi_max_vecs + * to hide it from bio_add_page(). */ - num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2; + num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2; bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; bio->bi_private = io; bio->bi_destructor = dm_bio_destructor; + bio->bi_max_vecs--; bio_set_region(bio, region); /* @@ -302,7 +305,7 @@ static void dispatch_io(int rw, unsigned int num_regions, } /* - * Drop the extra refence that we were holding to avoid + * Drop the extra reference that we were holding to avoid * the io being completed too early. */ dec_count(io, 0, 0); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d13bb15a8a02..cd6a184536a1 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -606,9 +606,14 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) return __get_name_cell(param->name); md = dm_get_md(huge_decode_dev(param->dev)); - if (md) - mdptr = dm_get_mdptr(md); + if (!md) + goto out; + mdptr = dm_get_mdptr(md); + if (!mdptr) + dm_put(md); + +out: return mdptr; } @@ -760,7 +765,7 @@ out: static int do_suspend(struct dm_ioctl *param) { int r = 0; - int do_lockfs = 1; + unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; struct mapped_device *md; md = find_device(param); @@ -768,10 +773,12 @@ static int do_suspend(struct dm_ioctl *param) return -ENXIO; if (param->flags & DM_SKIP_LOCKFS_FLAG) - do_lockfs = 0; + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; if (!dm_suspended(md)) - r = dm_suspend(md, do_lockfs); + r = dm_suspend(md, suspend_flags); if (!r) r = __dev_status(md, param); @@ -783,7 +790,7 @@ static int do_suspend(struct dm_ioctl *param) static int do_resume(struct dm_ioctl *param) { int r = 0; - int do_lockfs = 1; + unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; struct hash_cell *hc; struct mapped_device *md; struct dm_table *new_map; @@ -809,9 +816,11 @@ static int do_resume(struct dm_ioctl *param) if (new_map) { /* Suspend if it isn't already suspended */ if (param->flags & DM_SKIP_LOCKFS_FLAG) - do_lockfs = 0; + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; if (!dm_suspended(md)) - dm_suspend(md, do_lockfs); + dm_suspend(md, suspend_flags); r = dm_swap_table(md, new_map); if (r) { diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 47b3c62bbdb8..17753d80ad22 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -77,7 +77,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = lc->dev->bdev; bio->bi_sector = lc->start + (bio->bi_sector - ti->begin); - return 1; + return DM_MAPIO_REMAPPED; } static int linear_status(struct dm_target *ti, status_type_t type, @@ -98,14 +98,31 @@ static int linear_status(struct dm_target *ti, status_type_t type, return 0; } +static int linear_ioctl(struct dm_target *ti, struct inode *inode, + struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + struct block_device *bdev = lc->dev->bdev; + struct file fake_file = {}; + struct dentry fake_dentry = {}; + + fake_file.f_mode = lc->dev->mode; + fake_file.f_path.dentry = &fake_dentry; + fake_dentry.d_inode = bdev->bd_inode; + + return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg); +} + static struct target_type linear_target = { .name = "linear", - .version= {1, 0, 1}, + .version= {1, 0, 2}, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, .map = linear_map, .status = linear_status, + .ioctl = linear_ioctl, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 64b764bd02cc..6a9261351848 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -466,6 +466,7 @@ static int disk_resume(struct dirty_log *log) /* copy clean across to sync */ memcpy(lc->sync_bits, lc->clean_bits, size); lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); + lc->sync_search = 0; /* set the correct number of regions in the header */ lc->header.nr_regions = lc->region_count; @@ -480,6 +481,13 @@ static uint32_t core_get_region_size(struct dirty_log *log) return lc->region_size; } +static int core_resume(struct dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + lc->sync_search = 0; + return 0; +} + static int core_is_clean(struct dirty_log *log, region_t region) { struct log_c *lc = (struct log_c *) log->context; @@ -549,16 +557,19 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region) return 1; } -static void core_complete_resync_work(struct dirty_log *log, region_t region, - int success) +static void core_set_region_sync(struct dirty_log *log, region_t region, + int in_sync) { struct log_c *lc = (struct log_c *) log->context; log_clear_bit(lc, lc->recovering_bits, region); - if (success) { + if (in_sync) { log_set_bit(lc, lc->sync_bits, region); lc->sync_count++; - } + } else if (log_test_bit(lc->sync_bits, region)) { + lc->sync_count--; + log_clear_bit(lc, lc->sync_bits, region); + } } static region_t core_get_sync_count(struct dirty_log *log) @@ -618,6 +629,7 @@ static struct dirty_log_type _core_type = { .module = THIS_MODULE, .ctr = core_ctr, .dtr = core_dtr, + .resume = core_resume, .get_region_size = core_get_region_size, .is_clean = core_is_clean, .in_sync = core_in_sync, @@ -625,7 +637,7 @@ static struct dirty_log_type _core_type = { .mark_region = core_mark_region, .clear_region = core_clear_region, .get_resync_work = core_get_resync_work, - .complete_resync_work = core_complete_resync_work, + .set_region_sync = core_set_region_sync, .get_sync_count = core_get_sync_count, .status = core_status, }; @@ -644,7 +656,7 @@ static struct dirty_log_type _disk_type = { .mark_region = core_mark_region, .clear_region = core_clear_region, .get_resync_work = core_get_resync_work, - .complete_resync_work = core_complete_resync_work, + .set_region_sync = core_set_region_sync, .get_sync_count = core_get_sync_count, .status = disk_status, }; diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h index 5ae5309ebf28..86a301c8daf1 100644 --- a/drivers/md/dm-log.h +++ b/drivers/md/dm-log.h @@ -90,12 +90,12 @@ struct dirty_log_type { int (*get_resync_work)(struct dirty_log *log, region_t *region); /* - * This notifies the log that the resync of an area has - * been completed. The log should then mark this region - * as CLEAN. + * This notifies the log that the resync status of a region + * has changed. It also clears the region from the recovering + * list (if present). */ - void (*complete_resync_work)(struct dirty_log *log, - region_t region, int success); + void (*set_region_sync)(struct dirty_log *log, + region_t region, int in_sync); /* * Returns the number of regions that are in sync. diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 93f701ea87bc..3aa013506967 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -31,7 +31,7 @@ struct pgpath { struct priority_group *pg; /* Owning PG */ unsigned fail_count; /* Cumulative failure count */ - struct path path; + struct dm_path path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -101,11 +101,11 @@ typedef int (*action_fn) (struct pgpath *pgpath); #define MIN_IOS 256 /* Mempool size */ -static kmem_cache_t *_mpio_cache; +static struct kmem_cache *_mpio_cache; struct workqueue_struct *kmultipathd; -static void process_queued_ios(void *data); -static void trigger_event(void *data); +static void process_queued_ios(struct work_struct *work); +static void trigger_event(struct work_struct *work); /*----------------------------------------------- @@ -114,12 +114,10 @@ static void trigger_event(void *data); static struct pgpath *alloc_pgpath(void) { - struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL); + struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); - if (pgpath) { - memset(pgpath, 0, sizeof(*pgpath)); + if (pgpath) pgpath->path.is_active = 1; - } return pgpath; } @@ -133,12 +131,10 @@ static struct priority_group *alloc_priority_group(void) { struct priority_group *pg; - pg = kmalloc(sizeof(*pg), GFP_KERNEL); - if (!pg) - return NULL; + pg = kzalloc(sizeof(*pg), GFP_KERNEL); - memset(pg, 0, sizeof(*pg)); - INIT_LIST_HEAD(&pg->pgpaths); + if (pg) + INIT_LIST_HEAD(&pg->pgpaths); return pg; } @@ -168,23 +164,24 @@ static void free_priority_group(struct priority_group *pg, kfree(pg); } -static struct multipath *alloc_multipath(void) +static struct multipath *alloc_multipath(struct dm_target *ti) { struct multipath *m; - m = kmalloc(sizeof(*m), GFP_KERNEL); + m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { - memset(m, 0, sizeof(*m)); INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); m->queue_io = 1; - INIT_WORK(&m->process_queued_ios, process_queued_ios, m); - INIT_WORK(&m->trigger_event, trigger_event, m); + INIT_WORK(&m->process_queued_ios, process_queued_ios); + INIT_WORK(&m->trigger_event, trigger_event); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { kfree(m); return NULL; } + m->ti = ti; + ti->private = m; } return m; @@ -232,7 +229,7 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) { - struct path *path; + struct dm_path *path; path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); if (!path) @@ -285,10 +282,27 @@ failed: m->current_pg = NULL; } +/* + * Check whether bios must be queued in the device-mapper core rather + * than here in the target. + * + * m->lock must be held on entry. + * + * If m->queue_if_no_path and m->saved_queue_if_no_path hold the + * same value then we are not between multipath_presuspend() + * and multipath_resume() calls and we have no need to check + * for the DMF_NOFLUSH_SUSPENDING flag. + */ +static int __must_push_back(struct multipath *m) +{ + return (m->queue_if_no_path != m->saved_queue_if_no_path && + dm_noflush_suspending(m->ti)); +} + static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, unsigned was_queued) { - int r = 1; + int r = DM_MAPIO_REMAPPED; unsigned long flags; struct pgpath *pgpath; @@ -313,11 +327,13 @@ static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio, !m->queue_io) queue_work(kmultipathd, &m->process_queued_ios); pgpath = NULL; - r = 0; - } else if (!pgpath) - r = -EIO; /* Failed */ - else + r = DM_MAPIO_SUBMITTED; + } else if (pgpath) bio->bi_bdev = pgpath->path.dev->bdev; + else if (__must_push_back(m)) + r = DM_MAPIO_REQUEUE; + else + r = -EIO; /* Failed */ mpio->pgpath = pgpath; @@ -375,16 +391,19 @@ static void dispatch_queued_ios(struct multipath *m) r = map_io(m, bio, mpio, 1); if (r < 0) bio_endio(bio, bio->bi_size, r); - else if (r == 1) + else if (r == DM_MAPIO_REMAPPED) generic_make_request(bio); + else if (r == DM_MAPIO_REQUEUE) + bio_endio(bio, bio->bi_size, -EIO); bio = next; } } -static void process_queued_ios(void *data) +static void process_queued_ios(struct work_struct *work) { - struct multipath *m = (struct multipath *) data; + struct multipath *m = + container_of(work, struct multipath, process_queued_ios); struct hw_handler *hwh = &m->hw_handler; struct pgpath *pgpath = NULL; unsigned init_required = 0, must_queue = 1; @@ -424,9 +443,10 @@ out: * An event is triggered whenever a path is taken out of use. * Includes path failure and PG bypass. */ -static void trigger_event(void *data) +static void trigger_event(struct work_struct *work) { - struct multipath *m = (struct multipath *) data; + struct multipath *m = + container_of(work, struct multipath, trigger_event); dm_table_event(m->ti->table); } @@ -557,8 +577,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, } static struct priority_group *parse_priority_group(struct arg_set *as, - struct multipath *m, - struct dm_target *ti) + struct multipath *m) { static struct param _params[] = { {1, 1024, "invalid number of paths"}, @@ -568,6 +587,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, int r; unsigned i, nr_selector_args, nr_params; struct priority_group *pg; + struct dm_target *ti = m->ti; if (as->argc < 2) { as->argc = 0; @@ -624,12 +644,12 @@ static struct priority_group *parse_priority_group(struct arg_set *as, return NULL; } -static int parse_hw_handler(struct arg_set *as, struct multipath *m, - struct dm_target *ti) +static int parse_hw_handler(struct arg_set *as, struct multipath *m) { int r; struct hw_handler_type *hwht; unsigned hw_argc; + struct dm_target *ti = m->ti; static struct param _params[] = { {0, 1024, "invalid number of hardware handler args"}, @@ -661,11 +681,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m, return 0; } -static int parse_features(struct arg_set *as, struct multipath *m, - struct dm_target *ti) +static int parse_features(struct arg_set *as, struct multipath *m) { int r; unsigned argc; + struct dm_target *ti = m->ti; static struct param _params[] = { {0, 1, "invalid number of feature args"}, @@ -704,19 +724,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, as.argc = argc; as.argv = argv; - m = alloc_multipath(); + m = alloc_multipath(ti); if (!m) { ti->error = "can't allocate multipath"; return -EINVAL; } - m->ti = ti; - - r = parse_features(&as, m, ti); + r = parse_features(&as, m); if (r) goto bad; - r = parse_hw_handler(&as, m, ti); + r = parse_hw_handler(&as, m); if (r) goto bad; @@ -732,7 +750,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, while (as.argc) { struct priority_group *pg; - pg = parse_priority_group(&as, m, ti); + pg = parse_priority_group(&as, m); if (!pg) { r = -EINVAL; goto bad; @@ -752,8 +770,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } - ti->private = m; - return 0; bad: @@ -788,7 +804,7 @@ static int multipath_map(struct dm_target *ti, struct bio *bio, map_context->ptr = mpio; bio->bi_rw |= (1 << BIO_RW_FAILFAST); r = map_io(m, bio, mpio, 0); - if (r < 0) + if (r < 0 || r == DM_MAPIO_REQUEUE) mempool_free(mpio, m->mpio_pool); return r; @@ -962,7 +978,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) /* * pg_init must call this when it has completed its initialisation */ -void dm_pg_init_complete(struct path *path, unsigned err_flags) +void dm_pg_init_complete(struct dm_path *path, unsigned err_flags) { struct pgpath *pgpath = path_to_pgpath(path); struct priority_group *pg = pgpath->pg; @@ -1012,7 +1028,10 @@ static int do_end_io(struct multipath *m, struct bio *bio, spin_lock_irqsave(&m->lock, flags); if (!m->nr_valid_paths) { - if (!m->queue_if_no_path) { + if (__must_push_back(m)) { + spin_unlock_irqrestore(&m->lock, flags); + return DM_ENDIO_REQUEUE; + } else if (!m->queue_if_no_path) { spin_unlock_irqrestore(&m->lock, flags); return -EIO; } else { @@ -1047,7 +1066,7 @@ static int do_end_io(struct multipath *m, struct bio *bio, queue_work(kmultipathd, &m->process_queued_ios); spin_unlock_irqrestore(&m->lock, flags); - return 1; /* io not complete */ + return DM_ENDIO_INCOMPLETE; /* io not complete */ } static int multipath_end_io(struct dm_target *ti, struct bio *bio, @@ -1065,7 +1084,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio, if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path); } - if (r <= 0) + if (r != DM_ENDIO_INCOMPLETE) mempool_free(mpio, m->mpio_pool); return r; @@ -1266,12 +1285,47 @@ error: return -EINVAL; } +static int multipath_ioctl(struct dm_target *ti, struct inode *inode, + struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct multipath *m = (struct multipath *) ti->private; + struct block_device *bdev = NULL; + unsigned long flags; + struct file fake_file = {}; + struct dentry fake_dentry = {}; + int r = 0; + + fake_file.f_path.dentry = &fake_dentry; + + spin_lock_irqsave(&m->lock, flags); + + if (!m->current_pgpath) + __choose_pgpath(m); + + if (m->current_pgpath) { + bdev = m->current_pgpath->path.dev->bdev; + fake_dentry.d_inode = bdev->bd_inode; + fake_file.f_mode = m->current_pgpath->path.dev->mode; + } + + if (m->queue_io) + r = -EAGAIN; + else if (!bdev) + r = -EIO; + + spin_unlock_irqrestore(&m->lock, flags); + + return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file, + bdev->bd_disk, cmd, arg); +} + /*----------------------------------------------------------------- * Module setup *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 0, 4}, + .version = {1, 0, 5}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, @@ -1281,6 +1335,7 @@ static struct target_type multipath_target = { .resume = multipath_resume, .status = multipath_status, .message = multipath_message, + .ioctl = multipath_ioctl, }; static int __init dm_multipath_init(void) diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h index 8a4bf2b6d52e..b9cdcbb3ed59 100644 --- a/drivers/md/dm-mpath.h +++ b/drivers/md/dm-mpath.h @@ -11,7 +11,7 @@ struct dm_dev; -struct path { +struct dm_path { struct dm_dev *dev; /* Read-only */ unsigned is_active; /* Read-only */ @@ -20,6 +20,6 @@ struct path { }; /* Callback for hwh_pg_init_fn to use when complete */ -void dm_pg_init_complete(struct path *path, unsigned err_flags); +void dm_pg_init_complete(struct dm_path *path, unsigned err_flags); #endif diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 732d06a84f85..27357b85d73d 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -44,7 +44,7 @@ struct path_selector_type { * Add an opaque path object, along with some selector specific * path args (eg, path priority). */ - int (*add_path) (struct path_selector *ps, struct path *path, + int (*add_path) (struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error); /* @@ -55,27 +55,27 @@ struct path_selector_type { * calling the function again. 0 means don't call it again unless * the path fails. */ - struct path *(*select_path) (struct path_selector *ps, + struct dm_path *(*select_path) (struct path_selector *ps, unsigned *repeat_count); /* * Notify the selector that a path has failed. */ - void (*fail_path) (struct path_selector *ps, struct path *p); + void (*fail_path) (struct path_selector *ps, struct dm_path *p); /* * Ask selector to reinstate a path. */ - int (*reinstate_path) (struct path_selector *ps, struct path *p); + int (*reinstate_path) (struct path_selector *ps, struct dm_path *p); /* * Table content based on parameters added in ps_add_path_fn * or path selector status */ - int (*status) (struct path_selector *ps, struct path *path, + int (*status) (struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen); - int (*end_io) (struct path_selector *ps, struct path *path); + int (*end_io) (struct path_selector *ps, struct dm_path *path); }; /* Register a path selector */ diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index c54de989eb00..23a642619bed 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -24,6 +24,7 @@ static struct workqueue_struct *_kmirrord_wq; static struct work_struct _kmirrord_work; +static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); static inline void wake(void) { @@ -83,6 +84,7 @@ struct region_hash { struct list_head *buckets; spinlock_t region_lock; + atomic_t recovery_in_flight; struct semaphore recovery_count; struct list_head clean_regions; struct list_head quiesced_regions; @@ -191,6 +193,7 @@ static int rh_init(struct region_hash *rh, struct mirror_set *ms, spin_lock_init(&rh->region_lock); sema_init(&rh->recovery_count, 0); + atomic_set(&rh->recovery_in_flight, 0); INIT_LIST_HEAD(&rh->clean_regions); INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); @@ -341,6 +344,17 @@ static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) } } +static void complete_resync_work(struct region *reg, int success) +{ + struct region_hash *rh = reg->rh; + + rh->log->type->set_region_sync(rh->log, reg->key, success); + dispatch_bios(rh->ms, ®->delayed_bios); + if (atomic_dec_and_test(&rh->recovery_in_flight)) + wake_up_all(&_kmirrord_recovery_stopped); + up(&rh->recovery_count); +} + static void rh_update_states(struct region_hash *rh) { struct region *reg, *next; @@ -380,9 +394,7 @@ static void rh_update_states(struct region_hash *rh) */ list_for_each_entry_safe (reg, next, &recovered, list) { rh->log->type->clear_region(rh->log, reg->key); - rh->log->type->complete_resync_work(rh->log, reg->key, 1); - dispatch_bios(rh->ms, ®->delayed_bios); - up(&rh->recovery_count); + complete_resync_work(reg, 1); mempool_free(reg, rh->region_pool); } @@ -502,11 +514,21 @@ static int __rh_recovery_prepare(struct region_hash *rh) static void rh_recovery_prepare(struct region_hash *rh) { - while (!down_trylock(&rh->recovery_count)) + /* Extra reference to avoid race with rh_stop_recovery */ + atomic_inc(&rh->recovery_in_flight); + + while (!down_trylock(&rh->recovery_count)) { + atomic_inc(&rh->recovery_in_flight); if (__rh_recovery_prepare(rh) <= 0) { + atomic_dec(&rh->recovery_in_flight); up(&rh->recovery_count); break; } + } + + /* Drop the extra reference */ + if (atomic_dec_and_test(&rh->recovery_in_flight)) + wake_up_all(&_kmirrord_recovery_stopped); } /* @@ -868,7 +890,7 @@ static void do_mirror(struct mirror_set *ms) do_writes(ms, &writes); } -static void do_work(void *ignored) +static void do_work(struct work_struct *ignored) { struct mirror_set *ms; @@ -1122,7 +1144,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, if (rw == WRITE) { queue_bio(ms, bio, rw); - return 0; + return DM_MAPIO_SUBMITTED; } r = ms->rh.log->type->in_sync(ms->rh.log, @@ -1131,7 +1153,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, return r; if (r == -EWOULDBLOCK) /* FIXME: ugly */ - r = 0; + r = DM_MAPIO_SUBMITTED; /* * We don't want to fast track a recovery just for a read @@ -1144,7 +1166,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, if (!r) { /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); - return 0; + return DM_MAPIO_SUBMITTED; } m = choose_mirror(ms, bio->bi_sector); @@ -1152,7 +1174,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, return -EIO; map_bio(ms, m, bio); - return 1; + return DM_MAPIO_REMAPPED; } static int mirror_end_io(struct dm_target *ti, struct bio *bio, @@ -1177,6 +1199,11 @@ static void mirror_postsuspend(struct dm_target *ti) struct dirty_log *log = ms->rh.log; rh_stop_recovery(&ms->rh); + + /* Wait for all I/O we generated to complete */ + wait_event(_kmirrord_recovery_stopped, + !atomic_read(&ms->rh.recovery_in_flight)); + if (log->type->suspend && log->type->suspend(log)) /* FIXME: need better error handling */ DMWARN("log suspend failed"); @@ -1213,9 +1240,9 @@ static int mirror_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - DMEMIT("%d ", ms->nr_mirrors); + DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) - DMEMIT("%s %llu ", ms->mirror[m].dev->name, + DMEMIT(" %s %llu", ms->mirror[m].dev->name, (unsigned long long)ms->mirror[m].offset); } @@ -1249,7 +1276,7 @@ static int __init dm_mirror_init(void) dm_dirty_log_exit(); return r; } - INIT_WORK(&_kmirrord_work, do_work, NULL); + INIT_WORK(&_kmirrord_work, do_work); r = dm_register_target(&mirror_target); if (r < 0) { diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index c5a16c550122..a348a97b65af 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -21,7 +21,7 @@ *---------------------------------------------------------------*/ struct path_info { struct list_head list; - struct path *path; + struct dm_path *path; unsigned repeat_count; }; @@ -80,7 +80,7 @@ static void rr_destroy(struct path_selector *ps) ps->context = NULL; } -static int rr_status(struct path_selector *ps, struct path *path, +static int rr_status(struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen) { struct path_info *pi; @@ -106,7 +106,7 @@ static int rr_status(struct path_selector *ps, struct path *path, * Called during initialisation to register each path with an * optional repeat_count. */ -static int rr_add_path(struct path_selector *ps, struct path *path, +static int rr_add_path(struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error) { struct selector *s = (struct selector *) ps->context; @@ -136,12 +136,12 @@ static int rr_add_path(struct path_selector *ps, struct path *path, path->pscontext = pi; - list_add(&pi->list, &s->valid_paths); + list_add_tail(&pi->list, &s->valid_paths); return 0; } -static void rr_fail_path(struct path_selector *ps, struct path *p) +static void rr_fail_path(struct path_selector *ps, struct dm_path *p) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = p->pscontext; @@ -149,7 +149,7 @@ static void rr_fail_path(struct path_selector *ps, struct path *p) list_move(&pi->list, &s->invalid_paths); } -static int rr_reinstate_path(struct path_selector *ps, struct path *p) +static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = p->pscontext; @@ -159,7 +159,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct path *p) return 0; } -static struct path *rr_select_path(struct path_selector *ps, +static struct dm_path *rr_select_path(struct path_selector *ps, unsigned *repeat_count) { struct selector *s = (struct selector *) ps->context; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 1d0fafda0f76..0821a2b68a73 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -39,6 +39,9 @@ */ #define SNAPSHOT_PAGES 256 +static struct workqueue_struct *ksnapd; +static void flush_queued_bios(struct work_struct *work); + struct pending_exception { struct exception e; @@ -56,7 +59,7 @@ struct pending_exception { /* * The primary pending_exception is the one that holds - * the sibling_count and the list of origin_bios for a + * the ref_count and the list of origin_bios for a * group of pending_exceptions. It is always last to get freed. * These fields get set up when writing to the origin. */ @@ -69,7 +72,7 @@ struct pending_exception { * the sibling concerned and not pe->primary_pe->snap->lock unless * they are the same. */ - atomic_t sibling_count; + atomic_t ref_count; /* Pointer back to snapshot context */ struct dm_snapshot *snap; @@ -85,8 +88,8 @@ struct pending_exception { * Hash table mapping origin volumes to lists of snapshots and * a lock to protect it */ -static kmem_cache_t *exception_cache; -static kmem_cache_t *pending_cache; +static struct kmem_cache *exception_cache; +static struct kmem_cache *pending_cache; static mempool_t *pending_pool; /* @@ -225,7 +228,7 @@ static int init_exception_table(struct exception_table *et, uint32_t size) return 0; } -static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) { struct list_head *slot; struct exception *ex, *next; @@ -387,15 +390,46 @@ static inline ulong round_up(ulong n, ulong size) return (n + size) & ~size; } -static void read_snapshot_metadata(struct dm_snapshot *s) +static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg, + char **error) { - if (s->store.read_metadata(&s->store)) { - down_write(&s->lock); - s->valid = 0; - up_write(&s->lock); + unsigned long chunk_size; + char *value; + + chunk_size = simple_strtoul(chunk_size_arg, &value, 10); + if (*chunk_size_arg == '\0' || *value != '\0') { + *error = "Invalid chunk size"; + return -EINVAL; + } - dm_table_event(s->table); + if (!chunk_size) { + s->chunk_size = s->chunk_mask = s->chunk_shift = 0; + return 0; } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); + + /* Check chunk_size is a power of 2 */ + if (chunk_size & (chunk_size - 1)) { + *error = "Chunk size is not a power of 2"; + return -EINVAL; + } + + /* Validate the chunk size against the device block size */ + if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) { + *error = "Chunk size is not a multiple of device blocksize"; + return -EINVAL; + } + + s->chunk_size = chunk_size; + s->chunk_mask = chunk_size - 1; + s->chunk_shift = ffs(chunk_size) - 1; + + return 0; } /* @@ -404,15 +438,12 @@ static void read_snapshot_metadata(struct dm_snapshot *s) static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dm_snapshot *s; - unsigned long chunk_size; int r = -EINVAL; char persistent; char *origin_path; char *cow_path; - char *value; - int blocksize; - if (argc < 4) { + if (argc != 4) { ti->error = "requires exactly 4 arguments"; r = -EINVAL; goto bad1; @@ -428,13 +459,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad1; } - chunk_size = simple_strtoul(argv[3], &value, 10); - if (chunk_size == 0 || value == NULL) { - ti->error = "Invalid chunk size"; - r = -EINVAL; - goto bad1; - } - s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) { ti->error = "Cannot allocate snapshot context private " @@ -457,36 +481,17 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad2; } - /* - * Chunk size must be multiple of page size. Silently - * round up if it's not. - */ - chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); - - /* Validate the chunk size against the device block size */ - blocksize = s->cow->bdev->bd_disk->queue->hardsect_size; - if (chunk_size % (blocksize >> 9)) { - ti->error = "Chunk size is not a multiple of device blocksize"; - r = -EINVAL; - goto bad3; - } - - /* Check chunk_size is a power of 2 */ - if (chunk_size & (chunk_size - 1)) { - ti->error = "Chunk size is not a power of 2"; - r = -EINVAL; + r = set_chunk_size(s, argv[3], &ti->error); + if (r) goto bad3; - } - s->chunk_size = chunk_size; - s->chunk_mask = chunk_size - 1; s->type = persistent; - s->chunk_shift = ffs(chunk_size) - 1; s->valid = 1; s->active = 0; s->last_percent = 0; init_rwsem(&s->lock); + spin_lock_init(&s->pe_lock); s->table = ti->table; /* Allocate hash table for COW data */ @@ -496,16 +501,12 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad3; } - /* - * Check the persistent flag - done here because we need the iobuf - * to check the LV header - */ s->store.snap = s; if (persistent == 'P') - r = dm_create_persistent(&s->store, chunk_size); + r = dm_create_persistent(&s->store); else - r = dm_create_transient(&s->store, s, blocksize); + r = dm_create_transient(&s->store); if (r) { ti->error = "Couldn't create exception store"; @@ -520,7 +521,14 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) } /* Metadata must only be loaded into one table at once */ - read_snapshot_metadata(s); + r = s->store.read_metadata(&s->store); + if (r) { + ti->error = "Failed to read snapshot metadata"; + goto bad6; + } + + bio_list_init(&s->queued_bios); + INIT_WORK(&s->queued_bios_work, flush_queued_bios); /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ @@ -556,21 +564,28 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) return r; } +static void __free_exceptions(struct dm_snapshot *s) +{ + kcopyd_client_destroy(s->kcopyd_client); + s->kcopyd_client = NULL; + + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + s->store.destroy(&s->store); +} + static void snapshot_dtr(struct dm_target *ti) { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + flush_workqueue(ksnapd); + /* Prevent further origin writes from using this snapshot. */ /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); - kcopyd_client_destroy(s->kcopyd_client); - - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); - - /* Deallocate memory used */ - s->store.destroy(&s->store); + __free_exceptions(s); dm_put_device(ti, s->origin); dm_put_device(ti, s->cow); @@ -593,6 +608,20 @@ static void flush_bios(struct bio *bio) } } +static void flush_queued_bios(struct work_struct *work) +{ + struct dm_snapshot *s = + container_of(work, struct dm_snapshot, queued_bios_work); + struct bio *queued_bios; + unsigned long flags; + + spin_lock_irqsave(&s->pe_lock, flags); + queued_bios = bio_list_get(&s->queued_bios); + spin_unlock_irqrestore(&s->pe_lock, flags); + + flush_bios(queued_bios); +} + /* * Error a list of buffers. */ @@ -608,28 +637,7 @@ static void error_bios(struct bio *bio) } } -static inline void error_snapshot_bios(struct pending_exception *pe) -{ - error_bios(bio_list_get(&pe->snapshot_bios)); -} - -static struct bio *__flush_bios(struct pending_exception *pe) -{ - /* - * If this pe is involved in a write to the origin and - * it is the last sibling to complete then release - * the bios for the original write to the origin. - */ - - if (pe->primary_pe && - atomic_dec_and_test(&pe->primary_pe->sibling_count)) - return bio_list_get(&pe->primary_pe->origin_bios); - - return NULL; -} - -static void __invalidate_snapshot(struct dm_snapshot *s, - struct pending_exception *pe, int err) +static void __invalidate_snapshot(struct dm_snapshot *s, int err) { if (!s->valid) return; @@ -639,9 +647,6 @@ static void __invalidate_snapshot(struct dm_snapshot *s, else if (err == -ENOMEM) DMERR("Invalidating snapshot: Unable to allocate exception."); - if (pe) - remove_exception(&pe->e); - if (s->store.drop_snapshot) s->store.drop_snapshot(&s->store); @@ -650,78 +655,95 @@ static void __invalidate_snapshot(struct dm_snapshot *s, dm_table_event(s->table); } +static void get_pending_exception(struct pending_exception *pe) +{ + atomic_inc(&pe->ref_count); +} + +static struct bio *put_pending_exception(struct pending_exception *pe) +{ + struct pending_exception *primary_pe; + struct bio *origin_bios = NULL; + + primary_pe = pe->primary_pe; + + /* + * If this pe is involved in a write to the origin and + * it is the last sibling to complete then release + * the bios for the original write to the origin. + */ + if (primary_pe && + atomic_dec_and_test(&primary_pe->ref_count)) + origin_bios = bio_list_get(&primary_pe->origin_bios); + + /* + * Free the pe if it's not linked to an origin write or if + * it's not itself a primary pe. + */ + if (!primary_pe || primary_pe != pe) + free_pending_exception(pe); + + /* + * Free the primary pe if nothing references it. + */ + if (primary_pe && !atomic_read(&primary_pe->ref_count)) + free_pending_exception(primary_pe); + + return origin_bios; +} + static void pending_complete(struct pending_exception *pe, int success) { struct exception *e; - struct pending_exception *primary_pe; struct dm_snapshot *s = pe->snap; - struct bio *flush = NULL; + struct bio *origin_bios = NULL; + struct bio *snapshot_bios = NULL; + int error = 0; if (!success) { /* Read/write error - snapshot is unusable */ down_write(&s->lock); - __invalidate_snapshot(s, pe, -EIO); - flush = __flush_bios(pe); - up_write(&s->lock); - - error_snapshot_bios(pe); + __invalidate_snapshot(s, -EIO); + error = 1; goto out; } e = alloc_exception(); if (!e) { down_write(&s->lock); - __invalidate_snapshot(s, pe, -ENOMEM); - flush = __flush_bios(pe); - up_write(&s->lock); - - error_snapshot_bios(pe); + __invalidate_snapshot(s, -ENOMEM); + error = 1; goto out; } *e = pe->e; - /* - * Add a proper exception, and remove the - * in-flight exception from the list. - */ down_write(&s->lock); if (!s->valid) { - flush = __flush_bios(pe); - up_write(&s->lock); - free_exception(e); - - error_snapshot_bios(pe); + error = 1; goto out; } + /* + * Add a proper exception, and remove the + * in-flight exception from the list. + */ insert_exception(&s->complete, e); + + out: remove_exception(&pe->e); - flush = __flush_bios(pe); + snapshot_bios = bio_list_get(&pe->snapshot_bios); + origin_bios = put_pending_exception(pe); up_write(&s->lock); /* Submit any pending write bios */ - flush_bios(bio_list_get(&pe->snapshot_bios)); - - out: - primary_pe = pe->primary_pe; - - /* - * Free the pe if it's not linked to an origin write or if - * it's not itself a primary pe. - */ - if (!primary_pe || primary_pe != pe) - free_pending_exception(pe); - - /* - * Free the primary pe if nothing references it. - */ - if (primary_pe && !atomic_read(&primary_pe->sibling_count)) - free_pending_exception(primary_pe); + if (error) + error_bios(snapshot_bios); + else + flush_bios(snapshot_bios); - if (flush) - flush_bios(flush); + flush_bios(origin_bios); } static void commit_callback(void *context, int success) @@ -822,7 +844,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->primary_pe = NULL; - atomic_set(&pe->sibling_count, 1); + atomic_set(&pe->ref_count, 0); pe->snap = s; pe->started = 0; @@ -831,6 +853,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) return NULL; } + get_pending_exception(pe); insert_exception(&s->pending, &pe->e); out: @@ -850,8 +873,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, { struct exception *e; struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - int copy_needed = 0; - int r = 1; + int r = DM_MAPIO_REMAPPED; chunk_t chunk; struct pending_exception *pe = NULL; @@ -865,32 +887,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, if (unlikely(bio_barrier(bio))) return -EOPNOTSUPP; + /* FIXME: should only take write lock if we need + * to copy an exception */ + down_write(&s->lock); + + if (!s->valid) { + r = -EIO; + goto out_unlock; + } + + /* If the block is already remapped - use that, else remap it */ + e = lookup_exception(&s->complete, chunk); + if (e) { + remap_exception(s, e, bio); + goto out_unlock; + } + /* * Write to snapshot - higher level takes care of RW/RO * flags so we should only get this if we are * writeable. */ if (bio_rw(bio) == WRITE) { - - /* FIXME: should only take write lock if we need - * to copy an exception */ - down_write(&s->lock); - - if (!s->valid) { - r = -EIO; - goto out_unlock; - } - - /* If the block is already remapped - use that, else remap it */ - e = lookup_exception(&s->complete, chunk); - if (e) { - remap_exception(s, e, bio); - goto out_unlock; - } - pe = __find_pending_exception(s, bio); if (!pe) { - __invalidate_snapshot(s, pe, -ENOMEM); + __invalidate_snapshot(s, -ENOMEM); r = -EIO; goto out_unlock; } @@ -898,45 +919,27 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, remap_exception(s, &pe->e, bio); bio_list_add(&pe->snapshot_bios, bio); + r = DM_MAPIO_SUBMITTED; + if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; - copy_needed = 1; - } - - r = 0; - - out_unlock: - up_write(&s->lock); - - if (copy_needed) + up_write(&s->lock); start_copy(pe); - } else { + goto out; + } + } else /* * FIXME: this read path scares me because we * always use the origin when we have a pending * exception. However I can't think of a * situation where this is wrong - ejt. */ + bio->bi_bdev = s->origin->bdev; - /* Do reads */ - down_read(&s->lock); - - if (!s->valid) { - up_read(&s->lock); - return -EIO; - } - - /* See if it it has been remapped */ - e = lookup_exception(&s->complete, chunk); - if (e) - remap_exception(s, e, bio); - else - bio->bi_bdev = s->origin->bdev; - - up_read(&s->lock); - } - + out_unlock: + up_write(&s->lock); + out: return r; } @@ -994,7 +997,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, *---------------------------------------------------------------*/ static int __origin_write(struct list_head *snapshots, struct bio *bio) { - int r = 1, first = 0; + int r = DM_MAPIO_REMAPPED, first = 0; struct dm_snapshot *snap; struct exception *e; struct pending_exception *pe, *next_pe, *primary_pe = NULL; @@ -1025,7 +1028,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) * is already remapped in this snapshot * and trigger an exception if not. * - * sibling_count is initialised to 1 so pending_complete() + * ref_count is initialised to 1 so pending_complete() * won't destroy the primary_pe while we're inside this loop. */ e = lookup_exception(&snap->complete, chunk); @@ -1034,7 +1037,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) pe = __find_pending_exception(snap, bio); if (!pe) { - __invalidate_snapshot(snap, pe, ENOMEM); + __invalidate_snapshot(snap, -ENOMEM); goto next_snapshot; } @@ -1052,12 +1055,12 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) bio_list_add(&primary_pe->origin_bios, bio); - r = 0; + r = DM_MAPIO_SUBMITTED; } if (!pe->primary_pe) { - atomic_inc(&primary_pe->sibling_count); pe->primary_pe = primary_pe; + get_pending_exception(primary_pe); } if (!pe->started) { @@ -1070,20 +1073,20 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) } if (!primary_pe) - goto out; + return r; /* * If this is the first time we're processing this chunk and - * sibling_count is now 1 it means all the pending exceptions + * ref_count is now 1 it means all the pending exceptions * got completed while we were in the loop above, so it falls to * us here to remove the primary_pe and submit any origin_bios. */ - if (first && atomic_dec_and_test(&primary_pe->sibling_count)) { + if (first && atomic_dec_and_test(&primary_pe->ref_count)) { flush_bios(bio_list_get(&primary_pe->origin_bios)); free_pending_exception(primary_pe); /* If we got here, pe_queue is necessarily empty. */ - goto out; + return r; } /* @@ -1092,7 +1095,6 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) list_for_each_entry_safe(pe, next_pe, &pe_queue, list) start_copy(pe); - out: return r; } @@ -1102,7 +1104,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) static int do_origin(struct dm_dev *origin, struct bio *bio) { struct origin *o; - int r = 1; + int r = DM_MAPIO_REMAPPED; down_read(&_origins_lock); o = __lookup_origin(origin->bdev); @@ -1159,7 +1161,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio, return -EOPNOTSUPP; /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1; + return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; } #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) @@ -1205,7 +1207,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -1216,7 +1218,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -1275,8 +1277,17 @@ static int __init dm_snapshot_init(void) goto bad5; } + ksnapd = create_singlethread_workqueue("ksnapd"); + if (!ksnapd) { + DMERR("Failed to create ksnapd workqueue."); + r = -ENOMEM; + goto bad6; + } + return 0; + bad6: + mempool_destroy(pending_pool); bad5: kmem_cache_destroy(pending_cache); bad4: @@ -1294,6 +1305,8 @@ static void __exit dm_snapshot_exit(void) { int r; + destroy_workqueue(ksnapd); + r = dm_unregister_target(&snapshot_target); if (r) DMERR("snapshot unregister failed %d", r); diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h index fdec1e2dc871..15fa2ae6cdc2 100644 --- a/drivers/md/dm-snap.h +++ b/drivers/md/dm-snap.h @@ -10,7 +10,9 @@ #define DM_SNAPSHOT_H #include "dm.h" +#include "dm-bio-list.h" #include <linux/blkdev.h> +#include <linux/workqueue.h> struct exception_table { uint32_t hash_mask; @@ -112,10 +114,20 @@ struct dm_snapshot { struct exception_table pending; struct exception_table complete; + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + /* The on disk metadata handler */ struct exception_store store; struct kcopyd_client *kcopyd_client; + + /* Queue of snapshot writes for ksnapd to flush */ + struct bio_list queued_bios; + struct work_struct queued_bios_work; }; /* @@ -128,10 +140,9 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); * Constructor and destructor for the default persistent * store. */ -int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); +int dm_create_persistent(struct exception_store *store); -int dm_create_transient(struct exception_store *store, - struct dm_snapshot *s, int blocksize); +int dm_create_transient(struct exception_store *store); /* * Return the number of sectors in the device. diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 6c29fcecd892..51f5e0760012 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -186,7 +186,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = sc->stripe[stripe].dev->bdev; bio->bi_sector = sc->stripe[stripe].physical_start + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); - return 1; + return DM_MAPIO_REMAPPED; } static int stripe_status(struct dm_target *ti, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 75fe9493e6af..05befa91807a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -522,56 +522,61 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, return 0; } - -int dm_get_device(struct dm_target *ti, const char *path, sector_t start, - sector_t len, int mode, struct dm_dev **result) +void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) { - int r = __table_get_device(ti->table, ti, path, - start, len, mode, result); - if (!r) { - request_queue_t *q = bdev_get_queue((*result)->bdev); - struct io_restrictions *rs = &ti->limits; - - /* - * Combine the device limits low. - * - * FIXME: if we move an io_restriction struct - * into q this would just be a call to - * combine_restrictions_low() - */ + request_queue_t *q = bdev_get_queue(bdev); + struct io_restrictions *rs = &ti->limits; + + /* + * Combine the device limits low. + * + * FIXME: if we move an io_restriction struct + * into q this would just be a call to + * combine_restrictions_low() + */ + rs->max_sectors = + min_not_zero(rs->max_sectors, q->max_sectors); + + /* FIXME: Device-Mapper on top of RAID-0 breaks because DM + * currently doesn't honor MD's merge_bvec_fn routine. + * In this case, we'll force DM to use PAGE_SIZE or + * smaller I/O, just to be safe. A better fix is in the + * works, but add this for the time being so it will at + * least operate correctly. + */ + if (q->merge_bvec_fn) rs->max_sectors = - min_not_zero(rs->max_sectors, q->max_sectors); + min_not_zero(rs->max_sectors, + (unsigned int) (PAGE_SIZE >> 9)); - /* FIXME: Device-Mapper on top of RAID-0 breaks because DM - * currently doesn't honor MD's merge_bvec_fn routine. - * In this case, we'll force DM to use PAGE_SIZE or - * smaller I/O, just to be safe. A better fix is in the - * works, but add this for the time being so it will at - * least operate correctly. - */ - if (q->merge_bvec_fn) - rs->max_sectors = - min_not_zero(rs->max_sectors, - (unsigned int) (PAGE_SIZE >> 9)); + rs->max_phys_segments = + min_not_zero(rs->max_phys_segments, + q->max_phys_segments); - rs->max_phys_segments = - min_not_zero(rs->max_phys_segments, - q->max_phys_segments); + rs->max_hw_segments = + min_not_zero(rs->max_hw_segments, q->max_hw_segments); - rs->max_hw_segments = - min_not_zero(rs->max_hw_segments, q->max_hw_segments); + rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); - rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); + rs->max_segment_size = + min_not_zero(rs->max_segment_size, q->max_segment_size); - rs->max_segment_size = - min_not_zero(rs->max_segment_size, q->max_segment_size); + rs->seg_boundary_mask = + min_not_zero(rs->seg_boundary_mask, + q->seg_boundary_mask); - rs->seg_boundary_mask = - min_not_zero(rs->seg_boundary_mask, - q->seg_boundary_mask); + rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); +} +EXPORT_SYMBOL_GPL(dm_set_device_limits); - rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); - } +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result) +{ + int r = __table_get_device(ti->table, ti, path, + start, len, mode, result); + + if (!r) + dm_set_device_limits(ti, (*result)->bdev); return r; } @@ -939,9 +944,20 @@ void dm_table_postsuspend_targets(struct dm_table *t) return suspend_targets(t, 1); } -void dm_table_resume_targets(struct dm_table *t) +int dm_table_resume_targets(struct dm_table *t) { - int i; + int i, r = 0; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (!ti->type->preresume) + continue; + + r = ti->type->preresume(ti); + if (r) + return r; + } for (i = 0; i < t->num_targets; i++) { struct dm_target *ti = t->targets + i; @@ -949,6 +965,8 @@ void dm_table_resume_targets(struct dm_table *t) if (ti->type->resume) ti->type->resume(ti); } + + return 0; } int dm_table_any_congested(struct dm_table *t, int bdi_bits) @@ -983,6 +1001,11 @@ int dm_table_flush_all(struct dm_table *t) { struct list_head *d, *devices = dm_table_get_devices(t); int ret = 0; + unsigned i; + + for (i = 0; i < t->num_targets; i++) + if (t->targets[i].type->flush) + t->targets[i].type->flush(&t->targets[i]); for (d = devices->next; d != devices; d = d->next) { struct dm_dev *dd = list_entry(d, struct dm_dev, list); diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index ea569f7348d2..f314d7dc9c26 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -46,7 +46,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio, bio_endio(bio, bio->bi_size, 0); /* accepted bio, don't make new request */ - return 0; + return DM_MAPIO_SUBMITTED; } static struct target_type zero_target = { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c99bf9f01759..fe7c56e10435 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,6 +20,7 @@ #include <linux/idr.h> #include <linux/hdreg.h> #include <linux/blktrace_api.h> +#include <linux/smp_lock.h> #define DM_MSG_PREFIX "core" @@ -67,10 +68,12 @@ union map_info *dm_get_mapinfo(struct bio *bio) #define DMF_FROZEN 2 #define DMF_FREEING 3 #define DMF_DELETING 4 +#define DMF_NOFLUSH_SUSPENDING 5 struct mapped_device { struct rw_semaphore io_lock; struct semaphore suspend_lock; + spinlock_t pushback_lock; rwlock_t map_lock; atomic_t holders; atomic_t open_count; @@ -88,7 +91,8 @@ struct mapped_device { */ atomic_t pending; wait_queue_head_t wait; - struct bio_list deferred; + struct bio_list deferred; + struct bio_list pushback; /* * The current mapping. @@ -101,6 +105,8 @@ struct mapped_device { mempool_t *io_pool; mempool_t *tio_pool; + struct bio_set *bs; + /* * Event handling. */ @@ -118,19 +124,13 @@ struct mapped_device { }; #define MIN_IOS 256 -static kmem_cache_t *_io_cache; -static kmem_cache_t *_tio_cache; - -static struct bio_set *dm_set; +static struct kmem_cache *_io_cache; +static struct kmem_cache *_tio_cache; static int __init local_init(void) { int r; - dm_set = bioset_create(16, 16, 4); - if (!dm_set) - return -ENOMEM; - /* allocate a slab for the dm_ios */ _io_cache = kmem_cache_create("dm_io", sizeof(struct dm_io), 0, 0, NULL, NULL); @@ -164,8 +164,6 @@ static void local_exit(void) kmem_cache_destroy(_tio_cache); kmem_cache_destroy(_io_cache); - bioset_free(dm_set); - if (unregister_blkdev(_major, _name) < 0) DMERR("unregister_blkdev failed"); @@ -288,6 +286,45 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } +static int dm_blk_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct mapped_device *md; + struct dm_table *map; + struct dm_target *tgt; + int r = -ENOTTY; + + /* We don't really need this lock, but we do need 'inode'. */ + unlock_kernel(); + + md = inode->i_bdev->bd_disk->private_data; + + map = dm_get_table(md); + + if (!map || !dm_table_get_size(map)) + goto out; + + /* We only support devices that have a single target */ + if (dm_table_get_num_targets(map) != 1) + goto out; + + tgt = dm_table_get_target(map, 0); + + if (dm_suspended(md)) { + r = -EAGAIN; + goto out; + } + + if (tgt->type->ioctl) + r = tgt->type->ioctl(tgt, inode, file, cmd, arg); + +out: + dm_table_put(map); + + lock_kernel(); + return r; +} + static inline struct dm_io *alloc_io(struct mapped_device *md) { return mempool_alloc(md->io_pool, GFP_NOIO); @@ -410,23 +447,50 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) * you this clearly demarcated crap. *---------------------------------------------------------------*/ +static int __noflush_suspending(struct mapped_device *md) +{ + return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); +} + /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ static void dec_pending(struct dm_io *io, int error) { - if (error) + unsigned long flags; + + /* Push-back supersedes any I/O errors */ + if (error && !(io->error > 0 && __noflush_suspending(io->md))) io->error = error; if (atomic_dec_and_test(&io->io_count)) { + if (io->error == DM_ENDIO_REQUEUE) { + /* + * Target requested pushing back the I/O. + * This must be handled before the sleeper on + * suspend queue merges the pushback list. + */ + spin_lock_irqsave(&io->md->pushback_lock, flags); + if (__noflush_suspending(io->md)) + bio_list_add(&io->md->pushback, io->bio); + else + /* noflush suspend was interrupted. */ + io->error = -EIO; + spin_unlock_irqrestore(&io->md->pushback_lock, flags); + } + if (end_io_acct(io)) /* nudge anyone waiting on suspend queue */ wake_up(&io->md->wait); - blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); + if (io->error != DM_ENDIO_REQUEUE) { + blk_add_trace_bio(io->md->queue, io->bio, + BLK_TA_COMPLETE); + + bio_endio(io->bio, io->bio->bi_size, io->error); + } - bio_endio(io->bio, io->bio->bi_size, io->error); free_io(io->md, io); } } @@ -435,7 +499,7 @@ static int clone_endio(struct bio *bio, unsigned int done, int error) { int r = 0; struct target_io *tio = bio->bi_private; - struct dm_io *io = tio->io; + struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; if (bio->bi_size) @@ -446,17 +510,30 @@ static int clone_endio(struct bio *bio, unsigned int done, int error) if (endio) { r = endio(tio->ti, bio, error, &tio->info); - if (r < 0) + if (r < 0 || r == DM_ENDIO_REQUEUE) + /* + * error and requeue request are handled + * in dec_pending(). + */ error = r; - - else if (r > 0) - /* the target wants another shot at the io */ + else if (r == DM_ENDIO_INCOMPLETE) + /* The target will handle the io */ return 1; + else if (r) { + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } } - free_tio(io->md, tio); - dec_pending(io, error); + dec_pending(tio->io, error); + + /* + * Store md for cleanup instead of tio which is about to get freed. + */ + bio->bi_private = md->bs; + bio_put(bio); + free_tio(md, tio); return r; } @@ -485,6 +562,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, { int r; sector_t sector; + struct mapped_device *md; /* * Sanity checks. @@ -502,7 +580,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, atomic_inc(&tio->io->io_count); sector = clone->bi_sector; r = ti->type->map(ti, clone, &tio->info); - if (r > 0) { + if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, @@ -510,14 +588,19 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, clone->bi_sector); generic_make_request(clone); - } - - else if (r < 0) { - /* error the io and bail out */ - struct dm_io *io = tio->io; - free_tio(tio->io->md, tio); - dec_pending(io, r); + } else if (r < 0 || r == DM_MAPIO_REQUEUE) { + /* error the io and bail out, or requeue it if needed */ + md = tio->io->md; + dec_pending(tio->io, r); + /* + * Store bio_set for cleanup. + */ + clone->bi_private = md->bs; bio_put(clone); + free_tio(md, tio); + } else if (r) { + DMWARN("unimplemented target map return value: %d", r); + BUG(); } } @@ -533,7 +616,9 @@ struct clone_info { static void dm_bio_destructor(struct bio *bio) { - bio_free(bio, dm_set); + struct bio_set *bs = bio->bi_private; + + bio_free(bio, bs); } /* @@ -541,12 +626,12 @@ static void dm_bio_destructor(struct bio *bio) */ static struct bio *split_bvec(struct bio *bio, sector_t sector, unsigned short idx, unsigned int offset, - unsigned int len) + unsigned int len, struct bio_set *bs) { struct bio *clone; struct bio_vec *bv = bio->bi_io_vec + idx; - clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set); + clone = bio_alloc_bioset(GFP_NOIO, 1, bs); clone->bi_destructor = dm_bio_destructor; *clone->bi_io_vec = *bv; @@ -566,11 +651,13 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, */ static struct bio *clone_bio(struct bio *bio, sector_t sector, unsigned short idx, unsigned short bv_count, - unsigned int len) + unsigned int len, struct bio_set *bs) { struct bio *clone; - clone = bio_clone(bio, GFP_NOIO); + clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); + __bio_clone(clone, bio); + clone->bi_destructor = dm_bio_destructor; clone->bi_sector = sector; clone->bi_idx = idx; clone->bi_vcnt = idx + bv_count; @@ -601,7 +688,8 @@ static void __clone_and_map(struct clone_info *ci) * the remaining io with a single clone. */ clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count); + bio->bi_vcnt - ci->idx, ci->sector_count, + ci->md->bs); __map_bio(ti, clone, tio); ci->sector_count = 0; @@ -624,7 +712,8 @@ static void __clone_and_map(struct clone_info *ci) len += bv_len; } - clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); + clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, + ci->md->bs); __map_bio(ti, clone, tio); ci->sector += len; @@ -653,7 +742,8 @@ static void __clone_and_map(struct clone_info *ci) len = min(remaining, max); clone = split_bvec(bio, ci->sector, ci->idx, - bv->bv_offset + offset, len); + bv->bv_offset + offset, len, + ci->md->bs); __map_bio(ti, clone, tio); @@ -896,6 +986,7 @@ static struct mapped_device *alloc_dev(int minor) memset(md, 0, sizeof(*md)); init_rwsem(&md->io_lock); init_MUTEX(&md->suspend_lock); + spin_lock_init(&md->pushback_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -903,7 +994,7 @@ static struct mapped_device *alloc_dev(int minor) md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) - goto bad1; + goto bad1_free_minor; md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; @@ -914,13 +1005,17 @@ static struct mapped_device *alloc_dev(int minor) md->queue->issue_flush_fn = dm_flush_all; md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); - if (!md->io_pool) - goto bad2; + if (!md->io_pool) + goto bad2; md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); if (!md->tio_pool) goto bad3; + md->bs = bioset_create(16, 16, 4); + if (!md->bs) + goto bad_no_bioset; + md->disk = alloc_disk(1); if (!md->disk) goto bad4; @@ -948,11 +1043,14 @@ static struct mapped_device *alloc_dev(int minor) return md; bad4: + bioset_free(md->bs); + bad_no_bioset: mempool_destroy(md->tio_pool); bad3: mempool_destroy(md->io_pool); bad2: blk_cleanup_queue(md->queue); + bad1_free_minor: free_minor(minor); bad1: module_put(THIS_MODULE); @@ -971,6 +1069,7 @@ static void free_dev(struct mapped_device *md) } mempool_destroy(md->tio_pool); mempool_destroy(md->io_pool); + bioset_free(md->bs); del_gendisk(md->disk); free_minor(minor); @@ -1215,20 +1314,30 @@ static void unlock_fs(struct mapped_device *md) * dm_bind_table, dm_suspend must be called to flush any in * flight bios and ensure that any further io gets deferred. */ -int dm_suspend(struct mapped_device *md, int do_lockfs) +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; + unsigned long flags; DECLARE_WAITQUEUE(wait, current); struct bio *def; int r = -EINVAL; + int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; + int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; down(&md->suspend_lock); if (dm_suspended(md)) - goto out; + goto out_unlock; map = dm_get_table(md); + /* + * DMF_NOFLUSH_SUSPENDING must be set before presuspend. + * This flag is cleared before dm_suspend returns. + */ + if (noflush) + set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + /* This does not get reverted if there's an error later. */ dm_table_presuspend_targets(map); @@ -1236,11 +1345,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) if (!md->suspended_bdev) { DMWARN("bdget failed in dm_suspend"); r = -ENOMEM; - goto out; + goto flush_and_out; } - /* Flush I/O to the device. */ - if (do_lockfs) { + /* + * Flush I/O to the device. + * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. + */ + if (do_lockfs && !noflush) { r = lock_fs(md); if (r) goto out; @@ -1276,6 +1388,14 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) down_write(&md->io_lock); remove_wait_queue(&md->wait, &wait); + if (noflush) { + spin_lock_irqsave(&md->pushback_lock, flags); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + bio_list_merge_head(&md->deferred, &md->pushback); + bio_list_init(&md->pushback); + spin_unlock_irqrestore(&md->pushback_lock, flags); + } + /* were we interrupted ? */ r = -EINTR; if (atomic_read(&md->pending)) { @@ -1284,7 +1404,7 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) __flush_deferred_io(md, def); up_write(&md->io_lock); unlock_fs(md); - goto out; + goto out; /* pushback list is already flushed, so skip flush */ } up_write(&md->io_lock); @@ -1294,6 +1414,25 @@ int dm_suspend(struct mapped_device *md, int do_lockfs) r = 0; +flush_and_out: + if (r && noflush) { + /* + * Because there may be already I/Os in the pushback list, + * flush them before return. + */ + down_write(&md->io_lock); + + spin_lock_irqsave(&md->pushback_lock, flags); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + bio_list_merge_head(&md->deferred, &md->pushback); + bio_list_init(&md->pushback); + spin_unlock_irqrestore(&md->pushback_lock, flags); + + def = bio_list_get(&md->deferred); + __flush_deferred_io(md, def); + up_write(&md->io_lock); + } + out: if (r && md->suspended_bdev) { bdput(md->suspended_bdev); @@ -1301,6 +1440,8 @@ out: } dm_table_put(map); + +out_unlock: up(&md->suspend_lock); return r; } @@ -1319,7 +1460,9 @@ int dm_resume(struct mapped_device *md) if (!map || !dm_table_get_size(map)) goto out; - dm_table_resume_targets(map); + r = dm_table_resume_targets(map); + if (r) + goto out; down_write(&md->io_lock); clear_bit(DMF_BLOCK_IO, &md->flags); @@ -1337,6 +1480,8 @@ int dm_resume(struct mapped_device *md) dm_table_unplug_all(map); + kobject_uevent(&md->disk->kobj, KOBJ_CHANGE); + r = 0; out: @@ -1374,9 +1519,21 @@ int dm_suspended(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_noflush_suspending(struct dm_target *ti) +{ + struct mapped_device *md = dm_table_get_md(ti->table); + int r = __noflush_suspending(md); + + dm_put(md); + + return r; +} +EXPORT_SYMBOL_GPL(dm_noflush_suspending); + static struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, + .ioctl = dm_blk_ioctl, .getgeo = dm_blk_getgeo, .owner = THIS_MODULE }; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 3c03c0ecab7e..2f796b1436b2 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -21,6 +21,11 @@ #define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) #define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) #define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) +#ifdef CONFIG_DM_DEBUG +# define DMDEBUG(f, arg...) printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg) +#else +# define DMDEBUG(f, arg...) do {} while (0) +#endif #define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 0 : scnprintf(result + sz, maxlen - sz, x)) @@ -28,6 +33,25 @@ #define SECTOR_SHIFT 9 /* + * Definitions of return values from target end_io function. + */ +#define DM_ENDIO_INCOMPLETE 1 +#define DM_ENDIO_REQUEUE 2 + +/* + * Definitions of return values from target map function. + */ +#define DM_MAPIO_SUBMITTED 0 +#define DM_MAPIO_REMAPPED 1 +#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE + +/* + * Suspend feature flags + */ +#define DM_SUSPEND_LOCKFS_FLAG (1 << 0) +#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) + +/* * List of devices that a metadevice uses and should open/close. */ struct dm_dev { @@ -52,7 +76,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); -void dm_table_resume_targets(struct dm_table *t); +int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); void dm_table_unplug_all(struct dm_table *t); int dm_table_flush_all(struct dm_table *t); diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c index f1db6eff4857..b46f6c575f7e 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/kcopyd.c @@ -203,7 +203,7 @@ struct kcopyd_job { /* FIXME: this should scale with the number of pages */ #define MIN_JOBS 512 -static kmem_cache_t *_job_cache; +static struct kmem_cache *_job_cache; static mempool_t *_job_pool; /* @@ -417,7 +417,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) /* * kcopyd does this every time it's woken up. */ -static void do_work(void *ignored) +static void do_work(struct work_struct *ignored) { /* * The order that these are called is *very* important. @@ -628,7 +628,7 @@ static int kcopyd_init(void) } kcopyd_clients++; - INIT_WORK(&_kcopyd_work, do_work, NULL); + INIT_WORK(&_kcopyd_work, do_work); mutex_unlock(&kcopyd_init_lock); return 0; } diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b99c19c7eb22..c625ddb8833d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -111,6 +111,19 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int linear_congested(void *data, int bits) +{ + mddev_t *mddev = data; + linear_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + for (i = 0; i < mddev->raid_disks && !ret ; i++) { + request_queue_t *q = bdev_get_queue(conf->disks[i].rdev->bdev); + ret |= bdi_congested(&q->backing_dev_info, bits); + } + return ret; +} + static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; @@ -269,6 +282,8 @@ static int linear_run (mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; mddev->queue->issue_flush_fn = linear_issue_flush; + mddev->queue->backing_dev_info.congested_fn = linear_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 8dbab2ef3885..21e2a7b08841 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -39,10 +39,10 @@ #include <linux/raid/bitmap.h> #include <linux/sysctl.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ -#include <linux/suspend.h> #include <linux/poll.h> #include <linux/mutex.h> #include <linux/ctype.h> +#include <linux/freezer.h> #include <linux/init.h> @@ -389,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error) if (bio->bi_size) return 1; - if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk("md: super_written gets error=%d, uptodate=%d\n", + error, test_bit(BIO_UPTODATE, &bio->bi_flags)); + WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); md_error(mddev, rdev); + } if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); @@ -970,12 +974,13 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) * version 1 superblock */ -static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) { - unsigned int disk_csum, csum; + __le32 disk_csum; + u32 csum; unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; - unsigned int *isuper = (unsigned int*)sb; + __le32 *isuper = (__le32*)sb; int i; disk_csum = sb->sb_csum; @@ -985,7 +990,7 @@ static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) newcsum += le32_to_cpu(*isuper++); if (size == 2) - newcsum += le16_to_cpu(*(unsigned short*) isuper); + newcsum += le16_to_cpu(*(__le16*) isuper); csum = (newcsum & 0xffffffff) + (newcsum >> 32); sb->sb_csum = disk_csum; @@ -1102,7 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (le32_to_cpu(sb->chunksize)) rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); - if (le32_to_cpu(sb->size) > rdev->size*2) + if (le64_to_cpu(sb->size) > rdev->size*2) return -EINVAL; return ret; } @@ -1224,7 +1229,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) else sb->resync_offset = cpu_to_le64(0); - sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); + sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->size<<1); @@ -1408,7 +1413,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); @@ -1418,7 +1423,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) if (err) { printk(KERN_ERR "md: could not bd_claim %s.\n", bdevname(bdev, b)); - blkdev_put_partition(bdev); + blkdev_put(bdev); return err; } rdev->bdev = bdev; @@ -1432,7 +1437,7 @@ static void unlock_rdev(mdk_rdev_t *rdev) if (!bdev) MD_BUG(); bd_release(bdev); - blkdev_put_partition(bdev); + blkdev_put(bdev); } void md_autodetect_dev(dev_t dev); @@ -1587,7 +1592,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) } } -void md_update_sb(mddev_t * mddev) +static void md_update_sb(mddev_t * mddev, int force_change) { int err; struct list_head *tmp; @@ -1598,7 +1603,18 @@ void md_update_sb(mddev_t * mddev) repeat: spin_lock_irq(&mddev->write_lock); - if (mddev->degraded && mddev->sb_dirty == 3) + set_bit(MD_CHANGE_PENDING, &mddev->flags); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + /* just a clean<-> dirty transition, possibly leave spares alone, + * though if events isn't the right even/odd, we will have to do + * spares after all + */ + nospares = 1; + if (force_change) + nospares = 0; + if (mddev->degraded) /* If the array is degraded, then skipping spares is both * dangerous and fairly pointless. * Dangerous because a device that was removed from the array @@ -1608,20 +1624,14 @@ repeat: * then a recovery will happen and soon that array won't * be degraded any more and the spare can go back to sleep then. */ - mddev->sb_dirty = 1; + nospares = 0; sync_req = mddev->in_sync; mddev->utime = get_seconds(); - if (mddev->sb_dirty == 3) - /* just a clean<-> dirty transition, possibly leave spares alone, - * though if events isn't the right even/odd, we will have to do - * spares after all - */ - nospares = 1; /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ - if (mddev->sb_dirty == 3 + if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) && (mddev->events & 1)) mddev->events--; @@ -1652,7 +1662,6 @@ repeat: MD_BUG(); mddev->events --; } - mddev->sb_dirty = 2; sync_sbs(mddev, nospares); /* @@ -1660,7 +1669,7 @@ repeat: * nonpersistent superblocks */ if (!mddev->persistent) { - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); return; @@ -1697,20 +1706,20 @@ repeat: break; } md_super_wait(mddev); - /* if there was a failure, sb_dirty was set to 1, and we re-write super */ + /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { + if (mddev->in_sync != sync_req || + test_bit(MD_CHANGE_DEVS, &mddev->flags)) { /* have to write it out again */ spin_unlock_irq(&mddev->write_lock); goto repeat; } - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); } -EXPORT_SYMBOL_GPL(md_update_sb); /* words written to sysfs files may, or my not, be \n terminated. * We want to accept with case. For this we use cmd_match. @@ -1783,7 +1792,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) else { mddev_t *mddev = rdev->mddev; kick_rdev_from_array(rdev); - md_update_sb(mddev); + md_update_sb(mddev, 1); md_new_event(mddev); err = 0; } @@ -1994,6 +2003,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj); rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; rdev->sb_events = 0; @@ -2426,7 +2436,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) spin_lock_irq(&mddev->write_lock); if (atomic_read(&mddev->writes_pending) == 0) { mddev->in_sync = 1; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } spin_unlock_irq(&mddev->write_lock); } else { @@ -2438,7 +2448,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - mddev->sb_dirty = 0; + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -2520,6 +2530,36 @@ static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t +bitmap_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *end; + unsigned long chunk, end_chunk; + + if (!mddev->bitmap) + goto out; + /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ + while (*buf) { + chunk = end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + if (*end == '-') { /* range */ + buf = end + 1; + end_chunk = simple_strtoul(buf, &end, 0); + if (buf == end) break; + } + if (*end && !isspace(*end)) break; + bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); + buf = end; + while (isspace(*buf)) buf++; + } + bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ +out: + return len; +} + +static struct md_sysfs_entry md_bitmap = +__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); + +static ssize_t size_show(mddev_t *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->size); @@ -2543,7 +2583,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { err = update_size(mddev, size); - md_update_sb(mddev); + md_update_sb(mddev, 1); } else { if (mddev->size == 0 || mddev->size > size) @@ -2839,6 +2879,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_completed.attr, &md_suspend_lo.attr, &md_suspend_hi.attr, + &md_bitmap.attr, NULL, }; static struct attribute_group md_redundancy_group = { @@ -3111,8 +3152,8 @@ static int do_md_run(mddev_t * mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->flags) + md_update_sb(mddev, 0); set_capacity(disk, mddev->array_size<<1); @@ -3159,6 +3200,7 @@ static int do_md_run(mddev_t * mddev) mddev->changed = 1; md_new_event(mddev); + kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); return 0; } @@ -3272,13 +3314,17 @@ static int do_md_stop(mddev_t * mddev, int mode) module_put(mddev->pers->owner); mddev->pers = NULL; + + set_capacity(disk, 0); + mddev->changed = 1; + if (mddev->ro) mddev->ro = 0; } - if (!mddev->in_sync || mddev->sb_dirty) { + if (!mddev->in_sync || mddev->flags) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; - md_update_sb(mddev); + md_update_sb(mddev, 1); } if (mode == 1) set_disk_ro(disk, 1); @@ -3291,7 +3337,7 @@ static int do_md_stop(mddev_t * mddev, int mode) if (mode == 0) { mdk_rdev_t *rdev; struct list_head *tmp; - struct gendisk *disk; + printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); @@ -3316,10 +3362,6 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->raid_disks = 0; mddev->recovery_cp = 0; - disk = mddev->gendisk; - if (disk) - set_capacity(disk, 0); - mddev->changed = 1; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); @@ -3329,6 +3371,7 @@ out: return err; } +#ifndef MODULE static void autorun_array(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -3374,6 +3417,7 @@ static void autorun_devices(int part) printk(KERN_INFO "md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { + int unit; dev_t dev; LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, @@ -3393,16 +3437,19 @@ static void autorun_devices(int part) * mostly sane superblocks. It's time to allocate the * mddev. */ - if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { + if (part) { + dev = MKDEV(mdp_major, + rdev0->preferred_minor << MdpMinorShift); + unit = MINOR(dev) >> MdpMinorShift; + } else { + dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); + unit = MINOR(dev); + } + if (rdev0->preferred_minor != unit) { printk(KERN_INFO "md: unit number in %s is bad: %d\n", bdevname(rdev0->bdev, b), rdev0->preferred_minor); break; } - if (part) - dev = MKDEV(mdp_major, - rdev0->preferred_minor << MdpMinorShift); - else - dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); md_probe(dev, NULL, NULL); mddev = mddev_find(dev); @@ -3439,67 +3486,7 @@ static void autorun_devices(int part) } printk(KERN_INFO "md: ... autorun DONE.\n"); } - -/* - * import RAID devices based on one partition - * if possible, the array gets run as well. - */ - -static int autostart_array(dev_t startdev) -{ - char b[BDEVNAME_SIZE]; - int err = -EINVAL, i; - mdp_super_t *sb = NULL; - mdk_rdev_t *start_rdev = NULL, *rdev; - - start_rdev = md_import_device(startdev, 0, 0); - if (IS_ERR(start_rdev)) - return err; - - - /* NOTE: this can only work for 0.90.0 superblocks */ - sb = (mdp_super_t*)page_address(start_rdev->sb_page); - if (sb->major_version != 0 || - sb->minor_version != 90 ) { - printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); - export_rdev(start_rdev); - return err; - } - - if (test_bit(Faulty, &start_rdev->flags)) { - printk(KERN_WARNING - "md: can not autostart based on faulty %s!\n", - bdevname(start_rdev->bdev,b)); - export_rdev(start_rdev); - return err; - } - list_add(&start_rdev->same_set, &pending_raid_disks); - - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc = sb->disks + i; - dev_t dev = MKDEV(desc->major, desc->minor); - - if (!dev) - continue; - if (dev == startdev) - continue; - if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) - continue; - rdev = md_import_device(dev, 0, 0); - if (IS_ERR(rdev)) - continue; - - list_add(&rdev->same_set, &pending_raid_disks); - } - - /* - * possibly return codes - */ - autorun_devices(0); - return 0; - -} - +#endif /* !MODULE */ static int get_version(void __user * arg) { @@ -3737,6 +3724,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); + md_update_sb(mddev, 1); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); return err; @@ -3808,7 +3796,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) goto busy; kick_rdev_from_array(rdev); - md_update_sb(mddev); + md_update_sb(mddev, 1); md_new_event(mddev); return 0; @@ -3867,6 +3855,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) goto abort_export; @@ -3885,7 +3874,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) rdev->raid_disk = -1; - md_update_sb(mddev); + md_update_sb(mddev, 1); /* * Kick recovery, maybe this spare has to be added to the @@ -4016,7 +4005,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->max_disks = MD_SB_DISKS; - mddev->sb_dirty = 1; + mddev->flags = 0; + set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev->default_bitmap_offset = MD_SB_BYTES >> 9; mddev->bitmap_offset = 0; @@ -4059,11 +4049,8 @@ static int update_size(mddev_t *mddev, unsigned long size) return -EBUSY; ITERATE_RDEV(mddev,rdev,tmp) { sector_t avail; - if (rdev->sb_offset > rdev->data_offset) - avail = (rdev->sb_offset*2) - rdev->data_offset; - else - avail = get_capacity(rdev->bdev->bd_disk) - - rdev->data_offset; + avail = rdev->size * 2; + if (fit && (size == 0 || size > avail/2)) size = avail/2; if (avail < ((sector_t)size << 1)) @@ -4185,7 +4172,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->bitmap_offset = 0; } } - md_update_sb(mddev); + md_update_sb(mddev, 1); return rv; } @@ -4259,27 +4246,6 @@ static int md_ioctl(struct inode *inode, struct file *file, goto abort; } - - if (cmd == START_ARRAY) { - /* START_ARRAY doesn't need to lock the array as autostart_array - * does the locking, and it could even be a different array - */ - static int cnt = 3; - if (cnt > 0 ) { - printk(KERN_WARNING - "md: %s(pid %d) used deprecated START_ARRAY ioctl. " - "This will not be supported beyond July 2006\n", - current->comm, current->pid); - cnt--; - } - err = autostart_array(new_decode_dev(arg)); - if (err) { - printk(KERN_WARNING "md: autostart failed!\n"); - goto abort; - } - goto done; - } - err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -4460,7 +4426,7 @@ static int md_open(struct inode *inode, struct file *file) mddev_t *mddev = inode->i_bdev->bd_disk->private_data; int err; - if ((err = mddev_lock(mddev))) + if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) goto out; err = 0; @@ -4476,8 +4442,7 @@ static int md_release(struct inode *inode, struct file * file) { mddev_t *mddev = inode->i_bdev->bd_disk->private_data; - if (!mddev) - BUG(); + BUG_ON(!mddev); mddev_put(mddev); return 0; @@ -4524,6 +4489,7 @@ static int md_thread(void * arg) * many dirty RAID5 blocks. */ + current->flags |= PF_NOFREEZE; allow_signal(SIGKILL); while (!kthread_should_stop()) { @@ -4540,7 +4506,6 @@ static int md_thread(void * arg) test_bit(THREAD_WAKEUP, &thread->flags) || kthread_should_stop(), thread->timeout); - try_to_freeze(); clear_bit(THREAD_WAKEUP, &thread->flags); @@ -4687,9 +4652,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? "reshape" : - (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? - "resync" : "recovery")), - per_milli/10, per_milli % 10, + (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? + "check" : + (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? + "resync" : "recovery"))), + per_milli/10, per_milli % 10, (unsigned long long) resync, (unsigned long long) max_blocks); @@ -4882,8 +4849,8 @@ static int md_seq_show(struct seq_file *seq, void *v) chunk_kb ? "KB" : "B"); if (bitmap->file) { seq_printf(seq, ", file: "); - seq_path(seq, bitmap->file->f_vfsmnt, - bitmap->file->f_dentry," \t\n"); + seq_path(seq, bitmap->file->f_path.mnt, + bitmap->file->f_path.dentry," \t\n"); } seq_printf(seq, "\n"); @@ -4948,6 +4915,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait) } static struct file_operations md_seq_fops = { + .owner = THIS_MODULE, .open = md_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -5042,12 +5010,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { mddev->in_sync = 0; - mddev->sb_dirty = 3; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); md_wakeup_thread(mddev->thread); } spin_unlock_irq(&mddev->write_lock); } - wait_event(mddev->sb_wait, mddev->sb_dirty==0); + wait_event(mddev->sb_wait, mddev->flags==0); } void md_write_end(mddev_t *mddev) @@ -5078,6 +5046,7 @@ void md_do_sync(mddev_t *mddev) int skipped = 0; struct list_head *rtmp; mdk_rdev_t *rdev; + char *desc; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -5085,6 +5054,18 @@ void md_do_sync(mddev_t *mddev) if (mddev->ro) /* never try to sync a read-only array */ return; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + desc = "data-check"; + else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + desc = "requested-resync"; + else + desc = "resync"; + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + desc = "reshape"; + else + desc = "recovery"; + /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync @@ -5128,10 +5109,10 @@ void md_do_sync(mddev_t *mddev) prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); if (!kthread_should_stop() && mddev2->curr_resync >= mddev->curr_resync) { - printk(KERN_INFO "md: delaying resync of %s" - " until %s has finished resync (they" + printk(KERN_INFO "md: delaying %s of %s" + " until %s has finished (they" " share one or more physical units)\n", - mdname(mddev), mdname(mddev2)); + desc, mdname(mddev), mdname(mddev2)); mddev_put(mddev2); schedule(); finish_wait(&resync_wait, &wq); @@ -5167,12 +5148,12 @@ void md_do_sync(mddev_t *mddev) j = rdev->recovery_offset; } - printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" - " %d KB/sec/disc.\n", speed_min(mddev)); + printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ speed:" + " %d KB/sec/disk.\n", speed_min(mddev)); printk(KERN_INFO "md: using maximum available idle IO bandwidth " - "(but not more than %d KB/sec) for reconstruction.\n", - speed_max(mddev)); + "(but not more than %d KB/sec) for %s.\n", + speed_max(mddev), desc); is_mddev_idle(mddev); /* this also initializes IO event counters */ @@ -5198,8 +5179,8 @@ void md_do_sync(mddev_t *mddev) if (j>2) { printk(KERN_INFO - "md: resuming recovery of %s from checkpoint.\n", - mdname(mddev)); + "md: resuming %s of %s from checkpoint.\n", + desc, mdname(mddev)); mddev->curr_resync = j; } @@ -5282,7 +5263,7 @@ void md_do_sync(mddev_t *mddev) } } } - printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); + printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); /* * this also signals 'finished resyncing' to md_stop */ @@ -5295,15 +5276,14 @@ void md_do_sync(mddev_t *mddev) mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && - test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { printk(KERN_INFO - "md: checkpointing recovery of %s.\n", - mdname(mddev)); + "md: checkpointing %s of %s.\n", + desc, mdname(mddev)); mddev->recovery_cp = mddev->curr_resync; } } else @@ -5317,9 +5297,9 @@ void md_do_sync(mddev_t *mddev) !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; - mddev->sb_dirty = 1; } } + set_bit(MD_CHANGE_DEVS, &mddev->flags); skip: mddev->curr_resync = 0; @@ -5374,7 +5354,7 @@ void md_check_recovery(mddev_t *mddev) } if ( ! ( - mddev->sb_dirty || + mddev->flags || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->safemode == 1) || @@ -5390,14 +5370,14 @@ void md_check_recovery(mddev_t *mddev) if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - mddev->sb_dirty = 3; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->flags) + md_update_sb(mddev, 0); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && @@ -5416,7 +5396,7 @@ void md_check_recovery(mddev_t *mddev) /* activate any spares */ mddev->pers->spare_active(mddev); } - md_update_sb(mddev); + md_update_sb(mddev, 1); /* if array is no-longer degraded, then any saved_raid_disk * information must be scrapped @@ -5556,22 +5536,15 @@ static void md_geninit(void) static int __init md_init(void) { - printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," - " MD_SB_DISKS=%d\n", - MD_MAJOR_VERSION, MD_MINOR_VERSION, - MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); - printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, - BITMAP_MINOR); - if (register_blkdev(MAJOR_NR, "md")) return -1; if ((mdp_major=register_blkdev(0, "mdp"))<=0) { unregister_blkdev(MAJOR_NR, "md"); return -1; } - blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, - md_probe, NULL, NULL); - blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, + blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, + md_probe, NULL, NULL); + blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); register_reboot_notifier(&md_notifier); @@ -5623,15 +5596,15 @@ static void autostart_arrays(int part) autorun_devices(part); } -#endif +#endif /* !MODULE */ static __exit void md_exit(void) { mddev_t *mddev; struct list_head *tmp; - blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); - blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); + blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); + blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); unregister_blkdev(MAJOR_NR,"md"); unregister_blkdev(mdp_major, "mdp"); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 1cc9de44ce86..14da37fee37b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -228,6 +228,28 @@ static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, rcu_read_unlock(); return ret; } +static int multipath_congested(void *data, int bits) +{ + mddev_t *mddev = data; + multipath_conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks ; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + request_queue_t *q = bdev_get_queue(rdev->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + /* Just like multipath_map, we just check the + * first available device + */ + break; + } + } + rcu_read_unlock(); + return ret; +} /* * Careful, this can execute in IRQ contexts as well! @@ -253,8 +275,9 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) char b[BDEVNAME_SIZE]; clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); conf->working_disks--; + mddev->degraded++; printk(KERN_ALERT "multipath: IO failure on %s," " disabling IO path. \n Operation continuing" " on %d IO paths.\n", @@ -314,6 +337,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); conf->working_disks++; + mddev->degraded--; rdev->raid_disk = path; set_bit(In_sync, &rdev->flags); rcu_assign_pointer(p->rdev, rdev); @@ -470,7 +494,6 @@ static int multipath_run (mddev_t *mddev) } conf->raid_disks = mddev->raid_disks; - mddev->sb_dirty = 1; conf->mddev = mddev; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); @@ -480,7 +503,7 @@ static int multipath_run (mddev_t *mddev) mdname(mddev)); goto out_free_conf; } - mddev->degraded = conf->raid_disks = conf->working_disks; + mddev->degraded = conf->raid_disks - conf->working_disks; conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); @@ -510,6 +533,8 @@ static int multipath_run (mddev_t *mddev) mddev->queue->unplug_fn = multipath_unplug; mddev->queue->issue_flush_fn = multipath_issue_flush; + mddev->queue->backing_dev_info.congested_fn = multipath_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index cb8c6317e4e5..dfe32149ad3a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -60,6 +60,21 @@ static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid0_congested(void *data, int bits) +{ + mddev_t *mddev = data; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t **devlist = conf->strip_zone[0].dev; + int i, ret = 0; + + for (i = 0; i < mddev->raid_disks && !ret ; i++) { + request_queue_t *q = bdev_get_queue(devlist[i]->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + } + return ret; +} + static int create_strip_zones (mddev_t *mddev) { @@ -236,6 +251,8 @@ static int create_strip_zones (mddev_t *mddev) mddev->queue->unplug_fn = raid0_unplug; mddev->queue->issue_flush_fn = raid0_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid0_congested; + mddev->queue->backing_dev_info.congested_data = mddev; printk("raid0: done.\n"); return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3b4d69c05623..b3c5e12f081d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -271,7 +271,7 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int */ update_head_pos(mirror, r1_bio); - if (uptodate || conf->working_disks <= 1) { + if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) { /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -601,6 +601,32 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid1_congested(void *data, int bits) +{ + mddev_t *mddev = data; + conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + request_queue_t *q = bdev_get_queue(rdev->bdev); + + /* Note the '|| 1' - when read_balance prefers + * non-congested targets, it can be removed + */ + if ((bits & (1<<BDI_write_congested)) || 1) + ret |= bdi_congested(&q->backing_dev_info, bits); + else + ret &= bdi_congested(&q->backing_dev_info, bits); + } + } + rcu_read_unlock(); + return ret; +} + + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -929,7 +955,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -953,26 +979,27 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && (conf->raid_disks - mddev->degraded) == 1) /* * Don't fail the drive, act as though we were just a * normal single drive */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -984,7 +1011,7 @@ static void print_conf(conf_t *conf) printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); @@ -1023,10 +1050,11 @@ static int raid1_spare_active(mddev_t *mddev) mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags)) { - conf->working_disks++; + && !test_and_set_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1368,6 +1396,95 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * 3. Performs writes following reads for array syncronising. */ +static void fix_read_error(conf_t *conf, int read_disk, + sector_t sect, int sectors) +{ + mddev_t *mddev = conf->mddev; + while(sectors) { + int s = sectors; + int d = read_disk; + int success = 0; + int start; + mdk_rdev_t *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ)) + success = 1; + else { + d++; + if (d == conf->raid_disks) + d = 0; + } + } while (!success && d != read_disk); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[read_disk].rdev); + break; + } + /* write it back and re-read */ + start = d; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + d = start; + while (d != read_disk) { + char b[BDEVNAME_SIZE]; + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else { + atomic_add(s, &rdev->corrected_errors); + printk(KERN_INFO + "raid1:%s: read error corrected " + "(%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect + + rdev->data_offset), + bdevname(rdev->bdev, b)); + } + } + } + sectors -= s; + sect += s; + } +} + static void raid1d(mddev_t *mddev) { r1bio_t *r1_bio; @@ -1460,86 +1577,14 @@ static void raid1d(mddev_t *mddev) * This is all done synchronously while the array is * frozen */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - do { - /* Note: no rcu protection needed here - * as this is synchronous in the raid1d thread - * which is the thread that might remove - * a device. If raid1d ever becomes multi-threaded.... - */ - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) - success = 1; - else { - d++; - if (d == conf->raid_disks) - d = 0; - } - } while (!success && d != r1_bio->read_disk); - - if (success) { - /* write it back and re-read */ - int start = d; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } - } - d = start; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else { - atomic_add(s, &rdev->corrected_errors); - printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n", - mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b)); - } - } - } - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - break; - } - sectors -= s; - sect += s; + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, + r1_bio->sectors); + unfreeze_array(conf); } - unfreeze_array(conf); - bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" @@ -1884,15 +1929,11 @@ static int run(mddev_t *mddev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (conf->working_disks == 1) - mddev->recovery_cp = MaxSector; spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -1900,11 +1941,6 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - if (!conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; - } mddev->degraded = 0; for (i = 0; i < conf->raid_disks; i++) { @@ -1915,8 +1951,16 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + conf->fullsync = 1; } } + if (mddev->degraded == conf->raid_disks) { + printk(KERN_ERR "raid1: no operational mirrors for %s\n", + mdname(mddev)); + goto out_free_conf; + } + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; /* * find the first working one and use it as a starting point @@ -1948,6 +1992,8 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; mddev->queue->issue_flush_fn = raid1_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid1_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; @@ -2035,7 +2081,7 @@ static int raid1_reshape(mddev_t *mddev) mirror_info_t *newmirrors; conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; - + unsigned long flags; int d, d2; /* Cannot change chunk_size, layout, or level */ @@ -2094,7 +2140,9 @@ static int raid1_reshape(mddev_t *mddev) kfree(conf->poolinfo); conf->poolinfo = newpoolinfo; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 016ddb831c9b..7492d6033ac6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -648,6 +648,26 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid10_congested(void *data, int bits) +{ + mddev_t *mddev = data; + conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks && ret == 0; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + request_queue_t *q = bdev_get_queue(rdev->bdev); + + ret |= bdi_congested(&q->backing_dev_info, bits); + } + } + rcu_read_unlock(); + return ret; +} + + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -921,7 +941,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, " %d far-copies", conf->far_copies); } seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && @@ -941,7 +961,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && conf->raid_disks-mddev->degraded == 1) /* * Don't fail the drive, just return an IO error. * The test should really be more sophisticated than @@ -950,20 +970,21 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * really dead" tests... */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -976,7 +997,7 @@ static void print_conf(conf_t *conf) printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { @@ -1034,10 +1055,11 @@ static int raid10_spare_active(mddev_t *mddev) tmp = conf->mirrors + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - conf->working_disks++; + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1350,9 +1372,119 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. + * 3. Performs writes following reads for array synchronising. */ +static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) +{ + int sect = 0; /* Offset from r10_bio->sector */ + int sectors = r10_bio->sectors; + mdk_rdev_t*rdev; + while(sectors) { + int s = sectors; + int sl = r10_bio->read_slot; + int success = 0; + int start; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rcu_read_lock(); + do { + int d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + success = sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + if (success) + break; + } + sl++; + if (sl == conf->copies) + sl = 0; + } while (!success && sl != r10_bio->read_slot); + rcu_read_unlock(); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + int dn = r10_bio->devs[r10_bio->read_slot].devnum; + md_error(mddev, conf->mirrors[dn].rdev); + break; + } + + start = sl; + /* write it back and re-read */ + rcu_read_lock(); + while (sl != r10_bio->read_slot) { + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + atomic_add(s, &rdev->corrected_errors); + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + sl = start; + while (sl != r10_bio->read_slot) { + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else + printk(KERN_INFO + "raid10:%s: read error corrected" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect+ + rdev->data_offset), + bdevname(rdev->bdev, b)); + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + rcu_read_unlock(); + + sectors -= s; + sect += s; + } +} + static void raid10d(mddev_t *mddev) { r10bio_t *r10_bio; @@ -1413,105 +1545,12 @@ static void raid10d(mddev_t *mddev) * This is all done synchronously while the array is * frozen. */ - int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int sl = r10_bio->read_slot; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - rcu_read_lock(); - do { - int d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - success = sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - if (success) - break; - } - sl++; - if (sl == conf->copies) - sl = 0; - } while (!success && sl != r10_bio->read_slot); - rcu_read_unlock(); - - if (success) { - int start = sl; - /* write it back and re-read */ - rcu_read_lock(); - while (sl != r10_bio->read_slot) { - int d; - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - sl = start; - while (sl != r10_bio->read_slot) { - int d; - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else - printk(KERN_INFO "raid10:%s: read error corrected (%d sectors at %llu on %s)\n", - mdname(mddev), s, (unsigned long long)(sect+rdev->data_offset), bdevname(rdev->bdev, b)); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev); - break; - } - sectors -= s; - sect += s; + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); } - unfreeze_array(conf); - bio = r10_bio->devs[r10_bio->read_slot].bio; r10_bio->devs[r10_bio->read_slot].bio = mddev->ro ? IO_BLOCKED : NULL; @@ -2018,8 +2057,6 @@ static int run(mddev_t *mddev) mddev->queue->max_sectors = (PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; @@ -2042,7 +2079,7 @@ static int run(mddev_t *mddev) disk = conf->mirrors + i; if (!disk->rdev || - !test_bit(In_sync, &rdev->flags)) { + !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; } @@ -2077,6 +2114,8 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid10_unplug; mddev->queue->issue_flush_fn = raid10_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid10_congested; + mddev->queue->backing_dev_info.congested_data = mddev; /* Calculate max read-ahead size. * We need to readahead at least twice a whole stripe.... diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 450066007160..377f8bc9b78b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -134,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list); wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); } } } @@ -348,7 +350,7 @@ static int grow_one_stripe(raid5_conf_t *conf) static int grow_stripes(raid5_conf_t *conf, int num) { - kmem_cache_t *sc; + struct kmem_cache *sc; int devs = conf->raid_disks; sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev)); @@ -397,7 +399,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) LIST_HEAD(newstripes); struct disk_info *ndisks; int err = 0; - kmem_cache_t *sc; + struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) @@ -542,35 +544,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } if (uptodate) { -#if 0 - struct bio *bio; - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - /* we can return a buffer if we bypassed the cache or - * if the top buffer is not in highmem. If there are - * multiple buffers, leave the extra work to - * handle_stripe - */ - buffer = sh->bh_read[i]; - if (buffer && - (!PageHighMem(buffer->b_page) - || buffer->b_page == bh->b_page ) - ) { - sh->bh_read[i] = buffer->b_reqnext; - buffer->b_reqnext = NULL; - } else - buffer = NULL; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (sh->bh_page[i]==bh->b_page) - set_buffer_uptodate(bh); - if (buffer) { - if (buffer->b_page != bh->b_page) - memcpy(buffer->b_data, bh->b_data, bh->b_size); - buffer->b_end_io(buffer, 1); - } -#else set_bit(R5_UPTODATE, &sh->dev[i].flags); -#endif if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", @@ -616,14 +590,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); -#if 0 - /* must restore b_page before unlocking buffer... */ - if (sh->bh_page[i] != bh->b_page) { - bh->b_page = sh->bh_page[i]; - bh->b_data = page_address(bh->b_page); - clear_buffer_uptodate(bh); - } -#endif clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -636,7 +602,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; - unsigned long flags; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); if (bi->bi_size) @@ -654,7 +619,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, return 0; } - spin_lock_irqsave(&conf->device_lock, flags); if (!uptodate) md_error(conf->mddev, conf->disks[i].rdev); @@ -662,8 +626,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); + release_stripe(sh); return 0; } @@ -696,12 +659,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) PRINTK("raid5: error called\n"); if (!test_bit(Faulty, &rdev->flags)) { - mddev->sb_dirty = 1; - if (test_bit(In_sync, &rdev->flags)) { - conf->working_disks--; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->failed_disks++; - clear_bit(In_sync, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery was running, make sure it aborts. */ @@ -711,7 +674,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) printk (KERN_ALERT "raid5: Disk failure on %s, disabling device." " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } } @@ -824,7 +787,8 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, static sector_t compute_blocknr(struct stripe_head *sh, int i) { raid5_conf_t *conf = sh->raid_conf; - int raid_disks = sh->disks, data_disks = raid_disks - 1; + int raid_disks = sh->disks; + int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; @@ -860,7 +824,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) } break; case 6: - data_disks = raid_disks - 2; if (i == raid6_next_disk(sh->pd_idx, raid_disks)) return 0; /* It is the Q disk */ switch (conf->algorithm) { @@ -1108,7 +1071,7 @@ static void compute_parity6(struct stripe_head *sh, int method) if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - if (sh->dev[i].written) BUG(); + BUG_ON(sh->dev[i].written); sh->dev[i].written = chosen; } break; @@ -1353,12 +1316,13 @@ static int page_is_zero(struct page *p) static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) { int sectors_per_chunk = conf->chunk_size >> 9; - sector_t x = stripe; int pd_idx, dd_idx; - int chunk_offset = sector_div(x, sectors_per_chunk); - stripe = x; - raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk - + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); + int chunk_offset = sector_div(stripe, sectors_per_chunk); + + raid5_compute_sector(stripe * (disks - conf->max_degraded) + *sectors_per_chunk + chunk_offset, + disks, disks - conf->max_degraded, + &dd_idx, &pd_idx, conf); return pd_idx; } @@ -1619,15 +1583,6 @@ static void handle_stripe5(struct stripe_head *sh) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -1645,9 +1600,6 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i]!=bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags) @@ -1659,9 +1611,6 @@ static void handle_stripe5(struct stripe_head *sh) /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 -|| sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -1869,7 +1818,9 @@ static void handle_stripe5(struct stripe_head *sh) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2197,15 +2148,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); @@ -2224,9 +2166,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (!test_bit(R5_OVERWRITE, &dev->flags) && i != pd_idx && i != qd_idx && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 - || sh->bh_page[i] != bh->b_page -#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -2422,7 +2361,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } for (i=disks; i-- ;) { int rw; @@ -2597,6 +2538,198 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid5_congested(void *data, int bits) +{ + mddev_t *mddev = data; + raid5_conf_t *conf = mddev_to_conf(mddev); + + /* No difference between reads and writes. Just check + * how busy the stripe_cache is + */ + if (conf->inactive_blocked) + return 1; + if (conf->quiesce) + return 1; + if (list_empty_careful(&conf->inactive_list)) + return 1; + + return 0; +} + +/* We want read requests to align with chunks where possible, + * but write requests don't need to. + */ +static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec) +{ + mddev_t *mddev = q->queuedata; + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + int max; + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + if (bio_data_dir(bio)) + return biovec->bv_len; /* always allow writes to be mergeable */ + + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; + if (max < 0) max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + else + return max; +} + + +static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) +{ + sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int bio_sectors = bio->bi_size >> 9; + + return chunk_sectors >= + ((sector & (chunk_sectors - 1)) + bio_sectors); +} + +/* + * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) + * later sampled by raid5d. + */ +static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) +{ + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + bi->bi_next = conf->retry_read_aligned_list; + conf->retry_read_aligned_list = bi; + + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(conf->mddev->thread); +} + + +static struct bio *remove_bio_from_retry(raid5_conf_t *conf) +{ + struct bio *bi; + + bi = conf->retry_read_aligned; + if (bi) { + conf->retry_read_aligned = NULL; + return bi; + } + bi = conf->retry_read_aligned_list; + if(bi) { + conf->retry_read_aligned = bi->bi_next; + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* biased count of active stripes */ + bi->bi_hw_segments = 0; /* count of processed stripes */ + } + + return bi; +} + + +/* + * The "raid5_align_endio" should check if the read succeeded and if it + * did, call bio_endio on the original bio (having bio_put the new bio + * first). + * If the read failed.. + */ +static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) +{ + struct bio* raid_bi = bi->bi_private; + mddev_t *mddev; + raid5_conf_t *conf; + int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + mdk_rdev_t *rdev; + + if (bi->bi_size) + return 1; + bio_put(bi); + + mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; + conf = mddev_to_conf(mddev); + rdev = (void*)raid_bi->bi_next; + raid_bi->bi_next = NULL; + + rdev_dec_pending(rdev, conf->mddev); + + if (!error && uptodate) { + bio_endio(raid_bi, bytes, 0); + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return 0; + } + + + PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); + + add_bio_to_retry(raid_bi, conf); + return 0; +} + +static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) +{ + mddev_t *mddev = q->queuedata; + raid5_conf_t *conf = mddev_to_conf(mddev); + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - conf->max_degraded; + unsigned int dd_idx, pd_idx; + struct bio* align_bi; + mdk_rdev_t *rdev; + + if (!in_chunk_boundary(mddev, raid_bio)) { + printk("chunk_aligned_read : non aligned\n"); + return 0; + } + /* + * use bio_clone to make a copy of the bio + */ + align_bi = bio_clone(raid_bio, GFP_NOIO); + if (!align_bi) + return 0; + /* + * set bi_end_io to a new function, and set bi_private to the + * original bio. + */ + align_bi->bi_end_io = raid5_align_endio; + align_bi->bi_private = raid_bio; + /* + * compute position + */ + align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, + raid_disks, + data_disks, + &dd_idx, + &pd_idx, + conf); + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + raid_bio->bi_next = (void*)rdev; + align_bi->bi_bdev = rdev->bdev; + align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + align_bi->bi_sector += rdev->data_offset; + + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + + generic_make_request(align_bi); + return 1; + } else { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } +} + + static int make_request(request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -2618,6 +2751,11 @@ static int make_request(request_queue_t *q, struct bio * bi) disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); + if (bio_data_dir(bi) == READ && + mddev->reshape_position == MaxSector && + chunk_aligned_read(q,bi)) + return 0; + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -2725,7 +2863,9 @@ static int make_request(request_queue_t *q, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); } return 0; } @@ -2781,9 +2921,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); mddev->reshape_position = conf->expand_progress; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || + wait_event(mddev->sb_wait, mddev->flags == 0 || kthread_should_stop()); spin_lock_irq(&conf->device_lock); conf->expand_lo = mddev->reshape_position; @@ -2936,6 +3076,74 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return STRIPE_SECTORS; } +static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) +{ + /* We may not be able to submit a whole bio at once as there + * may not be enough stripe_heads available. + * We cannot pre-allocate enough stripe_heads as we may need + * more than exist in the cache (if we allow ever large chunks). + * So we do one stripe head at a time and record in + * ->bi_hw_segments how many have been done. + * + * We *know* that this entire raid_bio is in one chunk, so + * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. + */ + struct stripe_head *sh; + int dd_idx, pd_idx; + sector_t sector, logical_sector, last_sector; + int scnt = 0; + int remaining; + int handled = 0; + + logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector( logical_sector, + conf->raid_disks, + conf->raid_disks - conf->max_degraded, + &dd_idx, + &pd_idx, + conf); + last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS, scnt++) { + + if (scnt < raid_bio->bi_hw_segments) + /* already done this stripe */ + continue; + + sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + + if (!sh) { + /* failed to get a stripe - must wait */ + raid_bio->bi_hw_segments = scnt; + conf->retry_read_aligned = raid_bio; + return handled; + } + + set_bit(R5_ReadError, &sh->dev[dd_idx].flags); + add_stripe_bio(sh, raid_bio, dd_idx, 0); + handle_stripe(sh, NULL); + release_stripe(sh); + handled++; + } + spin_lock_irq(&conf->device_lock); + remaining = --raid_bio->bi_phys_segments; + spin_unlock_irq(&conf->device_lock); + if (remaining == 0) { + int bytes = raid_bio->bi_size; + + raid_bio->bi_size = 0; + raid_bio->bi_end_io(raid_bio, bytes, + test_bit(BIO_UPTODATE, &raid_bio->bi_flags) + ? 0 : -EIO); + } + if (atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_stripe); + return handled; +} + + + /* * This is our raid5 kernel thread. * @@ -2957,6 +3165,7 @@ static void raid5d (mddev_t *mddev) spin_lock_irq(&conf->device_lock); while (1) { struct list_head *first; + struct bio *bio; if (conf->seq_flush != conf->seq_write) { int seq = conf->seq_flush; @@ -2973,6 +3182,16 @@ static void raid5d (mddev_t *mddev) !list_empty(&conf->delayed_list)) raid5_activate_delayed(conf); + while ((bio = remove_bio_from_retry(conf))) { + int ok; + spin_unlock_irq(&conf->device_lock); + ok = retry_aligned_read(conf, bio); + spin_lock_irq(&conf->device_lock); + if (!ok) + break; + handled++; + } + if (list_empty(&conf->handle_list)) break; @@ -3074,6 +3293,7 @@ static int run(mddev_t *mddev) mdk_rdev_t *rdev; struct disk_info *disk; struct list_head *tmp; + int working_disks = 0; if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", @@ -3159,6 +3379,7 @@ static int run(mddev_t *mddev) INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); PRINTK("raid5: run(%s) called.\n", mdname(mddev)); @@ -3176,14 +3397,14 @@ static int run(mddev_t *mddev) printk(KERN_INFO "raid5: device %s operational as raid" " disk %d\n", bdevname(rdev->bdev,b), raid_disk); - conf->working_disks++; + working_disks++; } } /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; + mddev->degraded = conf->raid_disks - working_disks; conf->mddev = mddev; conf->chunk_size = mddev->chunk_size; conf->level = mddev->level; @@ -3218,7 +3439,7 @@ static int run(mddev_t *mddev) if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", - mdname(mddev), conf->failed_disks, conf->raid_disks); + mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; } @@ -3299,9 +3520,14 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid5_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->array_size = mddev->size * (conf->previous_raid_disks - conf->max_degraded); + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + return 0; abort: if (conf) { @@ -3375,7 +3601,7 @@ static void status (struct seq_file *seq, mddev_t *mddev) int i; seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); - seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); + seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->disks[i].rdev && @@ -3397,8 +3623,8 @@ static void print_raid5_conf (raid5_conf_t *conf) printk("(conf==NULL)\n"); return; } - printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, - conf->working_disks, conf->failed_disks); + printk(" --- rd:%d wd:%d\n", conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; @@ -3420,11 +3646,11 @@ static int raid5_spare_active(mddev_t *mddev) tmp = conf->disks + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - conf->failed_disks--; - conf->working_disks++; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } print_raid5_conf(conf); @@ -3560,6 +3786,7 @@ static int raid5_start_reshape(mddev_t *mddev) struct list_head *rtmp; int spares = 0; int added_devices = 0; + unsigned long flags; if (mddev->degraded || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -3593,7 +3820,6 @@ static int raid5_start_reshape(mddev_t *mddev) if (raid5_add_disk(mddev, rdev)) { char nm[20]; set_bit(In_sync, &rdev->flags); - conf->working_disks++; added_devices++; rdev->recovery_offset = 0; sprintf(nm, "rd%d", rdev->raid_disk); @@ -3602,10 +3828,12 @@ static int raid5_start_reshape(mddev_t *mddev) break; } + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; + spin_unlock_irqrestore(&conf->device_lock, flags); mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -3639,7 +3867,7 @@ static void end_reshape(raid5_conf_t *conf) bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); + i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); } @@ -3674,7 +3902,8 @@ static void raid5_quiesce(mddev_t *mddev, int state) spin_lock_irq(&conf->device_lock); conf->quiesce = 1; wait_event_lock_irq(conf->wait_for_stripe, - atomic_read(&conf->active_stripes) == 0, + atomic_read(&conf->active_stripes) == 0 && + atomic_read(&conf->active_aligned_reads) == 0, conf->device_lock, /* nothing */); spin_unlock_irq(&conf->device_lock); break; |