diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 5 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 95 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 61 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 135 | ||||
-rw-r--r-- | drivers/md/dm-verity.c | 8 | ||||
-rw-r--r-- | drivers/md/dm.c | 71 | ||||
-rw-r--r-- | drivers/md/dm.h | 1 | ||||
-rw-r--r-- | drivers/md/md.c | 82 | ||||
-rw-r--r-- | drivers/md/md.h | 11 | ||||
-rw-r--r-- | drivers/md/raid1.c | 224 | ||||
-rw-r--r-- | drivers/md/raid1.h | 30 | ||||
-rw-r--r-- | drivers/md/raid10.c | 133 | ||||
-rw-r--r-- | drivers/md/raid10.h | 25 | ||||
-rw-r--r-- | drivers/md/raid5.c | 320 | ||||
-rw-r--r-- | drivers/md/raid5.h | 3 |
17 files changed, 815 insertions, 402 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 1eee45b69b71..d949b781f6f8 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -268,13 +268,14 @@ config DM_MIRROR needed for live data migration tools such as 'pvmove'. config DM_RAID - tristate "RAID 1/4/5/6 target" + tristate "RAID 1/4/5/6/10 target" depends on BLK_DEV_DM select MD_RAID1 + select MD_RAID10 select MD_RAID456 select BLK_DEV_MD ---help--- - A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings + A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 15dbe03117e4..94e7f6ba2e11 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1305,7 +1305,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&bitmap->counts.lock); - io_schedule(); + schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d8abb90a6c2f..034233eefc82 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1555,6 +1555,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) { struct multipath *m = ti->private; + struct pgpath *pgpath; struct block_device *bdev; fmode_t mode; unsigned long flags; @@ -1570,12 +1571,14 @@ again: if (!m->current_pgpath) __choose_pgpath(m, 0); - if (m->current_pgpath) { - bdev = m->current_pgpath->path.dev->bdev; - mode = m->current_pgpath->path.dev->mode; + pgpath = m->current_pgpath; + + if (pgpath) { + bdev = pgpath->path.dev->bdev; + mode = pgpath->path.dev->mode; } - if (m->queue_io) + if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) r = -EAGAIN; else if (!bdev) r = -EIO; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index f2f29c526544..982e3e390c45 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -11,6 +11,7 @@ #include "md.h" #include "raid1.h" #include "raid5.h" +#include "raid10.h" #include "bitmap.h" #include <linux/device-mapper.h> @@ -52,7 +53,10 @@ struct raid_dev { #define DMPF_MAX_RECOVERY_RATE 0x20 #define DMPF_MAX_WRITE_BEHIND 0x40 #define DMPF_STRIPE_CACHE 0x80 -#define DMPF_REGION_SIZE 0X100 +#define DMPF_REGION_SIZE 0x100 +#define DMPF_RAID10_COPIES 0x200 +#define DMPF_RAID10_FORMAT 0x400 + struct raid_set { struct dm_target *ti; @@ -76,6 +80,7 @@ static struct raid_type { const unsigned algorithm; /* RAID algorithm. */ } raid_types[] = { {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, + {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, @@ -86,6 +91,17 @@ static struct raid_type { {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} }; +static unsigned raid10_md_layout_to_copies(int layout) +{ + return layout & 0xFF; +} + +static int raid10_format_to_md_layout(char *format, unsigned copies) +{ + /* 1 "far" copy, and 'copies' "near" copies */ + return (1 << 8) | (copies & 0xFF); +} + static struct raid_type *get_raid_type(char *name) { int i; @@ -339,10 +355,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs * [region_size <sectors>] Defines granularity of bitmap + * + * RAID10-only options: + * [raid10_copies <# copies>] Number of copies. (Default: 2) + * [raid10_format <near>] Layout algorithm. (Default: near) */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) { + char *raid10_format = "near"; + unsigned raid10_copies = 2; unsigned i, rebuild_cnt = 0; unsigned long value, region_size = 0; sector_t sectors_per_dev = rs->ti->len; @@ -416,11 +438,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } key = argv[i++]; + + /* Parameters that take a string value are checked here. */ + if (!strcasecmp(key, "raid10_format")) { + if (rs->raid_type->level != 10) { + rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; + return -EINVAL; + } + if (strcmp("near", argv[i])) { + rs->ti->error = "Invalid 'raid10_format' value given"; + return -EINVAL; + } + raid10_format = argv[i]; + rs->print_flags |= DMPF_RAID10_FORMAT; + continue; + } + if (strict_strtoul(argv[i], 10, &value) < 0) { rs->ti->error = "Bad numerical argument given in raid params"; return -EINVAL; } + /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { rebuild_cnt++; @@ -439,6 +478,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } break; + case 10: default: DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); rs->ti->error = "Rebuild not supported for this RAID type"; @@ -495,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv, */ value /= 2; - if (rs->raid_type->level < 5) { + if ((rs->raid_type->level != 5) && + (rs->raid_type->level != 6)) { rs->ti->error = "Inappropriate argument: stripe_cache"; return -EINVAL; } @@ -520,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } else if (!strcasecmp(key, "region_size")) { rs->print_flags |= DMPF_REGION_SIZE; region_size = value; + } else if (!strcasecmp(key, "raid10_copies") && + (rs->raid_type->level == 10)) { + if ((value < 2) || (value > 0xFF)) { + rs->ti->error = "Bad value for 'raid10_copies'"; + return -EINVAL; + } + rs->print_flags |= DMPF_RAID10_COPIES; + raid10_copies = value; } else { DMERR("Unable to parse RAID parameter: %s", key); rs->ti->error = "Unable to parse RAID parameters"; @@ -538,8 +587,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv, if (dm_set_target_max_io_len(rs->ti, max_io_len)) return -EINVAL; - if ((rs->raid_type->level > 1) && - sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { + if (rs->raid_type->level == 10) { + if (raid10_copies > rs->md.raid_disks) { + rs->ti->error = "Not enough devices to satisfy specification"; + return -EINVAL; + } + + /* (Len * #mirrors) / #devices */ + sectors_per_dev = rs->ti->len * raid10_copies; + sector_div(sectors_per_dev, rs->md.raid_disks); + + rs->md.layout = raid10_format_to_md_layout(raid10_format, + raid10_copies); + rs->md.new_layout = rs->md.layout; + } else if ((rs->raid_type->level > 1) && + sector_div(sectors_per_dev, + (rs->md.raid_disks - rs->raid_type->parity_devs))) { rs->ti->error = "Target length not divisible by number of data devices"; return -EINVAL; } @@ -566,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) if (rs->raid_type->level == 1) return md_raid1_congested(&rs->md, bits); + if (rs->raid_type->level == 10) + return md_raid10_congested(&rs->md, bits); + return md_raid5_congested(&rs->md, bits); } @@ -884,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) case 6: redundancy = rs->raid_type->parity_devs; break; + case 10: + redundancy = raid10_md_layout_to_copies(mddev->layout) - 1; + break; default: ti->error = "Unknown RAID type"; return -EINVAL; @@ -1049,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + if (ti->len != rs->md.array_sectors) { + ti->error = "Array size does not match requested target length"; + ret = -EINVAL; + goto size_mismatch; + } rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); mddev_suspend(&rs->md); return 0; +size_mismatch: + md_stop(&rs->md); bad: context_free(rs); @@ -1203,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" region_size %lu", rs->md.bitmap_info.chunksize >> 9); + if (rs->print_flags & DMPF_RAID10_COPIES) + DMEMIT(" raid10_copies %u", + raid10_md_layout_to_copies(rs->md.layout)); + + if (rs->print_flags & DMPF_RAID10_FORMAT) + DMEMIT(" raid10_format near"); + DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { if (rs->dev[i].meta_dev) @@ -1277,7 +1360,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, @@ -1304,6 +1387,8 @@ module_init(dm_raid_init); module_exit(dm_raid_exit); MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid1"); +MODULE_ALIAS("dm-raid10"); MODULE_ALIAS("dm-raid4"); MODULE_ALIAS("dm-raid5"); MODULE_ALIAS("dm-raid6"); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f90069029aae..100368eb7991 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1212,6 +1212,41 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } +static int count_device(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned *num_devices = data; + + (*num_devices)++; + + return 0; +} + +/* + * Check whether a table has no data devices attached using each + * target's iterate_devices method. + * Returns false if the result is unknown because a target doesn't + * support iterate_devices. + */ +bool dm_table_has_no_data_devices(struct dm_table *table) +{ + struct dm_target *uninitialized_var(ti); + unsigned i = 0, num_devices = 0; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (!ti->type->iterate_devices) + return false; + + ti->type->iterate_devices(ti, count_device, &num_devices); + if (num_devices) + return false; + } + + return true; +} + /* * Establish the new table's queue_limits and validate them. */ @@ -1354,17 +1389,25 @@ static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, return q && blk_queue_nonrot(q); } -static bool dm_table_is_nonrot(struct dm_table *t) +static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !blk_queue_add_random(q); +} + +static bool dm_table_all_devices_attribute(struct dm_table *t, + iterate_devices_callout_fn func) { struct dm_target *ti; unsigned i = 0; - /* Ensure that all underlying device are non-rotational. */ while (i < dm_table_get_num_targets(t)) { ti = dm_table_get_target(t, i++); if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_nonrot, NULL)) + !ti->type->iterate_devices(ti, func, NULL)) return 0; } @@ -1396,7 +1439,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_discard_zeroes_data(t)) q->limits.discard_zeroes_data = 0; - if (dm_table_is_nonrot(t)) + /* Ensure that all underlying devices are non-rotational. */ + if (dm_table_all_devices_attribute(t, device_is_nonrot)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); else queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); @@ -1404,6 +1448,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, dm_table_set_integrity(t); /* + * Determine whether or not this queue's I/O timings contribute + * to the entropy pool, Only request-based targets use this. + * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not + * have it set. + */ + if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); + + /* * QUEUE_FLAG_STACKABLE must be set after all queue settings are * visible to other CPUs because, once the flag is set, incoming bios * are processed by request-based dm, which refers to the queue diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index af1fc3b2c2ad..c29410af1e22 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -509,9 +509,9 @@ enum pool_mode { struct pool_features { enum pool_mode mode; - unsigned zero_new_blocks:1; - unsigned discard_enabled:1; - unsigned discard_passdown:1; + bool zero_new_blocks:1; + bool discard_enabled:1; + bool discard_passdown:1; }; struct thin_c; @@ -580,7 +580,8 @@ struct pool_c { struct dm_target_callbacks callbacks; dm_block_t low_water_blocks; - struct pool_features pf; + struct pool_features requested_pf; /* Features requested during table load */ + struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ }; /* @@ -1839,6 +1840,47 @@ static void __requeue_bios(struct pool *pool) /*---------------------------------------------------------------- * Binding of control targets to a pool object *--------------------------------------------------------------*/ +static bool data_dev_supports_discard(struct pool_c *pt) +{ + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + + return q && blk_queue_discard(q); +} + +/* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not. + */ +static void disable_passdown_if_not_supported(struct pool_c *pt) +{ + struct pool *pool = pt->pool; + struct block_device *data_bdev = pt->data_dev->bdev; + struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; + sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; + const char *reason = NULL; + char buf[BDEVNAME_SIZE]; + + if (!pt->adjusted_pf.discard_passdown) + return; + + if (!data_dev_supports_discard(pt)) + reason = "discard unsupported"; + + else if (data_limits->max_discard_sectors < pool->sectors_per_block) + reason = "max discard sectors smaller than a block"; + + else if (data_limits->discard_granularity > block_size) + reason = "discard granularity larger than a block"; + + else if (block_size & (data_limits->discard_granularity - 1)) + reason = "discard granularity not a factor of block size"; + + if (reason) { + DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); + pt->adjusted_pf.discard_passdown = false; + } +} + static int bind_control_target(struct pool *pool, struct dm_target *ti) { struct pool_c *pt = ti->private; @@ -1847,31 +1889,16 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) * We want to make sure that degraded pools are never upgraded. */ enum pool_mode old_mode = pool->pf.mode; - enum pool_mode new_mode = pt->pf.mode; + enum pool_mode new_mode = pt->adjusted_pf.mode; if (old_mode > new_mode) new_mode = old_mode; pool->ti = ti; pool->low_water_blocks = pt->low_water_blocks; - pool->pf = pt->pf; - set_pool_mode(pool, new_mode); + pool->pf = pt->adjusted_pf; - /* - * If discard_passdown was enabled verify that the data device - * supports discards. Disable discard_passdown if not; otherwise - * -EOPNOTSUPP will be returned. - */ - /* FIXME: pull this out into a sep fn. */ - if (pt->pf.discard_passdown) { - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - if (!q || !blk_queue_discard(q)) { - char buf[BDEVNAME_SIZE]; - DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.", - bdevname(pt->data_dev->bdev, buf)); - pool->pf.discard_passdown = 0; - } - } + set_pool_mode(pool, new_mode); return 0; } @@ -1889,9 +1916,9 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) static void pool_features_init(struct pool_features *pf) { pf->mode = PM_WRITE; - pf->zero_new_blocks = 1; - pf->discard_enabled = 1; - pf->discard_passdown = 1; + pf->zero_new_blocks = true; + pf->discard_enabled = true; + pf->discard_passdown = true; } static void __pool_destroy(struct pool *pool) @@ -2119,13 +2146,13 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, argc--; if (!strcasecmp(arg_name, "skip_block_zeroing")) - pf->zero_new_blocks = 0; + pf->zero_new_blocks = false; else if (!strcasecmp(arg_name, "ignore_discard")) - pf->discard_enabled = 0; + pf->discard_enabled = false; else if (!strcasecmp(arg_name, "no_discard_passdown")) - pf->discard_passdown = 0; + pf->discard_passdown = false; else if (!strcasecmp(arg_name, "read_only")) pf->mode = PM_READ_ONLY; @@ -2259,8 +2286,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) pt->metadata_dev = metadata_dev; pt->data_dev = data_dev; pt->low_water_blocks = low_water_blocks; - pt->pf = pf; + pt->adjusted_pf = pt->requested_pf = pf; ti->num_flush_requests = 1; + /* * Only need to enable discards if the pool should pass * them down to the data device. The thin device's discard @@ -2268,12 +2296,14 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) */ if (pf.discard_enabled && pf.discard_passdown) { ti->num_discard_requests = 1; + /* * Setting 'discards_supported' circumvents the normal * stacking of discard limits (this keeps the pool and * thin devices' discard limits consistent). */ ti->discards_supported = true; + ti->discard_zeroes_data_unsupported = true; } ti->private = pt; @@ -2703,7 +2733,7 @@ static int pool_status(struct dm_target *ti, status_type_t type, format_dev_t(buf2, pt->data_dev->bdev->bd_dev), (unsigned long)pool->sectors_per_block, (unsigned long long)pt->low_water_blocks); - emit_flags(&pt->pf, result, sz, maxlen); + emit_flags(&pt->requested_pf, result, sz, maxlen); break; } @@ -2732,20 +2762,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } -static void set_discard_limits(struct pool *pool, struct queue_limits *limits) +static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) { - /* - * FIXME: these limits may be incompatible with the pool's data device - */ + struct pool *pool = pt->pool; + struct queue_limits *data_limits; + limits->max_discard_sectors = pool->sectors_per_block; /* - * This is just a hint, and not enforced. We have to cope with - * bios that cover a block partially. A discard that spans a block - * boundary is not sent to this target. + * discard_granularity is just a hint, and not enforced. */ - limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; - limits->discard_zeroes_data = pool->pf.zero_new_blocks; + if (pt->adjusted_pf.discard_passdown) { + data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; + limits->discard_granularity = data_limits->discard_granularity; + } else + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; } static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -2755,15 +2786,25 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, 0); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); - if (pool->pf.discard_enabled) - set_discard_limits(pool, limits); + + /* + * pt->adjusted_pf is a staging area for the actual features to use. + * They get transferred to the live pool in bind_control_target() + * called from pool_preresume(). + */ + if (!pt->adjusted_pf.discard_enabled) + return; + + disable_passdown_if_not_supported(pt); + + set_discard_limits(pt, limits); } static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3042,19 +3083,19 @@ static int thin_iterate_devices(struct dm_target *ti, return 0; } +/* + * A thin device always inherits its queue limits from its pool. + */ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct thin_c *tc = ti->private; - struct pool *pool = tc->pool; - blk_limits_io_min(limits, 0); - blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); - set_discard_limits(pool, limits); + *limits = bdev_get_queue(tc->pool_dev->bdev)->limits; } static struct target_type thin_target = { .name = "thin", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 254d19268ad2..892ae2766aa6 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -718,8 +718,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) v->hash_dev_block_bits = ffs(num) - 1; if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || - num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != - (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { + (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) { ti->error = "Invalid data blocks"; r = -EINVAL; goto bad; @@ -733,8 +733,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || - num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != - (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { + (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) + >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) { ti->error = "Invalid hash start"; r = -EINVAL; goto bad; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4e09b6ff5b49..67ffa391edcf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -865,10 +865,14 @@ static void dm_done(struct request *clone, int error, bool mapped) { int r = error; struct dm_rq_target_io *tio = clone->end_io_data; - dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; + dm_request_endio_fn rq_end_io = NULL; - if (mapped && rq_end_io) - r = rq_end_io(tio->ti, clone, error, &tio->info); + if (tio->ti) { + rq_end_io = tio->ti->type->rq_end_io; + + if (mapped && rq_end_io) + r = rq_end_io(tio->ti, clone, error, &tio->info); + } if (r <= 0) /* The target wants to complete the I/O */ @@ -1588,15 +1592,6 @@ static int map_request(struct dm_target *ti, struct request *clone, int r, requeued = 0; struct dm_rq_target_io *tio = clone->end_io_data; - /* - * Hold the md reference here for the in-flight I/O. - * We can't rely on the reference count by device opener, - * because the device may be closed during the request completion - * when all bios are completed. - * See the comment in rq_completed() too. - */ - dm_get(md); - tio->ti = ti; r = ti->type->map_rq(ti, clone, &tio->info); switch (r) { @@ -1628,6 +1623,26 @@ static int map_request(struct dm_target *ti, struct request *clone, return requeued; } +static struct request *dm_start_request(struct mapped_device *md, struct request *orig) +{ + struct request *clone; + + blk_start_request(orig); + clone = orig->special; + atomic_inc(&md->pending[rq_data_dir(clone)]); + + /* + * Hold the md reference here for the in-flight I/O. + * We can't rely on the reference count by device opener, + * because the device may be closed during the request completion + * when all bios are completed. + * See the comment in rq_completed() too. + */ + dm_get(md); + + return clone; +} + /* * q->request_fn for request-based dm. * Called with the queue lock held. @@ -1657,14 +1672,21 @@ static void dm_request_fn(struct request_queue *q) pos = blk_rq_pos(rq); ti = dm_table_find_target(map, pos); - BUG_ON(!dm_target_is_valid(ti)); + if (!dm_target_is_valid(ti)) { + /* + * Must perform setup, that dm_done() requires, + * before calling dm_kill_unmapped_request + */ + DMERR_LIMIT("request attempted access beyond the end of device"); + clone = dm_start_request(md, rq); + dm_kill_unmapped_request(clone, -EIO); + continue; + } if (ti->type->busy && ti->type->busy(ti)) goto delay_and_out; - blk_start_request(rq); - clone = rq->special; - atomic_inc(&md->pending[rq_data_dir(clone)]); + clone = dm_start_request(md, rq); spin_unlock(q->queue_lock); if (map_request(ti, clone, md)) @@ -1684,8 +1706,6 @@ delay_and_out: blk_delay_queue(q, HZ / 10); out: dm_table_put(map); - - return; } int dm_underlying_device_busy(struct request_queue *q) @@ -2409,7 +2429,7 @@ static void dm_queue_flush(struct mapped_device *md) */ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { - struct dm_table *map = ERR_PTR(-EINVAL); + struct dm_table *live_map, *map = ERR_PTR(-EINVAL); struct queue_limits limits; int r; @@ -2419,6 +2439,19 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) if (!dm_suspended_md(md)) goto out; + /* + * If the new table has no data devices, retain the existing limits. + * This helps multipath with queue_if_no_path if all paths disappear, + * then new I/O is queued based on these limits, and then some paths + * reappear. + */ + if (dm_table_has_no_data_devices(table)) { + live_map = dm_get_live_table(md); + if (live_map) + limits = md->queue->limits; + dm_table_put(live_map); + } + r = dm_calculate_queue_limits(table, &limits); if (r) { map = ERR_PTR(r); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 52eef493d266..6a99fefaa743 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -54,6 +54,7 @@ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +bool dm_table_has_no_data_devices(struct dm_table *table); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, diff --git a/drivers/md/md.c b/drivers/md/md.c index d5ab4493c8be..308e87b417e0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -498,61 +498,13 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -/* Support for plugging. - * This mirrors the plugging support in request_queue, but does not - * require having a whole queue or request structures. - * We allocate an md_plug_cb for each md device and each thread it gets - * plugged on. This links tot the private plug_handle structure in the - * personality data where we keep a count of the number of outstanding - * plugs so other code can see if a plug is active. - */ -struct md_plug_cb { - struct blk_plug_cb cb; - struct mddev *mddev; -}; - -static void plugger_unplug(struct blk_plug_cb *cb) -{ - struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); - if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) - md_wakeup_thread(mdcb->mddev->thread); - kfree(mdcb); -} - -/* Check that an unplug wakeup will come shortly. - * If not, wakeup the md thread immediately - */ -int mddev_check_plugged(struct mddev *mddev) +void md_unplug(struct blk_plug_cb *cb, bool from_schedule) { - struct blk_plug *plug = current->plug; - struct md_plug_cb *mdcb; - - if (!plug) - return 0; - - list_for_each_entry(mdcb, &plug->cb_list, cb.list) { - if (mdcb->cb.callback == plugger_unplug && - mdcb->mddev == mddev) { - /* Already on the list, move to top */ - if (mdcb != list_first_entry(&plug->cb_list, - struct md_plug_cb, - cb.list)) - list_move(&mdcb->cb.list, &plug->cb_list); - return 1; - } - } - /* Not currently on the callback list */ - mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); - if (!mdcb) - return 0; - - mdcb->mddev = mddev; - mdcb->cb.callback = plugger_unplug; - atomic_inc(&mddev->plug_cnt); - list_add(&mdcb->cb.list, &plug->cb_list); - return 1; + struct mddev *mddev = cb->data; + md_wakeup_thread(mddev->thread); + kfree(cb); } -EXPORT_SYMBOL_GPL(mddev_check_plugged); +EXPORT_SYMBOL(md_unplug); static inline struct mddev *mddev_get(struct mddev *mddev) { @@ -602,7 +554,6 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); - atomic_set(&mddev->plug_cnt, 0); spin_lock_init(&mddev->write_lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -1157,8 +1108,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor ret = 0; } rdev->sectors = rdev->sb_start; - /* Limit to 4TB as metadata cannot record more than that */ - if (rdev->sectors >= (2ULL << 32)) + /* Limit to 4TB as metadata cannot record more than that. + * (not needed for Linear and RAID0 as metadata doesn't + * record this size) + */ + if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) rdev->sectors = (2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) @@ -1449,7 +1403,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ - if (num_sectors >= (2ULL << 32)) + if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); @@ -3942,17 +3896,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; err = do_md_stop(mddev, 0, NULL); break; case inactive: /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; + if (mddev->pers) err = do_md_stop(mddev, 2, NULL); - } else + else err = 0; /* already inactive */ break; case suspended: @@ -7669,6 +7619,8 @@ static int remove_and_add_spares(struct mddev *mddev) } } } + if (removed) + set_bit(MD_CHANGE_DEVS, &mddev->flags); return spares; } @@ -7682,9 +7634,11 @@ static void reap_sync_thread(struct mddev *mddev) !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* success...*/ /* activate any spares */ - if (mddev->pers->spare_active(mddev)) + if (mddev->pers->spare_active(mddev)) { sysfs_notify(&mddev->kobj, NULL, "degraded"); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) diff --git a/drivers/md/md.h b/drivers/md/md.h index 7b4a3c318cae..f385b038589d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -266,9 +266,6 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - atomic_t plug_cnt; /* If device is expecting - * more bios soon. - */ struct md_thread *thread; /* management thread */ struct md_thread *sync_thread; /* doing resync or reconstruct */ sector_t curr_resync; /* last block scheduled */ @@ -630,6 +627,12 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern int mddev_check_plugged(struct mddev *mddev); extern void md_trim_bio(struct bio *bio, int offset, int size); + +extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); +static inline int mddev_check_plugged(struct mddev *mddev) +{ + return !!blk_check_plugged(md_unplug, mddev, + sizeof(struct blk_plug_cb)); +} #endif /* _MD_MD_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d6864..611b5f797618 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,6 +46,20 @@ */ #define NR_RAID1_BIOS 256 +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + /* When there are this many requests queue to be written by * the raid1 thread, we become 'congested' to provide back-pressure * for writeback. @@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect const sector_t this_sector = r1_bio->sector; int sectors; int best_good_sectors; - int start_disk; - int best_disk; - int i; + int best_disk, best_dist_disk, best_pending_disk; + int has_nonrot_disk; + int disk; sector_t best_dist; + unsigned int min_pending; struct md_rdev *rdev; int choose_first; + int choose_next_idle; rcu_read_lock(); /* @@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect retry: sectors = r1_bio->sectors; best_disk = -1; + best_dist_disk = -1; best_dist = MaxSector; + best_pending_disk = -1; + min_pending = UINT_MAX; best_good_sectors = 0; + has_nonrot_disk = 0; + choose_next_idle = 0; if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) { + (this_sector + sectors >= conf->next_resync)) choose_first = 1; - start_disk = 0; - } else { + else choose_first = 0; - start_disk = conf->last_used; - } - for (i = 0 ; i < conf->raid_disks * 2 ; i++) { + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { sector_t dist; sector_t first_bad; int bad_sectors; - - int disk = start_disk + i; - if (disk >= conf->raid_disks * 2) - disk -= conf->raid_disks * 2; + unsigned int pending; + bool nonrot; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED @@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } else best_good_sectors = sectors; + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first - /* Don't change to another disk for sequential reads */ - || conf->next_seq_sect == this_sector - || dist == 0 - /* If device is idle, use it */ - || atomic_read(&rdev->nr_pending) == 0) { + if (choose_first) { + best_disk = disk; + break; + } + /* Don't change to another disk for sequential reads */ + if (conf->mirrors[disk].next_seq_sect == this_sector + || dist == 0) { + int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; + struct raid1_info *mirror = &conf->mirrors[disk]; + + best_disk = disk; + /* + * If buffered sequential IO size exceeds optimal + * iosize, check if there is idle disk. If yes, choose + * the idle disk. read_balance could already choose an + * idle disk before noticing it's a sequential IO in + * this disk. This doesn't matter because this disk + * will idle, next time it will be utilized after the + * first disk has IO size exceeds optimal iosize. In + * this way, iosize of the first disk will be optimal + * iosize at least. iosize of the second disk might be + * small, but not a big deal since when the second disk + * starts IO, the first disk is likely still busy. + */ + if (nonrot && opt_iosize > 0 && + mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= + mirror->seq_start) { + choose_next_idle = 1; + continue; + } + break; + } + /* If device is idle, use it */ + if (pending == 0) { best_disk = disk; break; } + + if (choose_next_idle) + continue; + + if (min_pending > pending) { + min_pending = pending; + best_pending_disk = disk; + } + if (dist < best_dist) { best_dist = dist; - best_disk = disk; + best_dist_disk = disk; } } + /* + * If all disks are rotational, choose the closest disk. If any disk is + * non-rotational, choose the disk with less pending request even the + * disk is rotational, which might/might not be optimal for raids with + * mixed ratation/non-rotational disks depending on workload. + */ + if (best_disk == -1) { + if (has_nonrot_disk) + best_disk = best_pending_disk; + else + best_disk = best_dist_disk; + } + if (best_disk >= 0) { rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) @@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect goto retry; } sectors = best_good_sectors; - conf->next_seq_sect = this_sector + sectors; - conf->last_used = best_disk; + + if (conf->mirrors[best_disk].next_seq_sect != this_sector) + conf->mirrors[best_disk].seq_start = this_sector; + + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } rcu_read_unlock(); *max_sectors = sectors; @@ -870,10 +944,48 @@ do_sync_io: pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); } +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r1conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r1conf *conf = mddev->private; - struct mirror_info *mirror; + struct raid1_info *mirror; struct r1bio *r1_bio; struct bio *read_bio; int i, disks; @@ -883,6 +995,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid1_plug_cb *plug = NULL; int first_clone; int sectors_handled; int max_sectors; @@ -1185,11 +1299,22 @@ read_again: mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); + + cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid1_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, @@ -1364,7 +1489,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = -EEXIST; int mirror = 0; - struct mirror_info *p; + struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -1433,7 +1558,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct mirror_info *p = conf->mirrors+ number; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) p = conf->mirrors + conf->raid_disks + number; @@ -2173,8 +2298,7 @@ static void raid1d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -2371,6 +2495,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio->bi_rw = READ; bio->bi_end_io = end_sync_read; read_targets++; + } else if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + /* + * The device is suitable for reading (InSync), + * but has bad block(s) here. Let's try to correct them, + * if we are doing resync or repair. Otherwise, leave + * this device alone for this sync request. + */ + bio->bi_rw = WRITE; + bio->bi_end_io = end_sync_write; + write_targets++; } } if (bio->bi_end_io) { @@ -2428,7 +2564,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished */ - sector_t rv = max_sector - sector_nr; + sector_t rv; + if (min_bad > 0) + max_sector = sector_nr + min_bad; + rv = max_sector - sector_nr; *skipped = 1; put_buf(r1_bio); return rv; @@ -2521,7 +2660,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) { struct r1conf *conf; int i; - struct mirror_info *disk; + struct raid1_info *disk; struct md_rdev *rdev; int err = -ENOMEM; @@ -2529,7 +2668,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; - conf->mirrors = kzalloc(sizeof(struct mirror_info) + conf->mirrors = kzalloc(sizeof(struct raid1_info) * mddev->raid_disks * 2, GFP_KERNEL); if (!conf->mirrors) @@ -2572,6 +2711,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) mddev->merge_check_needed = 1; disk->head_position = 0; + disk->seq_start = MaxSector; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; @@ -2585,7 +2725,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; - conf->last_used = -1; for (i = 0; i < conf->raid_disks * 2; i++) { disk = conf->mirrors + i; @@ -2611,19 +2750,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev && (disk->rdev->saved_raid_disk < 0)) conf->fullsync = 1; - } else if (conf->last_used < 0) - /* - * The first working device is used as a - * starting point to read balancing. - */ - conf->last_used = i; + } } - if (conf->last_used < 0) { - printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", - mdname(mddev)); - goto abort; - } err = -ENOMEM; conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { @@ -2798,7 +2927,7 @@ static int raid1_reshape(struct mddev *mddev) */ mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; - struct mirror_info *newmirrors; + struct raid1_info *newmirrors; struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; @@ -2841,7 +2970,7 @@ static int raid1_reshape(struct mddev *mddev) kfree(newpoolinfo); return -ENOMEM; } - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, + newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); @@ -2880,7 +3009,6 @@ static int raid1_reshape(struct mddev *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - conf->last_used = 0; /* just make sure it is in-range */ lower_barrier(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 80ded139314c..0ff3715fb7eb 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,9 +1,15 @@ #ifndef _RAID1_H #define _RAID1_H -struct mirror_info { +struct raid1_info { struct md_rdev *rdev; sector_t head_position; + + /* When choose the best device for a read (read_balance()) + * we try to keep sequential reads one the same device + */ + sector_t next_seq_sect; + sector_t seq_start; }; /* @@ -24,17 +30,11 @@ struct pool_info { struct r1conf { struct mddev *mddev; - struct mirror_info *mirrors; /* twice 'raid_disks' to + struct raid1_info *mirrors; /* twice 'raid_disks' to * allow for replacements. */ int raid_disks; - /* When choose the best device for a read (read_balance()) - * we try to keep sequential reads one the same device - * using 'last_used' and 'next_seq_sect' - */ - int last_used; - sector_t next_seq_sect; /* During resync, read_balancing is only allowed on the part * of the array that has been resynced. 'next_resync' tells us * where that is. @@ -135,20 +135,6 @@ struct r1bio { /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ }; -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting bios[n] to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - /* bits for r1bio.state */ #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8da6282254c3..0138a727c1f3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -60,7 +60,21 @@ */ #define NR_RAID10_BIOS 256 -/* When there are this many requests queue to be written by +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* When there are this many requests queued to be written by * the raid10 thread, we become 'congested' to provide back-pressure * for writeback. */ @@ -645,7 +659,11 @@ static int raid10_mergeable_bvec(struct request_queue *q, max = biovec->bv_len; if (mddev->merge_check_needed) { - struct r10bio r10_bio; + struct { + struct r10bio r10_bio; + struct r10dev devs[conf->copies]; + } on_stack; + struct r10bio *r10_bio = &on_stack.r10_bio; int s; if (conf->reshape_progress != MaxSector) { /* Cannot give any guidance during reshape */ @@ -653,18 +671,18 @@ static int raid10_mergeable_bvec(struct request_queue *q, return biovec->bv_len; return 0; } - r10_bio.sector = sector; - raid10_find_phys(conf, &r10_bio); + r10_bio->sector = sector; + raid10_find_phys(conf, r10_bio); rcu_read_lock(); for (s = 0; s < conf->copies; s++) { - int disk = r10_bio.devs[s].devnum; + int disk = r10_bio->devs[s].devnum; struct md_rdev *rdev = rcu_dereference( conf->mirrors[disk].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); if (q->merge_bvec_fn) { - bvm->bi_sector = r10_bio.devs[s].addr + bvm->bi_sector = r10_bio->devs[s].addr + rdev->data_offset; bvm->bi_bdev = rdev->bdev; max = min(max, q->merge_bvec_fn( @@ -676,7 +694,7 @@ static int raid10_mergeable_bvec(struct request_queue *q, struct request_queue *q = bdev_get_queue(rdev->bdev); if (q->merge_bvec_fn) { - bvm->bi_sector = r10_bio.devs[s].addr + bvm->bi_sector = r10_bio->devs[s].addr + rdev->data_offset; bvm->bi_bdev = rdev->bdev; max = min(max, q->merge_bvec_fn( @@ -717,7 +735,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, int sectors = r10_bio->sectors; int best_good_sectors; sector_t new_distance, best_dist; - struct md_rdev *rdev, *best_rdev; + struct md_rdev *best_rdev, *rdev = NULL; int do_balance; int best_slot; struct geom *geo = &conf->geo; @@ -839,9 +857,8 @@ retry: return rdev; } -static int raid10_congested(void *data, int bits) +int md_raid10_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct r10conf *conf = mddev->private; int i, ret = 0; @@ -849,8 +866,6 @@ static int raid10_congested(void *data, int bits) conf->pending_count >= max_queued_requests) return 1; - if (mddev_congested(mddev, bits)) - return 1; rcu_read_lock(); for (i = 0; (i < conf->geo.raid_disks || i < conf->prev.raid_disks) @@ -866,6 +881,15 @@ static int raid10_congested(void *data, int bits) rcu_read_unlock(); return ret; } +EXPORT_SYMBOL_GPL(md_raid10_congested); + +static int raid10_congested(void *data, int bits) +{ + struct mddev *mddev = data; + + return mddev_congested(mddev, bits) || + md_raid10_congested(mddev, bits); +} static void flush_pending_writes(struct r10conf *conf) { @@ -1488,14 +1512,16 @@ static int _enough(struct r10conf *conf, struct geom *geo, int ignore) do { int n = conf->copies; int cnt = 0; + int this = first; while (n--) { - if (conf->mirrors[first].rdev && - first != ignore) + if (conf->mirrors[this].rdev && + this != ignore) cnt++; - first = (first+1) % geo->raid_disks; + this = (this+1) % geo->raid_disks; } if (cnt == 0) return 0; + first = (first + geo->near_copies) % geo->raid_disks; } while (first != 0); return 1; } @@ -1546,7 +1572,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) static void print_conf(struct r10conf *conf) { int i; - struct mirror_info *tmp; + struct raid10_info *tmp; printk(KERN_DEBUG "RAID10 conf printout:\n"); if (!conf) { @@ -1580,7 +1606,7 @@ static int raid10_spare_active(struct mddev *mddev) { int i; struct r10conf *conf = mddev->private; - struct mirror_info *tmp; + struct raid10_info *tmp; int count = 0; unsigned long flags; @@ -1655,7 +1681,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) else mirror = first; for ( ; mirror <= last ; mirror++) { - struct mirror_info *p = &conf->mirrors[mirror]; + struct raid10_info *p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; if (p->rdev) { @@ -1709,7 +1735,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) int err = 0; int number = rdev->raid_disk; struct md_rdev **rdevp; - struct mirror_info *p = conf->mirrors + number; + struct raid10_info *p = conf->mirrors + number; print_conf(conf); if (rdev == p->rdev) @@ -2660,8 +2686,7 @@ static void raid10d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -2876,7 +2901,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; - struct mirror_info *mirror = &conf->mirrors[i]; + struct raid10_info *mirror = &conf->mirrors[i]; if ((mirror->rdev == NULL || test_bit(In_sync, &mirror->rdev->flags)) @@ -3388,7 +3413,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) goto out; /* FIXME calc properly */ - conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + + conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + max(0,mddev->delta_disks)), GFP_KERNEL); if (!conf->mirrors) @@ -3452,7 +3477,7 @@ static int run(struct mddev *mddev) { struct r10conf *conf; int i, disk_idx, chunk_size; - struct mirror_info *disk; + struct raid10_info *disk; struct md_rdev *rdev; sector_t size; sector_t min_offset_diff = 0; @@ -3472,12 +3497,14 @@ static int run(struct mddev *mddev) conf->thread = NULL; chunk_size = mddev->chunk_sectors << 9; - blk_queue_io_min(mddev->queue, chunk_size); - if (conf->geo.raid_disks % conf->geo.near_copies) - blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); - else - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->geo.raid_disks / conf->geo.near_copies)); + if (mddev->queue) { + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->geo.raid_disks % conf->geo.near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); + else + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->geo.raid_disks / conf->geo.near_copies)); + } rdev_for_each(rdev, mddev) { long long diff; @@ -3511,8 +3538,9 @@ static int run(struct mddev *mddev) if (first || diff < min_offset_diff) min_offset_diff = diff; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); disk->head_position = 0; } @@ -3575,22 +3603,22 @@ static int run(struct mddev *mddev) md_set_array_sectors(mddev, size); mddev->resync_max_sectors = size; - mddev->queue->backing_dev_info.congested_fn = raid10_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - - /* Calculate max read-ahead size. - * We need to readahead at least twice a whole stripe.... - * maybe... - */ - { + if (mddev->queue) { int stripe = conf->geo.raid_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); + mddev->queue->backing_dev_info.congested_fn = raid10_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + + /* Calculate max read-ahead size. + * We need to readahead at least twice a whole stripe.... + * maybe... + */ stripe /= conf->geo.near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); } - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); if (md_integrity_register(mddev)) goto out_free_conf; @@ -3641,7 +3669,10 @@ static int stop(struct mddev *mddev) lower_barrier(conf); md_unregister_thread(&mddev->thread); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + if (mddev->queue) + /* the unplug fn references 'conf'*/ + blk_sync_queue(mddev->queue); + if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); kfree(conf->mirrors); @@ -3805,7 +3836,7 @@ static int raid10_check_reshape(struct mddev *mddev) if (mddev->delta_disks > 0) { /* allocate new 'mirrors' list */ conf->mirrors_new = kzalloc( - sizeof(struct mirror_info) + sizeof(struct raid10_info) *(mddev->raid_disks + mddev->delta_disks), GFP_KERNEL); @@ -3930,7 +3961,7 @@ static int raid10_start_reshape(struct mddev *mddev) spin_lock_irq(&conf->device_lock); if (conf->mirrors_new) { memcpy(conf->mirrors_new, conf->mirrors, - sizeof(struct mirror_info)*conf->prev.raid_disks); + sizeof(struct raid10_info)*conf->prev.raid_disks); smp_mb(); kfree(conf->mirrors_old); /* FIXME and elsewhere */ conf->mirrors_old = conf->mirrors; @@ -4389,14 +4420,18 @@ static int handle_reshape_read_error(struct mddev *mddev, { /* Use sync reads to get the blocks from somewhere else */ int sectors = r10_bio->sectors; - struct r10bio r10b; struct r10conf *conf = mddev->private; + struct { + struct r10bio r10_bio; + struct r10dev devs[conf->copies]; + } on_stack; + struct r10bio *r10b = &on_stack.r10_bio; int slot = 0; int idx = 0; struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; - r10b.sector = r10_bio->sector; - __raid10_find_phys(&conf->prev, &r10b); + r10b->sector = r10_bio->sector; + __raid10_find_phys(&conf->prev, r10b); while (sectors) { int s = sectors; @@ -4407,7 +4442,7 @@ static int handle_reshape_read_error(struct mddev *mddev, s = PAGE_SIZE >> 9; while (!success) { - int d = r10b.devs[slot].devnum; + int d = r10b->devs[slot].devnum; struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t addr; if (rdev == NULL || @@ -4415,7 +4450,7 @@ static int handle_reshape_read_error(struct mddev *mddev, !test_bit(In_sync, &rdev->flags)) goto failed; - addr = r10b.devs[slot].addr + idx * PAGE_SIZE; + addr = r10b->devs[slot].addr + idx * PAGE_SIZE; success = sync_page_io(rdev, addr, s << 9, diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 135b1b0a1554..1054cf602345 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -1,7 +1,7 @@ #ifndef _RAID10_H #define _RAID10_H -struct mirror_info { +struct raid10_info { struct md_rdev *rdev, *replacement; sector_t head_position; int recovery_disabled; /* matches @@ -13,8 +13,8 @@ struct mirror_info { struct r10conf { struct mddev *mddev; - struct mirror_info *mirrors; - struct mirror_info *mirrors_new, *mirrors_old; + struct raid10_info *mirrors; + struct raid10_info *mirrors_new, *mirrors_old; spinlock_t device_lock; /* geometry */ @@ -110,7 +110,7 @@ struct r10bio { * We choose the number when they are allocated. * We sometimes need an extra bio to write to the replacement. */ - struct { + struct r10dev { struct bio *bio; union { struct bio *repl_bio; /* used for resync and @@ -123,20 +123,6 @@ struct r10bio { } devs[0]; }; -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio*)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - /* bits for r10bio.state */ enum r10bio_state { R10BIO_Uptodate, @@ -159,4 +145,7 @@ enum r10bio_state { */ R10BIO_Previous, }; + +extern int md_raid10_congested(struct mddev *mddev, int bits); + #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 04348d76bb30..0689173fd9f5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits */ -static inline int raid5_bi_phys_segments(struct bio *bio) +static inline int raid5_bi_processed_stripes(struct bio *bio) { - return bio->bi_phys_segments & 0xffff; + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + return (atomic_read(segments) >> 16) & 0xffff; } -static inline int raid5_bi_hw_segments(struct bio *bio) +static inline int raid5_dec_bi_active_stripes(struct bio *bio) { - return (bio->bi_phys_segments >> 16) & 0xffff; + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + return atomic_sub_return(1, segments) & 0xffff; } -static inline int raid5_dec_bi_phys_segments(struct bio *bio) +static inline void raid5_inc_bi_active_stripes(struct bio *bio) { - --bio->bi_phys_segments; - return raid5_bi_phys_segments(bio); + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + atomic_inc(segments); } -static inline int raid5_dec_bi_hw_segments(struct bio *bio) +static inline void raid5_set_bi_processed_stripes(struct bio *bio, + unsigned int cnt) { - unsigned short val = raid5_bi_hw_segments(bio); + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + int old, new; - --val; - bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); - return val; + do { + old = atomic_read(segments); + new = (old & 0xffff) | (cnt << 16); + } while (atomic_cmpxchg(segments, old, new) != old); } -static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) +static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) { - bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + atomic_set(segments, cnt); } /* Find first data disk in a raid6 stripe */ @@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) { - if (atomic_dec_and_test(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); - BUG_ON(atomic_read(&conf->active_stripes)==0); - if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - sh->bm_seq - conf->seq_write > 0) - list_add_tail(&sh->lru, &conf->bitmap_list); - else { - clear_bit(STRIPE_DELAYED, &sh->state); - clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); - } - md_wakeup_thread(conf->mddev->thread); - } else { - BUG_ON(stripe_operations_active(sh)); - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - if (atomic_dec_return(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } + BUG_ON(!list_empty(&sh->lru)); + BUG_ON(atomic_read(&conf->active_stripes)==0); + if (test_bit(STRIPE_HANDLE, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + list_add_tail(&sh->lru, &conf->delayed_list); + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_DELAYED, &sh->state); + clear_bit(STRIPE_BIT_DELAY, &sh->state); + list_add_tail(&sh->lru, &conf->handle_list); + } + md_wakeup_thread(conf->mddev->thread); + } else { + BUG_ON(stripe_operations_active(sh)); + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + atomic_dec(&conf->active_stripes); + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { + list_add_tail(&sh->lru, &conf->inactive_list); + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); } } } +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +{ + if (atomic_dec_and_test(&sh->count)) + do_release_stripe(conf, sh); +} + static void release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); + local_irq_save(flags); + if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { + do_release_stripe(conf, sh); + spin_unlock(&conf->device_lock); + } + local_irq_restore(flags); } static inline void remove_hash(struct stripe_head *sh) @@ -380,6 +393,8 @@ static int calc_degraded(struct r5conf *conf) degraded = 0; for (i = 0; i < conf->previous_raid_disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = rcu_dereference(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags)) degraded++; else if (test_bit(In_sync, &rdev->flags)) @@ -404,6 +419,8 @@ static int calc_degraded(struct r5conf *conf) degraded2 = 0; for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = rcu_dereference(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags)) degraded2++; else if (test_bit(In_sync, &rdev->flags)) @@ -471,7 +488,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } else { if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) - && !test_bit(STRIPE_EXPANDING, &sh->state)); + && !test_bit(STRIPE_EXPANDING, &sh->state) + && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -640,6 +658,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else bi->bi_sector = (sh->sector + rdev->data_offset); + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + bi->bi_rw |= REQ_FLUSH; + bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_idx = 0; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -749,14 +770,12 @@ static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; struct bio *return_bi = NULL; - struct r5conf *conf = sh->raid_conf; int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* clear completed biofills */ - spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -774,7 +793,7 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_phys_segments(rbi)) { + if (!raid5_dec_bi_active_stripes(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -782,7 +801,6 @@ static void ops_complete_biofill(void *stripe_head_ref) } } } - spin_unlock_irq(&conf->device_lock); clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -794,7 +812,6 @@ static void ops_complete_biofill(void *stripe_head_ref) static void ops_run_biofill(struct stripe_head *sh) { struct dma_async_tx_descriptor *tx = NULL; - struct r5conf *conf = sh->raid_conf; struct async_submit_ctl submit; int i; @@ -805,10 +822,10 @@ static void ops_run_biofill(struct stripe_head *sh) struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi; - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&sh->stripe_lock); dev->read = rbi = dev->toread; dev->toread = NULL; - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { tx = async_copy_data(0, rbi, dev->page, @@ -1144,12 +1161,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; - spin_lock_irq(&sh->raid_conf->device_lock); + spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; BUG_ON(dev->written); wbi = dev->written = chosen; - spin_unlock_irq(&sh->raid_conf->device_lock); + spin_unlock_irq(&sh->stripe_lock); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1454,6 +1471,8 @@ static int grow_one_stripe(struct r5conf *conf) init_waitqueue_head(&sh->ops.wait_for_ops); #endif + spin_lock_init(&sh->stripe_lock); + if (grow_buffers(sh)) { shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); @@ -1572,6 +1591,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&nsh->ops.wait_for_ops); #endif + spin_lock_init(&nsh->stripe_lock); list_add(&nsh->lru, &newstripes); } @@ -1739,7 +1759,9 @@ static void raid5_end_read_request(struct bio * bi, int error) atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - } + } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + if (atomic_read(&rdev->read_errors)) atomic_set(&rdev->read_errors, 0); } else { @@ -1784,7 +1806,11 @@ static void raid5_end_read_request(struct bio * bi, int error) else retry = 1; if (retry) - set_bit(R5_ReadError, &sh->dev[i].flags); + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { + set_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + } else + set_bit(R5_ReadNoMerge, &sh->dev[i].flags); else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); @@ -2340,11 +2366,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector); - - spin_lock_irq(&conf->device_lock); + /* + * If several bio share a stripe. The bio bi_phys_segments acts as a + * reference count to avoid race. The reference count should already be + * increased before this function is called (for example, in + * make_request()), so other bio sharing this stripe will not free the + * stripe. If a stripe is owned by one stripe, the stripe lock will + * protect it. + */ + spin_lock_irq(&sh->stripe_lock); if (forwrite) { bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL && sh->dev[dd_idx].written == NULL) + if (*bip == NULL) firstwrite = 1; } else bip = &sh->dev[dd_idx].toread; @@ -2360,7 +2393,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (*bip) bi->bi_next = *bip; *bip = bi; - bi->bi_phys_segments++; + raid5_inc_bi_active_stripes(bi); if (forwrite) { /* check if page is covered */ @@ -2375,7 +2408,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(&sh->stripe_lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_sector, @@ -2391,7 +2424,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(&sh->stripe_lock); return 0; } @@ -2441,10 +2474,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, rdev_dec_pending(rdev, conf->mddev); } } - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&sh->stripe_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; + spin_unlock_irq(&sh->stripe_lock); if (bi) { s->to_write--; bitmap_end = 1; @@ -2457,13 +2491,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { + if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); + bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; @@ -2472,7 +2510,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { + if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -2496,14 +2534,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (!raid5_dec_bi_phys_segments(bi)) { + if (!raid5_dec_bi_active_stripes(bi)) { bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } } - spin_unlock_irq(&conf->device_lock); if (bitmap_end) bitmap_endwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0, 0); @@ -2707,30 +2744,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, test_bit(R5_UPTODATE, &dev->flags)) { /* We can return any write requests */ struct bio *wbi, *wbi2; - int bitmap_end = 0; pr_debug("Return write for disc %d\n", i); - spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (!raid5_dec_bi_phys_segments(wbi)) { + if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); wbi->bi_next = *return_bi; *return_bi = wbi; } wbi = wbi2; } - if (dev->towrite == NULL) - bitmap_end = 1; - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, - sh->sector, - STRIPE_SECTORS, + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), - 0); + 0); } } @@ -3182,7 +3212,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Now to look around and see what can be done */ rcu_read_lock(); - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { struct md_rdev *rdev; sector_t first_bad; @@ -3328,7 +3357,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) do_recovery = 1; } } - spin_unlock_irq(&conf->device_lock); if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, * we must be recovering. @@ -3791,7 +3819,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * this sets the active strip count to 1 and the processed * strip count to zero (upper 8 bits) */ - bi->bi_phys_segments = 1; /* biased count of active stripes */ + raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ } return bi; @@ -3988,6 +4016,62 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf) return sh; } +struct raid5_plug_cb { + struct blk_plug_cb cb; + struct list_head list; +}; + +static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) +{ + struct raid5_plug_cb *cb = container_of( + blk_cb, struct raid5_plug_cb, cb); + struct stripe_head *sh; + struct mddev *mddev = cb->cb.data; + struct r5conf *conf = mddev->private; + + if (cb->list.next && !list_empty(&cb->list)) { + spin_lock_irq(&conf->device_lock); + while (!list_empty(&cb->list)) { + sh = list_first_entry(&cb->list, struct stripe_head, lru); + list_del_init(&sh->lru); + /* + * avoid race release_stripe_plug() sees + * STRIPE_ON_UNPLUG_LIST clear but the stripe + * is still in our list + */ + smp_mb__before_clear_bit(); + clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); + __release_stripe(conf, sh); + } + spin_unlock_irq(&conf->device_lock); + } + kfree(cb); +} + +static void release_stripe_plug(struct mddev *mddev, + struct stripe_head *sh) +{ + struct blk_plug_cb *blk_cb = blk_check_plugged( + raid5_unplug, mddev, + sizeof(struct raid5_plug_cb)); + struct raid5_plug_cb *cb; + + if (!blk_cb) { + release_stripe(sh); + return; + } + + cb = container_of(blk_cb, struct raid5_plug_cb, cb); + + if (cb->list.next == NULL) + INIT_LIST_HEAD(&cb->list); + + if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) + list_add_tail(&sh->lru, &cb->list); + else + release_stripe(sh); +} + static void make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -4116,8 +4200,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ((bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - mddev_check_plugged(mddev); - release_stripe(sh); + release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -4126,9 +4209,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) } } - spin_lock_irq(&conf->device_lock); - remaining = raid5_dec_bi_phys_segments(bi); - spin_unlock_irq(&conf->device_lock); + remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { if ( rw == WRITE ) @@ -4484,7 +4565,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid5_bi_hw_segments(raid_bio)) + if (scnt < raid5_bi_processed_stripes(raid_bio)) /* already done this stripe */ continue; @@ -4492,25 +4573,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) if (!sh) { /* failed to get a stripe - must wait */ - raid5_set_bi_hw_segments(raid_bio, scnt); + raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { release_stripe(sh); - raid5_set_bi_hw_segments(raid_bio, scnt); + raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } + set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); handle_stripe(sh); release_stripe(sh); handled++; } - spin_lock_irq(&conf->device_lock); - remaining = raid5_dec_bi_phys_segments(raid_bio); - spin_unlock_irq(&conf->device_lock); + remaining = raid5_dec_bi_active_stripes(raid_bio); if (remaining == 0) bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) @@ -4518,6 +4598,30 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) return handled; } +#define MAX_STRIPE_BATCH 8 +static int handle_active_stripes(struct r5conf *conf) +{ + struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; + int i, batch_size = 0; + + while (batch_size < MAX_STRIPE_BATCH && + (sh = __get_priority_stripe(conf)) != NULL) + batch[batch_size++] = sh; + + if (batch_size == 0) + return batch_size; + spin_unlock_irq(&conf->device_lock); + + for (i = 0; i < batch_size; i++) + handle_stripe(batch[i]); + + cond_resched(); + + spin_lock_irq(&conf->device_lock); + for (i = 0; i < batch_size; i++) + __release_stripe(conf, batch[i]); + return batch_size; +} /* * This is our raid5 kernel thread. @@ -4528,7 +4632,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) */ static void raid5d(struct mddev *mddev) { - struct stripe_head *sh; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; @@ -4542,8 +4645,9 @@ static void raid5d(struct mddev *mddev) spin_lock_irq(&conf->device_lock); while (1) { struct bio *bio; + int batch_size; - if (atomic_read(&mddev->plug_cnt) == 0 && + if ( !list_empty(&conf->bitmap_list)) { /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; @@ -4553,8 +4657,7 @@ static void raid5d(struct mddev *mddev) conf->seq_write = conf->seq_flush; activate_bit_delay(conf); } - if (atomic_read(&mddev->plug_cnt) == 0) - raid5_activate_delayed(conf); + raid5_activate_delayed(conf); while ((bio = remove_bio_from_retry(conf))) { int ok; @@ -4566,21 +4669,16 @@ static void raid5d(struct mddev *mddev) handled++; } - sh = __get_priority_stripe(conf); - - if (!sh) + batch_size = handle_active_stripes(conf); + if (!batch_size) break; - spin_unlock_irq(&conf->device_lock); - - handled++; - handle_stripe(sh); - release_stripe(sh); - cond_resched(); + handled += batch_size; - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { + spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); - - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&conf->device_lock); + } } pr_debug("%d stripes handled\n", handled); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2164021f3b5f..a9fc24901eda 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -210,6 +210,7 @@ struct stripe_head { int disks; /* disks in stripe */ enum check_states check_state; enum reconstruct_states reconstruct_state; + spinlock_t stripe_lock; /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -273,6 +274,7 @@ enum r5dev_flags { R5_Wantwrite, R5_Overlap, /* There is a pending overlapping request * on this block */ + R5_ReadNoMerge, /* prevent bio from merging in block-layer */ R5_ReadError, /* seen a read error here recently */ R5_ReWrite, /* have tried to over-write the readerror */ @@ -319,6 +321,7 @@ enum { STRIPE_BIOFILL_RUN, STRIPE_COMPUTE_RUN, STRIPE_OPS_REQ_PENDING, + STRIPE_ON_UNPLUG_LIST, }; /* |