diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/util.h | 1 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 1 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 18 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 49 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.c | 18 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.h | 4 | ||||
-rw-r--r-- | drivers/md/dm.c | 61 | ||||
-rw-r--r-- | drivers/md/linear.c | 1 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 27 | ||||
-rw-r--r-- | drivers/md/md.h | 13 | ||||
-rw-r--r-- | drivers/md/multipath.c | 1 | ||||
-rw-r--r-- | drivers/md/raid0.c | 2 | ||||
-rw-r--r-- | drivers/md/raid1.c | 33 | ||||
-rw-r--r-- | drivers/md/raid10.c | 45 | ||||
-rw-r--r-- | drivers/md/raid5.c | 58 |
27 files changed, 241 insertions, 142 deletions
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index a126919ed102..5d13930f0f22 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -4,7 +4,6 @@ #include <linux/blkdev.h> #include <linux/errno.h> -#include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/sched/clock.h> #include <linux/llist.h> diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index e4c2c1a1e993..6735c8d6a445 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -932,7 +932,7 @@ static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd, *result = true; r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, - from_cblock(begin), &cmd->dirty_cursor); + from_cblock(cmd->cache_blocks), &cmd->dirty_cursor); if (r) { DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__); return r; @@ -959,14 +959,16 @@ static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd, return 0; } + begin = to_cblock(from_cblock(begin) + 1); + if (begin == end) + break; + r = dm_bitset_cursor_next(&cmd->dirty_cursor); if (r) { DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__); dm_bitset_cursor_end(&cmd->dirty_cursor); return r; } - - begin = to_cblock(from_cblock(begin) + 1); } dm_bitset_cursor_end(&cmd->dirty_cursor); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9c689b34e6e7..975922c8f231 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->num_discard_bios = 1; ti->discards_supported = true; - ti->discard_zeroes_data_unsupported = true; ti->split_discard_bios = false; cache->features = ca->features; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 136fda3ff9e5..fea5bd52ada8 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -132,6 +132,7 @@ void dm_init_md_queue(struct mapped_device *md); void dm_init_normal_md_queue(struct mapped_device *md); int md_in_flight(struct mapped_device *md); void disable_write_same(struct mapped_device *md); +void disable_write_zeroes(struct mapped_device *md); static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 389a3637ffcc..ef1d836bd81b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) wake_up_process(cc->write_thread); ti->num_flush_bios = 1; - ti->discard_zeroes_data_unsupported = true; return 0; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 03940bf36f6c..3702e502466d 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -312,9 +312,12 @@ static void do_region(int op, int op_flags, unsigned region, */ if (op == REQ_OP_DISCARD) special_cmd_max_sectors = q->limits.max_discard_sectors; + else if (op == REQ_OP_WRITE_ZEROES) + special_cmd_max_sectors = q->limits.max_write_zeroes_sectors; else if (op == REQ_OP_WRITE_SAME) special_cmd_max_sectors = q->limits.max_write_same_sectors; - if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) && + if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES || + op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) { dec_count(io, region, -EOPNOTSUPP); return; @@ -328,11 +331,18 @@ static void do_region(int op, int op_flags, unsigned region, /* * Allocate a suitably sized-bio. */ - if ((op == REQ_OP_DISCARD) || (op == REQ_OP_WRITE_SAME)) + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: + num_bvecs = 0; + break; + case REQ_OP_WRITE_SAME: num_bvecs = 1; - else + break; + default: num_bvecs = min_t(int, BIO_MAX_PAGES, dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); + } bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_iter.bi_sector = where->sector + (where->count - remaining); @@ -341,7 +351,7 @@ static void do_region(int op, int op_flags, unsigned region, bio_set_op_attrs(bio, op, op_flags); store_io_and_region_in_bio(bio, io, region); - if (op == REQ_OP_DISCARD) { + if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) { num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; remaining -= num_sectors; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 9e9d04cb7d51..f85846741d50 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -733,11 +733,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->pages = &zero_page_list; /* - * Use WRITE SAME to optimize zeroing if all dests support it. + * Use WRITE ZEROES to optimize zeroing if all dests support it. */ - job->rw = REQ_OP_WRITE_SAME; + job->rw = REQ_OP_WRITE_ZEROES; for (i = 0; i < job->num_dests; i++) - if (!bdev_write_same(job->dests[i].bdev)) { + if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) { job->rw = WRITE; break; } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 4788b0b989a9..e17fd44ceef5 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; + ti->num_write_zeroes_bios = 1; ti->private = lc; return 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 7f223dbed49f..2950b145443d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1103,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; + ti->num_write_zeroes_bios = 1; if (m->queue_mode == DM_TYPE_BIO_BASED) ti->per_io_data_size = multipath_per_bio_data_size(); else @@ -1491,7 +1492,7 @@ static int do_end_io(struct multipath *m, struct request *clone, */ int r = DM_ENDIO_REQUEUE; - if (!error && !clone->errors) + if (!error) return 0; /* I/O complete */ if (noretry_error(error)) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index f8564d63982f..2dae3e5b851c 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs) /* Assume discards not supported until after checks below. */ ti->discards_supported = false; - /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */ + /* + * XXX: RAID level 4,5,6 require zeroing for safety. + */ raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); for (i = 0; i < rs->raid_disks; i++) { @@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs) return; if (raid456) { - if (!q->limits.discard_zeroes_data) - return; if (!devices_handle_discard_safely) { DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty."); DMERR("Set dm-raid.devices_handle_discard_safely=Y to override."); @@ -3726,7 +3726,7 @@ static int raid_preresume(struct dm_target *ti) return r; /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ - if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { r = bitmap_resize(mddev->bitmap, mddev->dev_sectors, to_bytes(rs->requested_bitmap_chunk_sectors), 0); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 2ddc2d20e62d..a95cbb80fb34 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->per_io_data_size = sizeof(struct dm_raid1_bio_record); - ti->discard_zeroes_data_unsupported = true; ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); if (!ms->kmirrord_wq) { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 28955b94d2b2..bff7e3bdb4ed 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -298,9 +298,14 @@ static void dm_done(struct request *clone, int error, bool mapped) r = rq_end_io(tio->ti, clone, error, &tio->info); } - if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) && - !clone->q->limits.max_write_same_sectors)) - disable_write_same(tio->md); + if (unlikely(r == -EREMOTEIO)) { + if (req_op(clone) == REQ_OP_WRITE_SAME && + !clone->q->limits.max_write_same_sectors) + disable_write_same(tio->md); + if (req_op(clone) == REQ_OP_WRITE_ZEROES && + !clone->q->limits.max_write_zeroes_sectors) + disable_write_zeroes(tio->md); + } if (r <= 0) /* The target wants to complete the I/O */ @@ -358,7 +363,7 @@ static void dm_complete_request(struct request *rq, int error) if (!rq->q->mq_ops) blk_complete_request(rq); else - blk_mq_complete_request(rq, error); + blk_mq_complete_request(rq); } /* @@ -755,13 +760,14 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, /* Undo dm_start_request() before requeuing */ rq_end_stats(md, rq); rq_completed(md, rq_data_dir(rq), false); + blk_mq_delay_run_hw_queue(hctx, 100/*ms*/); return BLK_MQ_RQ_QUEUE_BUSY; } return BLK_MQ_RQ_QUEUE_OK; } -static struct blk_mq_ops dm_mq_ops = { +static const struct blk_mq_ops dm_mq_ops = { .queue_rq = dm_mq_queue_rq, .complete = dm_softirq_done, .init_request = dm_mq_init_request, diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 28193a57bf47..5ef49c121d99 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = stripes; ti->num_discard_bios = stripes; ti->num_write_same_bios = stripes; + ti->num_write_zeroes_bios = stripes; sc->chunk_size = chunk_size; if (chunk_size & (chunk_size - 1)) @@ -293,6 +294,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } if (unlikely(bio_op(bio) == REQ_OP_DISCARD) || + unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) || unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) { target_bio_nr = dm_bio_get_target_bio_nr(bio); BUG_ON(target_bio_nr >= sc->stripes); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3ad16d9c9d5a..958275aca008 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) return false; } -static bool dm_table_discard_zeroes_data(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i = 0; - - /* Ensure that all targets supports discard_zeroes_data. */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (ti->discard_zeroes_data_unsupported) - return false; - } - - return true; -} - static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1533,6 +1517,34 @@ static bool dm_table_supports_write_same(struct dm_table *t) return true; } +static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !q->limits.max_write_zeroes_sectors; +} + +static bool dm_table_supports_write_zeroes(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_write_zeroes_bios) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL)) + return false; + } + + return true; +} + + static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1592,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); - if (!dm_table_discard_zeroes_data(t)) - q->limits.discard_zeroes_data = 0; - /* Ensure that all underlying devices are non-rotational. */ if (dm_table_all_devices_attribute(t, device_is_nonrot)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); @@ -1603,6 +1612,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; + if (!dm_table_supports_write_zeroes(t)) + q->limits.max_write_zeroes_sectors = 0; if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2b266a2b5035..a5f1916f621a 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) * them down to the data device. The thin device's discard * processing will cause mappings to be removed from the btree. */ - ti->discard_zeroes_data_unsupported = true; if (pf.discard_enabled && pf.discard_passdown) { ti->num_discard_bios = 1; @@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->per_io_data_size = sizeof(struct dm_thin_endio_hook); /* In case the pool supports discards, pass them on. */ - ti->discard_zeroes_data_unsupported = true; if (tc->pool->pf.discard_enabled) { ti->discards_supported = true; ti->num_discard_bios = 1; diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 0f0eb8a3d922..78f36012eaca 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -146,8 +146,6 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, block = fec_buffer_rs_block(v, fio, n, i); res = fec_decode_rs8(v, fio, block, &par[offset], neras); if (res < 0) { - dm_bufio_release(buf); - r = res; goto error; } @@ -172,6 +170,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, done: r = corrected; error: + dm_bufio_release(buf); + if (r < 0 && neras) DMERR_LIMIT("%s: FEC %llu: failed to correct: %d", v->data_dev->name, (unsigned long long)rsb, r); @@ -269,7 +269,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, &is_zero) == 0) { /* skip known zero blocks entirely */ if (is_zero) - continue; + goto done; /* * skip if we have already found the theoretical @@ -439,6 +439,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (!verity_fec_is_enabled(v)) return -EOPNOTSUPP; + if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) { + DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name); + return -EIO; + } + + fio->level++; + if (type == DM_VERITY_BLOCK_TYPE_METADATA) block += v->data_blocks; @@ -470,7 +477,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, if (r < 0) { r = fec_decode_rsb(v, io, fio, rsb, offset, true); if (r < 0) - return r; + goto done; } if (dest) @@ -480,6 +487,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, r = verity_for_bv_block(v, io, iter, fec_bv_copy); } +done: + fio->level--; return r; } @@ -520,6 +529,7 @@ void verity_fec_init_io(struct dm_verity_io *io) memset(fio->bufs, 0, sizeof(fio->bufs)); fio->nbufs = 0; fio->output = NULL; + fio->level = 0; } /* diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 7fa0298b995e..bb31ce87a933 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -27,6 +27,9 @@ #define DM_VERITY_FEC_BUF_MAX \ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS)) +/* maximum recursion level for verity_fec_decode */ +#define DM_VERITY_FEC_MAX_RECURSION 4 + #define DM_VERITY_OPT_FEC_DEV "use_fec_from_device" #define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks" #define DM_VERITY_OPT_FEC_START "fec_start" @@ -58,6 +61,7 @@ struct dm_verity_fec_io { unsigned nbufs; /* number of buffers allocated */ u8 *output; /* buffer for corrected output */ size_t output_pos; + unsigned level; /* recursion level */ }; #ifdef CONFIG_DM_VERITY_FEC diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f4ffd1eb8f44..8bf397729bbd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - trace_block_bio_complete(md->queue, bio, io_error); bio->bi_error = io_error; bio_endio(bio); } @@ -825,6 +824,14 @@ void disable_write_same(struct mapped_device *md) limits->max_write_same_sectors = 0; } +void disable_write_zeroes(struct mapped_device *md) +{ + struct queue_limits *limits = dm_get_queue_limits(md); + + /* device doesn't really support WRITE ZEROES, disable it */ + limits->max_write_zeroes_sectors = 0; +} + static void clone_endio(struct bio *bio) { int error = bio->bi_error; @@ -851,9 +858,14 @@ static void clone_endio(struct bio *bio) } } - if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) - disable_write_same(md); + if (unlikely(r == -EREMOTEIO)) { + if (bio_op(bio) == REQ_OP_WRITE_SAME && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + disable_write_same(md); + if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) + disable_write_zeroes(md); + } free_tio(tio); dec_pending(io, error); @@ -989,26 +1001,29 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) struct dm_offload *o = container_of(cb, struct dm_offload, cb); struct bio_list list; struct bio *bio; + int i; INIT_LIST_HEAD(&o->cb.list); if (unlikely(!current->bio_list)) return; - list = *current->bio_list; - bio_list_init(current->bio_list); - - while ((bio = bio_list_pop(&list))) { - struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set) { - bio_list_add(current->bio_list, bio); - continue; + for (i = 0; i < 2; i++) { + list = current->bio_list[i]; + bio_list_init(¤t->bio_list[i]); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(¤t->bio_list[i], bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); } - - spin_lock(&bs->rescue_lock); - bio_list_add(&bs->rescue_list, bio); - queue_work(bs->rescue_workqueue, &bs->rescue_work); - spin_unlock(&bs->rescue_lock); } } @@ -1199,6 +1214,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti) return ti->num_write_same_bios; } +static unsigned get_num_write_zeroes_bios(struct dm_target *ti) +{ + return ti->num_write_zeroes_bios; +} + typedef bool (*is_split_required_fn)(struct dm_target *ti); static bool is_split_required_for_discard(struct dm_target *ti) @@ -1253,6 +1273,11 @@ static int __send_write_same(struct clone_info *ci) return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); } +static int __send_write_zeroes(struct clone_info *ci) +{ + return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL); +} + /* * Select the correct strategy for processing a non-flush bio. */ @@ -1267,6 +1292,8 @@ static int __split_and_process_non_flush(struct clone_info *ci) return __send_discard(ci); else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) return __send_write_same(ci); + else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) + return __send_write_zeroes(ci); ti = dm_table_find_target(ci->map, ci->sector); if (!dm_target_is_valid(ti)) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 3e38e0207a3e..377a8a3672e3 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -293,6 +293,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) split, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, split); + mddev_check_write_zeroes(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 2b13117fb918..321ecac23027 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -777,7 +777,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) bm_lockres->flags |= DLM_LKF_NOQUEUE; ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (ret == -EAGAIN) { - memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); s = read_resync_info(mddev, bm_lockres); if (s) { pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", @@ -974,6 +973,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); dlm_release_lockspace(cinfo->lockspace, 2); + kfree(cinfo); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 548d1b8014f8..f6ae1d67bcd0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -440,14 +440,6 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct mddev *mddev = cb->data; - md_wakeup_thread(mddev->thread); - kfree(cb); -} -EXPORT_SYMBOL(md_unplug); - static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); @@ -1887,7 +1879,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; + sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -2295,7 +2287,7 @@ static bool does_sb_need_changing(struct mddev *mddev) /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || - (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; @@ -6458,11 +6450,10 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->layout = info->layout; mddev->chunk_sectors = info->chunk_size >> 9; - mddev->max_disks = MD_SB_DISKS; - if (mddev->persistent) { - mddev->flags = 0; - mddev->sb_flags = 0; + mddev->max_disks = MD_SB_DISKS; + mddev->flags = 0; + mddev->sb_flags = 0; } set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -6533,8 +6524,12 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) - revalidate_disk(mddev->gendisk); + if (!rv) { + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + } return rv; } diff --git a/drivers/md/md.h b/drivers/md/md.h index b8859cbf84b6..1e76d64ce180 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -676,16 +676,10 @@ extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); -static inline int mddev_check_plugged(struct mddev *mddev) -{ - return !!blk_check_plugged(md_unplug, mddev, - sizeof(struct blk_plug_cb)); -} static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { @@ -715,4 +709,11 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) mddev->queue->limits.max_write_same_sectors = 0; } + +static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) +{ + if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) + mddev->queue->limits.max_write_zeroes_sectors = 0; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 79a12b59250b..e95d521d93e9 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -139,6 +139,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; mddev_check_writesame(mddev, &mp_bh->bio); + mddev_check_write_zeroes(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); return; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 93347ca7c7a6..ce7a6a56cf73 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -383,6 +383,7 @@ static int raid0_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); @@ -504,6 +505,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) split, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, split); + mddev_check_write_zeroes(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fbc2d7851b49..b59cc100320a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,7 +1027,7 @@ static int get_unqueued_pending(struct r1conf *conf) static void freeze_array(struct r1conf *conf, int extra) { /* Stop sync I/O and normal I/O and wait for everything to - * go quite. + * go quiet. * This is called in two situations: * 1) management command handlers (reshape, remove disk, quiesce). * 2) one normal I/O request failed. @@ -1587,9 +1587,30 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio) split = bio; } - if (bio_data_dir(split) == READ) + if (bio_data_dir(split) == READ) { raid1_read_request(mddev, split); - else + + /* + * If a bio is splitted, the first part of bio will + * pass barrier but the bio is queued in + * current->bio_list (see generic_make_request). If + * there is a raise_barrier() called here, the second + * part of bio can't pass barrier. But since the first + * part bio isn't dispatched to underlaying disks yet, + * the barrier is never released, hence raise_barrier + * will alays wait. We have a deadlock. + * Note, this only happens in read path. For write + * path, the first part of bio is dispatched in a + * schedule() call (because of blk plug) or offloaded + * to raid10d. + * Quitting from the function immediately can change + * the bio order queued in bio_list and avoid the deadlock. + */ + if (split != bio) { + generic_make_request(bio); + break; + } + } else raid1_write_request(mddev, split); } while (split != bio); } @@ -3156,8 +3177,10 @@ static int raid1_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (mddev->queue) + if (mddev->queue) { blk_queue_max_write_same_sectors(mddev->queue, 0); + blk_queue_max_write_zeroes_sectors(mddev->queue, 0); + } rdev_for_each(rdev, mddev) { if (!mddev->gendisk) @@ -3246,8 +3269,6 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 063c43d83b72..28ec3a93acee 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -974,7 +974,8 @@ static void wait_barrier(struct r10conf *conf) !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && - !bio_list_empty(current->bio_list)), + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) @@ -1477,11 +1478,24 @@ retry_write: mbio->bi_bdev = (void*)rdev; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } } @@ -1571,7 +1585,25 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) split = bio; } + /* + * If a bio is splitted, the first part of bio will pass + * barrier but the bio is queued in current->bio_list (see + * generic_make_request). If there is a raise_barrier() called + * here, the second part of bio can't pass barrier. But since + * the first part bio isn't dispatched to underlaying disks + * yet, the barrier is never released, hence raise_barrier will + * alays wait. We have a deadlock. + * Note, this only happens in read path. For write path, the + * first part of bio is dispatched in a schedule() call + * (because of blk plug) or offloaded to raid10d. + * Quitting from the function immediately can change the bio + * order queued in bio_list and avoid the deadlock. + */ __make_request(mddev, split); + if (split != bio && bio_data_dir(bio) == READ) { + generic_make_request(bio); + break; + } } while (split != bio); /* In case raid10d snuck in to freeze_array */ @@ -3717,6 +3749,7 @@ static int raid10_run(struct mddev *mddev) blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, 0); + blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); @@ -3943,10 +3976,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, size); - if (mddev->queue) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - } if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { mddev->recovery_cp = oldsize; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4fb09b3fcb41..2efdb0d67460 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1401,7 +1401,8 @@ static int set_syndrome_sources(struct page **srcs, (test_bit(R5_Wantdrain, &dev->flags) || test_bit(R5_InJournal, &dev->flags))) || (srctype == SYNDROME_SRC_WRITTEN && - dev->written)) { + (dev->written || + test_bit(R5_InJournal, &dev->flags)))) { if (test_bit(R5_InJournal, &dev->flags)) srcs[slot] = sh->dev[i].orig_page; else @@ -5030,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi) rdev_dec_pending(rdev, conf->mddev); if (!error) { - trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), - raid_bi, 0); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -7228,7 +7227,6 @@ static int raid5_run(struct mddev *mddev) if (mddev->queue) { int chunk_size; - bool discard_supported = true; /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the * number of raid devices @@ -7264,48 +7262,32 @@ static int raid5_run(struct mddev *mddev) blk_queue_max_discard_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); - /* - * unaligned part of discard request will be ignored, so can't - * guarantee discard_zeroes_data - */ - mddev->queue->limits.discard_zeroes_data = 0; - blk_queue_max_write_same_sectors(mddev->queue, 0); + blk_queue_max_write_zeroes_sectors(mddev->queue, 0); rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->new_data_offset << 9); - /* - * discard_zeroes_data is required, otherwise data - * could be lost. Consider a scenario: discard a stripe - * (the stripe could be inconsistent if - * discard_zeroes_data is 0); write one disk of the - * stripe (the stripe could be inconsistent again - * depending on which disks are used to calculate - * parity); the disk is broken; The stripe data of this - * disk is lost. - */ - if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || - !bdev_get_queue(rdev->bdev)-> - limits.discard_zeroes_data) - discard_supported = false; - /* Unfortunately, discard_zeroes_data is not currently - * a guarantee - just a hint. So we only allow DISCARD - * if the sysadmin has confirmed that only safe devices - * are in use by setting a module parameter. - */ - if (!devices_handle_discard_safely) { - if (discard_supported) { - pr_info("md/raid456: discard support disabled due to uncertainty.\n"); - pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); - } - discard_supported = false; - } } - if (discard_supported && + /* + * zeroing is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + * + * We only allow DISCARD if the sysadmin has confirmed that + * only safe devices are in use by setting a module parameter. + * A better idea might be to turn DISCARD into WRITE_ZEROES + * requests, as that is required to be safe. + */ + if (devices_handle_discard_safely && mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && mddev->queue->limits.discard_granularity >= stripe) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, @@ -7605,8 +7587,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; |