diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-mpath.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 16 | ||||
-rw-r--r-- | drivers/md/dm.c | 40 | ||||
-rw-r--r-- | drivers/md/md.c | 14 | ||||
-rw-r--r-- | drivers/md/raid5.c | 146 | ||||
-rw-r--r-- | drivers/md/raid5.h | 5 |
6 files changed, 134 insertions, 91 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 63953477a07c..eff7bdd7731d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -429,9 +429,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, /* blk-mq request-based interface */ *__clone = blk_get_request(bdev_get_queue(bdev), rq_data_dir(rq), GFP_ATOMIC); - if (IS_ERR(*__clone)) + if (IS_ERR(*__clone)) { /* ENOMEM, requeue */ + clear_mapinfo(m, map_context); return r; + } (*__clone)->bio = (*__clone)->biotail = NULL; (*__clone)->rq_disk = bdev->bd_disk; (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index d9b00b8565c6..16ba55ad7089 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -820,6 +820,12 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args) } EXPORT_SYMBOL(dm_consume_args); +static bool __table_type_request_based(unsigned table_type) +{ + return (table_type == DM_TYPE_REQUEST_BASED || + table_type == DM_TYPE_MQ_REQUEST_BASED); +} + static int dm_table_set_type(struct dm_table *t) { unsigned i; @@ -852,8 +858,7 @@ static int dm_table_set_type(struct dm_table *t) * Determine the type from the live device. * Default to bio-based if device is new. */ - if (live_md_type == DM_TYPE_REQUEST_BASED || - live_md_type == DM_TYPE_MQ_REQUEST_BASED) + if (__table_type_request_based(live_md_type)) request_based = 1; else bio_based = 1; @@ -903,7 +908,7 @@ static int dm_table_set_type(struct dm_table *t) } t->type = DM_TYPE_MQ_REQUEST_BASED; - } else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) { + } else if (list_empty(devices) && __table_type_request_based(live_md_type)) { /* inherit live MD type */ t->type = live_md_type; @@ -925,10 +930,7 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) bool dm_table_request_based(struct dm_table *t) { - unsigned table_type = dm_table_get_type(t); - - return (table_type == DM_TYPE_REQUEST_BASED || - table_type == DM_TYPE_MQ_REQUEST_BASED); + return __table_type_request_based(dm_table_get_type(t)); } bool dm_table_mq_request_based(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a930b72314ac..2caf492890d6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1082,13 +1082,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) dm_put(md); } -static void free_rq_clone(struct request *clone, bool must_be_mapped) +static void free_rq_clone(struct request *clone) { struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; - WARN_ON_ONCE(must_be_mapped && !clone->q); - blk_rq_unprep_clone(clone); if (md->type == DM_TYPE_MQ_REQUEST_BASED) @@ -1132,7 +1130,7 @@ static void dm_end_request(struct request *clone, int error) rq->sense_len = clone->sense_len; } - free_rq_clone(clone, true); + free_rq_clone(clone); if (!rq->q->mq_ops) blk_end_request_all(rq, error); else @@ -1151,7 +1149,7 @@ static void dm_unprep_request(struct request *rq) } if (clone) - free_rq_clone(clone, false); + free_rq_clone(clone); } /* @@ -1164,6 +1162,7 @@ static void old_requeue_request(struct request *rq) spin_lock_irqsave(q->queue_lock, flags); blk_requeue_request(q, rq); + blk_run_queue_async(q); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -1724,8 +1723,7 @@ static int dm_merge_bvec(struct request_queue *q, struct mapped_device *md = q->queuedata; struct dm_table *map = dm_get_live_table_fast(md); struct dm_target *ti; - sector_t max_sectors; - int max_size = 0; + sector_t max_sectors, max_size = 0; if (unlikely(!map)) goto out; @@ -1740,8 +1738,16 @@ static int dm_merge_bvec(struct request_queue *q, max_sectors = min(max_io_len(bvm->bi_sector, ti), (sector_t) queue_max_sectors(q)); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ - max_size = 0; + + /* + * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t + * to the targets' merge function since it holds sectors not bytes). + * Just doing this as an interim fix for stable@ because the more + * comprehensive cleanup of switching to sector_t will impact every + * DM target that implements a ->merge hook. + */ + if (max_size > INT_MAX) + max_size = INT_MAX; /* * merge_bvec_fn() returns number of bytes @@ -1749,7 +1755,7 @@ static int dm_merge_bvec(struct request_queue *q, * max is precomputed maximal io size */ if (max_size && ti->type->merge) - max_size = ti->type->merge(ti, bvm, biovec, max_size); + max_size = ti->type->merge(ti, bvm, biovec, (int) max_size); /* * If the target doesn't support merge method and some of the devices * provided their merge_bvec method (we know this by looking for the @@ -1971,8 +1977,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq, dm_kill_unmapped_request(rq, r); return r; } - if (IS_ERR(clone)) - return DM_MAPIO_REQUEUE; + if (r != DM_MAPIO_REMAPPED) + return r; if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { /* -ENOMEM */ ti->type->release_clone_rq(clone); @@ -2753,13 +2759,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { /* clone request is allocated at the end of the pdu */ tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); - if (!clone_rq(rq, md, tio, GFP_ATOMIC)) - return BLK_MQ_RQ_QUEUE_BUSY; + (void) clone_rq(rq, md, tio, GFP_ATOMIC); queue_kthread_work(&md->kworker, &tio->work); } else { /* Direct call is fine since .queue_rq allows allocations */ - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) - dm_requeue_unmapped_original_request(md, rq); + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { + /* Undo dm_start_request() before requeuing */ + rq_completed(md, rq_data_dir(rq), false); + return BLK_MQ_RQ_QUEUE_BUSY; + } } return BLK_MQ_RQ_QUEUE_OK; diff --git a/drivers/md/md.c b/drivers/md/md.c index 593a02476c78..27506302eb7a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); @@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len) test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; else if (cmd_match(page, "resync")) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); else if (cmd_match(page, "recover")) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } else if (cmd_match(page, "reshape")) { int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; err = mddev_lock(mddev); if (!err) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); mddev_unlock(mddev); } @@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); else if (!cmd_match(page, "repair")) return -EINVAL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b9f2b9cc6060..553d54b87052 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static bool stripe_can_batch(struct stripe_head *sh) { return test_bit(STRIPE_BATCH_READY, &sh->state) && + !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && is_full_stripe_write(sh); } @@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); + if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { + int seq = sh->bm_seq; + if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && + sh->batch_head->bm_seq > seq) + seq = sh->batch_head->bm_seq; + set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); + sh->batch_head->bm_seq = seq; + } + atomic_inc(&sh->count); unlock_out: unlock_two_stripes(head, sh); @@ -2987,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_iter.bi_sector, (unsigned long long)sh->sector, dd_idx); - spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap && firstwrite) { + /* Cannot hold spinlock over bitmap_startwrite, + * but must ensure this isn't added to a batch until + * we have added to the bitmap and set bm_seq. + * So set STRIPE_BITMAP_PENDING to prevent + * batching. + * If multiple add_stripe_bio() calls race here they + * much all set STRIPE_BITMAP_PENDING. So only the first one + * to complete "bitmap_startwrite" gets to set + * STRIPE_BIT_DELAY. This is important as once a stripe + * is added to a batch, STRIPE_BIT_DELAY cannot be changed + * any more. + */ + set_bit(STRIPE_BITMAP_PENDING, &sh->state); + spin_unlock_irq(&sh->stripe_lock); bitmap_startwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0); - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); + spin_lock_irq(&sh->stripe_lock); + clear_bit(STRIPE_BITMAP_PENDING, &sh->state); + if (!sh->batch_head) { + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } } + spin_unlock_irq(&sh->stripe_lock); if (stripe_can_batch(sh)) stripe_add_to_batch_list(conf, sh); @@ -3392,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh, set_bit(STRIPE_HANDLE, &sh->state); } +static void break_stripe_batch_list(struct stripe_head *head_sh, + unsigned long handle_flags); /* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but @@ -3405,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf, int discard_pending = 0; struct stripe_head *head_sh = sh; bool do_endio = false; - int wakeup_nr = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -3494,44 +3523,8 @@ unhash: if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); - if (!head_sh->batch_head || !do_endio) - return; - for (i = 0; i < head_sh->disks; i++) { - if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) - wakeup_nr++; - } - while (!list_empty(&head_sh->batch_list)) { - int i; - sh = list_first_entry(&head_sh->batch_list, - struct stripe_head, batch_list); - list_del_init(&sh->batch_list); - - set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, - head_sh->state & ~((1 << STRIPE_ACTIVE) | - (1 << STRIPE_PREREAD_ACTIVE) | - STRIPE_EXPAND_SYNC_FLAG)); - sh->check_state = head_sh->check_state; - sh->reconstruct_state = head_sh->reconstruct_state; - for (i = 0; i < sh->disks; i++) { - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wakeup_nr++; - sh->dev[i].flags = head_sh->dev[i].flags; - } - - spin_lock_irq(&sh->stripe_lock); - sh->batch_head = NULL; - spin_unlock_irq(&sh->stripe_lock); - if (sh->state & STRIPE_EXPAND_SYNC_FLAG) - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); - } - - spin_lock_irq(&head_sh->stripe_lock); - head_sh->batch_head = NULL; - spin_unlock_irq(&head_sh->stripe_lock); - wake_up_nr(&conf->wait_for_overlap, wakeup_nr); - if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) - set_bit(STRIPE_HANDLE, &head_sh->state); + if (head_sh->batch_head && do_endio) + break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); } static void handle_stripe_dirtying(struct r5conf *conf, @@ -4172,9 +4165,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) static int clear_batch_ready(struct stripe_head *sh) { + /* Return '1' if this is a member of batch, or + * '0' if it is a lone stripe or a head which can now be + * handled. + */ struct stripe_head *tmp; if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) - return 0; + return (sh->batch_head && sh->batch_head != sh); spin_lock(&sh->stripe_lock); if (!sh->batch_head) { spin_unlock(&sh->stripe_lock); @@ -4202,38 +4199,65 @@ static int clear_batch_ready(struct stripe_head *sh) return 0; } -static void check_break_stripe_batch_list(struct stripe_head *sh) +static void break_stripe_batch_list(struct stripe_head *head_sh, + unsigned long handle_flags) { - struct stripe_head *head_sh, *next; + struct stripe_head *sh, *next; int i; - - if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) - return; - - head_sh = sh; + int do_wakeup = 0; list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { list_del_init(&sh->batch_list); - set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, - head_sh->state & ~((1 << STRIPE_ACTIVE) | - (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED) | - STRIPE_EXPAND_SYNC_FLAG)); + WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | + (1 << STRIPE_SYNCING) | + (1 << STRIPE_REPLACED) | + (1 << STRIPE_PREREAD_ACTIVE) | + (1 << STRIPE_DELAYED) | + (1 << STRIPE_BIT_DELAY) | + (1 << STRIPE_FULL_WRITE) | + (1 << STRIPE_BIOFILL_RUN) | + (1 << STRIPE_COMPUTE_RUN) | + (1 << STRIPE_OPS_REQ_PENDING) | + (1 << STRIPE_DISCARD) | + (1 << STRIPE_BATCH_READY) | + (1 << STRIPE_BATCH_ERR) | + (1 << STRIPE_BITMAP_PENDING))); + WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | + (1 << STRIPE_REPLACED))); + + set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | + (1 << STRIPE_DEGRADED)), + head_sh->state & (1 << STRIPE_INSYNC)); + sh->check_state = head_sh->check_state; sh->reconstruct_state = head_sh->reconstruct_state; - for (i = 0; i < sh->disks; i++) + for (i = 0; i < sh->disks; i++) { + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + do_wakeup = 1; sh->dev[i].flags = head_sh->dev[i].flags & (~((1 << R5_WriteError) | (1 << R5_Overlap))); - + } spin_lock_irq(&sh->stripe_lock); sh->batch_head = NULL; spin_unlock_irq(&sh->stripe_lock); - - set_bit(STRIPE_HANDLE, &sh->state); + if (handle_flags == 0 || + sh->state & handle_flags) + set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } + spin_lock_irq(&head_sh->stripe_lock); + head_sh->batch_head = NULL; + spin_unlock_irq(&head_sh->stripe_lock); + for (i = 0; i < head_sh->disks; i++) + if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) + do_wakeup = 1; + if (head_sh->state & handle_flags) + set_bit(STRIPE_HANDLE, &head_sh->state); + + if (do_wakeup) + wake_up(&head_sh->raid_conf->wait_for_overlap); } static void handle_stripe(struct stripe_head *sh) @@ -4258,7 +4282,8 @@ static void handle_stripe(struct stripe_head *sh) return; } - check_break_stripe_batch_list(sh); + if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) + break_stripe_batch_list(sh, 0); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); @@ -4312,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh) if (s.failed > conf->max_degraded) { sh->check_state = 0; sh->reconstruct_state = 0; + break_stripe_batch_list(sh, 0); if (s.to_read+s.to_write+s.written) handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); if (s.syncing + s.replacing) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd86074b..896d603ad0da 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -337,9 +337,12 @@ enum { STRIPE_ON_RELEASE_LIST, STRIPE_BATCH_READY, STRIPE_BATCH_ERR, + STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add + * to batch yet. + */ }; -#define STRIPE_EXPAND_SYNC_FLAG \ +#define STRIPE_EXPAND_SYNC_FLAGS \ ((1 << STRIPE_EXPAND_SOURCE) |\ (1 << STRIPE_EXPAND_READY) |\ (1 << STRIPE_EXPANDING) |\ |