From a347c7ad8edf4c5685154f3fdc3c12fc1db800ba Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Sun, 10 Jun 2018 22:38:24 +0200 Subject: blk-mq: reinit q->tag_set_list entry only after grace period It is not allowed to reinit q->tag_set_list list entry while RCU grace period has not completed yet, otherwise the following soft lockup in blk_mq_sched_restart() happens: [ 1064.252652] watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [fio:9270] [ 1064.254445] task: ffff99b912e8b900 task.stack: ffffa6d54c758000 [ 1064.254613] RIP: 0010:blk_mq_sched_restart+0x96/0x150 [ 1064.256510] Call Trace: [ 1064.256664] [ 1064.256824] blk_mq_free_request+0xea/0x100 [ 1064.256987] msg_io_conf+0x59/0xd0 [ibnbd_client] [ 1064.257175] complete_rdma_req+0xf2/0x230 [ibtrs_client] [ 1064.257340] ? ibtrs_post_recv_empty+0x4d/0x70 [ibtrs_core] [ 1064.257502] ibtrs_clt_rdma_done+0xd1/0x1e0 [ibtrs_client] [ 1064.257669] ib_create_qp+0x321/0x380 [ib_core] [ 1064.257841] ib_process_cq_direct+0xbd/0x120 [ib_core] [ 1064.258007] irq_poll_softirq+0xb7/0xe0 [ 1064.258165] __do_softirq+0x106/0x2a2 [ 1064.258328] irq_exit+0x92/0xa0 [ 1064.258509] do_IRQ+0x4a/0xd0 [ 1064.258660] common_interrupt+0x7a/0x7a [ 1064.258818] Meanwhile another context frees other queue but with the same set of shared tags: [ 1288.201183] INFO: task bash:5910 blocked for more than 180 seconds. [ 1288.201833] bash D 0 5910 5820 0x00000000 [ 1288.202016] Call Trace: [ 1288.202315] schedule+0x32/0x80 [ 1288.202462] schedule_timeout+0x1e5/0x380 [ 1288.203838] wait_for_completion+0xb0/0x120 [ 1288.204137] __wait_rcu_gp+0x125/0x160 [ 1288.204287] synchronize_sched+0x6e/0x80 [ 1288.204770] blk_mq_free_queue+0x74/0xe0 [ 1288.204922] blk_cleanup_queue+0xc7/0x110 [ 1288.205073] ibnbd_clt_unmap_device+0x1bc/0x280 [ibnbd_client] [ 1288.205389] ibnbd_clt_unmap_dev_store+0x169/0x1f0 [ibnbd_client] [ 1288.205548] kernfs_fop_write+0x109/0x180 [ 1288.206328] vfs_write+0xb3/0x1a0 [ 1288.206476] SyS_write+0x52/0xc0 [ 1288.206624] do_syscall_64+0x68/0x1d0 [ 1288.206774] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 What happened is the following: 1. There are several MQ queues with shared tags. 2. One queue is about to be freed and now task is in blk_mq_del_queue_tag_set(). 3. Other CPU is in blk_mq_sched_restart() and loops over all queues in tag list in order to find hctx to restart. Because linked list entry was modified in blk_mq_del_queue_tag_set() without proper waiting for a grace period, blk_mq_sched_restart() never ends, spining in list_for_each_entry_rcu_rr(), thus soft lockup. Fix is simple: reinit list entry after an RCU grace period elapsed. Fixes: Fixes: 705cda97ee3a ("blk-mq: Make it safe to use RCU to iterate over blk_mq_tag_set.tag_list") Cc: stable@vger.kernel.org Cc: Sagi Grimberg Cc: linux-block@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: Roman Pen Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d2de0a719ab8..2be78cc30ec5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2349,7 +2349,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_rcu(&q->tag_set_list); - INIT_LIST_HEAD(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_SHARED; @@ -2357,8 +2356,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) blk_mq_update_tag_set_depth(set, false); } mutex_unlock(&set->tag_list_lock); - synchronize_rcu(); + INIT_LIST_HEAD(&q->tag_set_list); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, -- cgit v1.2.3 From da661267398869a553b7f67d739d360aaa1361b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 13:58:45 +0200 Subject: blk-mq: don't time out requests again that are in the timeout handler We can currently call the timeout handler again on a request that has already been handed over to the timeout handler. Prevent that with a new flag. Fixes: 12f5b931 ("blk-mq: Remove generation seqeunce") Reported-by: Andrew Randrianasulu Tested-by: Andrew Randrianasulu Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++++ include/linux/blkdev.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 2be78cc30ec5..8e57b84e50e9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -671,6 +671,7 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); + rq->rq_flags &= ~RQF_TIMED_OUT; if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -770,6 +771,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); static void blk_mq_rq_timed_out(struct request *req, bool reserved) { + req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; @@ -779,6 +781,7 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved) WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } + req->rq_flags &= ~RQF_TIMED_OUT; blk_add_timer(req); } @@ -788,6 +791,8 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) return false; + if (rq->rq_flags & RQF_TIMED_OUT) + return false; deadline = blk_rq_deadline(rq); if (time_after_eq(jiffies, deadline)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bca3a92eb55f..fa6f11751430 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* already slept for hybrid poll */ #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) +/* ->timeout has been called, don't expire again */ +#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3 From e6c3456aa897c9799de5423b28550efad14a51b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 14:26:47 +0200 Subject: blk-mq: remove blk_mq_tagset_iter Unused now that nvme stopped using it. Signed-off-by: Christoph Hellwig Reviewed-by: Jens Axboe --- block/blk-mq-tag.c | 29 ----------------------------- include/linux/blk-mq.h | 2 -- 2 files changed, 31 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 70356a2a11ab..09b2ee6694fb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -311,35 +311,6 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, - int (fn)(void *, struct request *)) -{ - int i, j, ret = 0; - - if (WARN_ON_ONCE(!fn)) - goto out; - - for (i = 0; i < set->nr_hw_queues; i++) { - struct blk_mq_tags *tags = set->tags[i]; - - if (!tags) - continue; - - for (j = 0; j < tags->nr_tags; j++) { - if (!tags->static_rqs[j]) - continue; - - ret = fn(data, tags->static_rqs[j]); - if (ret) - goto out; - } - } - -out: - return ret; -} -EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); - void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fb355173f3c7..e3147eb74222 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -281,8 +281,6 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, - int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.2.3 From be7f99c536c5aeebad29082b7d8dce32077fea14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Jun 2018 13:55:07 +0200 Subject: block: remov blk_queue_invalidate_tags This function is entirely unused, so remove it and the tag_queue_busy member of struct request_queue. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 15 +-------------- block/blk-tag.c | 22 ---------------------- include/linux/blkdev.h | 2 -- 3 files changed, 1 insertion(+), 38 deletions(-) (limited to 'block') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 86927029a52d..207eca58efaa 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -752,18 +752,6 @@ completion of the request to the block layer. This means ending tag operations before calling end_that_request_last()! For an example of a user of these helpers, see the IDE tagged command queueing support. -Certain hardware conditions may dictate a need to invalidate the block tag -queue. For instance, on IDE any tagged request error needs to clear both -the hardware and software block queue and enable the driver to sanely restart -all the outstanding requests. There's a third helper to do that: - - blk_queue_invalidate_tags(struct request_queue *q) - - Clear the internal block tag queue and re-add all the pending requests - to the request queue. The driver will receive them again on the - next request_fn run, just like it did the first time it encountered - them. - 3.2.5.2 Tag info Some block functions exist to query current tag status or to go from a @@ -805,8 +793,7 @@ Internally, block manages tags in the blk_queue_tag structure: Most of the above is simple and straight forward, however busy_list may need a bit of explaining. Normally we don't care too much about request ordering, but in the event of any barrier requests in the tag queue we need to ensure -that requests are restarted in the order they were queue. This may happen -if the driver needs to use blk_queue_invalidate_tags(). +that requests are restarted in the order they were queue. 3.3 I/O Submission diff --git a/block/blk-tag.c b/block/blk-tag.c index 09f19c6c52ce..ada0d7cff62b 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -188,7 +188,6 @@ int blk_queue_init_tags(struct request_queue *q, int depth, */ q->queue_tags = tags; queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); - INIT_LIST_HEAD(&q->tag_busy_list); return 0; } EXPORT_SYMBOL(blk_queue_init_tags); @@ -374,27 +373,6 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) rq->tag = tag; bqt->tag_index[tag] = rq; blk_start_request(rq); - list_add(&rq->queuelist, &q->tag_busy_list); return 0; } EXPORT_SYMBOL(blk_queue_start_tag); - -/** - * blk_queue_invalidate_tags - invalidate all pending tags - * @q: the request queue for the device - * - * Description: - * Hardware conditions may dictate a need to stop all pending requests. - * In this case, we will safely clear the block side of the tag queue and - * readd all requests to the request queue in the right order. - **/ -void blk_queue_invalidate_tags(struct request_queue *q) -{ - struct list_head *tmp, *n; - - lockdep_assert_held(q->queue_lock); - - list_for_each_safe(tmp, n, &q->tag_busy_list) - blk_requeue_request(q, list_entry_rq(tmp)); -} -EXPORT_SYMBOL(blk_queue_invalidate_tags); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fa6f11751430..9154570edf29 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -562,7 +562,6 @@ struct request_queue { unsigned int dma_alignment; struct blk_queue_tag *queue_tags; - struct list_head tag_busy_list; unsigned int nr_sorted; unsigned int in_flight[2]; @@ -1375,7 +1374,6 @@ extern void blk_queue_end_tag(struct request_queue *, struct request *); extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int); extern void blk_queue_free_tags(struct request_queue *); extern int blk_queue_resize_tags(struct request_queue *, int); -extern void blk_queue_invalidate_tags(struct request_queue *); extern struct blk_queue_tag *blk_init_tags(int, int); extern void blk_free_tags(struct blk_queue_tag *); -- cgit v1.2.3 From d6c73964f1e2a07f75057fb32ae46f6599036f93 Mon Sep 17 00:00:00 2001 From: Anatoliy Glagolev Date: Wed, 13 Jun 2018 15:38:51 -0600 Subject: bsg: fix race of bsg_open and bsg_unregister The existing implementation allows races between bsg_unregister and bsg_open paths. bsg_unregister and request_queue cleanup and deletion may start and complete right after bsg_get_device (in bsg_open path) retrieves bsg_class_device and releases the mutex. Then bsg_open path touches freed memory of bsg_class_device and request_queue. One possible fix is to hold the mutex all the way through bsg_get_device instead of releasing it after bsg_class_device retrieval. Reviewed-by: Christoph Hellwig Signed-Off-By: Anatoliy Glagolev Signed-off-by: Jens Axboe --- block/bsg.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index 132e657e2d91..66602c489956 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -693,6 +693,8 @@ static struct bsg_device *bsg_add_device(struct inode *inode, struct bsg_device *bd; unsigned char buf[32]; + lockdep_assert_held(&bsg_mutex); + if (!blk_get_queue(rq)) return ERR_PTR(-ENXIO); @@ -707,14 +709,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode, bsg_set_block(bd, file); atomic_set(&bd->ref_count, 1); - mutex_lock(&bsg_mutex); hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); bsg_dbg(bd, "bound to <%s>, max queue %d\n", format_dev_t(buf, inode->i_rdev), bd->max_queue); - mutex_unlock(&bsg_mutex); return bd; } @@ -722,7 +722,7 @@ static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) { struct bsg_device *bd; - mutex_lock(&bsg_mutex); + lockdep_assert_held(&bsg_mutex); hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { if (bd->queue == q) { @@ -732,7 +732,6 @@ static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) } bd = NULL; found: - mutex_unlock(&bsg_mutex); return bd; } @@ -746,17 +745,18 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file) */ mutex_lock(&bsg_mutex); bcd = idr_find(&bsg_minor_idr, iminor(inode)); - mutex_unlock(&bsg_mutex); - if (!bcd) - return ERR_PTR(-ENODEV); + if (!bcd) { + bd = ERR_PTR(-ENODEV); + goto out_unlock; + } bd = __bsg_get_device(iminor(inode), bcd->queue); - if (bd) - return bd; - - bd = bsg_add_device(inode, bcd->queue, file); + if (!bd) + bd = bsg_add_device(inode, bcd->queue, file); +out_unlock: + mutex_unlock(&bsg_mutex); return bd; } -- cgit v1.2.3