diff options
-rw-r--r-- | block/blk-core.c | 1 | ||||
-rw-r--r-- | block/blk-ioc.c | 44 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 16 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 2 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 6 | ||||
-rw-r--r-- | block/blk-mq.c | 120 | ||||
-rw-r--r-- | block/blk-mq.h | 10 | ||||
-rw-r--r-- | block/blk-sysfs.c | 2 | ||||
-rw-r--r-- | block/elevator.c | 2 | ||||
-rw-r--r-- | block/genhd.c | 5 | ||||
-rw-r--r-- | drivers/block/loop.c | 15 | ||||
-rw-r--r-- | drivers/block/nbd.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 47 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 4 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 45 | ||||
-rw-r--r-- | drivers/pci/msi.c | 16 | ||||
-rw-r--r-- | fs/block_dev.c | 6 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 3 | ||||
-rw-r--r-- | include/linux/pci.h | 6 |
19 files changed, 265 insertions, 89 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index b9e857f4afe8..1086dac8724c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -578,7 +578,6 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_unregister(q->backing_dev_info); put_disk_devt(q->disk_devt); /* @q is and will stay empty, shutdown and put */ diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 6bfa39675337..63898d229cb9 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -37,8 +37,8 @@ static void icq_free_icq_rcu(struct rcu_head *head) } /* - * Exit an icq. Called with both ioc and q locked for sq, only ioc locked for - * mq. + * Exit an icq. Called with ioc locked for blk-mq, and with both ioc + * and queue locked for legacy. */ static void ioc_exit_icq(struct io_cq *icq) { @@ -55,7 +55,10 @@ static void ioc_exit_icq(struct io_cq *icq) icq->flags |= ICQ_EXITED; } -/* Release an icq. Called with both ioc and q locked. */ +/* + * Release an icq. Called with ioc locked for blk-mq, and with both ioc + * and queue locked for legacy. + */ static void ioc_destroy_icq(struct io_cq *icq) { struct io_context *ioc = icq->ioc; @@ -63,7 +66,6 @@ static void ioc_destroy_icq(struct io_cq *icq) struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); - lockdep_assert_held(q->queue_lock); radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); @@ -223,24 +225,40 @@ void exit_io_context(struct task_struct *task) put_io_context_active(ioc); } +static void __ioc_clear_queue(struct list_head *icq_list) +{ + unsigned long flags; + + while (!list_empty(icq_list)) { + struct io_cq *icq = list_entry(icq_list->next, + struct io_cq, q_node); + struct io_context *ioc = icq->ioc; + + spin_lock_irqsave(&ioc->lock, flags); + ioc_destroy_icq(icq); + spin_unlock_irqrestore(&ioc->lock, flags); + } +} + /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared * - * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked. + * Walk @q->icq_list and exit all io_cq's. */ void ioc_clear_queue(struct request_queue *q) { - lockdep_assert_held(q->queue_lock); + LIST_HEAD(icq_list); - while (!list_empty(&q->icq_list)) { - struct io_cq *icq = list_entry(q->icq_list.next, - struct io_cq, q_node); - struct io_context *ioc = icq->ioc; + spin_lock_irq(q->queue_lock); + list_splice_init(&q->icq_list, &icq_list); - spin_lock(&ioc->lock); - ioc_destroy_icq(icq); - spin_unlock(&ioc->lock); + if (q->mq_ops) { + spin_unlock_irq(q->queue_lock); + __ioc_clear_queue(&icq_list); + } else { + __ioc_clear_queue(&icq_list); + spin_unlock_irq(q->queue_lock); } } diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 98c7b061781e..09af8ff18719 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -110,15 +110,14 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, struct blk_mq_alloc_data *data) { struct elevator_queue *e = q->elevator; - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; struct request *rq; blk_queue_enter_live(q); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); - - blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); + data->q = q; + if (likely(!data->ctx)) + data->ctx = blk_mq_get_ctx(q); + if (likely(!data->hctx)) + data->hctx = blk_mq_map_queue(q, data->ctx->cpu); if (e) { data->flags |= BLK_MQ_REQ_INTERNAL; @@ -135,8 +134,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, rq = __blk_mq_alloc_request(data, op); } else { rq = __blk_mq_alloc_request(data, op); - if (rq) - data->hctx->tags->rqs[rq->tag] = rq; } if (rq) { @@ -454,7 +451,8 @@ int blk_mq_sched_setup(struct request_queue *q) */ ret = 0; queue_for_each_hw_ctx(q, hctx, i) { - hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); + hctx->sched_tags = blk_mq_alloc_rq_map(set, i, + q->nr_requests, set->reserved_tags); if (!hctx->sched_tags) { ret = -ENOMEM; break; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 54c84363c1b2..e48bc2c72615 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -181,7 +181,7 @@ found_tag: void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { - if (tag >= tags->nr_reserved_tags) { + if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 63497423c5cd..5cb51e53cc03 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -85,4 +85,10 @@ static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, hctx->tags->rqs[tag] = rq; } +static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, + unsigned int tag) +{ + return tag < tags->nr_reserved_tags; +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 6f35b6fd4799..b2fd175e84d7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -77,10 +77,20 @@ void blk_mq_freeze_queue_start(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); -static void blk_mq_freeze_queue_wait(struct request_queue *q) +void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); + +int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, + unsigned long timeout) +{ + return wait_event_timeout(q->mq_freeze_wq, + percpu_ref_is_zero(&q->q_usage_counter), + timeout); +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); /* * Guarantee no request is in use, so we can change any data structure of @@ -236,6 +246,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, } rq->tag = tag; rq->internal_tag = -1; + data->hctx->tags->rqs[rq->tag] = rq; } blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); @@ -275,10 +286,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, unsigned int flags, unsigned int hctx_idx) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - struct blk_mq_alloc_data alloc_data; + unsigned int cpu; int ret; /* @@ -301,25 +311,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ - hctx = q->queue_hw_ctx[hctx_idx]; - if (!blk_mq_hw_queue_mapped(hctx)) { - ret = -EXDEV; - goto out_queue_exit; + alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { + blk_queue_exit(q); + return ERR_PTR(-EXDEV); } - ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); + cpu = cpumask_first(alloc_data.hctx->cpumask); + alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq) { - ret = -EWOULDBLOCK; - goto out_queue_exit; - } - - return rq; + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); -out_queue_exit: + blk_mq_put_ctx(alloc_data.ctx); blk_queue_exit(q); - return ERR_PTR(ret); + + if (!rq) + return ERR_PTR(-EWOULDBLOCK); + + return rq; } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); @@ -854,6 +862,9 @@ done: return true; } + if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) + data.flags |= BLK_MQ_REQ_RESERVED; + rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { if (blk_mq_tag_busy(data.hctx)) { @@ -867,12 +878,9 @@ done: return false; } -static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) +static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) { - if (rq->tag == -1 || rq->internal_tag == -1) - return; - blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); rq->tag = -1; @@ -882,6 +890,26 @@ static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, } } +static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + __blk_mq_put_driver_tag(hctx, rq); +} + +static void blk_mq_put_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + __blk_mq_put_driver_tag(hctx, rq); +} + /* * If we fail getting a driver tag because all the driver tags are already * assigned and on the dispatch list, BUT the first entry does not have a @@ -991,7 +1019,19 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) bd.rq = rq; bd.list = dptr; - bd.last = list_empty(list); + + /* + * Flag last if we have no more requests, or if we have more + * but can't assign a driver tag to it. + */ + if (list_empty(list)) + bd.last = true; + else { + struct request *nxt; + + nxt = list_first_entry(list, struct request, queuelist); + bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + } ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { @@ -999,7 +1039,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: - blk_mq_put_driver_tag(hctx, rq); + blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -1029,6 +1069,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * that is where we will continue on next queue run. */ if (!list_empty(list)) { + /* + * If we got a driver tag for the next request already, + * free it again. + */ + rq = list_first_entry(list, struct request, queuelist); + blk_mq_put_driver_tag(rq); + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1715,16 +1762,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int reserved_tags) { struct blk_mq_tags *tags; + int node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, - set->numa_node, + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; + + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - set->numa_node); + node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; @@ -1732,7 +1783,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - set->numa_node); + node); if (!tags->static_rqs) { kfree(tags->rqs); blk_mq_free_tags(tags); @@ -1752,6 +1803,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; + int node; + + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); @@ -1773,7 +1829,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, this_order--; do { - page = alloc_pages_node(set->numa_node, + page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) @@ -1806,7 +1862,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, rq, hctx_idx, i, - set->numa_node)) { + node)) { tags->static_rqs[i] = NULL; goto fail; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 24b2256186f3..088ced003c13 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -146,16 +146,6 @@ struct blk_mq_alloc_data { struct blk_mq_hw_ctx *hctx; }; -static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, - struct request_queue *q, unsigned int flags, - struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx) -{ - data->q = q; - data->flags = flags; - data->ctx = ctx; - data->hctx = hctx; -} - static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->flags & BLK_MQ_REQ_INTERNAL) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 002af836aa87..c44b321335f3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -815,9 +815,7 @@ static void blk_release_queue(struct kobject *kobj) blkcg_exit_queue(q); if (q->elevator) { - spin_lock_irq(q->queue_lock); ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); elevator_exit(q->elevator); } diff --git a/block/elevator.c b/block/elevator.c index ac1c9f481a98..01139f549b5b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -983,9 +983,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) if (old_registered) elv_unregister_queue(q); - spin_lock_irq(q->queue_lock); ioc_clear_queue(q); - spin_unlock_irq(q->queue_lock); } /* allocate, init and register new elevator */ diff --git a/block/genhd.c b/block/genhd.c index 2f444b87a5f2..b26a5ea115d0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -681,6 +681,11 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + /* + * Unregister bdi before releasing device numbers (as they can get + * reused and we'd get clashes in sysfs). + */ + bdi_unregister(disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index eeb1db73f44e..0712365f2729 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1142,13 +1142,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) (info->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; - if ((info->lo_flags & LO_FLAGS_PARTSCAN) && - !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_flags |= LO_FLAGS_PARTSCAN; - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - loop_reread_partitions(lo, lo->lo_device); - } - lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_init[0] = info->lo_init[0]; lo->lo_init[1] = info->lo_init[1]; @@ -1163,6 +1156,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) exit: blk_mq_unfreeze_queue(lo->lo_queue); + + if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { + lo->lo_flags |= LO_FLAGS_PARTSCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + loop_reread_partitions(lo, lo->lo_device); + } + return err; } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1541cb880744..7e4287bc19e5 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -675,8 +675,10 @@ static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev) nbd->num_connections) { int i; - for (i = 0; i < nbd->num_connections; i++) + for (i = 0; i < nbd->num_connections; i++) { + sockfd_put(nbd->socks[i]->sock); kfree(nbd->socks[i]); + } kfree(nbd->socks); nbd->socks = NULL; nbd->num_connections = 0; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 25ec4e585220..9b3b57fef446 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2344,6 +2344,53 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_kill_queues); +void nvme_unfreeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_unfreeze_queue(ns->queue); + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_unfreeze); + +void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); + if (timeout <= 0) + break; + } + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); + +void nvme_wait_freeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_freeze_queue_wait(ns->queue); + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_wait_freeze); + +void nvme_start_freeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_freeze_queue_start(ns->queue); + mutex_unlock(&ctrl->namespaces_mutex); +} +EXPORT_SYMBOL_GPL(nvme_start_freeze); + void nvme_stop_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a3da1e90b99d..2aa20e3e5675 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -294,6 +294,10 @@ void nvme_queue_async_events(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_unfreeze(struct nvme_ctrl *ctrl); +void nvme_wait_freeze(struct nvme_ctrl *ctrl); +void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 57a1af52b06e..26a5fd05fe88 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1038,9 +1038,10 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth) + int depth, int node) { - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); + struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, + node); if (!nvmeq) return NULL; @@ -1217,7 +1218,8 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) nvmeq = dev->queues[0]; if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, + dev_to_node(dev->dev)); if (!nvmeq) return -ENOMEM; } @@ -1309,7 +1311,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev) int ret = 0; for (i = dev->queue_count; i <= dev->max_qid; i++) { - if (!nvme_alloc_queue(dev, i, dev->q_depth)) { + /* vector == qid - 1, match nvme_create_queue */ + if (!nvme_alloc_queue(dev, i, dev->q_depth, + pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ret = -ENOMEM; break; } @@ -1671,21 +1675,34 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i, queues; - u32 csts = -1; + bool dead = true; + struct pci_dev *pdev = to_pci_dev(dev->dev); del_timer_sync(&dev->watchdog_timer); mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(to_pci_dev(dev->dev))) { - nvme_stop_queues(&dev->ctrl); - csts = readl(dev->bar + NVME_REG_CSTS); + if (pci_is_enabled(pdev)) { + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + if (dev->ctrl.state == NVME_CTRL_LIVE) + nvme_start_freeze(&dev->ctrl); + dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || + pdev->error_state != pci_channel_io_normal); } + /* + * Give the controller a chance to complete all entered requests if + * doing a safe shutdown. + */ + if (!dead && shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + nvme_stop_queues(&dev->ctrl); + queues = dev->online_queues - 1; for (i = dev->queue_count - 1; i > 0; i--) nvme_suspend_queue(dev->queues[i]); - if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { + if (dead) { /* A device might become IO incapable very soon during * probe, before the admin queue is configured. Thus, * queue_count can be 0 here. @@ -1700,6 +1717,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + + /* + * The driver will not be starting up queues again if shutting down so + * must flush all entered requests to their failed completion to avoid + * deadlocking blk-mq hot-cpu notifier. + */ + if (shutdown) + nvme_start_queues(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); } @@ -1822,7 +1847,9 @@ static void nvme_reset_work(struct work_struct *work) nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); + nvme_wait_freeze(&dev->ctrl); nvme_dev_add(dev); + nvme_unfreeze(&dev->ctrl); } if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 980eaf588281..d571bc330686 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1298,6 +1298,22 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) } EXPORT_SYMBOL(pci_irq_get_affinity); +/** + * pci_irq_get_node - return the numa node of a particular msi vector + * @pdev: PCI device to operate on + * @vec: device-relative interrupt vector index (0-based). + */ +int pci_irq_get_node(struct pci_dev *pdev, int vec) +{ + const struct cpumask *mask; + + mask = pci_irq_get_affinity(pdev, vec); + if (mask) + return local_memory_node(cpu_to_node(cpumask_first(mask))); + return dev_to_node(&pdev->dev); +} +EXPORT_SYMBOL(pci_irq_get_node); + struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) { return to_pci_dev(desc->dev); diff --git a/fs/block_dev.c b/fs/block_dev.c index 77c30f15a02c..2eca00ec4370 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -870,6 +870,7 @@ static void init_once(void *foo) #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif + bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); @@ -884,8 +885,10 @@ static void bdev_evict_inode(struct inode *inode) spin_lock(&bdev_lock); list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); - if (bdev->bd_bdi != &noop_backing_dev_info) + if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); + bdev->bd_bdi = &noop_backing_dev_info; + } } static const struct super_operations bdev_sops = { @@ -988,7 +991,6 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_bdi = &noop_backing_dev_info; bdev->bd_block_size = i_blocksize(inode); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 001d30d727c5..b296a9006117 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -245,6 +245,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); +void blk_mq_freeze_queue_wait(struct request_queue *q); +int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, + unsigned long timeout); int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); int blk_mq_map_queues(struct blk_mq_tag_set *set); diff --git a/include/linux/pci.h b/include/linux/pci.h index 282ed32244ce..eb3da1a04e6c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1323,6 +1323,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, void pci_free_irq_vectors(struct pci_dev *dev); int pci_irq_vector(struct pci_dev *dev, unsigned int nr); const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec); +int pci_irq_get_node(struct pci_dev *pdev, int vec); #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } @@ -1370,6 +1371,11 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, { return cpu_possible_mask; } + +static inline int pci_irq_get_node(struct pci_dev *pdev, int vec) +{ + return first_online_node; +} #endif static inline int |