diff options
Diffstat (limited to 'drivers/nvme/host/rdma.c')
-rw-r--r-- | drivers/nvme/host/rdma.c | 497 |
1 files changed, 265 insertions, 232 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 87bac27ec64b..2a0bba7f50cf 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> +#include <rdma/mr_pool.h> #include <linux/err.h> #include <linux/string.h> #include <linux/atomic.h> @@ -41,17 +42,9 @@ #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_RDMA_NR_AEN_COMMANDS 1 -#define NVME_RDMA_AQ_BLKMQ_DEPTH \ - (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) - struct nvme_rdma_device { - struct ib_device *dev; - struct ib_pd *pd; + struct ib_device *dev; + struct ib_pd *pd; struct kref ref; struct list_head entry; }; @@ -67,6 +60,9 @@ struct nvme_rdma_request { struct nvme_request req; struct ib_mr *mr; struct nvme_rdma_qe sqe; + union nvme_result result; + __le16 status; + refcount_t ref; struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; u32 num_sge; int nents; @@ -79,13 +75,13 @@ struct nvme_rdma_request { }; enum nvme_rdma_queue_flags { - NVME_RDMA_Q_LIVE = 0, - NVME_RDMA_Q_DELETING = 1, + NVME_RDMA_Q_ALLOCATED = 0, + NVME_RDMA_Q_LIVE = 1, + NVME_RDMA_Q_TR_READY = 2, }; struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - atomic_t sig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -105,7 +101,6 @@ struct nvme_rdma_ctrl { /* other member variables */ struct blk_mq_tag_set tag_set; - struct work_struct delete_work; struct work_struct err_work; struct nvme_rdma_qe async_event_sqe; @@ -267,29 +262,6 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) return ret; } -static int nvme_rdma_reinit_request(void *data, struct request *rq) -{ - struct nvme_rdma_ctrl *ctrl = data; - struct nvme_rdma_device *dev = ctrl->device; - struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - int ret = 0; - - ib_dereg_mr(req->mr); - - req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, - ctrl->max_fr_pages); - if (IS_ERR(req->mr)) { - ret = PTR_ERR(req->mr); - req->mr = NULL; - goto out; - } - - req->mr->need_inval = false; - -out: - return ret; -} - static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { @@ -299,9 +271,6 @@ static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; - if (req->mr) - ib_dereg_mr(req->mr); - nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); } @@ -323,21 +292,9 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, if (ret) return ret; - req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, - ctrl->max_fr_pages); - if (IS_ERR(req->mr)) { - ret = PTR_ERR(req->mr); - goto out_free_qe; - } - req->queue = queue; return 0; - -out_free_qe: - nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - return -ENOMEM; } static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -437,9 +394,20 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) struct nvme_rdma_device *dev; struct ib_device *ibdev; + if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags)) + return; + dev = queue->device; ibdev = dev->dev; - rdma_destroy_qp(queue->cm_id); + + ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); + + /* + * The cm_id object might have been destroyed during RDMA connection + * establishment error flow to avoid getting other cma events, thus + * the destruction of the QP shouldn't use rdma_cm API. + */ + ib_destroy_qp(queue->qp); ib_free_cq(queue->ib_cq); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, @@ -448,6 +416,12 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) nvme_rdma_dev_put(dev); } +static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) +{ + return min_t(u32, NVME_RDMA_MAX_SEGMENTS, + ibdev->attrs.max_fast_reg_page_list_len); +} + static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; @@ -490,10 +464,26 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) goto out_destroy_qp; } + ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, + queue->queue_size, + IB_MR_TYPE_MEM_REG, + nvme_rdma_get_max_fr_pages(ibdev)); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "failed to initialize MR pool sized %d for QID %d\n", + queue->queue_size, idx); + goto out_destroy_ring; + } + + set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); + return 0; +out_destroy_ring: + nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); out_destroy_qp: - ib_destroy_qp(queue->qp); + rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: ib_free_cq(queue->ib_cq); out_put_dev: @@ -518,7 +508,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; - atomic_set(&queue->sig_count, 0); queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); @@ -544,16 +533,17 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, ret = nvme_rdma_wait_for_cm(queue); if (ret) { dev_info(ctrl->ctrl.device, - "rdma_resolve_addr wait failed (%d).\n", ret); + "rdma connection establishment failed (%d)\n", ret); goto out_destroy_cm_id; } - clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); + set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags); return 0; out_destroy_cm_id: rdma_destroy_id(queue->cm_id); + nvme_rdma_destroy_queue_ib(queue); return ret; } @@ -568,7 +558,7 @@ static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) { - if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) + if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return; if (nvme_rdma_queue_idx(queue) == 0) { @@ -676,11 +666,10 @@ out_free_queues: return ret; } -static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin) +static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, + struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - struct blk_mq_tag_set *set = admin ? - &ctrl->admin_tag_set : &ctrl->tag_set; blk_mq_free_tag_set(set); nvme_rdma_dev_put(ctrl->device); @@ -697,7 +686,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set = &ctrl->admin_tag_set; memset(set, 0, sizeof(*set)); set->ops = &nvme_rdma_admin_mq_ops; - set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; set->reserved_tags = 2; /* connect + keep-alive */ set->numa_node = NUMA_NO_NODE; set->cmd_size = sizeof(struct nvme_rdma_request) + @@ -705,6 +694,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = 1; set->timeout = ADMIN_TIMEOUT; + set->flags = BLK_MQ_F_NO_SCHED; } else { set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); @@ -748,7 +738,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, nvme_rdma_stop_queue(&ctrl->queues[0]); if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); - nvme_rdma_free_tagset(&ctrl->ctrl, true); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); } nvme_rdma_free_queue(&ctrl->queues[0]); } @@ -764,8 +754,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->device = ctrl->queues[0].device; - ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, - ctrl->device->dev->attrs.max_fast_reg_page_list_len); + ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); if (new) { ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); @@ -779,11 +768,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, error = PTR_ERR(ctrl->ctrl.admin_q); goto out_free_tagset; } - } else { - error = blk_mq_reinit_tagset(&ctrl->admin_tag_set, - nvme_rdma_reinit_request); - if (error) - goto out_free_queue; } error = nvme_rdma_start_queue(ctrl, 0); @@ -825,7 +809,7 @@ out_cleanup_queue: blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, true); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -837,7 +821,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_rdma_stop_io_queues(ctrl); if (remove) { blk_cleanup_queue(ctrl->ctrl.connect_q); - nvme_rdma_free_tagset(&ctrl->ctrl, false); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); } nvme_rdma_free_io_queues(ctrl); } @@ -863,11 +847,6 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_free_tag_set; } } else { - ret = blk_mq_reinit_tagset(&ctrl->tag_set, - nvme_rdma_reinit_request); - if (ret) - goto out_free_io_queues; - blk_mq_update_nr_hw_queues(&ctrl->tag_set, ctrl->ctrl.queue_count - 1); } @@ -883,7 +862,7 @@ out_cleanup_connect_q: blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, false); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret; @@ -922,7 +901,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_wq, &ctrl->delete_work); + nvme_delete_ctrl(&ctrl->ctrl); } } @@ -935,10 +914,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ++ctrl->ctrl.nr_reconnects; - if (ctrl->ctrl.queue_count > 1) - nvme_rdma_destroy_io_queues(ctrl, false); - - nvme_rdma_destroy_admin_queue(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto requeue; @@ -946,7 +921,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) - goto requeue; + goto destroy_admin; } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -956,14 +931,17 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; } - ctrl->ctrl.nr_reconnects = 0; - nvme_start_ctrl(&ctrl->ctrl); - dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); + dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", + ctrl->ctrl.nr_reconnects); + + ctrl->ctrl.nr_reconnects = 0; return; +destroy_admin: + nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.nr_reconnects); @@ -979,17 +957,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - } - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - - /* We must take care of fastfail/requeue all our inflight requests */ - if (ctrl->ctrl.queue_count > 1) blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, false); + } + + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl, false); /* * queues are not a live anymore, so restart the queues to fail fast @@ -998,12 +974,18 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + nvme_rdma_reconnect_or_remove(ctrl); } static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) { - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; queue_work(nvme_wq, &ctrl->err_work); @@ -1031,8 +1013,18 @@ static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) { - if (unlikely(wc->status != IB_WC_SUCCESS)) + struct nvme_rdma_request *req = + container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); + struct request *rq = blk_mq_rq_from_pdu(req); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); + return; + } + + if (refcount_dec_and_test(&req->ref)) + nvme_end_request(rq, req->status, req->result); + } static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, @@ -1043,7 +1035,7 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, .opcode = IB_WR_LOCAL_INV, .next = NULL, .num_sge = 0, - .send_flags = 0, + .send_flags = IB_SEND_SIGNALED, .ex.invalidate_rkey = req->mr->rkey, }; @@ -1057,22 +1049,15 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_rdma_ctrl *ctrl = queue->ctrl; struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; - int res; if (!blk_rq_bytes(rq)) return; - if (req->mr->need_inval) { - res = nvme_rdma_inv_rkey(queue, req); - if (unlikely(res < 0)) { - dev_err(ctrl->ctrl.device, - "Queueing INV WR for rkey %#x failed (%d)\n", - req->mr->rkey, res); - nvme_rdma_error_recovery(queue->ctrl); - } + if (req->mr) { + ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); + req->mr = NULL; } ib_dma_unmap_sg(ibdev, req->sg_table.sgl, @@ -1131,12 +1116,18 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; int nr; + req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs); + if (WARN_ON_ONCE(!req->mr)) + return -EAGAIN; + /* * Align the MR to a 4K page size to match the ctrl page size and * the block virtual boundary. */ nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); if (unlikely(nr < count)) { + ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); + req->mr = NULL; if (nr < 0) return nr; return -EINVAL; @@ -1155,8 +1146,6 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; - req->mr->need_inval = true; - sg->addr = cpu_to_le64(req->mr->iova); put_unaligned_le24(req->mr->length, sg->length); put_unaligned_le32(req->mr->rkey, sg->key); @@ -1176,7 +1165,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->num_sge = 1; req->inline_data = false; - req->mr->need_inval = false; + refcount_set(&req->ref, 2); /* send and recv completions */ c->common.flags |= NVME_CMD_SGL_METABUF; @@ -1213,25 +1202,24 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { - if (unlikely(wc->status != IB_WC_SUCCESS)) - nvme_rdma_wr_error(cq, wc, "SEND"); -} + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_request *req = + container_of(qe, struct nvme_rdma_request, sqe); + struct request *rq = blk_mq_rq_from_pdu(req); -/* - * We want to signal completion at least every queue depth/2. This returns the - * largest power of two that is not above half of (queue size + 1) to optimize - * (avoid divisions). - */ -static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) -{ - int limit = 1 << ilog2((queue->queue_size + 1) / 2); + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvme_rdma_wr_error(cq, wc, "SEND"); + return; + } - return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; + if (refcount_dec_and_test(&req->ref)) + nvme_end_request(rq, req->status, req->result); } static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, - struct ib_send_wr *first, bool flush) + struct ib_send_wr *first) { struct ib_send_wr wr, *bad_wr; int ret; @@ -1240,31 +1228,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, sge->length = sizeof(struct nvme_command), sge->lkey = queue->device->pd->local_dma_lkey; - qe->cqe.done = nvme_rdma_send_done; - wr.next = NULL; wr.wr_cqe = &qe->cqe; wr.sg_list = sge; wr.num_sge = num_sge; wr.opcode = IB_WR_SEND; - wr.send_flags = 0; - - /* - * Unsignalled send completions are another giant desaster in the - * IB Verbs spec: If we don't regularly post signalled sends - * the send queue will fill up and only a QP reset will rescue us. - * Would have been way to obvious to handle this in hardware or - * at least the RDMA stack.. - * - * Always signal the flushes. The magic request used for the flush - * sequencer is not allocated in our driver's tagset and it's - * triggered to be freed by blk_cleanup_queue(). So we need to - * always mark it as signaled to ensure that the "wr_cqe", which is - * embedded in request's payload, is not freed when __ib_process_cq() - * calls wr_cqe->done(). - */ - if (nvme_rdma_queue_sig_limit(queue) || flush) - wr.send_flags |= IB_SEND_SIGNALED; + wr.send_flags = IB_SEND_SIGNALED; if (first) first->next = ≀ @@ -1314,7 +1283,13 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) return queue->ctrl->tag_set.tags[queue_idx - 1]; } -static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "ASYNC"); +} + +static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); struct nvme_rdma_queue *queue = &ctrl->queues[0]; @@ -1324,21 +1299,20 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) struct ib_sge sge; int ret; - if (WARN_ON_ONCE(aer_idx != 0)) - return; - ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); memset(cmd, 0, sizeof(*cmd)); cmd->common.opcode = nvme_admin_async_event; - cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; + cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; cmd->common.flags |= NVME_CMD_SGL_METABUF; nvme_rdma_set_sg_null(cmd); + sqe->cqe.done = nvme_rdma_async_done; + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); - ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); + ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL); WARN_ON_ONCE(ret); } @@ -1359,14 +1333,34 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, } req = blk_mq_rq_to_pdu(rq); - if (rq->tag == tag) - ret = 1; + req->status = cqe->status; + req->result = cqe->result; - if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && - wc->ex.invalidate_rkey == req->mr->rkey) - req->mr->need_inval = false; + if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { + if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) { + dev_err(queue->ctrl->ctrl.device, + "Bogus remote invalidation for rkey %#x\n", + req->mr->rkey); + nvme_rdma_error_recovery(queue->ctrl); + } + } else if (req->mr) { + ret = nvme_rdma_inv_rkey(queue, req); + if (unlikely(ret < 0)) { + dev_err(queue->ctrl->ctrl.device, + "Queueing INV WR for rkey %#x failed (%d)\n", + req->mr->rkey, ret); + nvme_rdma_error_recovery(queue->ctrl); + } + /* the local invalidation completion will end the request */ + return 0; + } + + if (refcount_dec_and_test(&req->ref)) { + if (rq->tag == tag) + ret = 1; + nvme_end_request(rq, req->status, req->result); + } - nvme_end_request(rq, cqe->status, cqe->result); return ret; } @@ -1393,7 +1387,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) * for them but rather special case them here. */ if (unlikely(nvme_rdma_queue_idx(queue) == 0 && - cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else @@ -1590,6 +1584,10 @@ nvme_rdma_timeout(struct request *rq, bool reserved) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + dev_warn(req->queue->ctrl->ctrl.device, + "I/O %d QID %d timeout, reset controller\n", + rq->tag, nvme_rdma_queue_idx(req->queue)); + /* queue error recovery */ nvme_rdma_error_recovery(req->queue->ctrl); @@ -1603,28 +1601,11 @@ nvme_rdma_timeout(struct request *rq, bool reserved) * We cannot accept any other command until the Connect command has completed. */ static inline blk_status_t -nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) -{ - if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { - struct nvme_command *cmd = nvme_req(rq)->cmd; - - if (!blk_rq_is_passthrough(rq) || - cmd->common.opcode != nvme_fabrics_command || - cmd->fabrics.fctype != nvme_fabrics_type_connect) { - /* - * reconnecting state means transport disruption, which - * can take a long time and even might fail permanently, - * so we can't let incoming I/O be requeued forever. - * fail it fast to allow upper layers a chance to - * failover. - */ - if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING) - return BLK_STS_IOERR; - return BLK_STS_RESOURCE; /* try again later */ - } - } - - return 0; +nvme_rdma_is_ready(struct nvme_rdma_queue *queue, struct request *rq) +{ + if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) + return nvmf_check_init_req(&queue->ctrl->ctrl, rq); + return BLK_STS_OK; } static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -1636,14 +1617,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = &req->sqe; struct nvme_command *c = sqe->data; - bool flush = false; struct ib_device *dev; blk_status_t ret; int err; WARN_ON_ONCE(rq->tag < 0); - ret = nvme_rdma_queue_is_ready(queue, rq); + ret = nvme_rdma_is_ready(queue, rq); if (unlikely(ret)) return ret; @@ -1665,13 +1645,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, goto err; } + sqe->cqe.done = nvme_rdma_send_done; + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); - if (req_op(rq) == REQ_OP_FLUSH) - flush = true; err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, - req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); + req->mr ? &req->reg_wr.wr : NULL); if (unlikely(err)) { nvme_rdma_unmap_data(queue, rq); goto err; @@ -1764,50 +1744,9 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_destroy_admin_queue(ctrl, shutdown); } -static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) +static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { - nvme_remove_namespaces(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl, true); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); -} - -static void nvme_rdma_del_ctrl_work(struct work_struct *work) -{ - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, delete_work); - - nvme_stop_ctrl(&ctrl->ctrl); - nvme_rdma_remove_ctrl(ctrl); -} - -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return -EBUSY; - - return 0; -} - -static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - int ret = 0; - - /* - * Keep a reference until all work is flushed since - * __nvme_rdma_del_ctrl can free the ctrl mem - */ - if (!kref_get_unless_zero(&ctrl->ctrl.kref)) - return -EBUSY; - ret = __nvme_rdma_del_ctrl(ctrl); - if (!ret) - flush_work(&ctrl->delete_work); - nvme_put_ctrl(&ctrl->ctrl); - return ret; + nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) @@ -1820,6 +1759,12 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto out_fail; @@ -1831,7 +1776,11 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!changed) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + return; + } nvme_start_ctrl(&ctrl->ctrl); @@ -1839,7 +1788,10 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) out_fail: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - nvme_rdma_remove_ctrl(ctrl); + nvme_remove_namespaces(&ctrl->ctrl); + nvme_rdma_shutdown_ctrl(ctrl, true); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { @@ -1851,10 +1803,87 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reg_write32 = nvmf_reg_write32, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, - .delete_ctrl = nvme_rdma_del_ctrl, + .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, }; +static inline bool +__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl, + struct nvmf_ctrl_options *opts) +{ + char *stdport = __stringify(NVME_RDMA_IP_PORT); + + + if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) || + strcmp(opts->traddr, ctrl->ctrl.opts->traddr)) + return false; + + if (opts->mask & NVMF_OPT_TRSVCID && + ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid)) + return false; + } else if (opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(opts->trsvcid, stdport)) + return false; + } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(stdport, ctrl->ctrl.opts->trsvcid)) + return false; + } + /* else, it's a match as both have stdport. Fall to next checks */ + + /* + * checking the local address is rough. In most cases, one + * is not specified and the host port is selected by the stack. + * + * Assume no match if: + * local address is specified and address is not the same + * local address is not specified but remote is, or vice versa + * (admin using specific host_traddr when it matters). + */ + if (opts->mask & NVMF_OPT_HOST_TRADDR && + ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr)) + return false; + } else if (opts->mask & NVMF_OPT_HOST_TRADDR || + ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) + return false; + /* + * if neither controller had an host port specified, assume it's + * a match as everything else matched. + */ + + return true; +} + +/* + * Fails a connection request if it matches an existing controller + * (association) with the same tuple: + * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN> + * + * if local address is not specified in the request, it will match an + * existing controller with all the other parameters the same and no + * local port address specified as well. + * + * The ports don't need to be compared as they are intrinsically + * already matched by the port pointers supplied. + */ +static bool +nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { + found = __nvme_rdma_options_match(ctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_rdma_ctrl_mutex); + + return found; +} + static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -1891,6 +1920,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, } } + if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) { + ret = -EALREADY; + goto out_free_ctrl; + } + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) @@ -1899,7 +1933,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, INIT_DELAYED_WORK(&ctrl->reconnect_work, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); - INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ @@ -1958,7 +1991,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - kref_get(&ctrl->ctrl.kref); + nvme_get_ctrl(&ctrl->ctrl); mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); @@ -2003,7 +2036,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) dev_info(ctrl->ctrl.device, "Removing ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - __nvme_rdma_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); } mutex_unlock(&nvme_rdma_ctrl_mutex); |