diff options
Diffstat (limited to 'drivers/nvme/target/rdma.c')
-rw-r--r-- | drivers/nvme/target/rdma.c | 197 |
1 files changed, 152 insertions, 45 deletions
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 52e0c5d579a7..e7f43d1e1779 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -33,16 +33,17 @@ #include "nvmet.h" /* - * We allow up to a page of inline data to go with the SQE + * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data */ -#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_MAX_INLINE_SGE 4 +#define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) struct nvmet_rdma_cmd { - struct ib_sge sge[2]; + struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; struct ib_recv_wr wr; - struct scatterlist inline_sg; - struct page *inline_page; + struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; struct nvme_command *nvme_cmd; struct nvmet_rdma_queue *queue; }; @@ -116,6 +117,8 @@ struct nvmet_rdma_device { size_t srq_size; struct kref ref; struct list_head entry; + int inline_data_size; + int inline_page_count; }; static bool nvmet_rdma_use_srq; @@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); static const struct nvmet_fabrics_ops nvmet_rdma_ops; +static int num_pages(int len) +{ + return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); +} + /* XXX: really should move to a generic header sooner or later.. */ static inline u32 get_unaligned_le24(const u8 *p) { @@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); } +static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + int i; + + if (!ndev->inline_data_size) + return; + + sg = c->inline_sg; + sge = &c->sge[1]; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } +} + +static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + struct page *pg; + int len; + int i; + + if (!ndev->inline_data_size) + return 0; + + sg = c->inline_sg; + sg_init_table(sg, ndev->inline_page_count); + sge = &c->sge[1]; + len = ndev->inline_data_size; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) + goto out_err; + sg_assign_page(sg, pg); + sge->addr = ib_dma_map_page(ndev->device, + pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, sge->addr)) + goto out_err; + sge->length = min_t(int, len, PAGE_SIZE); + sge->lkey = ndev->pd->local_dma_lkey; + len -= sge->length; + } + + return 0; +out_err: + for (; i >= 0; i--, sg--, sge--) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } + return -ENOMEM; +} + static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { @@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, c->sge[0].length = sizeof(*c->nvme_cmd); c->sge[0].lkey = ndev->pd->local_dma_lkey; - if (!admin) { - c->inline_page = alloc_pages(GFP_KERNEL, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - if (!c->inline_page) - goto out_unmap_cmd; - c->sge[1].addr = ib_dma_map_page(ndev->device, - c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) - goto out_free_inline_page; - c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; - c->sge[1].lkey = ndev->pd->local_dma_lkey; - } + if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) + goto out_unmap_cmd; c->cqe.done = nvmet_rdma_recv_done; c->wr.wr_cqe = &c->cqe; c->wr.sg_list = c->sge; - c->wr.num_sge = admin ? 1 : 2; + c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; return 0; -out_free_inline_page: - if (!admin) { - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } out_unmap_cmd: ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); @@ -240,12 +297,8 @@ out: static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { - if (!admin) { - ib_dma_unmap_page(ndev->device, c->sge[1].addr, - NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } + if (!admin) + nvmet_rdma_free_inline_pages(ndev, c); ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); kfree(c->nvme_cmd); @@ -383,14 +436,21 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmd) { struct ib_recv_wr *bad_wr; + int ret; ib_dma_sync_single_for_device(ndev->device, cmd->sge[0].addr, cmd->sge[0].length, DMA_FROM_DEVICE); if (ndev->srq) - return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); - return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); + ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); + else + ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); + + if (unlikely(ret)) + pr_err("post_recv cmd failed\n"); + + return ret; } static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) @@ -429,7 +489,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); } - if (rsp->req.sg != &rsp->cmd->inline_sg) + if (rsp->req.sg != rsp->cmd->inline_sg) sgl_free(rsp->req.sg); if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) @@ -493,7 +553,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) rsp->send_sge.addr, rsp->send_sge.length, DMA_TO_DEVICE); - if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); } @@ -529,10 +589,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, u64 off) { - sg_init_table(&rsp->cmd->inline_sg, 1); - sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); - rsp->req.sg = &rsp->cmd->inline_sg; - rsp->req.sg_cnt = 1; + int sg_count = num_pages(len); + struct scatterlist *sg; + int i; + + sg = rsp->cmd->inline_sg; + for (i = 0; i < sg_count; i++, sg++) { + if (i < sg_count - 1) + sg_unmark_end(sg); + else + sg_mark_end(sg); + sg->offset = off; + sg->length = min_t(int, len, PAGE_SIZE - off); + len -= sg->length; + if (!i) + off = 0; + } + + rsp->req.sg = rsp->cmd->inline_sg; + rsp->req.sg_cnt = sg_count; } static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) @@ -544,7 +619,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) if (!nvme_is_write(rsp->req.cmd)) return NVME_SC_INVALID_FIELD | NVME_SC_DNR; - if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; } @@ -743,7 +818,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) srq_size = 4095; /* XXX: tune */ srq_attr.attr.max_wr = srq_size; - srq_attr.attr.max_sge = 2; + srq_attr.attr.max_sge = 1 + ndev->inline_page_count; srq_attr.attr.srq_limit = 0; srq_attr.srq_type = IB_SRQT_BASIC; srq = ib_create_srq(ndev->pd, &srq_attr); @@ -765,11 +840,16 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) ndev->srq = srq; ndev->srq_size = srq_size; - for (i = 0; i < srq_size; i++) - nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + for (i = 0; i < srq_size; i++) { + ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + if (ret) + goto out_free_cmds; + } return 0; +out_free_cmds: + nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); out_destroy_srq: ib_destroy_srq(srq); return ret; @@ -793,7 +873,10 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { + struct nvmet_port *port = cm_id->context; struct nvmet_rdma_device *ndev; + int inline_page_count; + int inline_sge_count; int ret; mutex_lock(&device_list_mutex); @@ -807,6 +890,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; + inline_page_count = num_pages(port->inline_data_size); + inline_sge_count = max(cm_id->device->attrs.max_sge_rd, + cm_id->device->attrs.max_sge) - 1; + if (inline_page_count > inline_sge_count) { + pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", + port->inline_data_size, cm_id->device->name, + inline_sge_count * PAGE_SIZE); + port->inline_data_size = inline_sge_count * PAGE_SIZE; + inline_page_count = inline_sge_count; + } + ndev->inline_data_size = port->inline_data_size; + ndev->inline_page_count = inline_page_count; ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -881,7 +976,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) } else { /* +1 for drain */ qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; - qp_attr.cap.max_recv_sge = 2; + qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; } ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); @@ -899,13 +994,17 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) if (!ndev->srq) { for (i = 0; i < queue->recv_queue_size; i++) { queue->cmds[i].queue = queue; - nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + if (ret) + goto err_destroy_qp; } } out: return ret; +err_destroy_qp: + rdma_destroy_qp(queue->cm_id); err_destroy_cq: ib_free_cq(queue->cq); goto out; @@ -1379,6 +1478,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) return -EINVAL; } + if (port->inline_data_size < 0) { + port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + port->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, port->disc_addr.trsvcid, &addr); if (ret) { @@ -1456,7 +1564,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, - .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, .msdbd = 1, .has_keyed_sgls = 1, .add_port = nvmet_rdma_add_port, |