diff options
Diffstat (limited to 'drivers/infiniband')
41 files changed, 1704 insertions, 1566 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f818538a7f4e..26987d9d7e1c 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -8,9 +8,9 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) -ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ +ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ - roce_gid_mgmt.o + roce_gid_mgmt.o mr_pool.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93ab0ae97208..f0c91ba3178a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -800,6 +800,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, if (id->device != pd->device) return -EINVAL; + qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); @@ -4294,7 +4295,8 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) + if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), + cma_cb_table)) pr_warn("RDMA CMA: failed to add netlink callback\n"); cma_configfs_init(); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index e28a160cdab0..f0572049d291 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -459,7 +459,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, if (pm_addr->ss_family == AF_INET) { struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; - if (pm4_addr->sin_addr.s_addr == INADDR_ANY) { + if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) { struct sockaddr_in *cm4_addr = (struct sockaddr_in *)cm_addr; struct sockaddr_in *cm4_outaddr = @@ -1175,7 +1175,7 @@ static int __init iw_cm_init(void) if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - ret = ibnl_add_client(RDMA_NL_IWCM, RDMA_NL_IWPM_NUM_OPS, + ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), iwcm_nl_cb_table); if (ret) pr_err("iw_cm: couldn't register netlink callbacks\n"); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 9b2bf2fb2b00..b65e06c560d7 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -634,6 +634,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c new file mode 100644 index 000000000000..49d478b2ea94 --- /dev/null +++ b/drivers/infiniband/core/mr_pool.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <rdma/ib_verbs.h> +#include <rdma/mr_pool.h> + +struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + mr = list_first_entry_or_null(list, struct ib_mr, qp_entry); + if (mr) { + list_del(&mr->qp_entry); + qp->mrs_used++; + } + spin_unlock_irqrestore(&qp->mr_lock, flags); + + return mr; +} +EXPORT_SYMBOL(ib_mr_pool_get); + +void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add(&mr->qp_entry, list); + qp->mrs_used--; + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_put); + +int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, + enum ib_mr_type type, u32 max_num_sg) +{ + struct ib_mr *mr; + unsigned long flags; + int ret, i; + + for (i = 0; i < nr; i++) { + mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto out; + } + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add_tail(&mr->qp_entry, list); + spin_unlock_irqrestore(&qp->mr_lock, flags); + } + + return 0; +out: + ib_mr_pool_destroy(qp, list); + return ret; +} +EXPORT_SYMBOL(ib_mr_pool_init); + +void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + while (!list_empty(list)) { + mr = list_first_entry(list, struct ib_mr, qp_entry); + list_del(&mr->qp_entry); + + spin_unlock_irqrestore(&qp->mr_lock, flags); + ib_dereg_mr(mr); + spin_lock_irqsave(&qp->mr_lock, flags); + } + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_destroy); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index d47df9356779..9b8c20c8209b 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -151,12 +151,11 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct ibnl_client *client; int type = nlh->nlmsg_type; int index = RDMA_NL_GET_CLIENT(type); - int op = RDMA_NL_GET_OP(type); + unsigned int op = RDMA_NL_GET_OP(type); list_for_each_entry(client, &client_list, list) { if (client->index == index) { - if (op < 0 || op >= client->nops || - !client->cb_table[op].dump) + if (op >= client->nops || !client->cb_table[op].dump) return -EINVAL; /* diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c new file mode 100644 index 000000000000..1eb9b1294a63 --- /dev/null +++ b/drivers/infiniband/core/rw.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <rdma/mr_pool.h> +#include <rdma/rw.h> + +enum { + RDMA_RW_SINGLE_WR, + RDMA_RW_MULTI_WR, + RDMA_RW_MR, + RDMA_RW_SIG_MR, +}; + +static bool rdma_rw_force_mr; +module_param_named(force_mr, rdma_rw_force_mr, bool, 0); +MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); + +/* + * Check if the device might use memory registration. This is currently only + * true for iWarp devices. In the future we can hopefully fine tune this based + * on HCA driver input. + */ +static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) +{ + if (rdma_protocol_iwarp(dev, port_num)) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +/* + * Check if the device will use memory registration for this RW operation. + * We currently always use memory registrations for iWarp RDMA READs, and + * have a debug option to force usage of MRs. + * + * XXX: In the future we can hopefully fine tune this based on HCA driver + * input. + */ +static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, + enum dma_data_direction dir, int dma_nents) +{ + if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +static inline u32 rdma_rw_max_sge(struct ib_device *dev, + enum dma_data_direction dir) +{ + return dir == DMA_TO_DEVICE ? + dev->attrs.max_sge : dev->attrs.max_sge_rd; +} + +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) +{ + /* arbitrary limit to avoid allocating gigantic resources */ + return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); +} + +static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, + struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, + u32 sg_cnt, u32 offset) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 nents = min(sg_cnt, pages_per_mr); + int count = 0, ret; + + reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs); + if (!reg->mr) + return -EAGAIN; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); + if (ret < nents) { + ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr); + return -EINVAL; + } + + reg->reg_wr.wr.opcode = IB_WR_REG_MR; + reg->reg_wr.mr = reg->mr; + reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + count++; + + reg->sge.addr = reg->mr->iova; + reg->sge.length = reg->mr->length; + return count; +} + +static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + int i, j, ret = 0, count = 0; + + ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; + ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ctx->nr_ops; i++) { + struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL; + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; + u32 nents = min(sg_cnt, pages_per_mr); + + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt, + offset); + if (ret < 0) + goto out_free; + count += ret; + + if (prev) { + if (reg->mr->need_inval) + prev->wr.wr.next = ®->inv_wr; + else + prev->wr.wr.next = ®->reg_wr.wr; + } + + reg->reg_wr.wr.next = ®->wr.wr; + + reg->wr.wr.sg_list = ®->sge; + reg->wr.wr.num_sge = 1; + reg->wr.remote_addr = remote_addr; + reg->wr.rkey = rkey; + if (dir == DMA_TO_DEVICE) { + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; + } else if (!rdma_cap_read_inv(qp->device, port_num)) { + reg->wr.wr.opcode = IB_WR_RDMA_READ; + } else { + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; + } + count++; + + remote_addr += reg->sge.length; + sg_cnt -= nents; + for (j = 0; j < nents; j++) + sg = sg_next(sg); + offset = 0; + } + + ctx->type = RDMA_RW_MR; + return count; + +out_free: + while (--i >= 0) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); +out: + return ret; +} + +static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 max_sge = rdma_rw_max_sge(dev, dir); + struct ib_sge *sge; + u32 total_len = 0, i, j; + + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge); + + ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL); + if (!ctx->map.sges) + goto out; + + ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL); + if (!ctx->map.wrs) + goto out_free_sges; + + for (i = 0; i < ctx->nr_ops; i++) { + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; + u32 nr_sge = min(sg_cnt, max_sge); + + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->remote_addr = remote_addr + total_len; + rdma_wr->rkey = rkey; + rdma_wr->wr.sg_list = sge; + + for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { + rdma_wr->wr.num_sge++; + + sge->addr = ib_sg_dma_address(dev, sg) + offset; + sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->lkey = qp->pd->local_dma_lkey; + + total_len += sge->length; + sge++; + sg_cnt--; + offset = 0; + } + + if (i + 1 < ctx->nr_ops) + rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; + } + + ctx->type = RDMA_RW_MULTI_WR; + return ctx->nr_ops; + +out_free_sges: + kfree(ctx->map.sges); +out: + return -ENOMEM; +} + +static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; + + ctx->nr_ops = 1; + + ctx->single.sge.lkey = qp->pd->local_dma_lkey; + ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; + ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + + memset(rdma_wr, 0, sizeof(*rdma_wr)); + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->wr.sg_list = &ctx->single.sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + + ctx->type = RDMA_RW_SINGLE_WR; + return 1; +} + +/** + * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @sg_offset: current byte offset into @sg + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + int ret; + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + /* + * Skip to the S/G entry that sg_offset falls into: + */ + for (;;) { + u32 len = ib_sg_dma_len(dev, sg); + + if (sg_offset < len) + break; + + sg = sg_next(sg); + sg_offset -= len; + sg_cnt--; + } + + ret = -EIO; + if (WARN_ON_ONCE(sg_cnt == 0)) + goto out_unmap_sg; + + if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) { + ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt, + sg_offset, remote_addr, rkey, dir); + } else if (sg_cnt > 1) { + ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset, + remote_addr, rkey, dir); + } else { + ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset, + remote_addr, rkey, dir); + } + + if (ret < 0) + goto out_unmap_sg; + return ret; + +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_init); + +/** + * rdma_rw_ctx_signature init - initialize a RW context with signature offload + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist to READ/WRITE protection information from/to + * @prot_sg_cnt: number of entries in @prot_sg + * @sig_attrs: signature offloading algorithms + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + struct ib_sig_attrs *sig_attrs, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + struct ib_rdma_wr *rdma_wr; + struct ib_send_wr *prev_wr = NULL; + int count = 0, ret; + + if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { + pr_err("SG count too large\n"); + return -EINVAL; + } + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); + if (!ret) { + ret = -ENOMEM; + goto out_unmap_sg; + } + prot_sg_cnt = ret; + + ctx->type = RDMA_RW_SIG_MR; + ctx->nr_ops = 1; + ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL); + if (!ctx->sig) { + ret = -ENOMEM; + goto out_unmap_prot_sg; + } + + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0); + if (ret < 0) + goto out_free_ctx; + count += ret; + prev_wr = &ctx->sig->data.reg_wr.wr; + + if (prot_sg_cnt) { + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot, + prot_sg, prot_sg_cnt, 0); + if (ret < 0) + goto out_destroy_data_mr; + count += ret; + + if (ctx->sig->prot.inv_wr.next) + prev_wr->next = &ctx->sig->prot.inv_wr; + else + prev_wr->next = &ctx->sig->prot.reg_wr.wr; + prev_wr = &ctx->sig->prot.reg_wr.wr; + } else { + ctx->sig->prot.mr = NULL; + } + + ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->sig->sig_mr) { + ret = -EAGAIN; + goto out_destroy_prot_mr; + } + + if (ctx->sig->sig_mr->need_inval) { + memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr)); + + ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV; + ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey; + + prev_wr->next = &ctx->sig->sig_inv_wr; + prev_wr = &ctx->sig->sig_inv_wr; + } + + ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR; + ctx->sig->sig_wr.wr.wr_cqe = NULL; + ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge; + ctx->sig->sig_wr.wr.num_sge = 1; + ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; + ctx->sig->sig_wr.sig_attrs = sig_attrs; + ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr; + if (prot_sg_cnt) + ctx->sig->sig_wr.prot = &ctx->sig->prot.sge; + prev_wr->next = &ctx->sig->sig_wr.wr; + prev_wr = &ctx->sig->sig_wr.wr; + count++; + + ctx->sig->sig_sge.addr = 0; + ctx->sig->sig_sge.length = ctx->sig->data.sge.length; + if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE) + ctx->sig->sig_sge.length += ctx->sig->prot.sge.length; + + rdma_wr = &ctx->sig->data.wr; + rdma_wr->wr.sg_list = &ctx->sig->sig_sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + prev_wr->next = &rdma_wr->wr; + prev_wr = &rdma_wr->wr; + count++; + + return count; + +out_destroy_prot_mr: + if (prot_sg_cnt) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); +out_destroy_data_mr: + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); +out_free_ctx: + kfree(ctx->sig); +out_unmap_prot_sg: + ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_signature_init); + +/* + * Now that we are going to post the WRs we can update the lkey and need_inval + * state on the MRs. If we were doing this at init time, we would get double + * or missing invalidations if a context was initialized but not actually + * posted. + */ +static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) +{ + reg->mr->need_inval = need_inval; + ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey)); + reg->reg_wr.key = reg->mr->lkey; + reg->sge.lkey = reg->mr->lkey; +} + +/** + * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Return the WR chain for the set of RDMA READ/WRITE operations described by + * @ctx, as well as any memory registration operations needed. If @chain_wr + * is non-NULL the WR it points to will be appended to the chain of WRs posted. + * If @chain_wr is not set @cqe must be set so that the caller gets a + * completion notification. + */ +struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *last_wr; + int i; + + switch (ctx->type) { + case RDMA_RW_SIG_MR: + rdma_rw_update_lkey(&ctx->sig->data, true); + if (ctx->sig->prot.mr) + rdma_rw_update_lkey(&ctx->sig->prot, true); + + ctx->sig->sig_mr->need_inval = true; + ib_update_fast_reg_key(ctx->sig->sig_mr, + ib_inc_rkey(ctx->sig->sig_mr->lkey)); + ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey; + + if (ctx->sig->data.inv_wr.next) + first_wr = &ctx->sig->data.inv_wr; + else + first_wr = &ctx->sig->data.reg_wr.wr; + last_wr = &ctx->sig->data.wr.wr; + break; + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) { + rdma_rw_update_lkey(&ctx->reg[i], + ctx->reg[i].wr.wr.opcode != + IB_WR_RDMA_READ_WITH_INV); + } + + if (ctx->reg[0].inv_wr.next) + first_wr = &ctx->reg[0].inv_wr; + else + first_wr = &ctx->reg[0].reg_wr.wr; + last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; + break; + case RDMA_RW_MULTI_WR: + first_wr = &ctx->map.wrs[0].wr; + last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; + break; + case RDMA_RW_SINGLE_WR: + first_wr = &ctx->single.wr.wr; + last_wr = &ctx->single.wr.wr; + break; + default: + BUG(); + } + + if (chain_wr) { + last_wr->next = chain_wr; + } else { + last_wr->wr_cqe = cqe; + last_wr->send_flags |= IB_SEND_SIGNALED; + } + + return first_wr; +} +EXPORT_SYMBOL(rdma_rw_ctx_wrs); + +/** + * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Post the set of RDMA READ/WRITE operations described by @ctx, as well as + * any memory registration operations needed. If @chain_wr is non-NULL the + * WR it points to will be appended to the chain of WRs posted. If @chain_wr + * is not set @cqe must be set so that the caller gets a completion + * notification. + */ +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *bad_wr; + + first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr); + return ib_post_send(qp, first_wr, &bad_wr); +} +EXPORT_SYMBOL(rdma_rw_ctx_post); + +/** + * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) +{ + int i; + + switch (ctx->type) { + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); + break; + case RDMA_RW_MULTI_WR: + kfree(ctx->map.wrs); + kfree(ctx->map.sges); + break; + case RDMA_RW_SINGLE_WR: + break; + default: + BUG(); + break; + } + + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy); + +/** + * rdma_rw_ctx_destroy_signature - release all resources allocated by + * rdma_rw_ctx_init_signature + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist that was used for the READ/WRITE of the PI + * @prot_sg_cnt: number of entries in @prot_sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + enum dma_data_direction dir) +{ + if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) + return; + + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + + if (ctx->sig->prot.mr) { + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); + ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); + } + + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr); + kfree(ctx->sig); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); + +void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) +{ + u32 factor; + + WARN_ON_ONCE(attr->port_num == 0); + + /* + * Each context needs at least one RDMA READ or WRITE WR. + * + * For some hardware we might need more, eventually we should ask the + * HCA driver for a multiplier here. + */ + factor = 1; + + /* + * If the devices needs MRs to perform RDMA READ or WRITE operations, + * we'll need two additional MRs for the registrations and the + * invalidation. + */ + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + factor += 6; /* (inv + reg) * (data + prot + sig) */ + else if (rdma_rw_can_use_mr(dev, attr->port_num)) + factor += 2; /* inv + reg */ + + attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; + + /* + * But maybe we were just too high in the sky and the device doesn't + * even support all we need, and we'll have to live with what we get.. + */ + attr->cap.max_send_wr = + min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); +} + +int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) +{ + struct ib_device *dev = qp->pd->device; + u32 nr_mrs = 0, nr_sig_mrs = 0; + int ret = 0; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) { + nr_sig_mrs = attr->cap.max_rdma_ctxs; + nr_mrs = attr->cap.max_rdma_ctxs * 2; + } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { + nr_mrs = attr->cap.max_rdma_ctxs; + } + + if (nr_mrs) { + ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, + IB_MR_TYPE_MEM_REG, + rdma_rw_fr_page_list_len(dev)); + if (ret) { + pr_err("%s: failed to allocated %d MRs\n", + __func__, nr_mrs); + return ret; + } + } + + if (nr_sig_mrs) { + ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, + IB_MR_TYPE_SIGNATURE, 2); + if (ret) { + pr_err("%s: failed to allocated %d SIG MRs\n", + __func__, nr_mrs); + goto out_free_rdma_mrs; + } + } + + return 0; + +out_free_rdma_mrs: + ib_mr_pool_destroy(qp, &qp->rdma_mrs); + return ret; +} + +void rdma_rw_cleanup_mrs(struct ib_qp *qp) +{ + ib_mr_pool_destroy(qp, &qp->sig_mrs); + ib_mr_pool_destroy(qp, &qp->rdma_mrs); +} diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8a09c0fb268d..3ebd108bcc5f 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -536,7 +536,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); if (!data) { - kfree_skb(skb); + nlmsg_free(skb); return -EMSGSIZE; } @@ -1820,7 +1820,7 @@ static int __init ib_sa_init(void) goto err3; } - if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS, + if (ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ib_sa_cb_table), ib_sa_cb_table)) { pr_err("Failed to add netlink callback\n"); ret = -EINVAL; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6fdc7ecdaca0..03e39c2aaf66 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1833,7 +1833,8 @@ static int create_qp(struct ib_uverbs_file *file, if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV)) { + IB_QP_CREATE_MANAGED_RECV | + IB_QP_CREATE_SCATTER_FCS)) { ret = -EINVAL; goto err_put; } @@ -3655,6 +3656,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.hca_core_clock = attr.hca_core_clock; resp.response_length += sizeof(resp.hca_core_clock); + if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex)) + goto end; + + resp.device_cap_flags_ex = attr.device_cap_flags; + resp.response_length += sizeof(resp.device_cap_flags_ex); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b65b3541e732..1d7d4cf442e3 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -48,6 +48,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> +#include <rdma/rw.h> #include "core_priv.h" @@ -723,59 +724,89 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, } EXPORT_SYMBOL(ib_open_qp); +static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *real_qp = qp; + + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + return qp; +} + struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { - struct ib_qp *qp, *real_qp; - struct ib_device *device; + struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_qp *qp; + int ret; + + /* + * If the callers is using the RDMA API calculate the resources + * needed for the RDMA READ/WRITE operations. + * + * Note that these callers need to pass in a port number. + */ + if (qp_init_attr->cap.max_rdma_ctxs) + rdma_rw_init_qp(device, qp_init_attr); - device = pd ? pd->device : qp_init_attr->xrcd->device; qp = device->create_qp(pd, qp_init_attr, NULL); + if (IS_ERR(qp)) + return qp; + + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + + atomic_set(&qp->usecnt, 0); + qp->mrs_used = 0; + spin_lock_init(&qp->mr_lock); + INIT_LIST_HEAD(&qp->rdma_mrs); + INIT_LIST_HEAD(&qp->sig_mrs); + + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) + return ib_create_xrc_qp(qp, qp_init_attr); + + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } - if (!IS_ERR(qp)) { - qp->device = device; - qp->real_qp = qp; - qp->uobject = NULL; - qp->qp_type = qp_init_attr->qp_type; - - atomic_set(&qp->usecnt, 0); - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - qp->event_handler = __ib_shared_qp_event_handler; - qp->qp_context = qp; - qp->pd = NULL; - qp->send_cq = qp->recv_cq = NULL; - qp->srq = NULL; - qp->xrcd = qp_init_attr->xrcd; - atomic_inc(&qp_init_attr->xrcd->usecnt); - INIT_LIST_HEAD(&qp->open_list); - - real_qp = qp; - qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, - qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->destroy_qp(real_qp); - } else { - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->cap.max_rdma_ctxs) { + ret = rdma_rw_init_mrs(qp, qp_init_attr); + if (ret) { + pr_err("failed to init MR pool ret= %d\n", ret); + ib_destroy_qp(qp); + qp = ERR_PTR(ret); } } @@ -1250,6 +1281,8 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_srq *srq; int ret; + WARN_ON_ONCE(qp->mrs_used > 0); + if (atomic_read(&qp->usecnt)) return -EBUSY; @@ -1261,6 +1294,9 @@ int ib_destroy_qp(struct ib_qp *qp) rcq = qp->recv_cq; srq = qp->srq; + if (!qp->uobject) + rdma_rw_cleanup_mrs(qp); + ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) @@ -1343,6 +1379,7 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1389,6 +1426,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1597,6 +1635,7 @@ EXPORT_SYMBOL(ib_set_vf_guid); * @mr: memory region * @sg: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg * @page_size: page vector desired page size * * Constraints: @@ -1615,17 +1654,15 @@ EXPORT_SYMBOL(ib_set_vf_guid); * After this completes successfully, the memory region * is ready for registration. */ -int ib_map_mr_sg(struct ib_mr *mr, - struct scatterlist *sg, - int sg_nents, - unsigned int page_size) +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->map_mr_sg)) return -ENOSYS; mr->page_size = page_size; - return mr->device->map_mr_sg(mr, sg, sg_nents); + return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg); @@ -1635,6 +1672,10 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset_p: IN: start offset in bytes into sg + * OUT: offset in bytes for element n of the sg of the first + * byte that has not been processed where n is the return + * value of this function. * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest @@ -1645,23 +1686,26 @@ EXPORT_SYMBOL(ib_map_mr_sg); * Returns the number of sg elements that were assigned to * a page vector. */ -int ib_sg_to_pages(struct ib_mr *mr, - struct scatterlist *sgl, - int sg_nents, - int (*set_page)(struct ib_mr *, u64)) +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; u64 last_end_dma_addr = 0; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; - mr->iova = sg_dma_address(&sgl[0]); + if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) + return -EINVAL; + + mr->iova = sg_dma_address(&sgl[0]) + sg_offset; mr->length = 0; for_each_sg(sgl, sg, sg_nents, i) { - u64 dma_addr = sg_dma_address(sg); - unsigned int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg) + sg_offset; + u64 prev_addr = dma_addr; + unsigned int dma_len = sg_dma_len(sg) - sg_offset; u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; @@ -1685,8 +1729,14 @@ int ib_sg_to_pages(struct ib_mr *mr, do { ret = set_page(mr, page_addr); - if (unlikely(ret < 0)) - return i ? : ret; + if (unlikely(ret < 0)) { + sg_offset = prev_addr - sg_dma_address(sg); + mr->length += prev_addr - dma_addr; + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i || sg_offset ? i : ret; + } + prev_addr = page_addr; next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); @@ -1694,8 +1744,12 @@ next_page: mr->length += dma_len; last_end_dma_addr = end_dma_addr; last_page_off = end_dma_addr & ~page_mask; + + sg_offset = 0; } + if (sg_offset_p) + *sg_offset_p = 0; return i; } EXPORT_SYMBOL(ib_sg_to_pages); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 3234a8be16f6..47cb927a0dd6 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -783,15 +783,14 @@ static int iwch_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -static int iwch_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) { struct iwch_mr *mhp = to_iwch_mr(ibmr); mhp->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, iwch_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page); } static int iwch_destroy_qp(struct ib_qp *ib_qp) diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index d4e2b519a192..f6f34a75af27 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -924,9 +924,8 @@ void c4iw_qp_rem_ref(struct ib_qp *qp); struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int c4iw_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int c4iw_dealloc_mw(struct ib_mw *mw); struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index d495675ea68d..55d0651ee4de 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -691,15 +691,14 @@ static int c4iw_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int c4iw_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct c4iw_mr *mhp = to_c4iw_mr(ibmr); mhp->mpl_len = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, c4iw_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page); } int c4iw_dereg_mr(struct ib_mr *ib_mr) diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 72a10a19880a..c963cad92f5a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -270,7 +270,6 @@ static void i40iw_disable_irq(struct i40iw_sc_dev *dev, i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0); else i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0); - synchronize_irq(msix_vec->irq); free_irq(msix_vec->irq, dev_id); } @@ -1892,7 +1891,7 @@ static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev, } /* client interface functions */ -static struct i40e_client_ops i40e_ops = { +static const struct i40e_client_ops i40e_ops = { .open = i40iw_open, .close = i40iw_close, .l2_param_change = i40iw_l2param_change, diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h index 937b7ee7d69b..16cc61720b53 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_type.h +++ b/drivers/infiniband/hw/i40iw/i40iw_type.h @@ -479,7 +479,7 @@ struct i40iw_sc_dev { struct i40iw_virt_mem ieq_mem; struct i40iw_puda_rsrc *ieq; - struct i40iw_vf_cqp_ops *iw_vf_cqp_ops; + const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops; struct i40iw_hmc_fpm_misc hmc_fpm_misc; u16 qs_handle; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index eaa79c9fc821..4a740f7a0519 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1573,12 +1573,13 @@ static int i40iw_set_page(struct ib_mr *ibmr, u64 addr) * @sg: scatter gather list for fmr * @sg_nents: number of sg pages */ -static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents) +static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) { struct i40iw_mr *iwmr = to_iwmr(ibmr); iwmr->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, i40iw_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, i40iw_set_page); } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c b/drivers/infiniband/hw/i40iw/i40iw_vf.c index cb0f18340e14..e33d4810965c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_vf.c +++ b/drivers/infiniband/hw/i40iw/i40iw_vf.c @@ -80,6 +80,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp, return 0; } -struct i40iw_vf_cqp_ops iw_vf_cqp_ops = { +const struct i40iw_vf_cqp_ops iw_vf_cqp_ops = { i40iw_manage_vf_pble_bp }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h b/drivers/infiniband/hw/i40iw/i40iw_vf.h index f649f3a62e13..4359559ece9c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_vf.h +++ b/drivers/infiniband/hw/i40iw/i40iw_vf.h @@ -57,6 +57,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp, u64 scratch, bool post_sq); -extern struct i40iw_vf_cqp_ops iw_vf_cqp_ops; +extern const struct i40iw_vf_cqp_ops iw_vf_cqp_ops; #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c index 4e1d7c665dc5..3041003c94d2 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c @@ -254,7 +254,7 @@ static void vchnl_pf_send_get_hmc_fcn_resp(struct i40iw_sc_dev *dev, static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev, u32 vf_id, struct i40iw_virtchnl_op_buf *vchnl_msg, - struct i40iw_dev_hw_stats hw_stats) + struct i40iw_dev_hw_stats *hw_stats) { enum i40iw_status_code ret_code; u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(struct i40iw_dev_hw_stats) - 1]; @@ -264,7 +264,7 @@ static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev, vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx; vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer); vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS; - *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = hw_stats; + *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = *hw_stats; ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer)); if (ret_code) i40iw_debug(dev, I40IW_DEBUG_VIRT, @@ -539,7 +539,7 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev, devstat->ops.iw_hw_stat_read_all(devstat, &devstat->hw_stats); spin_unlock_irqrestore(&dev->dev_pestat.stats_lock, flags); vf_dev->msg_count--; - vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, devstat->hw_stats); + vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, &devstat->hw_stats); break; default: i40iw_debug(dev, I40IW_DEBUG_VIRT, diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f014eaf5969b..b01ef6eee6e8 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1601,7 +1601,7 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att else if (ret == -ENXIO) pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); else if (ret) - pr_err("Invalid argumant. Fail to register network rule.\n"); + pr_err("Invalid argument. Fail to register network rule.\n"); mlx4_free_cmd_mailbox(mdev->dev, mailbox); return ret; diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 99451d887266..ebdca2b280ab 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -747,14 +747,11 @@ static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx __be64 tid, union ib_gid *new_mgid) { - struct mcast_group *group = NULL, *cur_group; + struct mcast_group *group = NULL, *cur_group, *n; struct mcast_req *req; - struct list_head *pos; - struct list_head *n; mutex_lock(&ctx->mcg_table_lock); - list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { - group = list_entry(pos, struct mcast_group, mgid0_list); + list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) { mutex_lock(&group->lock); if (group->last_req_tid == tid) { if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 1eca01cebe51..6c5ac5d8f32f 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -717,9 +717,8 @@ int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index ce0b5aa8eb9b..631272172a0b 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -528,9 +528,8 @@ static int mlx4_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct mlx4_ib_mr *mr = to_mmr(ibmr); int rc; @@ -541,7 +540,7 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, sizeof(u64) * mr->max_pages, DMA_TO_DEVICE); - rc = ib_sg_to_pages(ibmr, sg, sg_nents, mlx4_set_page); + rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->page_map, sizeof(u64) * mr->max_pages, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6ad0489cb3c5..1b04df71f09d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -38,6 +38,9 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/io-mapping.h> +#if defined(CONFIG_X86) +#include <asm/pat.h> +#endif #include <linux/sched.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_addr.h> @@ -517,6 +520,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_TSO; } + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, scatter_fcs)) + props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; @@ -1068,38 +1075,89 @@ static int get_index(unsigned long offset) return get_arg(offset); } +static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) +{ + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + return "WC"; + case MLX5_IB_MMAP_REGULAR_PAGE: + return "best effort WC"; + case MLX5_IB_MMAP_NC_PAGE: + return "NC"; + default: + return NULL; + } +} + +static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, + struct vm_area_struct *vma, struct mlx5_uuar_info *uuari) +{ + int err; + unsigned long idx; + phys_addr_t pfn, pa; + pgprot_t prot; + + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: +/* Some architectures don't support WC memory */ +#if defined(CONFIG_X86) + if (!pat_enabled()) + return -EPERM; +#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) + return -EPERM; +#endif + /* fall through */ + case MLX5_IB_MMAP_REGULAR_PAGE: + /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ + prot = pgprot_writecombine(vma->vm_page_prot); + break; + case MLX5_IB_MMAP_NC_PAGE: + prot = pgprot_noncached(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) + return -EINVAL; + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); + + vma->vm_page_prot = prot; + err = io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", + err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); + return -EAGAIN; + } + + pa = pfn << PAGE_SHIFT; + mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), + vma->vm_start, &pa); + + return 0; +} + static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; unsigned long command; - unsigned long idx; phys_addr_t pfn; command = get_command(vma->vm_pgoff); switch (command) { + case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - idx = get_index(vma->vm_pgoff); - if (idx >= uuari->num_uars) - return -EINVAL; - - pfn = uar_index2pfn(dev, uuari->uars[idx].index); - mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, - (unsigned long long)pfn); - - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n", - vma->vm_start, - (unsigned long long)pfn << PAGE_SHIFT); - break; + return uar_mmap(dev, command, vma, uuari); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; @@ -1108,7 +1166,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; - if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + if (vma->vm_flags & VM_WRITE) return -EPERM; /* Don't expose to user-space information it shouldn't have */ diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b46c25542a7c..c4a9825828bc 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -70,6 +70,8 @@ enum { enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_REGULAR_PAGE = 0, MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + MLX5_IB_MMAP_WC_PAGE = 2, + MLX5_IB_MMAP_NC_PAGE = 3, /* 5 is chosen in order to be compatible with old versions of libmlx5 */ MLX5_IB_MMAP_CORE_CLOCK = 5, }; @@ -356,6 +358,7 @@ enum mlx5_ib_qp_flags { MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, /* QP uses 1 as its source QP number */ MLX5_IB_QP_SQPN_QP1 = 1 << 6, + MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, }; struct mlx5_umr_wr { @@ -712,9 +715,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 4d5bff151cdf..8cf2ce50511f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1751,26 +1751,33 @@ done: static int mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, struct scatterlist *sgl, - unsigned short sg_nents) + unsigned short sg_nents, + unsigned int *sg_offset_p) { struct scatterlist *sg = sgl; struct mlx5_klm *klms = mr->descs; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; u32 lkey = mr->ibmr.pd->local_dma_lkey; int i; - mr->ibmr.iova = sg_dma_address(sg); + mr->ibmr.iova = sg_dma_address(sg) + sg_offset; mr->ibmr.length = 0; mr->ndescs = sg_nents; for_each_sg(sgl, sg, sg_nents, i) { if (unlikely(i > mr->max_descs)) break; - klms[i].va = cpu_to_be64(sg_dma_address(sg)); - klms[i].bcount = cpu_to_be32(sg_dma_len(sg)); + klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); klms[i].key = cpu_to_be32(lkey); mr->ibmr.length += sg_dma_len(sg); + + sg_offset = 0; } + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i; } @@ -1788,9 +1795,8 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct mlx5_ib_mr *mr = to_mmr(ibmr); int n; @@ -1802,9 +1808,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, DMA_TO_DEVICE); if (mr->access_mode == MLX5_ACCESS_MODE_KLM) - n = mlx5_ib_sg_to_klms(mr, sg, sg_nents); + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); else - n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + mlx5_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8dee8bc1e0fe..504117657d41 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1028,6 +1028,7 @@ static int get_rq_pas_size(void *qpc) static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, void *qpin) { + struct mlx5_ib_qp *mqp = rq->base.container_mibqp; __be64 *pas; __be64 *qp_pas; void *in; @@ -1051,6 +1052,9 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS) + MLX5_SET(rqc, rqc, scatter_fcs, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, end_padding_mode, @@ -1136,11 +1140,12 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } if (qp->rq.wqe_cnt) { + rq->base.container_mibqp = qp; + err = create_raw_packet_qp_rq(dev, rq, in); if (err) goto err_destroy_sq; - rq->base.container_mibqp = qp; err = create_raw_packet_qp_tir(dev, rq, tdn); if (err) @@ -1252,6 +1257,19 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EOPNOTSUPP; } + if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs"); + return -EOPNOTSUPP; + } + if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) || + !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS; + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 6d3a169c049b..37331e2fdc5f 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -44,6 +44,7 @@ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/init.h> +#include <linux/kernel.h> #include <asm/io.h> #include <asm/irq.h> @@ -903,70 +904,15 @@ void nes_clc(unsigned long parm) */ void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length) { - char xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', - 'a', 'b', 'c', 'd', 'e', 'f'}; - char *ptr; - char hex_buf[80]; - char ascii_buf[20]; - int num_char; - int num_ascii; - int num_hex; - if (!(nes_debug_level & dump_debug_level)) { return; } - ptr = addr; if (length > 0x100) { nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100); length = 0x100; } - nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length); - - memset(ascii_buf, 0, 20); - memset(hex_buf, 0, 80); - - num_ascii = 0; - num_hex = 0; - for (num_char = 0; num_char < length; num_char++) { - if (num_ascii == 8) { - ascii_buf[num_ascii++] = ' '; - hex_buf[num_hex++] = '-'; - hex_buf[num_hex++] = ' '; - } - - if (*ptr < 0x20 || *ptr > 0x7e) - ascii_buf[num_ascii++] = '.'; - else - ascii_buf[num_ascii++] = *ptr; - hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)]; - hex_buf[num_hex++] = xlate[*ptr & 0x0f]; - hex_buf[num_hex++] = ' '; - ptr++; - - if (num_ascii >= 17) { - /* output line and reset */ - nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); - memset(ascii_buf, 0, 20); - memset(hex_buf, 0, 80); - num_ascii = 0; - num_hex = 0; - } - } + nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length); - /* output the rest */ - if (num_ascii) { - while (num_ascii < 17) { - if (num_ascii == 8) { - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - } - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - num_ascii++; - } - - nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); - } + print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true); } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fba69a39a7eb..464d6da5fe91 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -402,15 +402,14 @@ static int nes_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -static int nes_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) { struct nes_mr *nesmr = to_nesmr(ibmr); nesmr->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, nes_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page); } /** @@ -981,7 +980,7 @@ static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic, /** * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory. */ -static inline void nes_free_qp_mem(struct nes_device *nesdev, +static void nes_free_qp_mem(struct nes_device *nesdev, struct nes_qp *nesqp, int virt_wqs) { unsigned long flags; @@ -1315,6 +1314,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type); return ERR_PTR(-EINVAL); } + init_completion(&nesqp->sq_drained); + init_completion(&nesqp->rq_drained); nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); init_timer(&nesqp->terminate_timer); @@ -3452,6 +3453,29 @@ out: return err; } +/** + * nes_drain_sq - drain sq + * @ibqp: pointer to ibqp + */ +static void nes_drain_sq(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head) + wait_for_completion(&nesqp->sq_drained); +} + +/** + * nes_drain_rq - drain rq + * @ibqp: pointer to ibqp + */ +static void nes_drain_rq(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head) + wait_for_completion(&nesqp->rq_drained); +} /** * nes_poll_cq @@ -3582,6 +3606,13 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) } } + if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) { + if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head) + complete(&nesqp->sq_drained); + if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head) + complete(&nesqp->rq_drained); + } + entry->wr_id = wrid; entry++; cqe_count++; @@ -3754,6 +3785,8 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.req_notify_cq = nes_req_notify_cq; nesibdev->ibdev.post_send = nes_post_send; nesibdev->ibdev.post_recv = nes_post_recv; + nesibdev->ibdev.drain_sq = nes_drain_sq; + nesibdev->ibdev.drain_rq = nes_drain_rq; nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL); if (nesibdev->ibdev.iwcm == NULL) { diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 70290883d067..e02a5662dc20 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -189,6 +189,8 @@ struct nes_qp { u8 pau_pending; u8 pau_state; __u64 nesuqp_addr; + struct completion sq_drained; + struct completion rq_drained; }; struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index a8496a18e20d..b1a3d91fe8b9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -3081,13 +3081,12 @@ static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int ocrdma_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct ocrdma_mr *mr = get_ocrdma_mr(ibmr); mr->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, ocrdma_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ocrdma_set_page); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 8b517fd36779..704ef1e9271b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -122,8 +122,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int ocrdma_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); #endif /* __OCRDMA_VERBS_H__ */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index a53fa5fc0dec..1502199c8e56 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -36,6 +36,27 @@ #include "ipoib.h" +struct ipoib_stats { + char stat_string[ETH_GSTRING_LEN]; + int stat_offset; +}; + +#define IPOIB_NETDEV_STAT(m) { \ + .stat_string = #m, \ + .stat_offset = offsetof(struct rtnl_link_stats64, m) } + +static const struct ipoib_stats ipoib_gstrings_stats[] = { + IPOIB_NETDEV_STAT(rx_packets), + IPOIB_NETDEV_STAT(tx_packets), + IPOIB_NETDEV_STAT(rx_bytes), + IPOIB_NETDEV_STAT(tx_bytes), + IPOIB_NETDEV_STAT(tx_errors), + IPOIB_NETDEV_STAT(rx_dropped), + IPOIB_NETDEV_STAT(tx_dropped) +}; + +#define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) + static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { @@ -92,11 +113,57 @@ static int ipoib_set_coalesce(struct net_device *dev, return 0; } +static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats __always_unused *stats, + u64 *data) +{ + int i; + struct net_device_stats *net_stats = &dev->stats; + u8 *p = (u8 *)net_stats; + + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) + data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset); + +} +static void ipoib_get_strings(struct net_device __always_unused *dev, + u32 stringset, u8 *data) +{ + u8 *p = data; + int i; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) { + memcpy(p, ipoib_gstrings_stats[i].stat_string, + ETH_GSTRING_LEN); + p += ETH_GSTRING_LEN; + } + break; + case ETH_SS_TEST: + default: + break; + } +} +static int ipoib_get_sset_count(struct net_device __always_unused *dev, + int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return IPOIB_GLOBAL_STATS_LEN; + case ETH_SS_TEST: + default: + break; + } + return -EOPNOTSUPP; +} static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, + .get_strings = ipoib_get_strings, + .get_ethtool_stats = ipoib_get_ethtool_stats, + .get_sset_count = ipoib_get_sset_count, }; void ipoib_set_ethtool_ops(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index f0e55e47eb54..da5f28c892ca 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -51,8 +51,6 @@ MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif -static DEFINE_MUTEX(pkey_mutex); - struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ib_pd *pd, struct ib_ah_attr *attr) { diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9a391cc5b9b3..90be56893414 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -236,7 +236,7 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, page_vec->npages = 0; page_vec->fake_mr.page_size = SIZE_4K; plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg, - mem->size, iser_set_page); + mem->size, NULL, iser_set_page); if (unlikely(plen < mem->size)) { iser_err("page vec too short to hold this SG\n"); iser_data_buf_dump(mem, device->ib_device); @@ -446,7 +446,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); - n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K); + n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K); if (unlikely(n != mem->size)) { iser_err("failed to map sg (%d/%d)\n", n, mem->size); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 411e4464ca23..897b5a4993e8 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -33,7 +33,8 @@ #define ISERT_MAX_CONN 8 #define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN) -#define ISER_MAX_TX_CQ_LEN (ISERT_QP_MAX_REQ_DTOS * ISERT_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN \ + ((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN) #define ISER_MAX_CQ_LEN (ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \ ISERT_MAX_CONN) @@ -46,14 +47,6 @@ static LIST_HEAD(device_list); static struct workqueue_struct *isert_comp_wq; static struct workqueue_struct *isert_release_wq; -static void -isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); -static int -isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn); -static void -isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); -static int -isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn); static int isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); static int @@ -142,6 +135,7 @@ isert_create_qp(struct isert_conn *isert_conn, attr.recv_cq = comp->cq; attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1; attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; + attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX; attr.cap.max_send_sge = device->ib_device->attrs.max_sge; isert_conn->max_sge = min(device->ib_device->attrs.max_sge, device->ib_device->attrs.max_sge_rd); @@ -270,9 +264,9 @@ isert_alloc_comps(struct isert_device *device) device->ib_device->num_comp_vectors)); isert_info("Using %d CQs, %s supports %d vectors support " - "Fast registration %d pi_capable %d\n", + "pi_capable %d\n", device->comps_used, device->ib_device->name, - device->ib_device->num_comp_vectors, device->use_fastreg, + device->ib_device->num_comp_vectors, device->pi_capable); device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp), @@ -313,18 +307,6 @@ isert_create_device_ib_res(struct isert_device *device) isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge); isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd); - /* asign function handlers */ - if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && - ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { - device->use_fastreg = 1; - device->reg_rdma_mem = isert_reg_rdma; - device->unreg_rdma_mem = isert_unreg_rdma; - } else { - device->use_fastreg = 0; - device->reg_rdma_mem = isert_map_rdma; - device->unreg_rdma_mem = isert_unmap_cmd; - } - ret = isert_alloc_comps(device); if (ret) goto out; @@ -417,146 +399,6 @@ isert_device_get(struct rdma_cm_id *cma_id) } static void -isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) -{ - struct fast_reg_descriptor *fr_desc, *tmp; - int i = 0; - - if (list_empty(&isert_conn->fr_pool)) - return; - - isert_info("Freeing conn %p fastreg pool", isert_conn); - - list_for_each_entry_safe(fr_desc, tmp, - &isert_conn->fr_pool, list) { - list_del(&fr_desc->list); - ib_dereg_mr(fr_desc->data_mr); - if (fr_desc->pi_ctx) { - ib_dereg_mr(fr_desc->pi_ctx->prot_mr); - ib_dereg_mr(fr_desc->pi_ctx->sig_mr); - kfree(fr_desc->pi_ctx); - } - kfree(fr_desc); - ++i; - } - - if (i < isert_conn->fr_pool_size) - isert_warn("Pool still has %d regions registered\n", - isert_conn->fr_pool_size - i); -} - -static int -isert_create_pi_ctx(struct fast_reg_descriptor *desc, - struct ib_device *device, - struct ib_pd *pd) -{ - struct pi_context *pi_ctx; - int ret; - - pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); - if (!pi_ctx) { - isert_err("Failed to allocate pi context\n"); - return -ENOMEM; - } - - pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(pi_ctx->prot_mr)) { - isert_err("Failed to allocate prot frmr err=%ld\n", - PTR_ERR(pi_ctx->prot_mr)); - ret = PTR_ERR(pi_ctx->prot_mr); - goto err_pi_ctx; - } - desc->ind |= ISERT_PROT_KEY_VALID; - - pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2); - if (IS_ERR(pi_ctx->sig_mr)) { - isert_err("Failed to allocate signature enabled mr err=%ld\n", - PTR_ERR(pi_ctx->sig_mr)); - ret = PTR_ERR(pi_ctx->sig_mr); - goto err_prot_mr; - } - - desc->pi_ctx = pi_ctx; - desc->ind |= ISERT_SIG_KEY_VALID; - desc->ind &= ~ISERT_PROTECTED; - - return 0; - -err_prot_mr: - ib_dereg_mr(pi_ctx->prot_mr); -err_pi_ctx: - kfree(pi_ctx); - - return ret; -} - -static int -isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, - struct fast_reg_descriptor *fr_desc) -{ - fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(fr_desc->data_mr)) { - isert_err("Failed to allocate data frmr err=%ld\n", - PTR_ERR(fr_desc->data_mr)); - return PTR_ERR(fr_desc->data_mr); - } - fr_desc->ind |= ISERT_DATA_KEY_VALID; - - isert_dbg("Created fr_desc %p\n", fr_desc); - - return 0; -} - -static int -isert_conn_create_fastreg_pool(struct isert_conn *isert_conn) -{ - struct fast_reg_descriptor *fr_desc; - struct isert_device *device = isert_conn->device; - struct se_session *se_sess = isert_conn->conn->sess->se_sess; - struct se_node_acl *se_nacl = se_sess->se_node_acl; - int i, ret, tag_num; - /* - * Setup the number of FRMRs based upon the number of tags - * available to session in iscsi_target_locate_portal(). - */ - tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); - tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; - - isert_conn->fr_pool_size = 0; - for (i = 0; i < tag_num; i++) { - fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); - if (!fr_desc) { - isert_err("Failed to allocate fast_reg descriptor\n"); - ret = -ENOMEM; - goto err; - } - - ret = isert_create_fr_desc(device->ib_device, - device->pd, fr_desc); - if (ret) { - isert_err("Failed to create fastreg descriptor err=%d\n", - ret); - kfree(fr_desc); - goto err; - } - - list_add_tail(&fr_desc->list, &isert_conn->fr_pool); - isert_conn->fr_pool_size++; - } - - isert_dbg("Creating conn %p fastreg pool size=%d", - isert_conn, isert_conn->fr_pool_size); - - return 0; - -err: - isert_conn_free_fastreg_pool(isert_conn); - return ret; -} - -static void isert_init_conn(struct isert_conn *isert_conn) { isert_conn->state = ISER_CONN_INIT; @@ -565,8 +407,6 @@ isert_init_conn(struct isert_conn *isert_conn) init_completion(&isert_conn->login_req_comp); kref_init(&isert_conn->kref); mutex_init(&isert_conn->mutex); - spin_lock_init(&isert_conn->pool_lock); - INIT_LIST_HEAD(&isert_conn->fr_pool); INIT_WORK(&isert_conn->release_work, isert_release_work); } @@ -739,9 +579,6 @@ isert_connect_release(struct isert_conn *isert_conn) BUG_ON(!device); - if (device->use_fastreg) - isert_conn_free_fastreg_pool(isert_conn); - isert_free_rx_descriptors(isert_conn); if (isert_conn->cm_id) rdma_destroy_id(isert_conn->cm_id); @@ -1080,7 +917,6 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, { struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; - isert_cmd->iser_ib_op = ISER_IB_SEND; tx_desc->tx_cqe.done = isert_send_done; send_wr->wr_cqe = &tx_desc->tx_cqe; @@ -1160,16 +996,6 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, } if (!login->login_failed) { if (login->login_complete) { - if (!conn->sess->sess_ops->SessionType && - isert_conn->device->use_fastreg) { - ret = isert_conn_create_fastreg_pool(isert_conn); - if (ret) { - isert_err("Conn: %p failed to create" - " fastreg pool\n", isert_conn); - return ret; - } - } - ret = isert_alloc_rx_descriptors(isert_conn); if (ret) return ret; @@ -1633,97 +1459,26 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc) ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); } -static int -isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, - struct scatterlist *sg, u32 nents, u32 length, u32 offset, - enum iser_ib_op_code op, struct isert_data_buf *data) -{ - struct ib_device *ib_dev = isert_conn->cm_id->device; - - data->dma_dir = op == ISER_IB_RDMA_WRITE ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; - - data->len = length - offset; - data->offset = offset; - data->sg_off = data->offset / PAGE_SIZE; - - data->sg = &sg[data->sg_off]; - data->nents = min_t(unsigned int, nents - data->sg_off, - ISCSI_ISER_SG_TABLESIZE); - data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * - PAGE_SIZE); - - data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, - data->dma_dir); - if (unlikely(!data->dma_nents)) { - isert_err("Cmd: unable to dma map SGs %p\n", sg); - return -EINVAL; - } - - isert_dbg("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", - isert_cmd, data->dma_nents, data->sg, data->nents, data->len); - - return 0; -} - static void -isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) +isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn) { - struct ib_device *ib_dev = isert_conn->cm_id->device; - - ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); - memset(data, 0, sizeof(*data)); -} - - - -static void -isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) -{ - isert_dbg("Cmd %p\n", isert_cmd); + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); - if (isert_cmd->data.sg) { - isert_dbg("Cmd %p unmap_sg op\n", isert_cmd); - isert_unmap_data_buf(isert_conn, &isert_cmd->data); - } - - if (isert_cmd->rdma_wr) { - isert_dbg("Cmd %p free send_wr\n", isert_cmd); - kfree(isert_cmd->rdma_wr); - isert_cmd->rdma_wr = NULL; - } - - if (isert_cmd->ib_sge) { - isert_dbg("Cmd %p free ib_sge\n", isert_cmd); - kfree(isert_cmd->ib_sge); - isert_cmd->ib_sge = NULL; - } -} - -static void -isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) -{ - isert_dbg("Cmd %p\n", isert_cmd); - - if (isert_cmd->fr_desc) { - isert_dbg("Cmd %p free fr_desc %p\n", isert_cmd, isert_cmd->fr_desc); - if (isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - isert_unmap_data_buf(isert_conn, &isert_cmd->prot); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } - spin_lock_bh(&isert_conn->pool_lock); - list_add_tail(&isert_cmd->fr_desc->list, &isert_conn->fr_pool); - spin_unlock_bh(&isert_conn->pool_lock); - isert_cmd->fr_desc = NULL; - } + if (!cmd->rw.nr_ops) + return; - if (isert_cmd->data.sg) { - isert_dbg("Cmd %p unmap_sg op\n", isert_cmd); - isert_unmap_data_buf(isert_conn, &isert_cmd->data); + if (isert_prot_cmd(conn, se_cmd)) { + rdma_rw_ctx_destroy_signature(&cmd->rw, conn->qp, + conn->cm_id->port_num, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->t_prot_sg, + se_cmd->t_prot_nents, dir); + } else { + rdma_rw_ctx_destroy(&cmd->rw, conn->qp, conn->cm_id->port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, dir); } - isert_cmd->ib_sge = NULL; - isert_cmd->rdma_wr = NULL; + cmd->rw.nr_ops = 0; } static void @@ -1732,7 +1487,6 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct isert_conn *isert_conn = isert_cmd->conn; struct iscsi_conn *conn = isert_conn->conn; - struct isert_device *device = isert_conn->device; struct iscsi_text_rsp *hdr; isert_dbg("Cmd %p\n", isert_cmd); @@ -1760,7 +1514,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) } } - device->unreg_rdma_mem(isert_cmd, isert_conn); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); transport_generic_free_cmd(&cmd->se_cmd, 0); break; case ISCSI_OP_SCSI_TMFUNC: @@ -1894,14 +1648,9 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) isert_dbg("Cmd %p\n", isert_cmd); - if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - ret = isert_check_pi_status(cmd, - isert_cmd->fr_desc->pi_ctx->sig_mr); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } + ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); - device->unreg_rdma_mem(isert_cmd, isert_conn); - isert_cmd->rdma_wr_num = 0; if (ret) transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0); else @@ -1929,16 +1678,12 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) isert_dbg("Cmd %p\n", isert_cmd); - if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - ret = isert_check_pi_status(se_cmd, - isert_cmd->fr_desc->pi_ctx->sig_mr); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } - iscsit_stop_dataout_timer(cmd); - device->unreg_rdma_mem(isert_cmd, isert_conn); - cmd->write_data_done = isert_cmd->data.len; - isert_cmd->rdma_wr_num = 0; + + if (isert_prot_cmd(isert_conn, se_cmd)) + ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); + cmd->write_data_done = 0; isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); spin_lock_bh(&cmd->istate_lock); @@ -2111,7 +1856,6 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; spin_lock_bh(&conn->cmd_lock); if (!list_empty(&cmd->i_conn_node)) @@ -2120,8 +1864,7 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) if (cmd->data_direction == DMA_TO_DEVICE) iscsit_stop_dataout_timer(cmd); - - device->unreg_rdma_mem(isert_cmd, isert_conn); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); } static enum target_prot_op @@ -2274,234 +2017,6 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) return isert_post_response(isert_conn, isert_cmd); } -static int -isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, - struct ib_sge *ib_sge, struct ib_rdma_wr *rdma_wr, - u32 data_left, u32 offset) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct scatterlist *sg_start, *tmp_sg; - struct isert_device *device = isert_conn->device; - struct ib_device *ib_dev = device->ib_device; - u32 sg_off, page_off; - int i = 0, sg_nents; - - sg_off = offset / PAGE_SIZE; - sg_start = &cmd->se_cmd.t_data_sg[sg_off]; - sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge); - page_off = offset % PAGE_SIZE; - - rdma_wr->wr.sg_list = ib_sge; - rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe; - - /* - * Perform mapping of TCM scatterlist memory ib_sge dma_addr. - */ - for_each_sg(sg_start, tmp_sg, sg_nents, i) { - isert_dbg("RDMA from SGL dma_addr: 0x%llx dma_len: %u, " - "page_off: %u\n", - (unsigned long long)tmp_sg->dma_address, - tmp_sg->length, page_off); - - ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; - ib_sge->length = min_t(u32, data_left, - ib_sg_dma_len(ib_dev, tmp_sg) - page_off); - ib_sge->lkey = device->pd->local_dma_lkey; - - isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n", - ib_sge->addr, ib_sge->length, ib_sge->lkey); - page_off = 0; - data_left -= ib_sge->length; - if (!data_left) - break; - ib_sge++; - isert_dbg("Incrementing ib_sge pointer to %p\n", ib_sge); - } - - rdma_wr->wr.num_sge = ++i; - isert_dbg("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n", - rdma_wr->wr.sg_list, rdma_wr->wr.num_sge); - - return rdma_wr->wr.num_sge; -} - -static int -isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct se_cmd *se_cmd = &cmd->se_cmd; - struct isert_conn *isert_conn = conn->context; - struct isert_data_buf *data = &isert_cmd->data; - struct ib_rdma_wr *rdma_wr; - struct ib_sge *ib_sge; - u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; - int ret = 0, i, ib_sge_cnt; - - offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ? - cmd->write_data_done : 0; - ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, - se_cmd->t_data_nents, se_cmd->data_length, - offset, isert_cmd->iser_ib_op, - &isert_cmd->data); - if (ret) - return ret; - - data_left = data->len; - offset = data->offset; - - ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); - if (!ib_sge) { - isert_warn("Unable to allocate ib_sge\n"); - ret = -ENOMEM; - goto unmap_cmd; - } - isert_cmd->ib_sge = ib_sge; - - isert_cmd->rdma_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); - isert_cmd->rdma_wr = kzalloc(sizeof(struct ib_rdma_wr) * - isert_cmd->rdma_wr_num, GFP_KERNEL); - if (!isert_cmd->rdma_wr) { - isert_dbg("Unable to allocate isert_cmd->rdma_wr\n"); - ret = -ENOMEM; - goto unmap_cmd; - } - - rdma_write_max = isert_conn->max_sge * PAGE_SIZE; - - for (i = 0; i < isert_cmd->rdma_wr_num; i++) { - rdma_wr = &isert_cmd->rdma_wr[i]; - data_len = min(data_left, rdma_write_max); - - rdma_wr->wr.send_flags = 0; - if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; - rdma_wr->remote_addr = isert_cmd->read_va + offset; - rdma_wr->rkey = isert_cmd->read_stag; - if (i + 1 == isert_cmd->rdma_wr_num) - rdma_wr->wr.next = &isert_cmd->tx_desc.send_wr; - else - rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr; - } else { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_READ; - rdma_wr->remote_addr = isert_cmd->write_va + va_offset; - rdma_wr->rkey = isert_cmd->write_stag; - if (i + 1 == isert_cmd->rdma_wr_num) - rdma_wr->wr.send_flags = IB_SEND_SIGNALED; - else - rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr; - } - - ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge, - rdma_wr, data_len, offset); - ib_sge += ib_sge_cnt; - - offset += data_len; - va_offset += data_len; - data_left -= data_len; - } - - return 0; -unmap_cmd: - isert_unmap_data_buf(isert_conn, data); - - return ret; -} - -static inline void -isert_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) -{ - u32 rkey; - - memset(inv_wr, 0, sizeof(*inv_wr)); - inv_wr->wr_cqe = NULL; - inv_wr->opcode = IB_WR_LOCAL_INV; - inv_wr->ex.invalidate_rkey = mr->rkey; - - /* Bump the key */ - rkey = ib_inc_rkey(mr->rkey); - ib_update_fast_reg_key(mr, rkey); -} - -static int -isert_fast_reg_mr(struct isert_conn *isert_conn, - struct fast_reg_descriptor *fr_desc, - struct isert_data_buf *mem, - enum isert_indicator ind, - struct ib_sge *sge) -{ - struct isert_device *device = isert_conn->device; - struct ib_device *ib_dev = device->ib_device; - struct ib_mr *mr; - struct ib_reg_wr reg_wr; - struct ib_send_wr inv_wr, *bad_wr, *wr = NULL; - int ret, n; - - if (mem->dma_nents == 1) { - sge->lkey = device->pd->local_dma_lkey; - sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); - sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); - isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", - sge->addr, sge->length, sge->lkey); - return 0; - } - - if (ind == ISERT_DATA_KEY_VALID) - /* Registering data buffer */ - mr = fr_desc->data_mr; - else - /* Registering protection buffer */ - mr = fr_desc->pi_ctx->prot_mr; - - if (!(fr_desc->ind & ind)) { - isert_inv_rkey(&inv_wr, mr); - wr = &inv_wr; - } - - n = ib_map_mr_sg(mr, mem->sg, mem->nents, PAGE_SIZE); - if (unlikely(n != mem->nents)) { - isert_err("failed to map mr sg (%d/%d)\n", - n, mem->nents); - return n < 0 ? n : -EINVAL; - } - - isert_dbg("Use fr_desc %p sg_nents %d offset %u\n", - fr_desc, mem->nents, mem->offset); - - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_cqe = NULL; - reg_wr.wr.send_flags = 0; - reg_wr.wr.num_sge = 0; - reg_wr.mr = mr; - reg_wr.key = mr->lkey; - reg_wr.access = IB_ACCESS_LOCAL_WRITE; - - if (!wr) - wr = ®_wr.wr; - else - wr->next = ®_wr.wr; - - ret = ib_post_send(isert_conn->qp, wr, &bad_wr); - if (ret) { - isert_err("fast registration failed, ret:%d\n", ret); - return ret; - } - fr_desc->ind &= ~ind; - - sge->lkey = mr->lkey; - sge->addr = mr->iova; - sge->length = mr->length; - - isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", - sge->addr, sge->length, sge->lkey); - - return ret; -} - static inline void isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, struct ib_sig_domain *domain) @@ -2526,6 +2041,8 @@ isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, static int isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) { + memset(sig_attrs, 0, sizeof(*sig_attrs)); + switch (se_cmd->prot_op) { case TARGET_PROT_DIN_INSERT: case TARGET_PROT_DOUT_STRIP: @@ -2547,228 +2064,59 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) return -EINVAL; } + sig_attrs->check_mask = + (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | + (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); return 0; } -static inline u8 -isert_set_prot_checks(u8 prot_checks) -{ - return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | - (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | - (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); -} - -static int -isert_reg_sig_mr(struct isert_conn *isert_conn, - struct isert_cmd *isert_cmd, - struct fast_reg_descriptor *fr_desc) -{ - struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd; - struct ib_sig_handover_wr sig_wr; - struct ib_send_wr inv_wr, *bad_wr, *wr = NULL; - struct pi_context *pi_ctx = fr_desc->pi_ctx; - struct ib_sig_attrs sig_attrs; - int ret; - - memset(&sig_attrs, 0, sizeof(sig_attrs)); - ret = isert_set_sig_attrs(se_cmd, &sig_attrs); - if (ret) - goto err; - - sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); - - if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { - isert_inv_rkey(&inv_wr, pi_ctx->sig_mr); - wr = &inv_wr; - } - - memset(&sig_wr, 0, sizeof(sig_wr)); - sig_wr.wr.opcode = IB_WR_REG_SIG_MR; - sig_wr.wr.wr_cqe = NULL; - sig_wr.wr.sg_list = &isert_cmd->ib_sg[DATA]; - sig_wr.wr.num_sge = 1; - sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; - sig_wr.sig_attrs = &sig_attrs; - sig_wr.sig_mr = pi_ctx->sig_mr; - if (se_cmd->t_prot_sg) - sig_wr.prot = &isert_cmd->ib_sg[PROT]; - - if (!wr) - wr = &sig_wr.wr; - else - wr->next = &sig_wr.wr; - - ret = ib_post_send(isert_conn->qp, wr, &bad_wr); - if (ret) { - isert_err("fast registration failed, ret:%d\n", ret); - goto err; - } - fr_desc->ind &= ~ISERT_SIG_KEY_VALID; - - isert_cmd->ib_sg[SIG].lkey = pi_ctx->sig_mr->lkey; - isert_cmd->ib_sg[SIG].addr = 0; - isert_cmd->ib_sg[SIG].length = se_cmd->data_length; - if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && - se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) - /* - * We have protection guards on the wire - * so we need to set a larget transfer - */ - isert_cmd->ib_sg[SIG].length += se_cmd->prot_length; - - isert_dbg("sig_sge: addr: 0x%llx length: %u lkey: %x\n", - isert_cmd->ib_sg[SIG].addr, isert_cmd->ib_sg[SIG].length, - isert_cmd->ib_sg[SIG].lkey); -err: - return ret; -} - static int -isert_handle_prot_cmd(struct isert_conn *isert_conn, - struct isert_cmd *isert_cmd) -{ - struct isert_device *device = isert_conn->device; - struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd; +isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); + u8 port_num = conn->cm_id->port_num; + u64 addr; + u32 rkey, offset; int ret; - if (!isert_cmd->fr_desc->pi_ctx) { - ret = isert_create_pi_ctx(isert_cmd->fr_desc, - device->ib_device, - device->pd); - if (ret) { - isert_err("conn %p failed to allocate pi_ctx\n", - isert_conn); - return ret; - } - } - - if (se_cmd->t_prot_sg) { - ret = isert_map_data_buf(isert_conn, isert_cmd, - se_cmd->t_prot_sg, - se_cmd->t_prot_nents, - se_cmd->prot_length, - 0, - isert_cmd->iser_ib_op, - &isert_cmd->prot); - if (ret) { - isert_err("conn %p failed to map protection buffer\n", - isert_conn); - return ret; - } - - memset(&isert_cmd->ib_sg[PROT], 0, sizeof(isert_cmd->ib_sg[PROT])); - ret = isert_fast_reg_mr(isert_conn, isert_cmd->fr_desc, - &isert_cmd->prot, - ISERT_PROT_KEY_VALID, - &isert_cmd->ib_sg[PROT]); - if (ret) { - isert_err("conn %p failed to fast reg mr\n", - isert_conn); - goto unmap_prot_cmd; - } - } - - ret = isert_reg_sig_mr(isert_conn, isert_cmd, isert_cmd->fr_desc); - if (ret) { - isert_err("conn %p failed to fast reg mr\n", - isert_conn); - goto unmap_prot_cmd; - } - isert_cmd->fr_desc->ind |= ISERT_PROTECTED; - - return 0; - -unmap_prot_cmd: - if (se_cmd->t_prot_sg) - isert_unmap_data_buf(isert_conn, &isert_cmd->prot); - - return ret; -} - -static int -isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct se_cmd *se_cmd = &cmd->se_cmd; - struct isert_conn *isert_conn = conn->context; - struct fast_reg_descriptor *fr_desc = NULL; - struct ib_rdma_wr *rdma_wr; - struct ib_sge *ib_sg; - u32 offset; - int ret = 0; - unsigned long flags; - - offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ? - cmd->write_data_done : 0; - ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, - se_cmd->t_data_nents, se_cmd->data_length, - offset, isert_cmd->iser_ib_op, - &isert_cmd->data); - if (ret) - return ret; - - if (isert_cmd->data.dma_nents != 1 || - isert_prot_cmd(isert_conn, se_cmd)) { - spin_lock_irqsave(&isert_conn->pool_lock, flags); - fr_desc = list_first_entry(&isert_conn->fr_pool, - struct fast_reg_descriptor, list); - list_del(&fr_desc->list); - spin_unlock_irqrestore(&isert_conn->pool_lock, flags); - isert_cmd->fr_desc = fr_desc; - } - - ret = isert_fast_reg_mr(isert_conn, fr_desc, &isert_cmd->data, - ISERT_DATA_KEY_VALID, &isert_cmd->ib_sg[DATA]); - if (ret) - goto unmap_cmd; - - if (isert_prot_cmd(isert_conn, se_cmd)) { - ret = isert_handle_prot_cmd(isert_conn, isert_cmd); - if (ret) - goto unmap_cmd; - - ib_sg = &isert_cmd->ib_sg[SIG]; + if (dir == DMA_FROM_DEVICE) { + addr = cmd->write_va; + rkey = cmd->write_stag; + offset = cmd->iscsi_cmd->write_data_done; } else { - ib_sg = &isert_cmd->ib_sg[DATA]; + addr = cmd->read_va; + rkey = cmd->read_stag; + offset = 0; } - memcpy(&isert_cmd->s_ib_sge, ib_sg, sizeof(*ib_sg)); - isert_cmd->ib_sge = &isert_cmd->s_ib_sge; - isert_cmd->rdma_wr_num = 1; - memset(&isert_cmd->s_rdma_wr, 0, sizeof(isert_cmd->s_rdma_wr)); - isert_cmd->rdma_wr = &isert_cmd->s_rdma_wr; + if (isert_prot_cmd(conn, se_cmd)) { + struct ib_sig_attrs sig_attrs; - rdma_wr = &isert_cmd->s_rdma_wr; - rdma_wr->wr.sg_list = &isert_cmd->s_ib_sge; - rdma_wr->wr.num_sge = 1; - rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe; - if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + return ret; - rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; - rdma_wr->remote_addr = isert_cmd->read_va; - rdma_wr->rkey = isert_cmd->read_stag; - rdma_wr->wr.send_flags = !isert_prot_cmd(isert_conn, se_cmd) ? - 0 : IB_SEND_SIGNALED; + WARN_ON_ONCE(offset); + ret = rdma_rw_ctx_signature_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + se_cmd->t_prot_sg, se_cmd->t_prot_nents, + &sig_attrs, addr, rkey, dir); } else { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_READ; - rdma_wr->remote_addr = isert_cmd->write_va; - rdma_wr->rkey = isert_cmd->write_stag; - rdma_wr->wr.send_flags = IB_SEND_SIGNALED; + ret = rdma_rw_ctx_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + offset, addr, rkey, dir); } - - return 0; - -unmap_cmd: - if (fr_desc) { - spin_lock_irqsave(&isert_conn->pool_lock, flags); - list_add_tail(&fr_desc->list, &isert_conn->fr_pool); - spin_unlock_irqrestore(&isert_conn->pool_lock, flags); + if (ret < 0) { + isert_err("Cmd: %p failed to prepare RDMA res\n", cmd); + return ret; } - isert_unmap_data_buf(isert_conn, &isert_cmd->data); + ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr); + if (ret < 0) + isert_err("Cmd: %p failed to post RDMA res\n", cmd); return ret; } @@ -2778,21 +2126,17 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; - struct ib_send_wr *wr_failed; + struct ib_cqe *cqe = NULL; + struct ib_send_wr *chain_wr = NULL; int rc; isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n", isert_cmd, se_cmd->data_length); - isert_cmd->iser_ib_op = ISER_IB_RDMA_WRITE; - rc = device->reg_rdma_mem(isert_cmd, conn); - if (rc) { - isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); - return rc; - } - - if (!isert_prot_cmd(isert_conn, se_cmd)) { + if (isert_prot_cmd(isert_conn, se_cmd)) { + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; + cqe = &isert_cmd->tx_desc.tx_cqe; + } else { /* * Build isert_conn->tx_desc for iSCSI response PDU and attach */ @@ -2803,56 +2147,35 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); isert_init_send_wr(isert_conn, isert_cmd, &isert_cmd->tx_desc.send_wr); - isert_cmd->s_rdma_wr.wr.next = &isert_cmd->tx_desc.send_wr; - isert_cmd->rdma_wr_num += 1; rc = isert_post_recv(isert_conn, isert_cmd->rx_desc); if (rc) { isert_err("ib_post_recv failed with %d\n", rc); return rc; } - } - rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed); - if (rc) - isert_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); - - if (!isert_prot_cmd(isert_conn, se_cmd)) - isert_dbg("Cmd: %p posted RDMA_WRITE + Response for iSER Data " - "READ\n", isert_cmd); - else - isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", - isert_cmd); + chain_wr = &isert_cmd->tx_desc.send_wr; + } + isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr); + isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd); return 1; } static int isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) { - struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; - struct ib_send_wr *wr_failed; - int rc; isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", - isert_cmd, se_cmd->data_length, cmd->write_data_done); - isert_cmd->iser_ib_op = ISER_IB_RDMA_READ; - rc = device->reg_rdma_mem(isert_cmd, conn); - if (rc) { - isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); - return rc; - } + isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done); - rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed); - if (rc) - isert_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; + isert_rdma_rw_ctx_post(isert_cmd, conn->context, + &isert_cmd->tx_desc.tx_cqe, NULL); isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", isert_cmd); - return 0; } diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 147900cbb578..e512ba941f2f 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -3,6 +3,7 @@ #include <linux/in6.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <rdma/rw.h> #include <scsi/iser.h> @@ -53,10 +54,7 @@ #define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) -#define ISERT_INFLIGHT_DATAOUTS 8 - -#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ - (1 + ISERT_INFLIGHT_DATAOUTS) + \ +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX + \ ISERT_MAX_TX_MISC_PDUS + \ ISERT_MAX_RX_MISC_PDUS) @@ -71,13 +69,6 @@ enum isert_desc_type { ISCSI_TX_DATAIN }; -enum iser_ib_op_code { - ISER_IB_RECV, - ISER_IB_SEND, - ISER_IB_RDMA_WRITE, - ISER_IB_RDMA_READ, -}; - enum iser_conn_state { ISER_CONN_INIT, ISER_CONN_UP, @@ -118,42 +109,6 @@ static inline struct iser_tx_desc *cqe_to_tx_desc(struct ib_cqe *cqe) return container_of(cqe, struct iser_tx_desc, tx_cqe); } - -enum isert_indicator { - ISERT_PROTECTED = 1 << 0, - ISERT_DATA_KEY_VALID = 1 << 1, - ISERT_PROT_KEY_VALID = 1 << 2, - ISERT_SIG_KEY_VALID = 1 << 3, -}; - -struct pi_context { - struct ib_mr *prot_mr; - struct ib_mr *sig_mr; -}; - -struct fast_reg_descriptor { - struct list_head list; - struct ib_mr *data_mr; - u8 ind; - struct pi_context *pi_ctx; -}; - -struct isert_data_buf { - struct scatterlist *sg; - int nents; - u32 sg_off; - u32 len; /* cur_rdma_length */ - u32 offset; - unsigned int dma_nents; - enum dma_data_direction dma_dir; -}; - -enum { - DATA = 0, - PROT = 1, - SIG = 2, -}; - struct isert_cmd { uint32_t read_stag; uint32_t write_stag; @@ -166,16 +121,7 @@ struct isert_cmd { struct iscsi_cmd *iscsi_cmd; struct iser_tx_desc tx_desc; struct iser_rx_desc *rx_desc; - enum iser_ib_op_code iser_ib_op; - struct ib_sge *ib_sge; - struct ib_sge s_ib_sge; - int rdma_wr_num; - struct ib_rdma_wr *rdma_wr; - struct ib_rdma_wr s_rdma_wr; - struct ib_sge ib_sg[3]; - struct isert_data_buf data; - struct isert_data_buf prot; - struct fast_reg_descriptor *fr_desc; + struct rdma_rw_ctx rw; struct work_struct comp_work; struct scatterlist sg; }; @@ -210,10 +156,6 @@ struct isert_conn { struct isert_device *device; struct mutex mutex; struct kref kref; - struct list_head fr_pool; - int fr_pool_size; - /* lock to protect fastreg pool */ - spinlock_t pool_lock; struct work_struct release_work; bool logout_posted; bool snd_w_inv; @@ -236,7 +178,6 @@ struct isert_comp { }; struct isert_device { - int use_fastreg; bool pi_capable; int refcount; struct ib_device *ib_device; @@ -244,10 +185,6 @@ struct isert_device { struct isert_comp *comps; int comps_used; struct list_head dev_node; - int (*reg_rdma_mem)(struct isert_cmd *isert_cmd, - struct iscsi_conn *conn); - void (*unreg_rdma_mem)(struct isert_cmd *isert_cmd, - struct isert_conn *isert_conn); }; struct isert_np { diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 427dee3a3ab0..7ecb1970cb87 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -70,6 +70,7 @@ static unsigned int indirect_sg_entries; static bool allow_ext_sg; static bool prefer_fr = true; static bool register_always = true; +static bool never_register; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -99,6 +100,9 @@ module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); +module_param(never_register, bool, 0444); +MODULE_PARM_DESC(never_register, "Never register memory"); + static const struct kernel_param_ops srp_tmo_ops; static int srp_reconnect_delay = 10; @@ -447,16 +451,16 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) /** * srp_destroy_qp() - destroy an RDMA queue pair - * @ch: SRP RDMA channel. + * @qp: RDMA queue pair. * * Drain the qp before destroying it. This avoids that the receive * completion handler can access the queue pair while it is * being destroyed. */ -static void srp_destroy_qp(struct srp_rdma_ch *ch) +static void srp_destroy_qp(struct ib_qp *qp) { - ib_drain_rq(ch->qp); - ib_destroy_qp(ch->qp); + ib_drain_rq(qp); + ib_destroy_qp(qp); } static int srp_create_ch_ib(struct srp_rdma_ch *ch) @@ -468,7 +472,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) struct ib_qp *qp; struct ib_fmr_pool *fmr_pool = NULL; struct srp_fr_pool *fr_pool = NULL; - const int m = dev->use_fast_reg ? 3 : 1; + const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); @@ -529,7 +533,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) } if (ch->qp) - srp_destroy_qp(ch); + srp_destroy_qp(ch->qp); if (ch->recv_cq) ib_free_cq(ch->recv_cq); if (ch->send_cq) @@ -553,7 +557,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) return 0; err_qp: - srp_destroy_qp(ch); + srp_destroy_qp(qp); err_send_cq: ib_free_cq(send_cq); @@ -596,7 +600,7 @@ static void srp_free_ch_ib(struct srp_target_port *target, ib_destroy_fmr_pool(ch->fmr_pool); } - srp_destroy_qp(ch); + srp_destroy_qp(ch->qp); ib_free_cq(ch->send_cq); ib_free_cq(ch->recv_cq); @@ -849,7 +853,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch) for (i = 0; i < target->req_ring_size; ++i) { req = &ch->req_ring[i]; - mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + mr_list = kmalloc(target->mr_per_cmd * sizeof(void *), GFP_KERNEL); if (!mr_list) goto out; @@ -1298,9 +1302,16 @@ static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc) srp_handle_qp_err(cq, wc, "FAST REG"); } +/* + * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset + * where to start in the first element. If sg_offset_p != NULL then + * *sg_offset_p is updated to the offset in state->sg[retval] of the first + * byte that has not yet been mapped. + */ static int srp_map_finish_fr(struct srp_map_state *state, struct srp_request *req, - struct srp_rdma_ch *ch, int sg_nents) + struct srp_rdma_ch *ch, int sg_nents, + unsigned int *sg_offset_p) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; @@ -1316,9 +1327,13 @@ static int srp_map_finish_fr(struct srp_map_state *state, WARN_ON_ONCE(!dev->use_fast_reg); if (sg_nents == 1 && target->global_mr) { - srp_map_desc(state, sg_dma_address(state->sg), - sg_dma_len(state->sg), + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + + srp_map_desc(state, sg_dma_address(state->sg) + sg_offset, + sg_dma_len(state->sg) - sg_offset, target->global_mr->rkey); + if (sg_offset_p) + *sg_offset_p = 0; return 1; } @@ -1329,9 +1344,17 @@ static int srp_map_finish_fr(struct srp_map_state *state, rkey = ib_inc_rkey(desc->mr->rkey); ib_update_fast_reg_key(desc->mr, rkey); - n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, dev->mr_page_size); - if (unlikely(n < 0)) + n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p, + dev->mr_page_size); + if (unlikely(n < 0)) { + srp_fr_pool_put(ch->fr_pool, &desc, 1); + pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n", + dev_name(&req->scmnd->device->sdev_gendev), sg_nents, + sg_offset_p ? *sg_offset_p : -1, n); return n; + } + + WARN_ON_ONCE(desc->mr->length == 0); req->reg_cqe.done = srp_reg_mr_err_done; @@ -1353,8 +1376,10 @@ static int srp_map_finish_fr(struct srp_map_state *state, desc->mr->length, desc->mr->rkey); err = ib_post_send(ch->qp, &wr.wr, &bad_wr); - if (unlikely(err)) + if (unlikely(err)) { + WARN_ON_ONCE(err == -ENOMEM); return err; + } return n; } @@ -1411,7 +1436,7 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch, state->pages = req->map_page; state->fmr.next = req->fmr_list; - state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt; + state->fmr.end = req->fmr_list + ch->target->mr_per_cmd; for_each_sg(scat, sg, count, i) { ret = srp_map_sg_entry(state, ch, sg, i); @@ -1430,9 +1455,11 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, struct srp_request *req, struct scatterlist *scat, int count) { + unsigned int sg_offset = 0; + state->desc = req->indirect_desc; state->fr.next = req->fr_list; - state->fr.end = req->fr_list + ch->target->cmd_sg_cnt; + state->fr.end = req->fr_list + ch->target->mr_per_cmd; state->sg = scat; if (count == 0) @@ -1441,7 +1468,7 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, while (count) { int i, n; - n = srp_map_finish_fr(state, req, ch, count); + n = srp_map_finish_fr(state, req, ch, count, &sg_offset); if (unlikely(n < 0)) return n; @@ -1501,14 +1528,15 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, if (dev->use_fast_reg) { state.sg = idb_sg; - sg_set_buf(idb_sg, req->indirect_desc, idb_len); + sg_init_one(idb_sg, req->indirect_desc, idb_len); idb_sg->dma_address = req->indirect_dma_addr; /* hack! */ #ifdef CONFIG_NEED_SG_DMA_LENGTH idb_sg->dma_length = idb_sg->length; /* hack^2 */ #endif - ret = srp_map_finish_fr(&state, req, ch, 1); + ret = srp_map_finish_fr(&state, req, ch, 1, NULL); if (ret < 0) return ret; + WARN_ON_ONCE(ret < 1); } else if (dev->use_fmr) { state.pages = idb_pages; state.pages[0] = (req->indirect_dma_addr & @@ -1526,6 +1554,32 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, return 0; } +#if defined(DYNAMIC_DATA_DEBUG) +static void srp_check_mapping(struct srp_map_state *state, + struct srp_rdma_ch *ch, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = ch->target->srp_host->srp_dev; + struct srp_fr_desc **pfr; + u64 desc_len = 0, mr_len = 0; + int i; + + for (i = 0; i < state->ndesc; i++) + desc_len += be32_to_cpu(req->indirect_desc[i].len); + if (dev->use_fast_reg) + for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++) + mr_len += (*pfr)->mr->length; + else if (dev->use_fmr) + for (i = 0; i < state->nmdesc; i++) + mr_len += be32_to_cpu(req->indirect_desc[i].len); + if (desc_len != scsi_bufflen(req->scmnd) || + mr_len > scsi_bufflen(req->scmnd)) + pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n", + scsi_bufflen(req->scmnd), desc_len, mr_len, + state->ndesc, state->nmdesc); +} +#endif + /** * srp_map_data() - map SCSI data buffer onto an SRP request * @scmnd: SCSI command to map @@ -1611,6 +1665,15 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, if (ret < 0) goto unmap; +#if defined(DYNAMIC_DEBUG) + { + DEFINE_DYNAMIC_DEBUG_METADATA(ddm, + "Memory mapping consistency check"); + if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT)) + srp_check_mapping(&state, ch, req, scat, count); + } +#endif + /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this * target is not using an external indirect table, we are @@ -2575,6 +2638,20 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } +static int srp_slave_alloc(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + + if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + blk_queue_virt_boundary(sdev->request_queue, + ~srp_dev->mr_page_mask); + + return 0; +} + static int srp_slave_configure(struct scsi_device *sdev) { struct Scsi_Host *shost = sdev->host; @@ -2766,6 +2843,7 @@ static struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, + .slave_alloc = srp_slave_alloc, .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, @@ -2840,7 +2918,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) goto out; } - pr_debug(PFX "%s: SCSI scan succeeded - detected %d LUNs\n", + pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n", dev_name(&target->scsi_host->shost_gendev), srp_sdev_count(target->scsi_host)); @@ -3172,6 +3250,7 @@ static ssize_t srp_create_target(struct device *dev, struct srp_device *srp_dev = host->srp_dev; struct ib_device *ibdev = srp_dev->dev; int ret, node_idx, node, cpu, i; + unsigned int max_sectors_per_mr, mr_per_cmd = 0; bool multich = false; target_host = scsi_host_alloc(&srp_template, @@ -3228,8 +3307,33 @@ static ssize_t srp_create_target(struct device *dev, target->sg_tablesize = target->cmd_sg_cnt; } + if (srp_dev->use_fast_reg || srp_dev->use_fmr) { + /* + * FR and FMR can only map one HCA page per entry. If the + * start address is not aligned on a HCA page boundary two + * entries will be used for the head and the tail although + * these two entries combined contain at most one HCA page of + * data. Hence the "+ 1" in the calculation below. + * + * The indirect data buffer descriptor is contiguous so the + * memory for that buffer will only be registered if + * register_always is true. Hence add one to mr_per_cmd if + * register_always has been set. + */ + max_sectors_per_mr = srp_dev->max_pages_per_mr << + (ilog2(srp_dev->mr_page_size) - 9); + mr_per_cmd = register_always + + (target->scsi_host->max_sectors + 1 + + max_sectors_per_mr - 1) / max_sectors_per_mr; + pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n", + target->scsi_host->max_sectors, + srp_dev->max_pages_per_mr, srp_dev->mr_page_size, + max_sectors_per_mr, mr_per_cmd); + } + target_host->sg_tablesize = target->sg_tablesize; - target->mr_pool_size = target->scsi_host->can_queue; + target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd; + target->mr_per_cmd = mr_per_cmd; target->indirect_size = target->sg_tablesize * sizeof (struct srp_direct_buf); target->max_iu_len = sizeof (struct srp_cmd) + @@ -3426,17 +3530,6 @@ static void srp_add_one(struct ib_device *device) if (!srp_dev) return; - srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); - srp_dev->has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - if (!srp_dev->has_fmr && !srp_dev->has_fr) - dev_warn(&device->dev, "neither FMR nor FR is supported\n"); - - srp_dev->use_fast_reg = (srp_dev->has_fr && - (!srp_dev->has_fmr || prefer_fr)); - srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; - /* * Use the smallest page size supported by the HCA, down to a * minimum of 4096 bytes. We're unlikely to build large sglists @@ -3447,8 +3540,25 @@ static void srp_add_one(struct ib_device *device) srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); max_pages_per_mr = device->attrs.max_mr_size; do_div(max_pages_per_mr, srp_dev->mr_page_size); + pr_debug("%s: %llu / %u = %llu <> %u\n", __func__, + device->attrs.max_mr_size, srp_dev->mr_page_size, + max_pages_per_mr, SRP_MAX_PAGES_PER_MR); srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, max_pages_per_mr); + + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) { + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + } else if (!never_register && + device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) { + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); + srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; + } + if (srp_dev->use_fast_reg) { srp_dev->max_pages_per_mr = min_t(u32, srp_dev->max_pages_per_mr, @@ -3468,7 +3578,8 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->pd)) goto free_dev; - if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) { + if (never_register || !register_always || + (!srp_dev->has_fmr && !srp_dev->has_fr)) { srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd, IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index a00914cdd44e..26bb9b0a7a63 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -203,6 +203,7 @@ struct srp_target_port { unsigned int scsi_id; unsigned int sg_tablesize; int mr_pool_size; + int mr_per_cmd; int queue_size; int req_ring_size; int comp_vector; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 8b42401d4795..2843f1ae75bd 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -765,52 +765,6 @@ static int srpt_post_recv(struct srpt_device *sdev, } /** - * srpt_post_send() - Post an IB send request. - * - * Returns zero upon success and a non-zero value upon failure. - */ -static int srpt_post_send(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx, int len) -{ - struct ib_sge list; - struct ib_send_wr wr, *bad_wr; - struct srpt_device *sdev = ch->sport->sdev; - int ret; - - atomic_inc(&ch->req_lim); - - ret = -ENOMEM; - if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { - pr_warn("IB send queue full (needed 1)\n"); - goto out; - } - - ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len, - DMA_TO_DEVICE); - - list.addr = ioctx->ioctx.dma; - list.length = len; - list.lkey = sdev->pd->local_dma_lkey; - - ioctx->ioctx.cqe.done = srpt_send_done; - wr.next = NULL; - wr.wr_cqe = &ioctx->ioctx.cqe; - wr.sg_list = &list; - wr.num_sge = 1; - wr.opcode = IB_WR_SEND; - wr.send_flags = IB_SEND_SIGNALED; - - ret = ib_post_send(ch->qp, &wr, &bad_wr); - -out: - if (ret < 0) { - atomic_inc(&ch->sq_wr_avail); - atomic_dec(&ch->req_lim); - } - return ret; -} - -/** * srpt_zerolength_write() - Perform a zero-length RDMA write. * * A quote from the InfiniBand specification: C9-88: For an HCA responder @@ -843,6 +797,110 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) } } +static int srpt_alloc_rw_ctxs(struct srpt_send_ioctx *ioctx, + struct srp_direct_buf *db, int nbufs, struct scatterlist **sg, + unsigned *sg_cnt) +{ + enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + struct scatterlist *prev = NULL; + unsigned prev_nents; + int ret, i; + + if (nbufs == 1) { + ioctx->rw_ctxs = &ioctx->s_rw_ctx; + } else { + ioctx->rw_ctxs = kmalloc_array(nbufs, sizeof(*ioctx->rw_ctxs), + GFP_KERNEL); + if (!ioctx->rw_ctxs) + return -ENOMEM; + } + + for (i = ioctx->n_rw_ctx; i < nbufs; i++, db++) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + u64 remote_addr = be64_to_cpu(db->va); + u32 size = be32_to_cpu(db->len); + u32 rkey = be32_to_cpu(db->key); + + ret = target_alloc_sgl(&ctx->sg, &ctx->nents, size, false, + i < nbufs - 1); + if (ret) + goto unwind; + + ret = rdma_rw_ctx_init(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, 0, remote_addr, rkey, dir); + if (ret < 0) { + target_free_sgl(ctx->sg, ctx->nents); + goto unwind; + } + + ioctx->n_rdma += ret; + ioctx->n_rw_ctx++; + + if (prev) { + sg_unmark_end(&prev[prev_nents - 1]); + sg_chain(prev, prev_nents + 1, ctx->sg); + } else { + *sg = ctx->sg; + } + + prev = ctx->sg; + prev_nents = ctx->nents; + + *sg_cnt += ctx->nents; + } + + return 0; + +unwind: + while (--i >= 0) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, dir); + target_free_sgl(ctx->sg, ctx->nents); + } + if (ioctx->rw_ctxs != &ioctx->s_rw_ctx) + kfree(ioctx->rw_ctxs); + return ret; +} + +static void srpt_free_rw_ctxs(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd); + int i; + + for (i = 0; i < ioctx->n_rw_ctx; i++) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, dir); + target_free_sgl(ctx->sg, ctx->nents); + } + + if (ioctx->rw_ctxs != &ioctx->s_rw_ctx) + kfree(ioctx->rw_ctxs); +} + +static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd) +{ + /* + * The pointer computations below will only be compiled correctly + * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check + * whether srp_cmd::add_data has been declared as a byte pointer. + */ + BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) && + !__same_type(srp_cmd->add_data[0], (u8)0)); + + /* + * According to the SRP spec, the lower two bits of the 'ADDITIONAL + * CDB LENGTH' field are reserved and the size in bytes of this field + * is four times the value specified in bits 3..7. Hence the "& ~3". + */ + return srp_cmd->add_data + (srp_cmd->add_cdb_len & ~3); +} + /** * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. * @ioctx: Pointer to the I/O context associated with the request. @@ -858,94 +916,59 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) * -ENOMEM when memory allocation fails and zero upon success. */ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, - struct srp_cmd *srp_cmd, - enum dma_data_direction *dir, u64 *data_len) + struct srp_cmd *srp_cmd, enum dma_data_direction *dir, + struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len) { - struct srp_indirect_buf *idb; - struct srp_direct_buf *db; - unsigned add_cdb_offset; - int ret; - - /* - * The pointer computations below will only be compiled correctly - * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check - * whether srp_cmd::add_data has been declared as a byte pointer. - */ - BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) - && !__same_type(srp_cmd->add_data[0], (u8)0)); - BUG_ON(!dir); BUG_ON(!data_len); - ret = 0; - *data_len = 0; - /* * The lower four bits of the buffer format field contain the DATA-IN * buffer descriptor format, and the highest four bits contain the * DATA-OUT buffer descriptor format. */ - *dir = DMA_NONE; if (srp_cmd->buf_fmt & 0xf) /* DATA-IN: transfer data from target to initiator (read). */ *dir = DMA_FROM_DEVICE; else if (srp_cmd->buf_fmt >> 4) /* DATA-OUT: transfer data from initiator to target (write). */ *dir = DMA_TO_DEVICE; + else + *dir = DMA_NONE; + + /* initialize data_direction early as srpt_alloc_rw_ctxs needs it */ + ioctx->cmd.data_direction = *dir; - /* - * According to the SRP spec, the lower two bits of the 'ADDITIONAL - * CDB LENGTH' field are reserved and the size in bytes of this field - * is four times the value specified in bits 3..7. Hence the "& ~3". - */ - add_cdb_offset = srp_cmd->add_cdb_len & ~3; if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) { - ioctx->n_rbuf = 1; - ioctx->rbufs = &ioctx->single_rbuf; + struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd); - db = (struct srp_direct_buf *)(srp_cmd->add_data - + add_cdb_offset); - memcpy(ioctx->rbufs, db, sizeof(*db)); *data_len = be32_to_cpu(db->len); + return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt); } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { - idb = (struct srp_indirect_buf *)(srp_cmd->add_data - + add_cdb_offset); + struct srp_indirect_buf *idb = srpt_get_desc_buf(srp_cmd); + int nbufs = be32_to_cpu(idb->table_desc.len) / + sizeof(struct srp_direct_buf); - ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof(*db); - - if (ioctx->n_rbuf > + if (nbufs > (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { pr_err("received unsupported SRP_CMD request" " type (%u out + %u in != %u / %zu)\n", srp_cmd->data_out_desc_cnt, srp_cmd->data_in_desc_cnt, be32_to_cpu(idb->table_desc.len), - sizeof(*db)); - ioctx->n_rbuf = 0; - ret = -EINVAL; - goto out; - } - - if (ioctx->n_rbuf == 1) - ioctx->rbufs = &ioctx->single_rbuf; - else { - ioctx->rbufs = - kmalloc(ioctx->n_rbuf * sizeof(*db), GFP_ATOMIC); - if (!ioctx->rbufs) { - ioctx->n_rbuf = 0; - ret = -ENOMEM; - goto out; - } + sizeof(struct srp_direct_buf)); + return -EINVAL; } - db = idb->desc_list; - memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof(*db)); *data_len = be32_to_cpu(idb->len); + return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs, + sg, sg_cnt); + } else { + *data_len = 0; + return 0; } -out: - return ret; } /** @@ -1049,217 +1072,6 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) } /** - * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list. - */ -static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct scatterlist *sg; - enum dma_data_direction dir; - - BUG_ON(!ch); - BUG_ON(!ioctx); - BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs); - - while (ioctx->n_rdma) - kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list); - - kfree(ioctx->rdma_wrs); - ioctx->rdma_wrs = NULL; - - if (ioctx->mapped_sg_count) { - sg = ioctx->sg; - WARN_ON(!sg); - dir = ioctx->cmd.data_direction; - BUG_ON(dir == DMA_NONE); - ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, - target_reverse_dma_direction(&ioctx->cmd)); - ioctx->mapped_sg_count = 0; - } -} - -/** - * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list. - */ -static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct ib_device *dev = ch->sport->sdev->device; - struct se_cmd *cmd; - struct scatterlist *sg, *sg_orig; - int sg_cnt; - enum dma_data_direction dir; - struct ib_rdma_wr *riu; - struct srp_direct_buf *db; - dma_addr_t dma_addr; - struct ib_sge *sge; - u64 raddr; - u32 rsize; - u32 tsize; - u32 dma_len; - int count, nrdma; - int i, j, k; - - BUG_ON(!ch); - BUG_ON(!ioctx); - cmd = &ioctx->cmd; - dir = cmd->data_direction; - BUG_ON(dir == DMA_NONE); - - ioctx->sg = sg = sg_orig = cmd->t_data_sg; - ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; - - count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, - target_reverse_dma_direction(cmd)); - if (unlikely(!count)) - return -EAGAIN; - - ioctx->mapped_sg_count = count; - - if (ioctx->rdma_wrs && ioctx->n_rdma_wrs) - nrdma = ioctx->n_rdma_wrs; - else { - nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE - + ioctx->n_rbuf; - - ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs), - GFP_KERNEL); - if (!ioctx->rdma_wrs) - goto free_mem; - - ioctx->n_rdma_wrs = nrdma; - } - - db = ioctx->rbufs; - tsize = cmd->data_length; - dma_len = ib_sg_dma_len(dev, &sg[0]); - riu = ioctx->rdma_wrs; - - /* - * For each remote desc - calculate the #ib_sge. - * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then - * each remote desc rdma_iu is required a rdma wr; - * else - * we need to allocate extra rdma_iu to carry extra #ib_sge in - * another rdma wr - */ - for (i = 0, j = 0; - j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { - rsize = be32_to_cpu(db->len); - raddr = be64_to_cpu(db->va); - riu->remote_addr = raddr; - riu->rkey = be32_to_cpu(db->key); - riu->wr.num_sge = 0; - - /* calculate how many sge required for this remote_buf */ - while (rsize > 0 && tsize > 0) { - - if (rsize >= dma_len) { - tsize -= dma_len; - rsize -= dma_len; - raddr += dma_len; - - if (tsize > 0) { - ++j; - if (j < count) { - sg = sg_next(sg); - dma_len = ib_sg_dma_len( - dev, sg); - } - } - } else { - tsize -= rsize; - dma_len -= rsize; - rsize = 0; - } - - ++riu->wr.num_sge; - - if (rsize > 0 && - riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) { - ++ioctx->n_rdma; - riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, - sizeof(*riu->wr.sg_list), - GFP_KERNEL); - if (!riu->wr.sg_list) - goto free_mem; - - ++riu; - riu->wr.num_sge = 0; - riu->remote_addr = raddr; - riu->rkey = be32_to_cpu(db->key); - } - } - - ++ioctx->n_rdma; - riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, - sizeof(*riu->wr.sg_list), - GFP_KERNEL); - if (!riu->wr.sg_list) - goto free_mem; - } - - db = ioctx->rbufs; - tsize = cmd->data_length; - riu = ioctx->rdma_wrs; - sg = sg_orig; - dma_len = ib_sg_dma_len(dev, &sg[0]); - dma_addr = ib_sg_dma_address(dev, &sg[0]); - - /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ - for (i = 0, j = 0; - j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { - rsize = be32_to_cpu(db->len); - sge = riu->wr.sg_list; - k = 0; - - while (rsize > 0 && tsize > 0) { - sge->addr = dma_addr; - sge->lkey = ch->sport->sdev->pd->local_dma_lkey; - - if (rsize >= dma_len) { - sge->length = - (tsize < dma_len) ? tsize : dma_len; - tsize -= dma_len; - rsize -= dma_len; - - if (tsize > 0) { - ++j; - if (j < count) { - sg = sg_next(sg); - dma_len = ib_sg_dma_len( - dev, sg); - dma_addr = ib_sg_dma_address( - dev, sg); - } - } - } else { - sge->length = (tsize < rsize) ? tsize : rsize; - tsize -= rsize; - dma_len -= rsize; - dma_addr += rsize; - rsize = 0; - } - - ++k; - if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) { - ++riu; - sge = riu->wr.sg_list; - k = 0; - } else if (rsize > 0 && tsize > 0) - ++sge; - } - } - - return 0; - -free_mem: - srpt_unmap_sg_to_ib_sge(ch, ioctx); - - return -ENOMEM; -} - -/** * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. */ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) @@ -1284,12 +1096,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) BUG_ON(ioctx->ch != ch); spin_lock_init(&ioctx->spinlock); ioctx->state = SRPT_STATE_NEW; - ioctx->n_rbuf = 0; - ioctx->rbufs = NULL; ioctx->n_rdma = 0; - ioctx->n_rdma_wrs = 0; - ioctx->rdma_wrs = NULL; - ioctx->mapped_sg_count = 0; + ioctx->n_rw_ctx = 0; init_completion(&ioctx->tx_done); ioctx->queue_status_only = false; /* @@ -1359,7 +1167,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * SRP_RSP sending failed or the SRP_RSP send completion has * not been received in time. */ - srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); break; case SRPT_STATE_MGMT_RSP_SENT: @@ -1387,6 +1194,7 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(ioctx->n_rdma <= 0); atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + ioctx->n_rdma = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n", @@ -1403,23 +1211,6 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) __LINE__, srpt_get_cmd_state(ioctx)); } -static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct srpt_send_ioctx *ioctx = - container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe); - - if (unlikely(wc->status != IB_WC_SUCCESS)) { - /* - * Note: if an RDMA write error completion is received that - * means that a SEND also has been posted. Defer further - * processing of the associated command until the send error - * completion has been received. - */ - pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n", - ioctx, wc->status); - } -} - /** * srpt_build_cmd_rsp() - Build an SRP_RSP response. * @ch: RDMA channel through which the request has been received. @@ -1537,6 +1328,8 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, { struct se_cmd *cmd; struct srp_cmd *srp_cmd; + struct scatterlist *sg = NULL; + unsigned sg_cnt = 0; u64 data_len; enum dma_data_direction dir; int rc; @@ -1563,16 +1356,21 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, break; } - if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { - pr_err("0x%llx: parsing SRP descriptor table failed.\n", - srp_cmd->tag); + rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt, + &data_len); + if (rc) { + if (rc != -EAGAIN) { + pr_err("0x%llx: parsing SRP descriptor table failed.\n", + srp_cmd->tag); + } goto release_ioctx; } - rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, + rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb, &send_ioctx->sense_data[0], scsilun_to_int(&srp_cmd->lun), data_len, - TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF, + sg, sg_cnt, NULL, 0, NULL, 0); if (rc != 0) { pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc, srp_cmd->tag); @@ -1664,23 +1462,21 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, recv_ioctx->ioctx.dma, srp_max_req_size, DMA_FROM_DEVICE); - if (unlikely(ch->state == CH_CONNECTING)) { - list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); - goto out; - } + if (unlikely(ch->state == CH_CONNECTING)) + goto out_wait; if (unlikely(ch->state != CH_LIVE)) - goto out; + return; srp_cmd = recv_ioctx->ioctx.buf; if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { - if (!send_ioctx) + if (!send_ioctx) { + if (!list_empty(&ch->cmd_wait_list)) + goto out_wait; send_ioctx = srpt_get_send_ioctx(ch); - if (unlikely(!send_ioctx)) { - list_add_tail(&recv_ioctx->wait_list, - &ch->cmd_wait_list); - goto out; } + if (unlikely(!send_ioctx)) + goto out_wait; } switch (srp_cmd->opcode) { @@ -1709,8 +1505,10 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, } srpt_post_recv(ch->sport->sdev, recv_ioctx); -out: return; + +out_wait: + list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); } static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1779,14 +1577,13 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(state != SRPT_STATE_CMD_RSP_SENT && state != SRPT_STATE_MGMT_RSP_SENT); - atomic_inc(&ch->sq_wr_avail); + atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail); if (wc->status != IB_WC_SUCCESS) pr_info("sending response for ioctx 0x%p failed" " with status %d\n", ioctx, wc->status); if (state != SRPT_STATE_DONE) { - srpt_unmap_sg_to_ib_sge(ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); } else { pr_err("IB completion has been received too late for" @@ -1832,8 +1629,18 @@ retry: qp_init->srq = sdev->srq; qp_init->sq_sig_type = IB_SIGNAL_REQ_WR; qp_init->qp_type = IB_QPT_RC; - qp_init->cap.max_send_wr = srp_sq_size; - qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + /* + * We divide up our send queue size into half SEND WRs to send the + * completions, and half R/W contexts to actually do the RDMA + * READ/WRITE transfers. Note that we need to allocate CQ slots for + * both both, as RDMA contexts will also post completions for the + * RDMA READ case. + */ + qp_init->cap.max_send_wr = srp_sq_size / 2; + qp_init->cap.max_rdma_ctxs = srp_sq_size / 2; + qp_init->cap.max_send_sge = max(sdev->device->attrs.max_sge_rd, + sdev->device->attrs.max_sge); + qp_init->port_num = ch->sport->port; ch->qp = ib_create_qp(sdev->pd, qp_init); if (IS_ERR(ch->qp)) { @@ -2386,95 +2193,6 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return ret; } -/** - * srpt_perform_rdmas() - Perform IB RDMA. - * - * Returns zero upon success or a negative number upon failure. - */ -static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct ib_send_wr *bad_wr; - int sq_wr_avail, ret, i; - enum dma_data_direction dir; - const int n_rdma = ioctx->n_rdma; - - dir = ioctx->cmd.data_direction; - if (dir == DMA_TO_DEVICE) { - /* write */ - ret = -ENOMEM; - sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); - if (sq_wr_avail < 0) { - pr_warn("IB send queue full (needed %d)\n", - n_rdma); - goto out; - } - } - - for (i = 0; i < n_rdma; i++) { - struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr; - - wr->opcode = (dir == DMA_FROM_DEVICE) ? - IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; - - if (i == n_rdma - 1) { - /* only get completion event for the last rdma read */ - if (dir == DMA_TO_DEVICE) { - wr->send_flags = IB_SEND_SIGNALED; - ioctx->rdma_cqe.done = srpt_rdma_read_done; - } else { - ioctx->rdma_cqe.done = srpt_rdma_write_done; - } - wr->wr_cqe = &ioctx->rdma_cqe; - wr->next = NULL; - } else { - wr->wr_cqe = NULL; - wr->next = &ioctx->rdma_wrs[i + 1].wr; - } - } - - ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr); - if (ret) - pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n", - __func__, __LINE__, ret, i, n_rdma); -out: - if (unlikely(dir == DMA_TO_DEVICE && ret < 0)) - atomic_add(n_rdma, &ch->sq_wr_avail); - return ret; -} - -/** - * srpt_xfer_data() - Start data transfer from initiator to target. - */ -static int srpt_xfer_data(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - int ret; - - ret = srpt_map_sg_to_ib_sge(ch, ioctx); - if (ret) { - pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret); - goto out; - } - - ret = srpt_perform_rdmas(ch, ioctx); - if (ret) { - if (ret == -EAGAIN || ret == -ENOMEM) - pr_info("%s[%d] queue full -- ret=%d\n", - __func__, __LINE__, ret); - else - pr_err("%s[%d] fatal error -- ret=%d\n", - __func__, __LINE__, ret); - goto out_unmap; - } - -out: - return ret; -out_unmap: - srpt_unmap_sg_to_ib_sge(ch, ioctx); - goto out; -} - static int srpt_write_pending_status(struct se_cmd *se_cmd) { struct srpt_send_ioctx *ioctx; @@ -2491,11 +2209,42 @@ static int srpt_write_pending(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); struct srpt_rdma_ch *ch = ioctx->ch; + struct ib_send_wr *first_wr = NULL, *bad_wr; + struct ib_cqe *cqe = &ioctx->rdma_cqe; enum srpt_command_state new_state; + int ret, i; new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); WARN_ON(new_state == SRPT_STATE_DONE); - return srpt_xfer_data(ch, ioctx); + + if (atomic_sub_return(ioctx->n_rdma, &ch->sq_wr_avail) < 0) { + pr_warn("%s: IB send queue full (needed %d)\n", + __func__, ioctx->n_rdma); + ret = -ENOMEM; + goto out_undo; + } + + cqe->done = srpt_rdma_read_done; + for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, ch->sport->port, + cqe, first_wr); + cqe = NULL; + } + + ret = ib_post_send(ch->qp, first_wr, &bad_wr); + if (ret) { + pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n", + __func__, ret, ioctx->n_rdma, + atomic_read(&ch->sq_wr_avail)); + goto out_undo; + } + + return 0; +out_undo: + atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + return ret; } static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) @@ -2517,17 +2266,17 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) */ static void srpt_queue_response(struct se_cmd *cmd) { - struct srpt_rdma_ch *ch; - struct srpt_send_ioctx *ioctx; + struct srpt_send_ioctx *ioctx = + container_of(cmd, struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + struct srpt_device *sdev = ch->sport->sdev; + struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr; + struct ib_sge sge; enum srpt_command_state state; unsigned long flags; - int ret; - enum dma_data_direction dir; - int resp_len; + int resp_len, ret, i; u8 srp_tm_status; - ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); - ch = ioctx->ch; BUG_ON(!ch); spin_lock_irqsave(&ioctx->spinlock, flags); @@ -2554,17 +2303,19 @@ static void srpt_queue_response(struct se_cmd *cmd) return; } - dir = ioctx->cmd.data_direction; - /* For read commands, transfer the data to the initiator. */ - if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length && + if (ioctx->cmd.data_direction == DMA_FROM_DEVICE && + ioctx->cmd.data_length && !ioctx->queue_status_only) { - ret = srpt_xfer_data(ch, ioctx); - if (ret) { - pr_err("xfer_data failed for tag %llu\n", - ioctx->cmd.tag); - return; + for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, + ch->sport->port, NULL, + first_wr ? first_wr : &send_wr); } + } else { + first_wr = &send_wr; } if (state != SRPT_STATE_MGMT) @@ -2576,14 +2327,46 @@ static void srpt_queue_response(struct se_cmd *cmd) resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status, ioctx->cmd.tag); } - ret = srpt_post_send(ch, ioctx, resp_len); - if (ret) { - pr_err("sending cmd response failed for tag %llu\n", - ioctx->cmd.tag); - srpt_unmap_sg_to_ib_sge(ch, ioctx); - srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - target_put_sess_cmd(&ioctx->cmd); + + atomic_inc(&ch->req_lim); + + if (unlikely(atomic_sub_return(1 + ioctx->n_rdma, + &ch->sq_wr_avail) < 0)) { + pr_warn("%s: IB send queue full (needed %d)\n", + __func__, ioctx->n_rdma); + ret = -ENOMEM; + goto out; + } + + ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, resp_len, + DMA_TO_DEVICE); + + sge.addr = ioctx->ioctx.dma; + sge.length = resp_len; + sge.lkey = sdev->pd->local_dma_lkey; + + ioctx->ioctx.cqe.done = srpt_send_done; + send_wr.next = NULL; + send_wr.wr_cqe = &ioctx->ioctx.cqe; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(ch->qp, first_wr, &bad_wr); + if (ret < 0) { + pr_err("%s: sending cmd response failed for tag %llu (%d)\n", + __func__, ioctx->cmd.tag, ret); + goto out; } + + return; + +out: + atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail); + atomic_dec(&ch->req_lim); + srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + target_put_sess_cmd(&ioctx->cmd); } static int srpt_queue_data_in(struct se_cmd *cmd) @@ -2599,10 +2382,6 @@ static void srpt_queue_tm_rsp(struct se_cmd *cmd) static void srpt_aborted_task(struct se_cmd *cmd) { - struct srpt_send_ioctx *ioctx = container_of(cmd, - struct srpt_send_ioctx, cmd); - - srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); } static int srpt_queue_status(struct se_cmd *cmd) @@ -2903,12 +2682,10 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) unsigned long flags; WARN_ON(ioctx->state != SRPT_STATE_DONE); - WARN_ON(ioctx->mapped_sg_count != 0); - if (ioctx->n_rbuf > 1) { - kfree(ioctx->rbufs); - ioctx->rbufs = NULL; - ioctx->n_rbuf = 0; + if (ioctx->n_rw_ctx) { + srpt_free_rw_ctxs(ch, ioctx); + ioctx->n_rw_ctx = 0; } spin_lock_irqsave(&ch->spinlock, flags); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index af9b8b527340..fee6bfd7ca21 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -42,6 +42,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_sa.h> #include <rdma/ib_cm.h> +#include <rdma/rw.h> #include <scsi/srp.h> @@ -105,7 +106,6 @@ enum { SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2, SRPT_DEF_SG_TABLESIZE = 128, - SRPT_DEF_SG_PER_WQE = 16, MIN_SRPT_SQ_SIZE = 16, DEF_SRPT_SQ_SIZE = 4096, @@ -174,21 +174,17 @@ struct srpt_recv_ioctx { struct srpt_ioctx ioctx; struct list_head wait_list; }; + +struct srpt_rw_ctx { + struct rdma_rw_ctx rw; + struct scatterlist *sg; + unsigned int nents; +}; /** * struct srpt_send_ioctx - SRPT send I/O context. * @ioctx: See above. * @ch: Channel pointer. - * @free_list: Node in srpt_rdma_ch.free_list. - * @n_rbuf: Number of data buffers in the received SRP command. - * @rbufs: Pointer to SRP data buffer array. - * @single_rbuf: SRP data buffer if the command has only a single buffer. - * @sg: Pointer to sg-list associated with this I/O context. - * @sg_cnt: SG-list size. - * @mapped_sg_count: ib_dma_map_sg() return value. - * @n_rdma_wrs: Number of elements in the rdma_wrs array. - * @rdma_wrs: Array with information about the RDMA mapping. - * @tag: Tag of the received SRP information unit. * @spinlock: Protects 'state'. * @state: I/O context state. * @cmd: Target core command data structure. @@ -197,21 +193,18 @@ struct srpt_recv_ioctx { struct srpt_send_ioctx { struct srpt_ioctx ioctx; struct srpt_rdma_ch *ch; - struct ib_rdma_wr *rdma_wrs; + + struct srpt_rw_ctx s_rw_ctx; + struct srpt_rw_ctx *rw_ctxs; + struct ib_cqe rdma_cqe; - struct srp_direct_buf *rbufs; - struct srp_direct_buf single_rbuf; - struct scatterlist *sg; struct list_head free_list; spinlock_t spinlock; enum srpt_command_state state; struct se_cmd cmd; struct completion tx_done; - int sg_cnt; - int mapped_sg_count; - u16 n_rdma_wrs; u8 n_rdma; - u8 n_rbuf; + u8 n_rw_ctx; bool queue_status_only; u8 sense_data[TRANSPORT_SENSE_BUFFER]; }; |