From 39e487faaf706fa94bab4d0cf9f543a3430c746e Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 11 Apr 2018 15:32:25 +0800 Subject: infiniband: i40iw: Replace GFP_ATOMIC with GFP_KERNEL in i40iw_add_mqh_4 i40iw_add_mqh_4() is never called in atomic context, because it calls rtnl_lock() that can sleep. Despite never getting called from atomic context, i40iw_add_mqh_4() calls kzalloc() with GFP_ATOMIC, which does not sleep for allocation. GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL, which can sleep and improve the possibility of sucessful allocation. This is found by a static analysis tool named DCNS written by myself. And I also manually check it. Signed-off-by: Jia-Ju Bai Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 4cfa8f4647e2..8310d2488681 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1788,7 +1788,7 @@ static enum i40iw_status_code i40iw_add_mqh_4( &ifa->ifa_address, rdma_vlan_dev_vlan_id(dev), dev->dev_addr); - child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_ATOMIC); + child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_KERNEL); cm_parent_listen_node->cm_core->stats_listen_nodes_created++; i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, -- cgit v1.2.3 From f9af8730143a0fdc572f90b8a388795ee812cd74 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 11 Apr 2018 15:32:48 +0800 Subject: infiniband: i40iw: Replace GFP_ATOMIC with GFP_KERNEL in i40iw_make_listen_node i40iw_make_listen_node() is never called in atomic context. i40iw_make_listen_node() is only called by i40iw_create_listen, which is set as ".create_listen" in struct iw_cm_verbs. Despite never getting called from atomic context, i40iw_make_listen_node() calls kzalloc() with GFP_ATOMIC, which does not sleep for allocation. GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL, which can sleep and improve the possibility of sucessful allocation. This is found by a static analysis tool named DCNS written by myself. And I also manually check it. Signed-off-by: Jia-Ju Bai Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 8310d2488681..0243ec48e4b5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -2872,7 +2872,7 @@ static struct i40iw_cm_listener *i40iw_make_listen_node( if (!listener) { /* create a CM listen node (1/2 node to compare incoming traffic to) */ - listener = kzalloc(sizeof(*listener), GFP_ATOMIC); + listener = kzalloc(sizeof(*listener), GFP_KERNEL); if (!listener) return NULL; cm_core->stats_listen_nodes_created++; -- cgit v1.2.3 From 4e56569cee1505846b3dcb15fbf400f6a7e9f015 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 11 Apr 2018 15:33:06 +0800 Subject: infiniband: i40iw: Replace GFP_ATOMIC with GFP_KERNEL in i40iw_l2param_change i40iw_l2param_change() is never called in atomic context. i40iw_make_listen_node() is only set as ".l2_param_change" in struct i40e_client_ops, and this function pointer is not called in atomic context. Despite never getting called from atomic context, i40iw_l2param_change() calls kzalloc() with GFP_ATOMIC, which does not sleep for allocation. GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL, which can sleep and improve the possibility of sucessful allocation. This is found by a static analysis tool named DCNS written by myself. And I also manually check it. Signed-off-by: Jia-Ju Bai Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 9cd0d3ef9057..a220794dcdb0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1758,7 +1758,7 @@ static void i40iw_l2param_change(struct i40e_info *ldev, struct i40e_client *cli return; - work = kzalloc(sizeof(*work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), GFP_KERNEL); if (!work) return; -- cgit v1.2.3 From d819734126ce705784ca2cd847ad7623825f1a08 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 17 Apr 2018 19:53:58 +0530 Subject: infiniband: hw: hfi1: Change return type to vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index da4aa1a95b11..1b778fd16a32 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -110,7 +110,7 @@ static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg); static int ctxt_reset(struct hfi1_ctxtdata *uctxt); static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt, unsigned long arg); -static int vma_fault(struct vm_fault *vmf); +static vm_fault_t vma_fault(struct vm_fault *vmf); static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); @@ -591,7 +591,7 @@ done: * Local (non-chip) user memory is not mapped right away but as it is * accessed by the user-level code. */ -static int vma_fault(struct vm_fault *vmf) +static vm_fault_t vma_fault(struct vm_fault *vmf) { struct page *page; -- cgit v1.2.3 From 7991d96dd137408385f425cdf8ff815738ea2b49 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 17 Apr 2018 20:04:28 +0530 Subject: infiniband: hw: qib: Change return type to vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 6a8800b65047..bbb720bfd030 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -868,7 +868,7 @@ bail: /* * qib_file_vma_fault - handle a VMA page fault. */ -static int qib_file_vma_fault(struct vm_fault *vmf) +static vm_fault_t qib_file_vma_fault(struct vm_fault *vmf) { struct page *page; -- cgit v1.2.3 From ecb238f6a7f369b5e0eece4e913c9d671208860c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Apr 2018 15:31:06 +0800 Subject: IB/cxgb4: use skb_put_zero()/__skb_put_zero Use the recently introduced helper to replace the pattern of skb_put_zero/__skb_put() && memset(). Signed-off-by: YueHaibing Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/qp.c | 9 +++------ drivers/infiniband/sw/rxe/rxe_net.c | 4 +--- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index de77b6027d69..2dc94997ea11 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1297,8 +1297,7 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); - wqe = __skb_put(skb, sizeof(*wqe)); - memset(wqe, 0, sizeof *wqe); + wqe = __skb_put_zero(skb, sizeof(*wqe)); wqe->op_compl = cpu_to_be32(FW_WR_OP_V(FW_RI_INIT_WR)); wqe->flowid_len16 = cpu_to_be32( FW_WR_FLOWID_V(qhp->ep->hwtid) | @@ -1421,8 +1420,7 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - wqe = __skb_put(skb, sizeof(*wqe)); - memset(wqe, 0, sizeof *wqe); + wqe = __skb_put_zero(skb, sizeof(*wqe)); wqe->op_compl = cpu_to_be32( FW_WR_OP_V(FW_RI_INIT_WR) | FW_WR_COMPL_F); @@ -1487,8 +1485,7 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) } set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); - wqe = __skb_put(skb, sizeof(*wqe)); - memset(wqe, 0, sizeof *wqe); + wqe = __skb_put_zero(skb, sizeof(*wqe)); wqe->op_compl = cpu_to_be32( FW_WR_OP_V(FW_RI_INIT_WR) | FW_WR_COMPL_F); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index fca13a6281f0..95e52b3ec757 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -565,11 +565,9 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, pkt->rxe = rxe; pkt->port_num = port_num; - pkt->hdr = skb_put(skb, paylen); + pkt->hdr = skb_put_zero(skb, paylen); pkt->mask |= RXE_GRH_MASK; - memset(pkt->hdr, 0, paylen); - dev_put(ndev); return skb; } -- cgit v1.2.3 From ffab8c89ba59c4e01f9c277f1baaad12bd5a3c0c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 1 May 2018 09:25:49 +0100 Subject: RDMA/qedr: fix spelling mistake: "failes" -> "fails" Trivial fix to spelling mistake in DP_ERR error message Signed-off-by: Colin Ian King Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 7d3763b2e01c..35f3b6f8fd45 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2579,7 +2579,7 @@ static int qedr_set_page(struct ib_mr *ibmr, u64 addr) u32 pbes_in_page; if (unlikely(mr->npages == mr->info.pbl_info.num_pbes)) { - DP_ERR(mr->dev, "qedr_set_page failes when %d\n", mr->npages); + DP_ERR(mr->dev, "qedr_set_page fails when %d\n", mr->npages); return -ENOMEM; } -- cgit v1.2.3 From 056f9c7f39bf517d58f32797f1eb1465bb6f6ef2 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 3 May 2018 08:41:49 -0700 Subject: iw_cxgb4: dump detailed driver-specific QP information Provide a cxgb4-specific function to fill in qp state details. This allows dumping important c4iw_qp state useful for debugging. Included in the dump are the t4_sq, t4_rq structs, plus a dump of the t4_swsqe and t4swrqe descriptors for the first and last pending entries. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/Makefile | 3 +- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 5 + drivers/infiniband/hw/cxgb4/provider.c | 8 ++ drivers/infiniband/hw/cxgb4/restrack.c | 248 +++++++++++++++++++++++++++++++++ 4 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/cxgb4/restrack.c (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile index fa40b685831b..9edd92023e18 100644 --- a/drivers/infiniband/hw/cxgb4/Makefile +++ b/drivers/infiniband/hw/cxgb4/Makefile @@ -3,4 +3,5 @@ ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o -iw_cxgb4-y := device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o +iw_cxgb4-y := device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o \ + restrack.o diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index cc929002c05e..5eec8772468c 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -55,6 +55,7 @@ #include #include #include +#include #include "cxgb4.h" #include "cxgb4_uld.h" @@ -1078,4 +1079,8 @@ extern int use_dsgl; void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey); struct c4iw_wr_wait *c4iw_alloc_wr_wait(gfp_t gfp); +typedef int c4iw_restrack_func(struct sk_buff *msg, + struct rdma_restrack_entry *res); +extern c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX]; + #endif diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 0b9cc73c3ded..1feade8bb4b3 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -551,6 +551,13 @@ static struct net_device *get_netdev(struct ib_device *dev, u8 port) return ndev; } +static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) +{ + return (res->type < ARRAY_SIZE(c4iw_restrack_funcs) && + c4iw_restrack_funcs[res->type]) ? + c4iw_restrack_funcs[res->type](msg, res) : 0; +} + void c4iw_register_device(struct work_struct *work) { int ret; @@ -645,6 +652,7 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref; dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref; dev->ibdev.iwcm->get_qp = c4iw_get_qp; + dev->ibdev.res.fill_res_entry = fill_res_entry; memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iwcm->ifname)); diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c new file mode 100644 index 000000000000..a677940b164a --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2018 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iw_cxgb4.h" +#include +#include + +static int fill_sq(struct sk_buff *msg, struct t4_wq *wq) +{ + /* WQ+SQ */ + if (rdma_nl_put_driver_u32(msg, "sqid", wq->sq.qid)) + goto err; + if (rdma_nl_put_driver_u32(msg, "flushed", wq->flushed)) + goto err; + if (rdma_nl_put_driver_u32(msg, "memsize", wq->sq.memsize)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx", wq->sq.cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "pidx", wq->sq.pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "wq_pidx", wq->sq.wq_pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "flush_cidx", wq->sq.flush_cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "in_use", wq->sq.in_use)) + goto err; + if (rdma_nl_put_driver_u32(msg, "size", wq->sq.size)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "flags", wq->sq.flags)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +static int fill_rq(struct sk_buff *msg, struct t4_wq *wq) +{ + /* RQ */ + if (rdma_nl_put_driver_u32(msg, "rqid", wq->rq.qid)) + goto err; + if (rdma_nl_put_driver_u32(msg, "memsize", wq->rq.memsize)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx", wq->rq.cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "pidx", wq->rq.pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "wq_pidx", wq->rq.wq_pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "msn", wq->rq.msn)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "rqt_hwaddr", wq->rq.rqt_hwaddr)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rqt_size", wq->rq.rqt_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "in_use", wq->rq.in_use)) + goto err; + if (rdma_nl_put_driver_u32(msg, "size", wq->rq.size)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +static int fill_swsqe(struct sk_buff *msg, struct t4_sq *sq, u16 idx, + struct t4_swsqe *sqe) +{ + if (rdma_nl_put_driver_u32(msg, "idx", idx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "opcode", sqe->opcode)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "wr_id", sqe->wr_id)) + goto err; + if (rdma_nl_put_driver_u32(msg, "complete", sqe->complete)) + goto err; + if (sqe->complete && + rdma_nl_put_driver_u32(msg, "cqe_status", CQE_STATUS(&sqe->cqe))) + goto err; + if (rdma_nl_put_driver_u32(msg, "signaled", sqe->signaled)) + goto err; + if (rdma_nl_put_driver_u32(msg, "flushed", sqe->flushed)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +/* + * Dump the first and last pending sqes. + */ +static int fill_swsqes(struct sk_buff *msg, struct t4_sq *sq, + u16 first_idx, struct t4_swsqe *first_sqe, + u16 last_idx, struct t4_swsqe *last_sqe) +{ + if (!first_sqe) + goto out; + if (fill_swsqe(msg, sq, first_idx, first_sqe)) + goto err; + if (!last_sqe) + goto out; + if (fill_swsqe(msg, sq, last_idx, last_sqe)) + goto err; +out: + return 0; +err: + return -EMSGSIZE; +} + +static int fill_swrqe(struct sk_buff *msg, struct t4_rq *rq, u16 idx, + struct t4_swrqe *rqe) +{ + if (rdma_nl_put_driver_u32(msg, "idx", idx)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "wr_id", rqe->wr_id)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +/* + * Dump the first and last pending rqes. + */ +static int fill_swrqes(struct sk_buff *msg, struct t4_rq *rq, + u16 first_idx, struct t4_swrqe *first_rqe, + u16 last_idx, struct t4_swrqe *last_rqe) +{ + if (!first_rqe) + goto out; + if (fill_swrqe(msg, rq, first_idx, first_rqe)) + goto err; + if (!last_rqe) + goto out; + if (fill_swrqe(msg, rq, last_idx, last_rqe)) + goto err; +out: + return 0; +err: + return -EMSGSIZE; +} + +static int fill_res_qp_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct ib_qp *ibqp = container_of(res, struct ib_qp, res); + struct t4_swsqe *fsp = NULL, *lsp = NULL; + struct t4_swrqe *frp = NULL, *lrp = NULL; + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + struct t4_swsqe first_sqe, last_sqe; + struct t4_swrqe first_rqe, last_rqe; + u16 first_sq_idx, last_sq_idx; + u16 first_rq_idx, last_rq_idx; + struct nlattr *table_attr; + struct t4_wq wq; + + /* User qp state is not available, so don't dump user qps */ + if (qhp->ucontext) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err; + + /* Get a consistent snapshot */ + spin_lock_irq(&qhp->lock); + wq = qhp->wq; + + /* If there are any pending sqes, copy the first and last */ + if (wq.sq.cidx != wq.sq.pidx) { + first_sq_idx = wq.sq.cidx; + first_sqe = qhp->wq.sq.sw_sq[first_sq_idx]; + fsp = &first_sqe; + last_sq_idx = wq.sq.pidx; + if (last_sq_idx-- == 0) + last_sq_idx = wq.sq.size - 1; + if (last_sq_idx != first_sq_idx) { + last_sqe = qhp->wq.sq.sw_sq[last_sq_idx]; + lsp = &last_sqe; + } + } + + /* If there are any pending rqes, copy the first and last */ + if (wq.rq.cidx != wq.rq.pidx) { + first_rq_idx = wq.rq.cidx; + first_rqe = qhp->wq.rq.sw_rq[first_rq_idx]; + frp = &first_rqe; + last_rq_idx = wq.rq.pidx; + if (last_rq_idx-- == 0) + last_rq_idx = wq.rq.size - 1; + if (last_rq_idx != first_rq_idx) { + last_rqe = qhp->wq.rq.sw_rq[last_rq_idx]; + lrp = &last_rqe; + } + } + spin_unlock_irq(&qhp->lock); + + if (fill_sq(msg, &wq)) + goto err_cancel_table; + + if (fill_swsqes(msg, &wq.sq, first_sq_idx, fsp, last_sq_idx, lsp)) + goto err_cancel_table; + + if (fill_rq(msg, &wq)) + goto err_cancel_table; + + if (fill_swrqes(msg, &wq.rq, first_rq_idx, frp, last_rq_idx, lrp)) + goto err_cancel_table; + + nla_nest_end(msg, table_attr); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err: + return -EMSGSIZE; +} + +c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_QP] = fill_res_qp_entry, +}; -- cgit v1.2.3 From ed3dd9b017b85e00a459c35bd4d3fe2b83b0d092 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 2 May 2018 13:15:24 +0300 Subject: RDMA/hns: Drop local zgid in favor of core defined variable The zgid is already provided by IB/core, so there is no need in locally defined variable, let's drop it and reuse common one. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_main.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 9d48bc07a9e6..1b79a388e9d1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -99,7 +99,6 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) { struct hns_roce_dev *hr_dev = to_hr_dev(attr->device); struct ib_gid_attr zattr = { }; - union ib_gid zgid = { {0} }; u8 port = attr->port_num - 1; unsigned long flags; int ret; -- cgit v1.2.3 From 064e526247070c79aa3063d93384db378649a640 Mon Sep 17 00:00:00 2001 From: Idan Burstein Date: Wed, 2 May 2018 13:16:39 +0300 Subject: IB/mlx5: posting klm/mtt list inline in the send queue for reg_wr As most kernel RDMA ULPs, (e.g. NVMe over Fabrics in its default "register_always=Y" mode) registers and invalidates user buffer upon each IO. Today the mlx5 driver is posting the registration work request using scatter/gather entry for the MTT/KLM list. The fetch of the MTT/KLM list becomes the bottleneck in number of IO operation could be done by NVMe over Fabrics host driver on a single adapter as shown below. This patch is adding the support for inline registration work request upon MTT/KLM list of size <=64B. The result for NVMe over Fabrics is increase of > x3.5 for small IOs as shown below, I expect other ULPs (e.g iSER, SRP, NFS over RDMA) performance to be enhanced as well. The following results were taken against a single NVMe-oF (RoCE link layer) subsystem with a single namespace backed by null_blk using fio benchmark (with rw=randread, numjobs=48, iodepth={16,64}, ioengine=libaio direct=1): ConnectX-5 (pci Width x16) --------------------------- Block Size s/g reg_wr inline reg_wr ++++++++++ +++++++++++++++ ++++++++++++++++ 512B 1302.8K/34.82% 4951.9K/99.02% 1KB 1284.3K/33.86% 4232.7K/98.09% 2KB 1238.6K/34.1% 2797.5K/80.04% 4KB 1169.3K/32.46% 1941.3K/61.35% 8KB 1013.4K/30.08% 1236.6K/39.47% 16KB 695.7K/20.19% 696.9K/20.59% 32KB 350.3K/9.64% 350.6K/10.3% 64KB 175.86K/5.27% 175.9K/5.28% ConnectX-4 (pci Width x8) --------------------------- Block Size s/g reg_wr inline reg_wr ++++++++++ +++++++++++++++ ++++++++++++++++ 512B 1285.8K/42.66% 4242.7K/98.18% 1KB 1254.1K/41.74% 3569.2K/96.00% 2KB 1185.9K/39.83% 2173.9K/75.58% 4KB 1069.4K/36.46% 1343.3K/47.47% 8KB 755.1K/27.77% 748.7K/29.14% Tested-by: Nitzan Carmi Signed-off-by: Idan Burstein Signed-off-by: Max Gurtovoy Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 43 ++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7ed4b70f6447..7a9870a4823f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -54,6 +54,7 @@ enum { enum { MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64, }; static const u32 mlx5_ib_opcode[] = { @@ -298,7 +299,9 @@ static int sq_overhead(struct ib_qp_init_attr *attr) max(sizeof(struct mlx5_wqe_atomic_seg) + sizeof(struct mlx5_wqe_raddr_seg), sizeof(struct mlx5_wqe_umr_ctrl_seg) + - sizeof(struct mlx5_mkey_seg)); + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / + MLX5_IB_UMR_OCTOWORD); break; case IB_QPT_XRC_TGT: @@ -3633,13 +3636,15 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr) + struct mlx5_ib_mr *mr, bool umr_inline) { int size = mr->ndescs * mr->desc_size; memset(umr, 0, sizeof(*umr)); umr->flags = MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + umr->flags |= MLX5_UMR_INLINE; umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(); } @@ -3823,6 +3828,24 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); } +static void set_reg_umr_inline_seg(void *seg, struct mlx5_ib_qp *qp, + struct mlx5_ib_mr *mr, int mr_list_size) +{ + void *qend = qp->sq.qend; + void *addr = mr->descs; + int copy; + + if (unlikely(seg + mr_list_size > qend)) { + copy = qend - seg; + memcpy(seg, addr, copy); + addr += copy; + mr_list_size -= copy; + seg = mlx5_get_send_wqe(qp, 0); + } + memcpy(seg, addr, mr_list_size); + seg += mr_list_size; +} + static __be32 send_ieth(struct ib_send_wr *wr) { switch (wr->opcode) { @@ -4217,6 +4240,8 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + int mr_list_size = mr->ndescs * mr->desc_size; + bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { mlx5_ib_warn(to_mdev(qp->ibqp.device), @@ -4224,7 +4249,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, return -EINVAL; } - set_reg_umr_seg(*seg, mr); + set_reg_umr_seg(*seg, mr, umr_inline); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) @@ -4236,10 +4261,14 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); - set_reg_data_seg(*seg, mr, pd); - *seg += sizeof(struct mlx5_wqe_data_seg); - *size += (sizeof(struct mlx5_wqe_data_seg) / 16); - + if (umr_inline) { + set_reg_umr_inline_seg(*seg, qp, mr, mr_list_size); + *size += get_xlt_octo(mr_list_size); + } else { + set_reg_data_seg(*seg, mr, pd); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } return 0; } -- cgit v1.2.3 From 254361c1890e67486cd957e9072e518b1c464e27 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Wed, 2 May 2018 06:42:21 -0700 Subject: IB/hfi1: Prevent LNI hang when LCB can't obtain lanes When the LCB isn't able to get any lanes operational on the first transition into mission mode, the link transfer active never happens and the LNI stays in the polling state indefinitely. Reset LCB upon receiving an 8051 interrupt for LCB to try to obtain lanes with firmware version 1.25.0 or later. Also, update the LCB reset value in other parts of the code with a macro defined to make the code more maintainable and rename functions with the link_width label to link_mode to reflect the fact that those functions set and read link related data not just the link width. Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 51 ++++++++++++++++++++--------- drivers/infiniband/hw/hfi1/chip.h | 15 +++++++-- drivers/infiniband/hw/hfi1/chip_registers.h | 7 ++-- 3 files changed, 53 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index e6a60fa59f2b..cb9095d2cbc9 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1032,8 +1032,8 @@ static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, u8 *vcu, u16 *vl15buf, u8 *crc_sizes); static void read_vc_remote_link_width(struct hfi1_devdata *dd, u8 *remote_tx_rate, u16 *link_widths); -static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, - u8 *flag_bits, u16 *link_widths); +static void read_vc_local_link_mode(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths); static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, u8 *device_rev); static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx); @@ -6350,6 +6350,18 @@ static void handle_8051_request(struct hfi1_pportdata *ppd) dd_dev_info(dd, "8051 request: request 0x%x not supported\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + case HREQ_LCB_RESET: + /* Put the LCB, RX FPE and TX FPE into reset */ + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_INTO_RESET); + /* Make sure the write completed */ + (void)read_csr(dd, DCC_CFG_RESET); + /* Hold the reset long enough to take effect */ + udelay(1); + /* Take the LCB, RX FPE and TX FPE out of reset */ + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET); + hreq_response(dd, HREQ_SUCCESS, 0); + break; case HREQ_CONFIG_DONE: hreq_response(dd, HREQ_SUCCESS, 0); @@ -6461,8 +6473,7 @@ static void lcb_shutdown(struct hfi1_devdata *dd, int abort) dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN); reg = read_csr(dd, DCC_CFG_RESET); write_csr(dd, DCC_CFG_RESET, reg | - (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) | - (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT)); + DCC_CFG_RESET_RESET_LCB | DCC_CFG_RESET_RESET_RX_FPE); (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ if (!abort) { udelay(1); /* must hold for the longer of 16cclks or 20ns */ @@ -6527,7 +6538,7 @@ static void _dc_start(struct hfi1_devdata *dd) __func__); /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ - write_csr(dd, DCC_CFG_RESET, 0x10); + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET); /* lcb_shutdown() with abort=1 does not restore these */ write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); dd->dc_shutdown = 0; @@ -7348,7 +7359,7 @@ static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width, u8 misc_bits, local_flags; u16 active_tx, active_rx; - read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths); + read_vc_local_link_mode(dd, &misc_bits, &local_flags, &widths); tx = widths >> 12; rx = (widths >> 8) & 0xf; @@ -8820,29 +8831,29 @@ static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu, GENERAL_CONFIG, frame); } -static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, - u8 *flag_bits, u16 *link_widths) +static void read_vc_local_link_mode(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths) { u32 frame; - read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_MODE, GENERAL_CONFIG, &frame); *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK; *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK; *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; } -static int write_vc_local_link_width(struct hfi1_devdata *dd, - u8 misc_bits, - u8 flag_bits, - u16 link_widths) +static int write_vc_local_link_mode(struct hfi1_devdata *dd, + u8 misc_bits, + u8 flag_bits, + u16 link_widths) { u32 frame; frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT | (u32)link_widths << LINK_WIDTH_SHIFT; - return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_MODE, GENERAL_CONFIG, frame); } @@ -9312,8 +9323,16 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) if (loopback == LOOPBACK_SERDES) misc_bits |= 1 << LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT; - ret = write_vc_local_link_width(dd, misc_bits, 0, - opa_to_vc_link_widths( + /* + * An external device configuration request is used to reset the LCB + * to retry to obtain operational lanes when the first attempt is + * unsuccesful. + */ + if (dd->dc8051_ver >= dc8051_ver(1, 25, 0)) + misc_bits |= 1 << EXT_CFG_LCB_RESET_SUPPORTED_SHIFT; + + ret = write_vc_local_link_mode(dd, misc_bits, 0, + opa_to_vc_link_widths( ppd->link_width_enabled)); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index c0d70f255050..fdf389e46e19 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -196,6 +196,15 @@ #define LSTATE_ARMED 0x3 #define LSTATE_ACTIVE 0x4 +/* DCC_CFG_RESET reset states */ +#define LCB_RX_FPE_TX_FPE_INTO_RESET (DCC_CFG_RESET_RESET_LCB | \ + DCC_CFG_RESET_RESET_TX_FPE | \ + DCC_CFG_RESET_RESET_RX_FPE | \ + DCC_CFG_RESET_ENABLE_CCLK_BCC) + /* 0x17 */ + +#define LCB_RX_FPE_TX_FPE_OUT_OF_RESET DCC_CFG_RESET_ENABLE_CCLK_BCC /* 0x10 */ + /* DC8051_STS_CUR_STATE port values (physical link states) */ #define PLS_DISABLED 0x30 #define PLS_OFFLINE 0x90 @@ -283,6 +292,7 @@ #define HREQ_SET_TX_EQ_ABS 0x04 #define HREQ_SET_TX_EQ_REL 0x05 #define HREQ_ENABLE 0x06 +#define HREQ_LCB_RESET 0x07 #define HREQ_CONFIG_DONE 0xfe #define HREQ_INTERFACE_TEST 0xff @@ -383,7 +393,7 @@ #define TX_SETTINGS 0x06 #define VERIFY_CAP_LOCAL_PHY 0x07 #define VERIFY_CAP_LOCAL_FABRIC 0x08 -#define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09 +#define VERIFY_CAP_LOCAL_LINK_MODE 0x09 #define LOCAL_DEVICE_ID 0x0a #define RESERVED_REGISTERS 0x0b #define LOCAL_LNI_INFO 0x0c @@ -584,8 +594,9 @@ enum { #define LOOPBACK_LCB 2 #define LOOPBACK_CABLE 3 /* external cable */ -/* set up serdes bit in MISC_CONFIG_BITS */ +/* set up bits in MISC_CONFIG_BITS */ #define LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT 0 +#define EXT_CFG_LCB_RESET_SUPPORTED_SHIFT 3 /* read and write hardware registers */ u64 read_csr(const struct hfi1_devdata *dd, u32 offset); diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index 793514f1d15f..da598b5fe8f6 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -97,8 +97,11 @@ #define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32 #define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull #define DCC_CFG_RESET (DCC_CSRS + 0x000000000000) -#define DCC_CFG_RESET_RESET_LCB_SHIFT 0 -#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2 +#define DCC_CFG_RESET_RESET_LCB BIT_ULL(0) +#define DCC_CFG_RESET_RESET_TX_FPE BIT_ULL(1) +#define DCC_CFG_RESET_RESET_RX_FPE BIT_ULL(2) +#define DCC_CFG_RESET_RESET_8051 BIT_ULL(3) +#define DCC_CFG_RESET_ENABLE_CCLK_BCC BIT_ULL(4) #define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028) #define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0 #define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40 -- cgit v1.2.3 From 48e0a6559dd8e6aa87841270868423b23076220e Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 2 May 2018 06:42:29 -0700 Subject: IB/hfi1: Return actual error value from program_rcvarray() A failure of program_rcvarray() is treated inconsistently by the calling function. In one case the error is returned, in a second case, the error is overwritten with EFAULT. In both cases the code path is doing the same thing, allocating memory for groups, so it should be consistent. Make the error path consistent and return the error generated by program_rcvarray(). Reviewed-by: Harish Chegondi Fixes: 7e7a436ecb6e ("staging/hfi1: Add TID entry program function body") Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 0d5330b7353d..6a4c5142515a 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -437,7 +437,6 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, hfi1_cdbg(TID, "Failed to program RcvArray entries %d", ret); - ret = -EFAULT; goto unlock; } else if (ret > 0) { if (grp->used == grp->size) -- cgit v1.2.3 From 959f2d172daa0133e4d5b7a64344f8b2c2d87fc1 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 2 May 2018 06:42:36 -0700 Subject: IB/hfi1: Complete check for locally terminated smp For lid routed packets 'hop_cnt' is zero, therefore current test is incomplete. Fix it by using local mad check for both lid routed and direct routed MADs. Reviewed-by: Mike Mariciniszyn Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index e9962c65c68f..983b5794a660 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1238,7 +1238,7 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, } static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, - u32 logical_state, u32 phys_state) + u32 logical_state, u32 phys_state, int local_mad) { struct hfi1_devdata *dd = ppd->dd; u32 link_state; @@ -1314,7 +1314,7 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, * Don't send a reply if the response would be sent * through the disabled port. */ - if (link_state == HLS_DN_DISABLE && smp->hop_cnt) + if (link_state == HLS_DN_DISABLE && !local_mad) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; break; case IB_PORT_ARMED: @@ -1350,7 +1350,7 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, */ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len, u32 max_len) + u32 *resp_len, u32 max_len, int local_mad) { struct opa_port_info *pi = (struct opa_port_info *)data; struct ib_event event; @@ -1634,7 +1634,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, */ if (!invalid) { - ret = set_port_states(ppd, smp, ls_new, ps_new); + ret = set_port_states(ppd, smp, ls_new, ps_new, local_mad); if (ret) return ret; } @@ -2085,7 +2085,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len, u32 max_len) + u32 *resp_len, u32 max_len, int local_mad) { u32 nports = OPA_AM_NPORT(am); u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); @@ -2122,7 +2122,7 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, } if (!invalid) { - ret = set_port_states(ppd, smp, ls_new, ps_new); + ret = set_port_states(ppd, smp, ls_new, ps_new, local_mad); if (ret) return ret; } @@ -4190,7 +4190,7 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len, u32 max_len) + u32 *resp_len, u32 max_len, int local_mad) { int ret; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -4198,7 +4198,7 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, switch (attr_id) { case IB_SMP_ATTR_PORT_INFO: ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port, - resp_len, max_len); + resp_len, max_len, local_mad); break; case IB_SMP_ATTR_PKEY_TABLE: ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port, @@ -4222,7 +4222,7 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, break; case OPA_ATTRIB_ID_PORT_STATE_INFO: ret = __subn_set_opa_psi(smp, am, data, ibdev, port, - resp_len, max_len); + resp_len, max_len, local_mad); break; case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: ret = __subn_set_opa_bct(smp, am, data, ibdev, port, @@ -4314,7 +4314,7 @@ static int subn_get_opa_aggregate(struct opa_smp *smp, static int subn_set_opa_aggregate(struct opa_smp *smp, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, int local_mad) { int i; u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; @@ -4344,7 +4344,9 @@ static int subn_set_opa_aggregate(struct opa_smp *smp, } (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data, - ibdev, port, NULL, (u32)agg_data_len); + ibdev, port, NULL, (u32)agg_data_len, + local_mad); + if (smp->status & IB_SMP_INVALID_FIELD) break; if (smp->status & ~IB_SMP_DIRECTION) { @@ -4519,7 +4521,7 @@ static int hfi1_pkey_validation_pma(struct hfi1_ibport *ibp, static int process_subn_opa(struct ib_device *ibdev, int mad_flags, u8 port, const struct opa_mad *in_mad, struct opa_mad *out_mad, - u32 *resp_len) + u32 *resp_len, int local_mad) { struct opa_smp *smp = (struct opa_smp *)out_mad; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -4588,11 +4590,11 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, default: ret = subn_set_opa_sma(attr_id, smp, am, data, ibdev, port, resp_len, - data_size); + data_size, local_mad); break; case OPA_ATTRIB_ID_AGGREGATE: ret = subn_set_opa_aggregate(smp, ibdev, port, - resp_len); + resp_len, local_mad); break; } break; @@ -4832,6 +4834,7 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, { int ret; int pkey_idx; + int local_mad = 0; u32 resp_len = 0; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -4846,13 +4849,14 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, switch (in_mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: case IB_MGMT_CLASS_SUBN_LID_ROUTED: - if (is_local_mad(ibp, in_mad, in_wc)) { + local_mad = is_local_mad(ibp, in_mad, in_wc); + if (local_mad) { ret = opa_local_smp_check(ibp, in_wc); if (ret) return IB_MAD_RESULT_FAILURE; } ret = process_subn_opa(ibdev, mad_flags, port, in_mad, - out_mad, &resp_len); + out_mad, &resp_len, local_mad); goto bail; case IB_MGMT_CLASS_PERF_MGMT: ret = hfi1_pkey_validation_pma(ibp, in_mad, in_wc); -- cgit v1.2.3 From 8c79d8223bb11b2f005695a32ddd3985de97727c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 2 May 2018 06:42:44 -0700 Subject: IB/hfi1: Fix fault injection init/exit issues There are config dependent code paths that expose panics in unload paths both in this file and in debugfs_remove_recursive() because CONFIG_FAULT_INJECTION and CONFIG_FAULT_INJECTION_DEBUG_FS can be set independently. Having CONFIG_FAULT_INJECTION set and CONFIG_FAULT_INJECTION_DEBUG_FS reset causes fault_create_debugfs_attr() to return an error. The debugfs.c routines tolerate failures, but the module unload panics dereferencing a NULL in the two exit routines. If that is fixed, the dir passed to debugfs_remove_recursive comes from a memory location that was freed and potentially reused causing a segfault or corrupting memory. Here is an example of the NULL deref panic: [66866.286829] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 [66866.295602] IP: hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] [66866.301138] PGD 858496067 P4D 858496067 PUD 8433a7067 PMD 0 [66866.307452] Oops: 0000 [#1] SMP [66866.310953] Modules linked in: hfi1(-) rdmavt rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm iw_cm ib_cm ib_core rpcsec_gss_krb5 nfsv4 dns_resolver nfsv3 nfs fscache sb_edac x86_pkg_temp_thermal intel_powerclamp vfat fat coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel iTCO_wdt iTCO_vendor_support crypto_simd mei_me glue_helper cryptd mxm_wmi ipmi_si pcspkr lpc_ich sg mei ioatdma ipmi_devintf i2c_i801 mfd_core shpchp ipmi_msghandler wmi acpi_power_meter acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt igb fb_sys_fops ttm ahci ptp crc32c_intel libahci pps_core drm dca libata i2c_algo_bit i2c_core [last unloaded: opa_vnic] [66866.385551] CPU: 8 PID: 7470 Comm: rmmod Not tainted 4.14.0-mam-tid-rdma #2 [66866.393317] Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0018.C4.072020161249 07/20/2016 [66866.405252] task: ffff88084f28c380 task.stack: ffffc90008454000 [66866.411866] RIP: 0010:hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] [66866.417984] RSP: 0018:ffffc90008457da0 EFLAGS: 00010202 [66866.423812] RAX: 0000000000000000 RBX: ffff880857de0000 RCX: 0000000180040001 [66866.431773] RDX: 0000000180040002 RSI: ffffea0021088200 RDI: 0000000040000000 [66866.439734] RBP: ffffc90008457da8 R08: ffff88084220e000 R09: 0000000180040001 [66866.447696] R10: 000000004220e001 R11: ffff88084220e000 R12: ffff88085a31c000 [66866.455657] R13: ffffffffa07c9820 R14: ffffffffa07c9890 R15: ffff881059d78100 [66866.463618] FS: 00007f6876047740(0000) GS:ffff88085f800000(0000) knlGS:0000000000000000 [66866.472644] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [66866.479053] CR2: 0000000000000088 CR3: 0000000856357006 CR4: 00000000001606e0 [66866.487013] Call Trace: [66866.489747] remove_one+0x1f/0x220 [hfi1] [66866.494221] pci_device_remove+0x39/0xc0 [66866.498596] device_release_driver_internal+0x141/0x210 [66866.504424] driver_detach+0x3f/0x80 [66866.508409] bus_remove_driver+0x55/0xd0 [66866.512784] driver_unregister+0x2c/0x50 [66866.517164] pci_unregister_driver+0x2a/0xa0 [66866.521934] hfi1_mod_cleanup+0x10/0xaa2 [hfi1] [66866.526988] SyS_delete_module+0x171/0x250 [66866.531558] do_syscall_64+0x67/0x1b0 [66866.535644] entry_SYSCALL64_slow_path+0x25/0x25 [66866.540792] RIP: 0033:0x7f6875525c27 [66866.544777] RSP: 002b:00007ffd48528e78 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [66866.553224] RAX: ffffffffffffffda RBX: 0000000001cc01d0 RCX: 00007f6875525c27 [66866.561185] RDX: 00007f6875596000 RSI: 0000000000000800 RDI: 0000000001cc0238 [66866.569146] RBP: 0000000000000000 R08: 00007f68757e9060 R09: 00007f6875596000 [66866.577120] R10: 00007ffd48528c00 R11: 0000000000000206 R12: 00007ffd48529db4 [66866.585080] R13: 0000000000000000 R14: 0000000001cc01d0 R15: 0000000001cc0010 [66866.593040] Code: 90 0f 1f 44 00 00 48 83 3d a3 8b 03 00 00 55 48 89 e5 53 48 89 fb 74 4e 48 8d bf 18 0c 00 00 e8 9d f2 ff ff 48 8b 83 20 0c 00 00 <48> 8b b8 88 00 00 00 e8 2a 21 b3 e0 48 8b bb 20 0c 00 00 e8 0e [66866.614127] RIP: hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] RSP: ffffc90008457da0 [66866.621885] CR2: 0000000000000088 [66866.625618] ---[ end trace c4817425783fb092 ]--- Fix by insuring that upon failure from fault_create_debugfs_attr() the parent pointer for the routines is always set to NULL and guards added in the exit routines to insure that debugfs_remove_recursive() is not called when when the parent pointer is NULL. Fixes: 0181ce31b260 ("IB/hfi1: Add receive fault injection feature") Cc: # 4.14.x Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/debugfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 852173bf05d0..5343960610fe 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -1227,7 +1227,8 @@ DEBUGFS_FILE_OPS(fault_stats); static void fault_exit_opcode_debugfs(struct hfi1_ibdev *ibd) { - debugfs_remove_recursive(ibd->fault_opcode->dir); + if (ibd->fault_opcode) + debugfs_remove_recursive(ibd->fault_opcode->dir); kfree(ibd->fault_opcode); ibd->fault_opcode = NULL; } @@ -1255,6 +1256,7 @@ static int fault_init_opcode_debugfs(struct hfi1_ibdev *ibd) &ibd->fault_opcode->attr); if (IS_ERR(ibd->fault_opcode->dir)) { kfree(ibd->fault_opcode); + ibd->fault_opcode = NULL; return -ENOENT; } @@ -1278,7 +1280,8 @@ fail: static void fault_exit_packet_debugfs(struct hfi1_ibdev *ibd) { - debugfs_remove_recursive(ibd->fault_packet->dir); + if (ibd->fault_packet) + debugfs_remove_recursive(ibd->fault_packet->dir); kfree(ibd->fault_packet); ibd->fault_packet = NULL; } @@ -1304,6 +1307,7 @@ static int fault_init_packet_debugfs(struct hfi1_ibdev *ibd) &ibd->fault_opcode->attr); if (IS_ERR(ibd->fault_packet->dir)) { kfree(ibd->fault_packet); + ibd->fault_packet = NULL; return -ENOENT; } -- cgit v1.2.3 From e4607073ffa5c72279370ba91113b76e70f62e16 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 2 May 2018 06:42:59 -0700 Subject: IB/hfi1: Return correct value for device state The driver_pstate() function is used to map internal driver state information to externally defined states. The VERIFY_CAP and GOING_UP states are config/training states, but the mapping routing returns the POLLING value. Update the return values for VERIFY_CAP and GOING_UP to return the correct value: TRAINING. Reviewed-by: Sebastian Sanchez Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index cb9095d2cbc9..4cd422ff92f8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -10510,9 +10510,9 @@ u32 driver_pstate(struct hfi1_pportdata *ppd) case HLS_DN_OFFLINE: return OPA_PORTPHYSSTATE_OFFLINE; case HLS_VERIFY_CAP: - return IB_PORTPHYSSTATE_POLLING; + return IB_PORTPHYSSTATE_TRAINING; case HLS_GOING_UP: - return IB_PORTPHYSSTATE_POLLING; + return IB_PORTPHYSSTATE_TRAINING; case HLS_GOING_OFFLINE: return OPA_PORTPHYSSTATE_OFFLINE; case HLS_LINK_COOLDOWN: -- cgit v1.2.3 From a93a0a31111231bb1949f4a83b17238f0fa32d6a Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 2 May 2018 06:43:07 -0700 Subject: IB/hfi1: Reorder incorrect send context disable User send context integrity bits are cleared before the context is disabled. If the send context is still processing data, any packets that need those integrity bits will cause an error and halt the send context. During the disable handling, the driver waits for the context to drain. If the context is halted, the driver will eventually timeout because the context won't drain and then incorrectly bounce the link. Reorder the bit clearing and the context disable. Examine the software state and send context status as well as the egress status to determine if a send context is in the halted state. Promote the check macros to static functions for consistency with the new check and to follow kernel style. Remove an unused define that refers to the egress timeout. Cc: # 4.9.x Reviewed-by: Mitko Haralanov Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 2 +- drivers/infiniband/hw/hfi1/pio.c | 44 +++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 1b778fd16a32..c9d23c37a371 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -689,8 +689,8 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) * checks to default and disable the send context. */ if (uctxt->sc) { - set_pio_integrity(uctxt->sc); sc_disable(uctxt->sc); + set_pio_integrity(uctxt->sc); } hfi1_free_ctxt_rcv_groups(uctxt); diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 40dac4d16eb8..9cac15d10c4f 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -50,8 +50,6 @@ #include "qp.h" #include "trace.h" -#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */ - #define SC(name) SEND_CTXT_##name /* * Send Context functions @@ -961,15 +959,40 @@ void sc_disable(struct send_context *sc) } /* return SendEgressCtxtStatus.PacketOccupancy */ -#define packet_occupancy(r) \ - (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\ - >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT) +static u64 packet_occupancy(u64 reg) +{ + return (reg & + SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK) + >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT; +} /* is egress halted on the context? */ -#define egress_halted(r) \ - ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK) +static bool egress_halted(u64 reg) +{ + return !!(reg & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK); +} -/* wait for packet egress, optionally pause for credit return */ +/* is the send context halted? */ +static bool is_sc_halted(struct hfi1_devdata *dd, u32 hw_context) +{ + return !!(read_kctxt_csr(dd, hw_context, SC(STATUS)) & + SC(STATUS_CTXT_HALTED_SMASK)); +} + +/** + * sc_wait_for_packet_egress + * @sc: valid send context + * @pause: wait for credit return + * + * Wait for packet egress, optionally pause for credit return + * + * Egress halt and Context halt are not necessarily the same thing, so + * check for both. + * + * NOTE: The context halt bit may not be set immediately. Because of this, + * it is necessary to check the SW SFC_HALTED bit (set in the IRQ) and the HW + * context bit to determine if the context is halted. + */ static void sc_wait_for_packet_egress(struct send_context *sc, int pause) { struct hfi1_devdata *dd = sc->dd; @@ -981,8 +1004,9 @@ static void sc_wait_for_packet_egress(struct send_context *sc, int pause) reg_prev = reg; reg = read_csr(dd, sc->hw_context * 8 + SEND_EGRESS_CTXT_STATUS); - /* done if egress is stopped */ - if (egress_halted(reg)) + /* done if any halt bits, SW or HW are set */ + if (sc->flags & SCF_HALTED || + is_sc_halted(dd, sc->hw_context) || egress_halted(reg)) break; reg = packet_occupancy(reg); if (reg == 0) -- cgit v1.2.3 From 8d3e71136a080d007620472f50c7b3e63ba0f5cf Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 2 May 2018 06:43:15 -0700 Subject: IB/{hfi1, qib}: Add handling of kernel restart A warm restart will fail to unload the driver, leaving link state potentially flapping up to the point the BIOS resets the adapter. Correct the issue by hooking the shutdown pci method, which will bring port down. Cc: # 4.9.x Reviewed-by: Mike Marciniszyn Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 1 + drivers/infiniband/hw/hfi1/init.c | 13 +++++++++++++ drivers/infiniband/hw/qib/qib.h | 1 + drivers/infiniband/hw/qib/qib_init.c | 13 +++++++++++++ 4 files changed, 28 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index cac2c62bc42d..9c97c180c35e 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1856,6 +1856,7 @@ struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) #define HFI1_HAS_SDMA_TIMEOUT 0x8 #define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */ #define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */ +#define HFI1_SHUTDOWN 0x100 /* device is shutting down */ /* IB dword length mask in PBC (lower 11 bits); same for all chips */ #define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 6309edf811df..790542ce89a5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1058,6 +1058,10 @@ static void shutdown_device(struct hfi1_devdata *dd) unsigned pidx; int i; + if (dd->flags & HFI1_SHUTDOWN) + return; + dd->flags |= HFI1_SHUTDOWN; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1391,6 +1395,7 @@ void hfi1_disable_after_error(struct hfi1_devdata *dd) static void remove_one(struct pci_dev *); static int init_one(struct pci_dev *, const struct pci_device_id *); +static void shutdown_one(struct pci_dev *); #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " #define PFX DRIVER_NAME ": " @@ -1407,6 +1412,7 @@ static struct pci_driver hfi1_pci_driver = { .name = DRIVER_NAME, .probe = init_one, .remove = remove_one, + .shutdown = shutdown_one, .id_table = hfi1_pci_tbl, .err_handler = &hfi1_pci_err_handler, }; @@ -1816,6 +1822,13 @@ static void remove_one(struct pci_dev *pdev) postinit_cleanup(dd); } +static void shutdown_one(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + shutdown_device(dd); +} + /** * hfi1_create_rcvhdrq - create a receive header queue * @dd: the hfi1_ib device diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 46072455130c..43a68d7b51bb 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1228,6 +1228,7 @@ static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port) #define QIB_BADINTR 0x8000 /* severe interrupt problems */ #define QIB_DCA_ENABLED 0x10000 /* Direct Cache Access enabled */ #define QIB_HAS_QSFP 0x20000 /* device (card instance) has QSFP */ +#define QIB_SHUTDOWN 0x40000 /* device is shutting down */ /* * values for ppd->lflags (_ib_port_ related flags) diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 6c68f8a97018..015520289735 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -841,6 +841,10 @@ static void qib_shutdown_device(struct qib_devdata *dd) struct qib_pportdata *ppd; unsigned pidx; + if (dd->flags & QIB_SHUTDOWN) + return; + dd->flags |= QIB_SHUTDOWN; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1182,6 +1186,7 @@ void qib_disable_after_error(struct qib_devdata *dd) static void qib_remove_one(struct pci_dev *); static int qib_init_one(struct pci_dev *, const struct pci_device_id *); +static void qib_shutdown_one(struct pci_dev *); #define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " #define PFX QIB_DRV_NAME ": " @@ -1199,6 +1204,7 @@ static struct pci_driver qib_driver = { .name = QIB_DRV_NAME, .probe = qib_init_one, .remove = qib_remove_one, + .shutdown = qib_shutdown_one, .id_table = qib_pci_tbl, .err_handler = &qib_pci_err_handler, }; @@ -1549,6 +1555,13 @@ static void qib_remove_one(struct pci_dev *pdev) qib_postinit_cleanup(dd); } +static void qib_shutdown_one(struct pci_dev *pdev) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + + qib_shutdown_device(dd); +} + /** * qib_create_rcvhdrq - create a receive header queue * @dd: the qlogic_ib device -- cgit v1.2.3 From a74d5307caba42fe9bbc180feb03003f14f9f45c Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Wed, 2 May 2018 06:43:24 -0700 Subject: IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro Reviewed-by: Don Hiatt Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/Makefile | 10 +- drivers/infiniband/hw/hfi1/chip.c | 3 +- drivers/infiniband/hw/hfi1/debugfs.c | 296 +-------------------------- drivers/infiniband/hw/hfi1/debugfs.h | 93 +++++---- drivers/infiniband/hw/hfi1/driver.c | 20 +- drivers/infiniband/hw/hfi1/fault.c | 375 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/fault.h | 109 ++++++++++ drivers/infiniband/hw/hfi1/hfi.h | 10 +- drivers/infiniband/hw/hfi1/verbs.c | 13 +- drivers/infiniband/hw/hfi1/verbs.h | 6 +- 10 files changed, 577 insertions(+), 358 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/fault.c create mode 100644 drivers/infiniband/hw/hfi1/fault.h (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index ce4010bad982..f451ba912f47 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -14,7 +14,15 @@ hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ verbs_txreq.o vnic_main.o vnic_sdma.o -hfi1-$(CONFIG_DEBUG_FS) += debugfs.o + +ifdef CONFIG_DEBUG_FS +hfi1-y += debugfs.o +ifdef CONFIG_FAULT_INJECTION +ifdef CONFIG_FAULT_INJECTION_DEBUG_FS +hfi1-y += fault.o +endif +endif +endif CFLAGS_trace.o = -I$(src) ifdef MVERSION diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 4cd422ff92f8..582cf7eb779f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -65,6 +65,7 @@ #include "aspm.h" #include "affinity.h" #include "debugfs.h" +#include "fault.h" #define NUM_IB_PORTS 1 diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 5343960610fe..9f992ae36c89 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -60,15 +60,13 @@ #include "device.h" #include "qp.h" #include "sdma.h" +#include "fault.h" static struct dentry *hfi1_dbg_root; /* wrappers to enforce srcu in seq file */ -static ssize_t hfi1_seq_read( - struct file *file, - char __user *buf, - size_t size, - loff_t *ppos) +ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) { struct dentry *d = file->f_path.dentry; ssize_t r; @@ -81,10 +79,7 @@ static ssize_t hfi1_seq_read( return r; } -static loff_t hfi1_seq_lseek( - struct file *file, - loff_t offset, - int whence) +loff_t hfi1_seq_lseek(struct file *file, loff_t offset, int whence) { struct dentry *d = file->f_path.dentry; loff_t r; @@ -100,48 +95,6 @@ static loff_t hfi1_seq_lseek( #define private2dd(file) (file_inode(file)->i_private) #define private2ppd(file) (file_inode(file)->i_private) -#define DEBUGFS_SEQ_FILE_OPS(name) \ -static const struct seq_operations _##name##_seq_ops = { \ - .start = _##name##_seq_start, \ - .next = _##name##_seq_next, \ - .stop = _##name##_seq_stop, \ - .show = _##name##_seq_show \ -} - -#define DEBUGFS_SEQ_FILE_OPEN(name) \ -static int _##name##_open(struct inode *inode, struct file *s) \ -{ \ - struct seq_file *seq; \ - int ret; \ - ret = seq_open(s, &_##name##_seq_ops); \ - if (ret) \ - return ret; \ - seq = s->private_data; \ - seq->private = inode->i_private; \ - return 0; \ -} - -#define DEBUGFS_FILE_OPS(name) \ -static const struct file_operations _##name##_file_ops = { \ - .owner = THIS_MODULE, \ - .open = _##name##_open, \ - .read = hfi1_seq_read, \ - .llseek = hfi1_seq_lseek, \ - .release = seq_release \ -} - -#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ -do { \ - struct dentry *ent; \ - ent = debugfs_create_file(name, mode, parent, \ - data, ops); \ - if (!ent) \ - pr_warn("create of %s failed\n", name); \ -} while (0) - -#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ - DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO) - static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) { struct hfi1_opcode_stats_perctx *opstats; @@ -1160,236 +1113,6 @@ DEBUGFS_SEQ_FILE_OPS(sdma_cpu_list); DEBUGFS_SEQ_FILE_OPEN(sdma_cpu_list) DEBUGFS_FILE_OPS(sdma_cpu_list); -#ifdef CONFIG_FAULT_INJECTION -static void *_fault_stats_seq_start(struct seq_file *s, loff_t *pos) -{ - struct hfi1_opcode_stats_perctx *opstats; - - if (*pos >= ARRAY_SIZE(opstats->stats)) - return NULL; - return pos; -} - -static void *_fault_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct hfi1_opcode_stats_perctx *opstats; - - ++*pos; - if (*pos >= ARRAY_SIZE(opstats->stats)) - return NULL; - return pos; -} - -static void _fault_stats_seq_stop(struct seq_file *s, void *v) -{ -} - -static int _fault_stats_seq_show(struct seq_file *s, void *v) -{ - loff_t *spos = v; - loff_t i = *spos, j; - u64 n_packets = 0, n_bytes = 0; - struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; - struct hfi1_devdata *dd = dd_from_dev(ibd); - struct hfi1_ctxtdata *rcd; - - for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { - rcd = hfi1_rcd_get_by_index(dd, j); - if (rcd) { - n_packets += rcd->opstats->stats[i].n_packets; - n_bytes += rcd->opstats->stats[i].n_bytes; - } - hfi1_rcd_put(rcd); - } - for_each_possible_cpu(j) { - struct hfi1_opcode_stats_perctx *sp = - per_cpu_ptr(dd->tx_opstats, j); - - n_packets += sp->stats[i].n_packets; - n_bytes += sp->stats[i].n_bytes; - } - if (!n_packets && !n_bytes) - return SEQ_SKIP; - if (!ibd->fault_opcode->n_rxfaults[i] && - !ibd->fault_opcode->n_txfaults[i]) - return SEQ_SKIP; - seq_printf(s, "%02llx %llu/%llu (faults rx:%llu faults: tx:%llu)\n", i, - (unsigned long long)n_packets, - (unsigned long long)n_bytes, - (unsigned long long)ibd->fault_opcode->n_rxfaults[i], - (unsigned long long)ibd->fault_opcode->n_txfaults[i]); - return 0; -} - -DEBUGFS_SEQ_FILE_OPS(fault_stats); -DEBUGFS_SEQ_FILE_OPEN(fault_stats); -DEBUGFS_FILE_OPS(fault_stats); - -static void fault_exit_opcode_debugfs(struct hfi1_ibdev *ibd) -{ - if (ibd->fault_opcode) - debugfs_remove_recursive(ibd->fault_opcode->dir); - kfree(ibd->fault_opcode); - ibd->fault_opcode = NULL; -} - -static int fault_init_opcode_debugfs(struct hfi1_ibdev *ibd) -{ - struct dentry *parent = ibd->hfi1_ibdev_dbg; - - ibd->fault_opcode = kzalloc(sizeof(*ibd->fault_opcode), GFP_KERNEL); - if (!ibd->fault_opcode) - return -ENOMEM; - - ibd->fault_opcode->attr.interval = 1; - ibd->fault_opcode->attr.require_end = ULONG_MAX; - ibd->fault_opcode->attr.stacktrace_depth = 32; - ibd->fault_opcode->attr.dname = NULL; - ibd->fault_opcode->attr.verbose = 0; - ibd->fault_opcode->fault_by_opcode = false; - ibd->fault_opcode->opcode = 0; - ibd->fault_opcode->mask = 0xff; - - ibd->fault_opcode->dir = - fault_create_debugfs_attr("fault_opcode", - parent, - &ibd->fault_opcode->attr); - if (IS_ERR(ibd->fault_opcode->dir)) { - kfree(ibd->fault_opcode); - ibd->fault_opcode = NULL; - return -ENOENT; - } - - DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault_opcode->dir, ibd); - if (!debugfs_create_bool("fault_by_opcode", 0600, - ibd->fault_opcode->dir, - &ibd->fault_opcode->fault_by_opcode)) - goto fail; - if (!debugfs_create_x8("opcode", 0600, ibd->fault_opcode->dir, - &ibd->fault_opcode->opcode)) - goto fail; - if (!debugfs_create_x8("mask", 0600, ibd->fault_opcode->dir, - &ibd->fault_opcode->mask)) - goto fail; - - return 0; -fail: - fault_exit_opcode_debugfs(ibd); - return -ENOMEM; -} - -static void fault_exit_packet_debugfs(struct hfi1_ibdev *ibd) -{ - if (ibd->fault_packet) - debugfs_remove_recursive(ibd->fault_packet->dir); - kfree(ibd->fault_packet); - ibd->fault_packet = NULL; -} - -static int fault_init_packet_debugfs(struct hfi1_ibdev *ibd) -{ - struct dentry *parent = ibd->hfi1_ibdev_dbg; - - ibd->fault_packet = kzalloc(sizeof(*ibd->fault_packet), GFP_KERNEL); - if (!ibd->fault_packet) - return -ENOMEM; - - ibd->fault_packet->attr.interval = 1; - ibd->fault_packet->attr.require_end = ULONG_MAX; - ibd->fault_packet->attr.stacktrace_depth = 32; - ibd->fault_packet->attr.dname = NULL; - ibd->fault_packet->attr.verbose = 0; - ibd->fault_packet->fault_by_packet = false; - - ibd->fault_packet->dir = - fault_create_debugfs_attr("fault_packet", - parent, - &ibd->fault_opcode->attr); - if (IS_ERR(ibd->fault_packet->dir)) { - kfree(ibd->fault_packet); - ibd->fault_packet = NULL; - return -ENOENT; - } - - if (!debugfs_create_bool("fault_by_packet", 0600, - ibd->fault_packet->dir, - &ibd->fault_packet->fault_by_packet)) - goto fail; - if (!debugfs_create_u64("fault_stats", 0400, - ibd->fault_packet->dir, - &ibd->fault_packet->n_faults)) - goto fail; - - return 0; -fail: - fault_exit_packet_debugfs(ibd); - return -ENOMEM; -} - -static void fault_exit_debugfs(struct hfi1_ibdev *ibd) -{ - fault_exit_opcode_debugfs(ibd); - fault_exit_packet_debugfs(ibd); -} - -static int fault_init_debugfs(struct hfi1_ibdev *ibd) -{ - int ret = 0; - - ret = fault_init_opcode_debugfs(ibd); - if (ret) - return ret; - - ret = fault_init_packet_debugfs(ibd); - if (ret) - fault_exit_opcode_debugfs(ibd); - - return ret; -} - -bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) -{ - return ibd->fault_suppress_err; -} - -bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, u32 opcode, bool rx) -{ - bool ret = false; - struct hfi1_ibdev *ibd = to_idev(qp->ibqp.device); - - if (!ibd->fault_opcode || !ibd->fault_opcode->fault_by_opcode) - return false; - if (ibd->fault_opcode->opcode != (opcode & ibd->fault_opcode->mask)) - return false; - ret = should_fail(&ibd->fault_opcode->attr, 1); - if (ret) { - trace_hfi1_fault_opcode(qp, opcode); - if (rx) - ibd->fault_opcode->n_rxfaults[opcode]++; - else - ibd->fault_opcode->n_txfaults[opcode]++; - } - return ret; -} - -bool hfi1_dbg_fault_packet(struct hfi1_packet *packet) -{ - struct rvt_dev_info *rdi = &packet->rcd->ppd->dd->verbs_dev.rdi; - struct hfi1_ibdev *ibd = dev_from_rdi(rdi); - bool ret = false; - - if (!ibd->fault_packet || !ibd->fault_packet->fault_by_packet) - return false; - - ret = should_fail(&ibd->fault_packet->attr, 1); - if (ret) { - ++ibd->fault_packet->n_faults; - trace_hfi1_fault_packet(packet); - } - return ret; -} -#endif - void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) { char name[sizeof("port0counters") + 1]; @@ -1442,21 +1165,14 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) S_IRUGO : S_IRUGO | S_IWUSR); } -#ifdef CONFIG_FAULT_INJECTION - debugfs_create_bool("fault_suppress_err", 0600, - ibd->hfi1_ibdev_dbg, - &ibd->fault_suppress_err); - fault_init_debugfs(ibd); -#endif + hfi1_fault_init_debugfs(ibd); } void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) { if (!hfi1_dbg_root) goto out; -#ifdef CONFIG_FAULT_INJECTION - fault_exit_debugfs(ibd); -#endif + hfi1_fault_exit_debugfs(ibd); debugfs_remove(ibd->hfi1_ibdev_link); debugfs_remove_recursive(ibd->hfi1_ibdev_dbg); out: diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h index 38c38a98156d..1c91461b108f 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.h +++ b/drivers/infiniband/hw/hfi1/debugfs.h @@ -1,7 +1,7 @@ #ifndef _HFI1_DEBUGFS_H #define _HFI1_DEBUGFS_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015, 2016, 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -48,51 +48,59 @@ */ struct hfi1_ibdev; -#ifdef CONFIG_DEBUG_FS -void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd); -void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd); -void hfi1_dbg_init(void); -void hfi1_dbg_exit(void); -#ifdef CONFIG_FAULT_INJECTION -#include -struct fault_opcode { - struct fault_attr attr; - struct dentry *dir; - bool fault_by_opcode; - u64 n_rxfaults[256]; - u64 n_txfaults[256]; - u8 opcode; - u8 mask; -}; - -struct fault_packet { - struct fault_attr attr; - struct dentry *dir; - bool fault_by_packet; - u64 n_faults; -}; - -bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, u32 opcode, bool rx); -bool hfi1_dbg_fault_packet(struct hfi1_packet *packet); -bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd); -#else -static inline bool hfi1_dbg_fault_packet(struct hfi1_packet *packet) -{ - return false; +#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ +do { \ + struct dentry *ent; \ + const char *__name = name; \ + ent = debugfs_create_file(__name, mode, parent, \ + data, ops); \ + if (!ent) \ + pr_warn("create of %s failed\n", __name); \ +} while (0) + +#define DEBUGFS_SEQ_FILE_OPS(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ } -static inline bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, - u32 opcode, bool rx) -{ - return false; +#define DEBUGFS_SEQ_FILE_OPEN(name) \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ } -static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) -{ - return false; +#define DEBUGFS_FILE_OPS(name) \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = hfi1_seq_read, \ + .llseek = hfi1_seq_lseek, \ + .release = seq_release \ } -#endif + +#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ + DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444) + +ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos); +loff_t hfi1_seq_lseek(struct file *file, loff_t offset, int whence); + +#ifdef CONFIG_DEBUG_FS +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd); +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd); +void hfi1_dbg_init(void); +void hfi1_dbg_exit(void); #else static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) @@ -111,13 +119,12 @@ static inline void hfi1_dbg_exit(void) { } -static inline bool hfi1_dbg_fault_packet(struct hfi1_packet *packet) +static inline bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) { return false; } -static inline bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, - u32 opcode, bool rx) +static inline bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode) { return false; } diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index bd837a048bf4..e5a57ebd8da4 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -61,6 +61,7 @@ #include "sdma.h" #include "debugfs.h" #include "vnic.h" +#include "fault.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -1565,10 +1566,10 @@ void handle_eflags(struct hfi1_packet *packet) */ int process_receive_ib(struct hfi1_packet *packet) { - if (unlikely(hfi1_dbg_fault_packet(packet))) + if (hfi1_setup_9B_packet(packet)) return RHF_RCV_CONTINUE; - if (hfi1_setup_9B_packet(packet)) + if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; trace_hfi1_rcvhdr(packet); @@ -1642,7 +1643,8 @@ int process_receive_error(struct hfi1_packet *packet) /* KHdrHCRCErr -- KDETH packet with a bad HCRC */ if (unlikely( hfi1_dbg_fault_suppress_err(&packet->rcd->dd->verbs_dev) && - rhf_rcv_type_err(packet->rhf) == 3)) + (rhf_rcv_type_err(packet->rhf) == RHF_RCV_TYPE_ERROR || + packet->rhf & RHF_DC_ERR))) return RHF_RCV_CONTINUE; hfi1_setup_ib_header(packet); @@ -1657,10 +1659,10 @@ int process_receive_error(struct hfi1_packet *packet) int kdeth_process_expected(struct hfi1_packet *packet) { - if (unlikely(hfi1_dbg_fault_packet(packet))) + hfi1_setup_9B_packet(packet); + if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; - hfi1_setup_ib_header(packet); if (unlikely(rhf_err_flags(packet->rhf))) handle_eflags(packet); @@ -1671,11 +1673,11 @@ int kdeth_process_expected(struct hfi1_packet *packet) int kdeth_process_eager(struct hfi1_packet *packet) { - hfi1_setup_ib_header(packet); + hfi1_setup_9B_packet(packet); + if (unlikely(hfi1_dbg_should_fault_rx(packet))) + return RHF_RCV_CONTINUE; if (unlikely(rhf_err_flags(packet->rhf))) handle_eflags(packet); - if (unlikely(hfi1_dbg_fault_packet(packet))) - return RHF_RCV_CONTINUE; dd_dev_err(packet->rcd->dd, "Unhandled eager packet received. Dropping.\n"); diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c new file mode 100644 index 000000000000..e2290f32c8d9 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -0,0 +1,375 @@ +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include +#include +#include + +#include "debugfs.h" +#include "fault.h" +#include "trace.h" + +#define HFI1_FAULT_DIR_TX BIT(0) +#define HFI1_FAULT_DIR_RX BIT(1) +#define HFI1_FAULT_DIR_TXRX (HFI1_FAULT_DIR_TX | HFI1_FAULT_DIR_RX) + +static void *_fault_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_fault_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void _fault_stats_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _fault_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; + + for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { + rcd = hfi1_rcd_get_by_index(dd, j); + if (rcd) { + n_packets += rcd->opstats->stats[i].n_packets; + n_bytes += rcd->opstats->stats[i].n_bytes; + } + hfi1_rcd_put(rcd); + } + for_each_possible_cpu(j) { + struct hfi1_opcode_stats_perctx *sp = + per_cpu_ptr(dd->tx_opstats, j); + + n_packets += sp->stats[i].n_packets; + n_bytes += sp->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + if (!ibd->fault->n_rxfaults[i] && !ibd->fault->n_txfaults[i]) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu (faults rx:%llu faults: tx:%llu)\n", i, + (unsigned long long)n_packets, + (unsigned long long)n_bytes, + (unsigned long long)ibd->fault->n_rxfaults[i], + (unsigned long long)ibd->fault->n_txfaults[i]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(fault_stats); +DEBUGFS_SEQ_FILE_OPEN(fault_stats); +DEBUGFS_FILE_OPS(fault_stats); + +static int fault_opcodes_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +static ssize_t fault_opcodes_write(struct file *file, const char __user *buf, + size_t len, loff_t *pos) +{ + ssize_t ret = 0; + /* 1280 = 256 opcodes * 4 chars/opcode + 255 commas + NULL */ + size_t copy, datalen = 1280; + char *data, *token, *ptr, *end; + struct fault *fault = file->private_data; + + data = kcalloc(datalen, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + copy = min(len, datalen - 1); + if (copy_from_user(data, buf, copy)) + return -EFAULT; + + ret = debugfs_file_get(file->f_path.dentry); + if (unlikely(ret)) + return ret; + ptr = data; + token = ptr; + for (ptr = data; *ptr; ptr = end + 1, token = ptr) { + char *dash; + unsigned long range_start, range_end, i; + bool remove = false; + + end = strchr(ptr, ','); + if (end) + *end = '\0'; + if (token[0] == '-') { + remove = true; + token++; + } + dash = strchr(token, '-'); + if (dash) + *dash = '\0'; + if (kstrtoul(token, 0, &range_start)) + break; + if (dash) { + token = dash + 1; + if (kstrtoul(token, 0, &range_end)) + break; + } else { + range_end = range_start; + } + if (range_start == range_end && range_start == -1UL) { + bitmap_zero(fault->opcodes, sizeof(fault->opcodes) * + BITS_PER_BYTE); + break; + } + for (i = range_start; i <= range_end; i++) { + if (remove) + clear_bit(i, fault->opcodes); + else + set_bit(i, fault->opcodes); + } + if (!end) + break; + } + ret = len; + + debugfs_file_put(file->f_path.dentry); + kfree(data); + return ret; +} + +static ssize_t fault_opcodes_read(struct file *file, char __user *buf, + size_t len, loff_t *pos) +{ + ssize_t ret = 0; + char *data; + size_t datalen = 1280, size = 0; /* see fault_opcodes_write() */ + unsigned long bit = 0, zero = 0; + struct fault *fault = file->private_data; + size_t bitsize = sizeof(fault->opcodes) * BITS_PER_BYTE; + + data = kcalloc(datalen, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + ret = debugfs_file_get(file->f_path.dentry); + if (unlikely(ret)) + return ret; + bit = find_first_bit(fault->opcodes, bitsize); + while (bit < bitsize) { + zero = find_next_zero_bit(fault->opcodes, bitsize, bit); + if (zero - 1 != bit) + size += snprintf(data + size, + datalen - size - 1, + "0x%lx-0x%lx,", bit, zero - 1); + else + size += snprintf(data + size, + datalen - size - 1, "0x%lx,", + bit); + bit = find_next_bit(fault->opcodes, bitsize, zero); + } + debugfs_file_put(file->f_path.dentry); + data[size - 1] = '\n'; + data[size] = '\0'; + ret = simple_read_from_buffer(buf, len, pos, data, size); + kfree(data); + return ret; +} + +static const struct file_operations __fault_opcodes_fops = { + .owner = THIS_MODULE, + .open = fault_opcodes_open, + .read = fault_opcodes_read, + .write = fault_opcodes_write, + .llseek = no_llseek +}; + +void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) +{ + if (ibd->fault) + debugfs_remove_recursive(ibd->fault->dir); + kfree(ibd->fault); + ibd->fault = NULL; +} + +int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) +{ + struct dentry *parent = ibd->hfi1_ibdev_dbg; + + ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL); + if (!ibd->fault) + return -ENOMEM; + + ibd->fault->attr.interval = 1; + ibd->fault->attr.require_end = ULONG_MAX; + ibd->fault->attr.stacktrace_depth = 32; + ibd->fault->attr.dname = NULL; + ibd->fault->attr.verbose = 0; + ibd->fault->enable = false; + ibd->fault->opcode = false; + ibd->fault->fault_skip = 0; + ibd->fault->skip = 0; + ibd->fault->direction = HFI1_FAULT_DIR_TXRX; + ibd->fault->suppress_err = false; + bitmap_zero(ibd->fault->opcodes, + sizeof(ibd->fault->opcodes) * BITS_PER_BYTE); + + ibd->fault->dir = + fault_create_debugfs_attr("fault", parent, + &ibd->fault->attr); + if (IS_ERR(ibd->fault->dir)) { + kfree(ibd->fault); + ibd->fault = NULL; + return -ENOENT; + } + + DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd); + if (!debugfs_create_bool("enable", 0600, ibd->fault->dir, + &ibd->fault->enable)) + goto fail; + if (!debugfs_create_bool("suppress_err", 0600, + ibd->fault->dir, + &ibd->fault->suppress_err)) + goto fail; + if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir, + &ibd->fault->opcode)) + goto fail; + if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir, + ibd->fault, &__fault_opcodes_fops)) + goto fail; + if (!debugfs_create_u64("skip_pkts", 0600, + ibd->fault->dir, + &ibd->fault->fault_skip)) + goto fail; + if (!debugfs_create_u64("skip_usec", 0600, + ibd->fault->dir, + &ibd->fault->fault_skip_usec)) + goto fail; + if (!debugfs_create_u8("direction", 0600, ibd->fault->dir, + &ibd->fault->direction)) + goto fail; + + return 0; +fail: + hfi1_fault_exit_debugfs(ibd); + return -ENOMEM; +} + +bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) +{ + if (ibd->fault) + return ibd->fault->suppress_err; + return false; +} + +static bool __hfi1_should_fault(struct hfi1_ibdev *ibd, u32 opcode, + u8 direction) +{ + bool ret = false; + + if (!ibd->fault || !ibd->fault->enable) + return false; + if (!(ibd->fault->direction & direction)) + return false; + if (ibd->fault->opcode) { + if (bitmap_empty(ibd->fault->opcodes, + (sizeof(ibd->fault->opcodes) * + BITS_PER_BYTE))) + return false; + if (!(test_bit(opcode, ibd->fault->opcodes))) + return false; + } + if (ibd->fault->fault_skip_usec && + time_before(jiffies, ibd->fault->skip_usec)) + return false; + if (ibd->fault->fault_skip && ibd->fault->skip) { + ibd->fault->skip--; + return false; + } + ret = should_fail(&ibd->fault->attr, 1); + if (ret) { + ibd->fault->skip = ibd->fault->fault_skip; + ibd->fault->skip_usec = jiffies + + usecs_to_jiffies(ibd->fault->fault_skip_usec); + } + return ret; +} + +bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode) +{ + struct hfi1_ibdev *ibd = to_idev(qp->ibqp.device); + + if (__hfi1_should_fault(ibd, opcode, HFI1_FAULT_DIR_TX)) { + trace_hfi1_fault_opcode(qp, opcode); + ibd->fault->n_txfaults[opcode]++; + return true; + } + return false; +} + +bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) +{ + struct hfi1_ibdev *ibd = &packet->rcd->dd->verbs_dev; + + if (__hfi1_should_fault(ibd, packet->opcode, HFI1_FAULT_DIR_RX)) { + trace_hfi1_fault_packet(packet); + ibd->fault->n_rxfaults[packet->opcode]++; + return true; + } + return false; +} diff --git a/drivers/infiniband/hw/hfi1/fault.h b/drivers/infiniband/hw/hfi1/fault.h new file mode 100644 index 000000000000..a83382700a7c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/fault.h @@ -0,0 +1,109 @@ +#ifndef _HFI1_FAULT_H +#define _HFI1_FAULT_H +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include +#include + +#include "hfi.h" + +struct hfi1_ibdev; + +#if defined(CONFIG_FAULT_INJECTION) && defined(CONFIG_FAULT_INJECTION_DEBUG_FS) +struct fault { + struct fault_attr attr; + struct dentry *dir; + u64 n_rxfaults[(1U << BITS_PER_BYTE)]; + u64 n_txfaults[(1U << BITS_PER_BYTE)]; + u64 fault_skip; + u64 skip; + u64 fault_skip_usec; + unsigned long skip_usec; + unsigned long opcodes[(1U << BITS_PER_BYTE) / BITS_PER_LONG]; + bool enable; + bool suppress_err; + bool opcode; + u8 direction; +}; + +int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd); +bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode); +bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet); +bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd); +void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd); + +#else + +static inline int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) +{ + return 0; +} + +static inline bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) +{ + return false; +} + +static inline bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, + u32 opcode) +{ + return false; +} + +static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) +{ + return false; +} + +static inline void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) +{ +} +#endif +#endif /* _HFI1_FAULT_H */ diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 9c97c180c35e..9cd758ce7764 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1,7 +1,7 @@ #ifndef _HFI1_KERNEL_H #define _HFI1_KERNEL_H /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -2049,7 +2049,9 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK +#ifndef CONFIG_FAULT_INJECTION | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK +#endif | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK @@ -2062,7 +2064,11 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK; if (ctxt_type == SC_USER) - base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY; + base_sc_integrity |= +#ifndef CONFIG_FAULT_INJECTION + SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK | +#endif + HFI1_PKT_USER_SC_INTEGRITY; else base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c8cf4d4984d3..9554e912af98 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -63,6 +63,7 @@ #include "verbs_txreq.h" #include "debugfs.h" #include "vnic.h" +#include "fault.h" static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, @@ -624,10 +625,6 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet, if (hfi1_do_pkey_check(packet)) goto unlock_drop; - if (unlikely(hfi1_dbg_fault_opcode(packet->qp, packet->opcode, - true))) - goto unlock_drop; - spin_lock_irqsave(&packet->qp->r_lock, flags); packet_handler = qp_ok(packet); if (likely(packet_handler)) @@ -934,8 +931,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, else pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); - if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, - false))) + if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, @@ -1088,7 +1084,8 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; else pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); - if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, false))) + + if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); } diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 2d787b8346ca..081ca52e6621 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -227,9 +227,7 @@ struct hfi1_ibdev { /* per HFI symlinks to above */ struct dentry *hfi1_ibdev_link; #ifdef CONFIG_FAULT_INJECTION - struct fault_opcode *fault_opcode; - struct fault_packet *fault_packet; - bool fault_suppress_err; + struct fault *fault; #endif #endif }; -- cgit v1.2.3 From c872a1f9e3aaf51b091ff19ef6cb1e1a298f3c90 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Wed, 2 May 2018 06:43:31 -0700 Subject: IB/Hfi1: Read CCE Revision register to verify the device is responsive When Hfi1 device is unresponsive, reading the RcvArrayCnt register will return all 1's. This value is then used to remap chip's RcvArray. The incorrect all ones value used in remapping RcvArray will cause warn on as shown by trace below: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x70/0xb0 [] warn_slowpath_fmt+0x5c/0x80 [] __ioremap_caller+0x279/0x320 [] ? _dev_info+0x6c/0x90 [] ? hfi1_pcie_ddinit+0x1d5/0x330 [hfi1] [] ioremap_wc+0x32/0x40 [] hfi1_pcie_ddinit+0x1d5/0x330 [hfi1] [] hfi1_init_dd+0x1d1/0x2440 [hfi1] [] ? pci_write_config_word+0x1c/0x20 Read CCE revision register first to verify that WFR device is responsive. If the read return "all ones", bail out from init and fail the driver load. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 7 ------- drivers/infiniband/hw/hfi1/pcie.c | 8 ++++++++ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 582cf7eb779f..0fab6df0a345 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -15038,13 +15038,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret < 0) goto bail_cleanup; - /* verify that reads actually work, save revision for reset check */ - dd->revision = read_csr(dd, CCE_REVISION); - if (dd->revision == ~(u64)0) { - dd_dev_err(dd, "cannot read chip CSRs\n"); - ret = -EINVAL; - goto bail_cleanup; - } dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT) & CCE_REVISION_CHIP_REV_MAJOR_MASK; dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index c1c982908b4b..87bd6b60cb53 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -183,6 +183,14 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) return -ENOMEM; } dd_dev_info(dd, "UC base1: %p for %x\n", dd->kregbase1, RCV_ARRAY); + + /* verify that reads actually work, save revision for reset check */ + dd->revision = readq(dd->kregbase1 + CCE_REVISION); + if (dd->revision == ~(u64)0) { + dd_dev_err(dd, "Cannot read chip CSRs\n"); + goto nomem; + } + dd->chip_rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT); dd_dev_info(dd, "RcvArray count: %u\n", dd->chip_rcv_array_count); dd->base2_start = RCV_ARRAY + dd->chip_rcv_array_count * 8; -- cgit v1.2.3 From cf38ea100edfcc0ec0a5797966d69ec4e10fe4f1 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Wed, 2 May 2018 06:43:47 -0700 Subject: IB/hfi1: Create common functions for affinity CPU mask operations CPU masks are used to keep track of affinity assignments for IRQs and processes. Operations performed on these affinity CPU masks are duplicated throughout the code. Create common functions for affinity CPU mask operations to remove duplicate code. Reviewed-by: Michael J. Ruhl Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 83 +++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 23 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index b5fab55cc275..eca9e6354017 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -77,6 +77,58 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) set->gen = 0; } +/* Increment generation of CPU set if needed */ +static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set) +{ + if (cpumask_equal(&set->mask, &set->used)) { + /* + * We've used up all the CPUs, bump up the generation + * and reset the 'used' map + */ + set->gen++; + cpumask_clear(&set->used); + } +} + +static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set) +{ + if (cpumask_empty(&set->used) && set->gen) { + set->gen--; + cpumask_copy(&set->used, &set->mask); + } +} + +/* Get the first CPU from the list of unused CPUs in a CPU set data structure */ +static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff) +{ + int cpu; + + if (!diff || !set) + return -EINVAL; + + _cpu_mask_set_gen_inc(set); + + /* Find out CPUs left in CPU mask */ + cpumask_andnot(diff, &set->mask, &set->used); + + cpu = cpumask_first(diff); + if (cpu >= nr_cpu_ids) /* empty */ + cpu = -EINVAL; + else + cpumask_set_cpu(cpu, &set->used); + + return cpu; +} + +static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu) +{ + if (!set) + return; + + cpumask_clear_cpu(cpu, &set->used); + _cpu_mask_set_gen_dec(set); +} + /* Initialize non-HT cpu cores mask */ void init_real_cpu_mask(void) { @@ -456,17 +508,12 @@ static int get_irq_affinity(struct hfi1_devdata *dd, if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) return -ENOMEM; - if (cpumask_equal(&set->mask, &set->used)) { - /* - * We've used up all the CPUs, bump up the generation - * and reset the 'used' map - */ - set->gen++; - cpumask_clear(&set->used); + cpu = cpu_mask_set_get_first(set, diff); + if (cpu < 0) { + free_cpumask_var(diff); + dd_dev_err(dd, "Failure to obtain CPU for IRQ\n"); + return cpu; } - cpumask_andnot(diff, &set->mask, &set->used); - cpu = cpumask_first(diff); - cpumask_set_cpu(cpu, &set->used); free_cpumask_var(diff); } @@ -526,10 +573,7 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, if (set) { cpumask_andnot(&set->used, &set->used, &msix->mask); - if (cpumask_empty(&set->used) && set->gen) { - set->gen--; - cpumask_copy(&set->used, &set->mask); - } + _cpu_mask_set_gen_dec(set); } irq_set_affinity_hint(msix->irq, NULL); @@ -640,10 +684,7 @@ int hfi1_get_proc_affinity(int node) * If we've used all available HW threads, clear the mask and start * overloading. */ - if (cpumask_equal(&set->mask, &set->used)) { - set->gen++; - cpumask_clear(&set->used); - } + _cpu_mask_set_gen_inc(set); /* * If NUMA node has CPUs used by interrupt handlers, include them in the @@ -767,11 +808,7 @@ void hfi1_put_proc_affinity(int cpu) return; mutex_lock(&affinity->lock); - cpumask_clear_cpu(cpu, &set->used); + cpu_mask_set_put(set, cpu); hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); - if (cpumask_empty(&set->used) && set->gen) { - set->gen--; - cpumask_copy(&set->used, &set->mask); - } mutex_unlock(&affinity->lock); } -- cgit v1.2.3 From 5d18ee67d4c1735f5c1f757e89228ec68e4f4ef3 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Wed, 2 May 2018 06:43:55 -0700 Subject: IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 414 +++++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/affinity.h | 10 +- drivers/infiniband/hw/hfi1/chip.c | 5 + drivers/infiniband/hw/hfi1/hfi.h | 3 + drivers/infiniband/hw/hfi1/init.c | 15 +- drivers/infiniband/hw/hfi1/trace.c | 3 +- drivers/infiniband/hw/hfi1/trace_dbg.h | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 7 +- drivers/infiniband/hw/qib/qib_verbs.c | 6 +- drivers/infiniband/sw/rdmavt/cq.c | 81 +++---- drivers/infiniband/sw/rdmavt/cq.h | 6 +- drivers/infiniband/sw/rdmavt/trace_cq.h | 35 ++- drivers/infiniband/sw/rdmavt/vt.c | 35 +-- include/rdma/rdma_vt.h | 7 +- include/rdma/rdmavt_cq.h | 5 +- 15 files changed, 534 insertions(+), 101 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index eca9e6354017..fbe7198a715a 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -208,7 +208,13 @@ int node_affinity_init(void) return 0; } -void node_affinity_destroy(void) +static void node_affinity_destroy(struct hfi1_affinity_node *entry) +{ + free_percpu(entry->comp_vect_affinity); + kfree(entry); +} + +void node_affinity_destroy_all(void) { struct list_head *pos, *q; struct hfi1_affinity_node *entry; @@ -218,7 +224,7 @@ void node_affinity_destroy(void) entry = list_entry(pos, struct hfi1_affinity_node, list); list_del(pos); - kfree(entry); + node_affinity_destroy(entry); } mutex_unlock(&node_affinity.lock); kfree(hfi1_per_node_cntr); @@ -232,6 +238,7 @@ static struct hfi1_affinity_node *node_affinity_allocate(int node) if (!entry) return NULL; entry->node = node; + entry->comp_vect_affinity = alloc_percpu(u16); INIT_LIST_HEAD(&entry->list); return entry; @@ -261,6 +268,341 @@ static struct hfi1_affinity_node *node_affinity_lookup(int node) return NULL; } +static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, + u16 __percpu *comp_vect_affinity) +{ + int curr_cpu; + u16 cntr; + u16 prev_cntr; + int ret_cpu; + + if (!possible_cpumask) { + ret_cpu = -EINVAL; + goto fail; + } + + if (!comp_vect_affinity) { + ret_cpu = -EINVAL; + goto fail; + } + + ret_cpu = cpumask_first(possible_cpumask); + if (ret_cpu >= nr_cpu_ids) { + ret_cpu = -EINVAL; + goto fail; + } + + prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); + for_each_cpu(curr_cpu, possible_cpumask) { + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); + + if (cntr < prev_cntr) { + ret_cpu = curr_cpu; + prev_cntr = cntr; + } + } + + *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; + +fail: + return ret_cpu; +} + +static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, + u16 __percpu *comp_vect_affinity) +{ + int curr_cpu; + int max_cpu; + u16 cntr; + u16 prev_cntr; + + if (!possible_cpumask) + return -EINVAL; + + if (!comp_vect_affinity) + return -EINVAL; + + max_cpu = cpumask_first(possible_cpumask); + if (max_cpu >= nr_cpu_ids) + return -EINVAL; + + prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); + for_each_cpu(curr_cpu, possible_cpumask) { + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); + + if (cntr > prev_cntr) { + max_cpu = curr_cpu; + prev_cntr = cntr; + } + } + + *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; + + return max_cpu; +} + +/* + * Non-interrupt CPUs are used first, then interrupt CPUs. + * Two already allocated cpu masks must be passed. + */ +static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry, + cpumask_var_t non_intr_cpus, + cpumask_var_t available_cpus) + __must_hold(&node_affinity.lock) +{ + int cpu; + struct cpu_mask_set *set = dd->comp_vect; + + lockdep_assert_held(&node_affinity.lock); + if (!non_intr_cpus) { + cpu = -1; + goto fail; + } + + if (!available_cpus) { + cpu = -1; + goto fail; + } + + /* Available CPUs for pinning completion vectors */ + _cpu_mask_set_gen_inc(set); + cpumask_andnot(available_cpus, &set->mask, &set->used); + + /* Available CPUs without SDMA engine interrupts */ + cpumask_andnot(non_intr_cpus, available_cpus, + &entry->def_intr.used); + + /* If there are non-interrupt CPUs available, use them first */ + if (!cpumask_empty(non_intr_cpus)) + cpu = cpumask_first(non_intr_cpus); + else /* Otherwise, use interrupt CPUs */ + cpu = cpumask_first(available_cpus); + + if (cpu >= nr_cpu_ids) { /* empty */ + cpu = -1; + goto fail; + } + cpumask_set_cpu(cpu, &set->used); + +fail: + return cpu; +} + +static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) +{ + struct cpu_mask_set *set = dd->comp_vect; + + if (cpu < 0) + return; + + cpu_mask_set_put(set, cpu); +} + +/* _dev_comp_vect_mappings_destroy() is reentrant */ +static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) +{ + int i, cpu; + + if (!dd->comp_vect_mappings) + return; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = dd->comp_vect_mappings[i]; + _dev_comp_vect_cpu_put(dd, cpu); + dd->comp_vect_mappings[i] = -1; + hfi1_cdbg(AFFINITY, + "[%s] Release CPU %d from completion vector %d", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); + } + + kfree(dd->comp_vect_mappings); + dd->comp_vect_mappings = NULL; +} + +/* + * This function creates the table for looking up CPUs for completion vectors. + * num_comp_vectors needs to have been initilized before calling this function. + */ +static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry) + __must_hold(&node_affinity.lock) +{ + int i, cpu, ret; + cpumask_var_t non_intr_cpus; + cpumask_var_t available_cpus; + + lockdep_assert_held(&node_affinity.lock); + + if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { + free_cpumask_var(non_intr_cpus); + return -ENOMEM; + } + + dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus, + sizeof(*dd->comp_vect_mappings), + GFP_KERNEL); + if (!dd->comp_vect_mappings) { + ret = -ENOMEM; + goto fail; + } + for (i = 0; i < dd->comp_vect_possible_cpus; i++) + dd->comp_vect_mappings[i] = -1; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, + available_cpus); + if (cpu < 0) { + ret = -EINVAL; + goto fail; + } + + dd->comp_vect_mappings[i] = cpu; + hfi1_cdbg(AFFINITY, + "[%s] Completion Vector %d -> CPU %d", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); + } + + return 0; + +fail: + free_cpumask_var(available_cpus); + free_cpumask_var(non_intr_cpus); + _dev_comp_vect_mappings_destroy(dd); + + return ret; +} + +int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) +{ + int ret; + struct hfi1_affinity_node *entry; + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + if (!entry) { + ret = -EINVAL; + goto unlock; + } + ret = _dev_comp_vect_mappings_create(dd, entry); +unlock: + mutex_unlock(&node_affinity.lock); + + return ret; +} + +void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) +{ + _dev_comp_vect_mappings_destroy(dd); +} + +int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) +{ + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + + if (!dd->comp_vect_mappings) + return -EINVAL; + if (comp_vect >= dd->comp_vect_possible_cpus) + return -EINVAL; + + return dd->comp_vect_mappings[comp_vect]; +} + +/* + * It assumes dd->comp_vect_possible_cpus is available. + */ +static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry, + bool first_dev_init) + __must_hold(&node_affinity.lock) +{ + int i, j, curr_cpu; + int possible_cpus_comp_vect = 0; + struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; + + lockdep_assert_held(&node_affinity.lock); + /* + * If there's only one CPU available for completion vectors, then + * there will only be one completion vector available. Othewise, + * the number of completion vector available will be the number of + * available CPUs divide it by the number of devices in the + * local NUMA node. + */ + if (cpumask_weight(&entry->comp_vect_mask) == 1) { + possible_cpus_comp_vect = 1; + dd_dev_warn(dd, + "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); + } else { + possible_cpus_comp_vect += + cpumask_weight(&entry->comp_vect_mask) / + hfi1_per_node_cntr[dd->node]; + + /* + * If the completion vector CPUs available doesn't divide + * evenly among devices, then the first device device to be + * initialized gets an extra CPU. + */ + if (first_dev_init && + cpumask_weight(&entry->comp_vect_mask) % + hfi1_per_node_cntr[dd->node] != 0) + possible_cpus_comp_vect++; + } + + dd->comp_vect_possible_cpus = possible_cpus_comp_vect; + + /* Reserving CPUs for device completion vector */ + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, + entry->comp_vect_affinity); + if (curr_cpu < 0) + goto fail; + + cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); + } + + hfi1_cdbg(AFFINITY, + "[%s] Completion vector affinity CPU set(s) %*pbl", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), + cpumask_pr_args(dev_comp_vect_mask)); + + return 0; + +fail: + for (j = 0; j < i; j++) + per_cpu_affinity_put_max(&entry->comp_vect_mask, + entry->comp_vect_affinity); + + return curr_cpu; +} + +/* + * It assumes dd->comp_vect_possible_cpus is available. + */ +static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry) + __must_hold(&node_affinity.lock) +{ + int i, cpu; + + lockdep_assert_held(&node_affinity.lock); + if (!dd->comp_vect_possible_cpus) + return; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, + entry->comp_vect_affinity); + /* Clearing CPU in device completion vector cpu mask */ + if (cpu >= 0) + cpumask_clear_cpu(cpu, &dd->comp_vect->mask); + } + + dd->comp_vect_possible_cpus = 0; +} + /* * Interrupt affinity. * @@ -277,7 +619,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) int node = pcibus_to_node(dd->pcidev->bus); struct hfi1_affinity_node *entry; const struct cpumask *local_mask; - int curr_cpu, possible, i; + int curr_cpu, possible, i, ret; + bool new_entry = false; if (node < 0) node = numa_node_id(); @@ -299,11 +642,14 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) if (!entry) { dd_dev_err(dd, "Unable to allocate global affinity node\n"); - mutex_unlock(&node_affinity.lock); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } + new_entry = true; + init_cpu_mask_set(&entry->def_intr); init_cpu_mask_set(&entry->rcv_intr); + cpumask_clear(&entry->comp_vect_mask); cpumask_clear(&entry->general_intr_mask); /* Use the "real" cpu mask of this node as the default */ cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, @@ -356,10 +702,64 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) &entry->general_intr_mask); } - node_affinity_add_tail(entry); + /* Determine completion vector CPUs for the entire node */ + cpumask_and(&entry->comp_vect_mask, + &node_affinity.real_cpu_mask, local_mask); + cpumask_andnot(&entry->comp_vect_mask, + &entry->comp_vect_mask, + &entry->rcv_intr.mask); + cpumask_andnot(&entry->comp_vect_mask, + &entry->comp_vect_mask, + &entry->general_intr_mask); + + /* + * If there ends up being 0 CPU cores leftover for completion + * vectors, use the same CPU core as the general/control + * context. + */ + if (cpumask_weight(&entry->comp_vect_mask) == 0) + cpumask_copy(&entry->comp_vect_mask, + &entry->general_intr_mask); } + + ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); + if (ret < 0) + goto fail; + + if (new_entry) + node_affinity_add_tail(entry); + mutex_unlock(&node_affinity.lock); + return 0; + +fail: + if (new_entry) + node_affinity_destroy(entry); + mutex_unlock(&node_affinity.lock); + return ret; +} + +void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) +{ + struct hfi1_affinity_node *entry; + + if (dd->node < 0) + return; + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + if (!entry) + goto unlock; + + /* + * Free device completion vector CPUs to be used by future + * completion vectors + */ + _dev_comp_vect_cpu_mask_clean_up(dd, entry); +unlock: + mutex_unlock(&node_affinity.lock); + dd->node = -1; } /* diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index 2a1e374169c0..6a7e6ea4e426 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -98,9 +98,11 @@ void hfi1_put_proc_affinity(int cpu); struct hfi1_affinity_node { int node; + u16 __percpu *comp_vect_affinity; struct cpu_mask_set def_intr; struct cpu_mask_set rcv_intr; struct cpumask general_intr_mask; + struct cpumask comp_vect_mask; struct list_head list; }; @@ -116,7 +118,11 @@ struct hfi1_affinity_node_list { }; int node_affinity_init(void); -void node_affinity_destroy(void); +void node_affinity_destroy_all(void); extern struct hfi1_affinity_node_list node_affinity; +void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd); +int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect); +int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd); +void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd); #endif /* _HFI1_AFFINITY_H */ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 0fab6df0a345..46e9e4ffcba4 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -15233,6 +15233,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_cleanup; + ret = hfi1_comp_vectors_set_up(dd); + if (ret) + goto bail_clear_intr; + /* set up LCB access - must be after set_up_interrupts() */ init_lcb_access(dd); @@ -15275,6 +15279,7 @@ bail_free_rcverr: bail_free_cntrs: free_cntrs(dd); bail_clear_intr: + hfi1_comp_vectors_clean_up(dd); hfi1_clean_up_interrupts(dd); bail_cleanup: hfi1_pcie_ddcleanup(dd); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 9cd758ce7764..dd84238c1aac 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1263,6 +1263,9 @@ struct hfi1_devdata { /* Save the enabled LCB error bits */ u64 lcb_err_en; + struct cpu_mask_set *comp_vect; + int *comp_vect_mappings; + u32 comp_vect_possible_cpus; /* * Capability to have different send engines simply by changing a diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 790542ce89a5..5d1adfc450d3 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -1244,6 +1244,8 @@ static void hfi1_clean_devdata(struct hfi1_devdata *dd) dd->rcv_limit = NULL; dd->send_schedule = NULL; dd->tx_opstats = NULL; + kfree(dd->comp_vect); + dd->comp_vect = NULL; sdma_clean(dd, dd->num_sdma); rvt_dealloc_device(&dd->verbs_dev.rdi); } @@ -1300,6 +1302,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) dd->unit = ret; list_add(&dd->list, &hfi1_dev_list); } + dd->node = -1; spin_unlock_irqrestore(&hfi1_devs_lock, flags); idr_preload_end(); @@ -1352,6 +1355,12 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) goto bail; } + dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); + if (!dd->comp_vect) { + ret = -ENOMEM; + goto bail; + } + kobject_init(&dd->kobj, &hfi1_devdata_type); return dd; @@ -1521,7 +1530,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); - node_affinity_destroy(); + node_affinity_destroy_all(); hfi1_wss_exit(); hfi1_dbg_exit(); @@ -1605,6 +1614,8 @@ static void cleanup_device_data(struct hfi1_devdata *dd) static void postinit_cleanup(struct hfi1_devdata *dd) { hfi1_start_cleanup(dd); + hfi1_comp_vectors_clean_up(dd); + hfi1_dev_affinity_clean_up(dd); hfi1_pcie_ddcleanup(dd); hfi1_pcie_cleanup(dd->pcidev); diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 89bd9851065b..332b9b7c554a 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -374,6 +374,7 @@ const char *print_u32_array( return ret; } +__hfi1_trace_fn(AFFINITY); __hfi1_trace_fn(PKT); __hfi1_trace_fn(PROC); __hfi1_trace_fn(SDMA); diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h index 0e7d929530c5..e62171fb7379 100644 --- a/drivers/infiniband/hw/hfi1/trace_dbg.h +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -1,5 +1,5 @@ /* -* Copyright(c) 2015, 2016 Intel Corporation. +* Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -113,6 +113,7 @@ void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all * the debugfs stuff. */ +__hfi1_trace_def(AFFINITY); __hfi1_trace_def(PKT); __hfi1_trace_def(PROC); __hfi1_trace_def(SDMA); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 9554e912af98..fc2e44cde161 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -64,6 +64,7 @@ #include "debugfs.h" #include "vnic.h" #include "fault.h" +#include "affinity.h" static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, @@ -1934,11 +1935,11 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = + hfi1_comp_vect_mappings_lookup; /* completeion queue */ - snprintf(dd->verbs_dev.rdi.dparms.cq_name, - sizeof(dd->verbs_dev.rdi.dparms.cq_name), - "hfi1_cq%d", dd->unit); + dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus; dd->verbs_dev.rdi.dparms.node = dd->node; /* misc settings */ diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 3977abbc83ad..14b4057a2b8f 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2018 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -1631,10 +1631,6 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; - snprintf(dd->verbs_dev.rdi.dparms.cq_name, - sizeof(dd->verbs_dev.rdi.dparms.cq_name), - "qib_cq%d", dd->unit); - qib_fill_device_attr(dd); ppd = dd->pport; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 340c17aba3b0..4f1544ad4aff 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -47,11 +47,12 @@ #include #include -#include #include "cq.h" #include "vt.h" #include "trace.h" +static struct workqueue_struct *comp_vector_wq; + /** * rvt_cq_enter - add a new entry to the completion queue * @cq: completion queue @@ -120,27 +121,21 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && (solicited || entry->status != IB_WC_SUCCESS))) { - struct kthread_worker *worker; - /* * This will cause send_complete() to be called in * another thread. */ - rcu_read_lock(); - worker = rcu_dereference(cq->rdi->worker); - if (likely(worker)) { - cq->notify = RVT_CQ_NONE; - cq->triggered++; - kthread_queue_work(worker, &cq->comptask); - } - rcu_read_unlock(); + cq->notify = RVT_CQ_NONE; + cq->triggered++; + queue_work_on(cq->comp_vector_cpu, comp_vector_wq, + &cq->comptask); } spin_unlock_irqrestore(&cq->lock, flags); } EXPORT_SYMBOL(rvt_cq_enter); -static void send_complete(struct kthread_work *work) +static void send_complete(struct work_struct *work) { struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask); @@ -192,6 +187,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, struct ib_cq *ret; u32 sz; unsigned int entries = attr->cqe; + int comp_vector = attr->comp_vector; if (attr->flags) return ERR_PTR(-EINVAL); @@ -199,6 +195,11 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, if (entries < 1 || entries > rdi->dparms.props.max_cqe) return ERR_PTR(-EINVAL); + if (comp_vector < 0) + comp_vector = 0; + + comp_vector = comp_vector % rdi->ibdev.num_comp_vectors; + /* Allocate the completion queue structure. */ cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); if (!cq) @@ -267,14 +268,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, * an error. */ cq->rdi = rdi; + if (rdi->driver_f.comp_vect_cpu_lookup) + cq->comp_vector_cpu = + rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector); + else + cq->comp_vector_cpu = + cpumask_first(cpumask_of_node(rdi->dparms.node)); + cq->ibcq.cqe = entries; cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); - kthread_init_work(&cq->comptask, send_complete); + INIT_WORK(&cq->comptask, send_complete); cq->queue = wc; ret = &cq->ibcq; + trace_rvt_create_cq(cq, attr); goto done; bail_ip: @@ -300,7 +309,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq) struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_dev_info *rdi = cq->rdi; - kthread_flush_work(&cq->comptask); + flush_work(&cq->comptask); spin_lock_irq(&rdi->n_cqs_lock); rdi->n_cqs_allocated--; spin_unlock_irq(&rdi->n_cqs_lock); @@ -510,24 +519,13 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) * * Return: 0 on success */ -int rvt_driver_cq_init(struct rvt_dev_info *rdi) +int rvt_driver_cq_init(void) { - int cpu; - struct kthread_worker *worker; - - if (rcu_access_pointer(rdi->worker)) - return 0; - - spin_lock_init(&rdi->n_cqs_lock); - - cpu = cpumask_first(cpumask_of_node(rdi->dparms.node)); - worker = kthread_create_worker_on_cpu(cpu, 0, - "%s", rdi->dparms.cq_name); - if (IS_ERR(worker)) - return PTR_ERR(worker); + comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE, + 0, "rdmavt_cq"); + if (!comp_vector_wq) + return -ENOMEM; - set_user_nice(worker->task, MIN_NICE); - RCU_INIT_POINTER(rdi->worker, worker); return 0; } @@ -535,23 +533,8 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) * rvt_cq_exit - tear down cq reources * @rdi: rvt dev structure */ -void rvt_cq_exit(struct rvt_dev_info *rdi) +void rvt_cq_exit(void) { - struct kthread_worker *worker; - - if (!rcu_access_pointer(rdi->worker)) - return; - - spin_lock(&rdi->n_cqs_lock); - worker = rcu_dereference_protected(rdi->worker, - lockdep_is_held(&rdi->n_cqs_lock)); - if (!worker) { - spin_unlock(&rdi->n_cqs_lock); - return; - } - RCU_INIT_POINTER(rdi->worker, NULL); - spin_unlock(&rdi->n_cqs_lock); - synchronize_rcu(); - - kthread_destroy_worker(worker); + destroy_workqueue(comp_vector_wq); + comp_vector_wq = NULL; } diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 6182c29eff66..72184b1c176b 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -2,7 +2,7 @@ #define DEF_RVTCQ_H /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -59,6 +59,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); -int rvt_driver_cq_init(struct rvt_dev_info *rdi); -void rvt_cq_exit(struct rvt_dev_info *rdi); +int rvt_driver_cq_init(void); +void rvt_cq_exit(void); #endif /* DEF_RVTCQ_H */ diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h index a315850aa9bb..df8e1adbef9d 100644 --- a/drivers/infiniband/sw/rdmavt/trace_cq.h +++ b/drivers/infiniband/sw/rdmavt/trace_cq.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -71,6 +71,39 @@ __print_symbolic(opcode, \ wc_opcode_name(RECV), \ wc_opcode_name(RECV_RDMA_WITH_IMM)) +#define CQ_ATTR_PRINT \ +"[%s] user cq %s cqe %u comp_vector %d comp_vector_cpu %d flags %x" + +DECLARE_EVENT_CLASS(rvt_cq_template, + TP_PROTO(struct rvt_cq *cq, + const struct ib_cq_init_attr *attr), + TP_ARGS(cq, attr), + TP_STRUCT__entry(RDI_DEV_ENTRY(cq->rdi) + __field(struct rvt_mmap_info *, ip) + __field(unsigned int, cqe) + __field(int, comp_vector) + __field(int, comp_vector_cpu) + __field(u32, flags) + ), + TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi) + __entry->ip = cq->ip; + __entry->cqe = attr->cqe; + __entry->comp_vector = attr->comp_vector; + __entry->comp_vector_cpu = + cq->comp_vector_cpu; + __entry->flags = attr->flags; + ), + TP_printk(CQ_ATTR_PRINT, __get_str(dev), + __entry->ip ? "true" : "false", __entry->cqe, + __entry->comp_vector, __entry->comp_vector_cpu, + __entry->flags + ) +); + +DEFINE_EVENT(rvt_cq_template, rvt_create_cq, + TP_PROTO(struct rvt_cq *cq, const struct ib_cq_init_attr *attr), + TP_ARGS(cq, attr)); + #define CQ_PRN \ "[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x" diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 434199d0bc96..17e4abc067af 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -49,6 +49,7 @@ #include #include #include "vt.h" +#include "cq.h" #include "trace.h" #define RVT_UVERBS_ABI_VERSION 2 @@ -58,21 +59,18 @@ MODULE_DESCRIPTION("RDMA Verbs Transport Library"); static int rvt_init(void) { - /* - * rdmavt does not need to do anything special when it starts up. All it - * needs to do is sit and wait until a driver attempts registration. - */ - return 0; + int ret = rvt_driver_cq_init(); + + if (ret) + pr_err("Error in driver CQ init.\n"); + + return ret; } module_init(rvt_init); static void rvt_cleanup(void) { - /* - * Nothing to do at exit time either. The module won't be able to be - * removed until all drivers are gone which means all the dev structs - * are gone so there is really nothing to do. - */ + rvt_cq_exit(); } module_exit(rvt_cleanup); @@ -777,11 +775,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) } /* Completion queues */ - ret = rvt_driver_cq_init(rdi); - if (ret) { - pr_err("Error in driver CQ init.\n"); - goto bail_mr; - } + spin_lock_init(&rdi->n_cqs_lock); /* DMA Operations */ rdi->ibdev.dev.dma_ops = rdi->ibdev.dev.dma_ops ? : &dma_virt_ops; @@ -829,14 +823,15 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); rdi->ibdev.node_type = RDMA_NODE_IB_CA; - rdi->ibdev.num_comp_vectors = 1; + if (!rdi->ibdev.num_comp_vectors) + rdi->ibdev.num_comp_vectors = 1; rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); - goto bail_cq; + goto bail_mr; } rvt_create_mad_agents(rdi); @@ -844,9 +839,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rvt_pr_info(rdi, "Registration with rdmavt done.\n"); return ret; -bail_cq: - rvt_cq_exit(rdi); - bail_mr: rvt_mr_exit(rdi); @@ -870,7 +862,6 @@ void rvt_unregister_device(struct rvt_dev_info *rdi) rvt_free_mad_agents(rdi); ib_unregister_device(&rdi->ibdev); - rvt_cq_exit(rdi); rvt_mr_exit(rdi); rvt_qp_exit(rdi); } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index eec495e68823..e79229a0cf01 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -2,7 +2,7 @@ #define DEF_RDMA_VT_H /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -167,7 +167,6 @@ struct rvt_driver_params { int qpn_res_end; int nports; int npkeys; - char cq_name[RVT_CQN_MAX]; int node; int psn_mask; int psn_shift; @@ -347,6 +346,9 @@ struct rvt_driver_provided { /* Notify driver to restart rc */ void (*notify_restart_rc)(struct rvt_qp *qp, u32 psn, int wait); + + /* Get and return CPU to pin CQ processing thread */ + int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect); }; struct rvt_dev_info { @@ -402,7 +404,6 @@ struct rvt_dev_info { spinlock_t pending_lock; /* protect pending mmap list */ /* CQ */ - struct kthread_worker __rcu *worker; /* per device cq worker */ u32 n_cqs_allocated; /* number of CQs allocated for device */ spinlock_t n_cqs_lock; /* protect count of in use cqs */ diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index 51fd00b243d0..75dc65c0bfb8 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -8,7 +8,7 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -80,10 +80,11 @@ struct rvt_cq_wc { */ struct rvt_cq { struct ib_cq ibcq; - struct kthread_work comptask; + struct work_struct comptask; spinlock_t lock; /* protect changes in this struct */ u8 notify; u8 triggered; + int comp_vector_cpu; struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; struct rvt_mmap_info *ip; -- cgit v1.2.3 From 832369fa6410c93547264ad449ebbf16567bbccd Mon Sep 17 00:00:00 2001 From: Brian Welty Date: Wed, 2 May 2018 06:44:03 -0700 Subject: IB/{hfi1, qib, rdmavt}: Move logic to allocate receive WQE into rdmavt Moving receive-side WQE allocation logic into rdmavt will allow further code reuse between qib and hfi1 drivers. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Brian Welty Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 8 +- drivers/infiniband/hw/hfi1/ruc.c | 154 +--------------------------------- drivers/infiniband/hw/hfi1/uc.c | 4 +- drivers/infiniband/hw/hfi1/ud.c | 4 +- drivers/infiniband/hw/hfi1/verbs.h | 2 - drivers/infiniband/hw/qib/qib_rc.c | 8 +- drivers/infiniband/hw/qib/qib_ruc.c | 154 +--------------------------------- drivers/infiniband/hw/qib/qib_uc.c | 4 +- drivers/infiniband/hw/qib/qib_ud.c | 4 +- drivers/infiniband/hw/qib/qib_verbs.h | 2 - drivers/infiniband/sw/rdmavt/qp.c | 149 ++++++++++++++++++++++++++++++++ include/rdma/rdmavt_qp.h | 1 + 12 files changed, 170 insertions(+), 324 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index da58046a02ea..79ee2b9e28c6 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2123,7 +2123,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -2149,7 +2149,7 @@ send_middle: case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): /* consume RWQE */ - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) @@ -2159,7 +2159,7 @@ send_middle: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): case OP(SEND_ONLY_WITH_INVALIDATE): - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -2271,7 +2271,7 @@ send_last: goto send_middle; else if (opcode == OP(RDMA_WRITE_ONLY)) goto no_immediate_data; - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) { diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index c0071ca4147a..ef4c566e206f 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -53,156 +53,6 @@ #include "verbs_txreq.h" #include "trace.h" -/* - * Validate a RWQE and fill in the SGE state. - * Return 1 if OK. - */ -static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) -{ - int i, j, ret; - struct ib_wc wc; - struct rvt_lkey_table *rkt; - struct rvt_pd *pd; - struct rvt_sge_state *ss; - - rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; - pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); - ss = &qp->r_sge; - ss->sg_list = qp->r_sg_list; - qp->r_len = 0; - for (i = j = 0; i < wqe->num_sge; i++) { - if (wqe->sg_list[i].length == 0) - continue; - /* Check LKEY */ - ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], - IB_ACCESS_LOCAL_WRITE); - if (unlikely(ret <= 0)) - goto bad_lkey; - qp->r_len += wqe->sg_list[i].length; - j++; - } - ss->num_sge = j; - ss->total_len = qp->r_len; - ret = 1; - goto bail; - -bad_lkey: - while (j) { - struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; - - rvt_put_mr(sge->mr); - } - ss->num_sge = 0; - memset(&wc, 0, sizeof(wc)); - wc.wr_id = wqe->wr_id; - wc.status = IB_WC_LOC_PROT_ERR; - wc.opcode = IB_WC_RECV; - wc.qp = &qp->ibqp; - /* Signal solicited completion event. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); - ret = 0; -bail: - return ret; -} - -/** - * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE - * @qp: the QP - * @wr_id_only: update qp->r_wr_id only, not qp->r_sge - * - * Return -1 if there is a local error, 0 if no RWQE is available, - * otherwise return 1. - * - * Can be called from interrupt level. - */ -int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only) -{ - unsigned long flags; - struct rvt_rq *rq; - struct rvt_rwq *wq; - struct rvt_srq *srq; - struct rvt_rwqe *wqe; - void (*handler)(struct ib_event *, void *); - u32 tail; - int ret; - - if (qp->ibqp.srq) { - srq = ibsrq_to_rvtsrq(qp->ibqp.srq); - handler = srq->ibsrq.event_handler; - rq = &srq->rq; - } else { - srq = NULL; - handler = NULL; - rq = &qp->r_rq; - } - - spin_lock_irqsave(&rq->lock, flags); - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { - ret = 0; - goto unlock; - } - - wq = rq->wq; - tail = wq->tail; - /* Validate tail before using it since it is user writable. */ - if (tail >= rq->size) - tail = 0; - if (unlikely(tail == wq->head)) { - ret = 0; - goto unlock; - } - /* Make sure entry is read after head index is read. */ - smp_rmb(); - wqe = rvt_get_rwqe_ptr(rq, tail); - /* - * Even though we update the tail index in memory, the verbs - * consumer is not supposed to post more entries until a - * completion is generated. - */ - if (++tail >= rq->size) - tail = 0; - wq->tail = tail; - if (!wr_id_only && !init_sge(qp, wqe)) { - ret = -1; - goto unlock; - } - qp->r_wr_id = wqe->wr_id; - - ret = 1; - set_bit(RVT_R_WRID_VALID, &qp->r_aflags); - if (handler) { - u32 n; - - /* - * Validate head pointer value and compute - * the number of remaining WQEs. - */ - n = wq->head; - if (n >= rq->size) - n = 0; - if (n < tail) - n += rq->size - tail; - else - n -= tail; - if (n < srq->limit) { - struct ib_event ev; - - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - handler(&ev, srq->ibsrq.srq_context); - goto bail; - } - } -unlock: - spin_unlock_irqrestore(&rq->lock, flags); -bail: - return ret; -} - static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) { return (gid->global.interface_id == id && @@ -423,7 +273,7 @@ again: /* FALLTHROUGH */ case IB_WR_SEND: send: - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -435,7 +285,7 @@ send: goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 9d7a3110c14c..b7b671017e59 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -397,7 +397,7 @@ send_first: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { qp->r_sge = qp->s_rdma_read_sge; } else { - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -542,7 +542,7 @@ rdma_last_imm: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { rvt_put_ss(&qp->s_rdma_read_sge); } else { - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 69c17a5ef038..6ad203f6da88 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -163,7 +163,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } else { int ret; - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); goto bail_unlock; @@ -974,7 +974,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) } else { int ret; - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); return; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 081ca52e6621..a16fe5d3f7c4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -328,8 +328,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet); int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey); -int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only); - void hfi1_migrate_qp(struct rvt_qp *qp); int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index c9955d48c50f..f35fdeb14347 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1828,7 +1828,7 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr, /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -1849,7 +1849,7 @@ send_middle: case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): /* consume RWQE */ - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) @@ -1858,7 +1858,7 @@ send_middle: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -1949,7 +1949,7 @@ send_last: goto send_middle; else if (opcode == OP(RDMA_WRITE_ONLY)) goto no_immediate_data; - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) { diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 4662cc7bde92..f8a7de795beb 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -37,156 +37,6 @@ #include "qib.h" #include "qib_mad.h" -/* - * Validate a RWQE and fill in the SGE state. - * Return 1 if OK. - */ -static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) -{ - int i, j, ret; - struct ib_wc wc; - struct rvt_lkey_table *rkt; - struct rvt_pd *pd; - struct rvt_sge_state *ss; - - rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; - pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); - ss = &qp->r_sge; - ss->sg_list = qp->r_sg_list; - qp->r_len = 0; - for (i = j = 0; i < wqe->num_sge; i++) { - if (wqe->sg_list[i].length == 0) - continue; - /* Check LKEY */ - ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], - IB_ACCESS_LOCAL_WRITE); - if (unlikely(ret <= 0)) - goto bad_lkey; - qp->r_len += wqe->sg_list[i].length; - j++; - } - ss->num_sge = j; - ss->total_len = qp->r_len; - ret = 1; - goto bail; - -bad_lkey: - while (j) { - struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; - - rvt_put_mr(sge->mr); - } - ss->num_sge = 0; - memset(&wc, 0, sizeof(wc)); - wc.wr_id = wqe->wr_id; - wc.status = IB_WC_LOC_PROT_ERR; - wc.opcode = IB_WC_RECV; - wc.qp = &qp->ibqp; - /* Signal solicited completion event. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); - ret = 0; -bail: - return ret; -} - -/** - * qib_get_rwqe - copy the next RWQE into the QP's RWQE - * @qp: the QP - * @wr_id_only: update qp->r_wr_id only, not qp->r_sge - * - * Return -1 if there is a local error, 0 if no RWQE is available, - * otherwise return 1. - * - * Can be called from interrupt level. - */ -int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only) -{ - unsigned long flags; - struct rvt_rq *rq; - struct rvt_rwq *wq; - struct rvt_srq *srq; - struct rvt_rwqe *wqe; - void (*handler)(struct ib_event *, void *); - u32 tail; - int ret; - - if (qp->ibqp.srq) { - srq = ibsrq_to_rvtsrq(qp->ibqp.srq); - handler = srq->ibsrq.event_handler; - rq = &srq->rq; - } else { - srq = NULL; - handler = NULL; - rq = &qp->r_rq; - } - - spin_lock_irqsave(&rq->lock, flags); - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { - ret = 0; - goto unlock; - } - - wq = rq->wq; - tail = wq->tail; - /* Validate tail before using it since it is user writable. */ - if (tail >= rq->size) - tail = 0; - if (unlikely(tail == wq->head)) { - ret = 0; - goto unlock; - } - /* Make sure entry is read after head index is read. */ - smp_rmb(); - wqe = rvt_get_rwqe_ptr(rq, tail); - /* - * Even though we update the tail index in memory, the verbs - * consumer is not supposed to post more entries until a - * completion is generated. - */ - if (++tail >= rq->size) - tail = 0; - wq->tail = tail; - if (!wr_id_only && !qib_init_sge(qp, wqe)) { - ret = -1; - goto unlock; - } - qp->r_wr_id = wqe->wr_id; - - ret = 1; - set_bit(RVT_R_WRID_VALID, &qp->r_aflags); - if (handler) { - u32 n; - - /* - * Validate head pointer value and compute - * the number of remaining WQEs. - */ - n = wq->head; - if (n >= rq->size) - n = 0; - if (n < tail) - n += rq->size - tail; - else - n -= tail; - if (n < srq->limit) { - struct ib_event ev; - - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - handler(&ev, srq->ibsrq.srq_context); - goto bail; - } - } -unlock: - spin_unlock_irqrestore(&rq->lock, flags); -bail: - return ret; -} - /* * Switch to alternate path. * The QP s_lock should be held and interrupts disabled. @@ -419,7 +269,7 @@ again: wc.ex.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -431,7 +281,7 @@ again: goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 840eec6ebc33..3e54bc11e0ae 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -335,7 +335,7 @@ send_first: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) qp->r_sge = qp->s_rdma_read_sge; else { - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -471,7 +471,7 @@ rdma_last_imm: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) rvt_put_ss(&qp->s_rdma_read_sge); else { - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 3e4ff77260c2..f8d029a2390f 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -139,7 +139,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) else { int ret; - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); goto bail_unlock; @@ -534,7 +534,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, else { int ret; - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index f887737ac142..f9a46768a19a 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -321,8 +321,6 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, void mr_rcu_callback(struct rcu_head *list); -int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only); - void qib_migrate_qp(struct rvt_qp *qp); int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index c82e6bb3d77c..6e9a351f45fb 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1987,6 +1987,155 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, return 0; } +/* + * Validate a RWQE and fill in the SGE state. + * Return 1 if OK. + */ +static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) +{ + int i, j, ret; + struct ib_wc wc; + struct rvt_lkey_table *rkt; + struct rvt_pd *pd; + struct rvt_sge_state *ss; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + rkt = &rdi->lkey_table; + pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + ss = &qp->r_sge; + ss->sg_list = qp->r_sg_list; + qp->r_len = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + NULL, &wqe->sg_list[i], + IB_ACCESS_LOCAL_WRITE); + if (unlikely(ret <= 0)) + goto bad_lkey; + qp->r_len += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ss->total_len = qp->r_len; + return 1; + +bad_lkey: + while (j) { + struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + + rvt_put_mr(sge->mr); + } + ss->num_sge = 0; + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); + return 0; +} + +/** + * rvt_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return -1 if there is a local error, 0 if no RWQE is available, + * otherwise return 1. + * + * Can be called from interrupt level. + */ +int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) +{ + unsigned long flags; + struct rvt_rq *rq; + struct rvt_rwq *wq; + struct rvt_srq *srq; + struct rvt_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = ibsrq_to_rvtsrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = rvt_get_rwqe_ptr(rq, tail); + /* + * Even though we update the tail index in memory, the verbs + * consumer is not supposed to post more entries until a + * completion is generated. + */ + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + if (!wr_id_only && !init_sge(qp, wqe)) { + ret = -1; + goto unlock; + } + qp->r_wr_id = wqe->wr_id; + + ret = 1; + set_bit(RVT_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * Validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} +EXPORT_SYMBOL(rvt_get_rwqe); + /** * qp_comm_est - handle trap with QP established * @qp: the QP diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 89ab88c342b6..1145a4c154b2 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -663,6 +663,7 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout) extern const int ib_rvt_state_ops[]; struct rvt_dev_info; +int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only); void rvt_comm_est(struct rvt_qp *qp); int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err); -- cgit v1.2.3 From 009669cfad50beb10b3f63434863de231c6b408e Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Tue, 15 May 2018 14:19:19 -0400 Subject: RDMA/hfi1: Fix build error with debugfs disabled A recent patch set to rework the usage of debugfs and to add fault injection capabilities via debugfs files to the hfi1 driver introduced a build error that only shows up when debugfs is fully disabled. The patchset mistakenly defines some empty stub functions in two different headers when debugfs is disabled. Remove the set that shouldn't have been there to resolve the issue. Fixes: a74d5307caba ("IB/hfi1: Rework fault injection machinery") Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/debugfs.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h index 1c91461b108f..d5d824459fcc 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.h +++ b/drivers/infiniband/hw/hfi1/debugfs.h @@ -118,21 +118,6 @@ static inline void hfi1_dbg_init(void) static inline void hfi1_dbg_exit(void) { } - -static inline bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) -{ - return false; -} - -static inline bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode) -{ - return false; -} - -static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) -{ - return false; -} #endif #endif /* _HFI1_DEBUGFS_H */ -- cgit v1.2.3 From 1ea62e816407987fc27a1bb2d011ea6d81338933 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 14 May 2018 11:12:26 -0700 Subject: iw_cxgb4: fix uninitialized variable warnings Fixes: 056f9c7f39bf ("iw_cxgb4: dump detailed driver-specific QP information") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/restrack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index a677940b164a..8d1106befc5c 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -174,10 +174,10 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct t4_swsqe *fsp = NULL, *lsp = NULL; struct t4_swrqe *frp = NULL, *lrp = NULL; struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + u16 first_sq_idx = 0, last_sq_idx = 0; + u16 first_rq_idx = 0, last_rq_idx = 0; struct t4_swsqe first_sqe, last_sqe; struct t4_swrqe first_rqe, last_rqe; - u16 first_sq_idx, last_sq_idx; - u16 first_rq_idx, last_rq_idx; struct nlattr *table_attr; struct t4_wq wq; -- cgit v1.2.3 From 2d478b28596f4fa6efd10a696f05e354be05de45 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 15 May 2018 11:19:21 -0700 Subject: iw_cxgb4: remove wr_id attributes Remove sq/rq wr_id attributes because typically they are pointers and we don't want to pass up kernel pointers. Fixes: 056f9c7f39bf ("iw_cxgb4: dump detailed driver-specific QP information") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/restrack.c | 55 ---------------------------------- 1 file changed, 55 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index 8d1106befc5c..b9724d0b32e0 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -97,8 +97,6 @@ static int fill_swsqe(struct sk_buff *msg, struct t4_sq *sq, u16 idx, goto err; if (rdma_nl_put_driver_u32(msg, "opcode", sqe->opcode)) goto err; - if (rdma_nl_put_driver_u64_hex(msg, "wr_id", sqe->wr_id)) - goto err; if (rdma_nl_put_driver_u32(msg, "complete", sqe->complete)) goto err; if (sqe->complete && @@ -134,50 +132,14 @@ err: return -EMSGSIZE; } -static int fill_swrqe(struct sk_buff *msg, struct t4_rq *rq, u16 idx, - struct t4_swrqe *rqe) -{ - if (rdma_nl_put_driver_u32(msg, "idx", idx)) - goto err; - if (rdma_nl_put_driver_u64_hex(msg, "wr_id", rqe->wr_id)) - goto err; - return 0; -err: - return -EMSGSIZE; -} - -/* - * Dump the first and last pending rqes. - */ -static int fill_swrqes(struct sk_buff *msg, struct t4_rq *rq, - u16 first_idx, struct t4_swrqe *first_rqe, - u16 last_idx, struct t4_swrqe *last_rqe) -{ - if (!first_rqe) - goto out; - if (fill_swrqe(msg, rq, first_idx, first_rqe)) - goto err; - if (!last_rqe) - goto out; - if (fill_swrqe(msg, rq, last_idx, last_rqe)) - goto err; -out: - return 0; -err: - return -EMSGSIZE; -} - static int fill_res_qp_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) { struct ib_qp *ibqp = container_of(res, struct ib_qp, res); struct t4_swsqe *fsp = NULL, *lsp = NULL; - struct t4_swrqe *frp = NULL, *lrp = NULL; struct c4iw_qp *qhp = to_c4iw_qp(ibqp); u16 first_sq_idx = 0, last_sq_idx = 0; - u16 first_rq_idx = 0, last_rq_idx = 0; struct t4_swsqe first_sqe, last_sqe; - struct t4_swrqe first_rqe, last_rqe; struct nlattr *table_attr; struct t4_wq wq; @@ -206,20 +168,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, lsp = &last_sqe; } } - - /* If there are any pending rqes, copy the first and last */ - if (wq.rq.cidx != wq.rq.pidx) { - first_rq_idx = wq.rq.cidx; - first_rqe = qhp->wq.rq.sw_rq[first_rq_idx]; - frp = &first_rqe; - last_rq_idx = wq.rq.pidx; - if (last_rq_idx-- == 0) - last_rq_idx = wq.rq.size - 1; - if (last_rq_idx != first_rq_idx) { - last_rqe = qhp->wq.rq.sw_rq[last_rq_idx]; - lrp = &last_rqe; - } - } spin_unlock_irq(&qhp->lock); if (fill_sq(msg, &wq)) @@ -231,9 +179,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, if (fill_rq(msg, &wq)) goto err_cancel_table; - if (fill_swrqes(msg, &wq.rq, first_rq_idx, frp, last_rq_idx, lrp)) - goto err_cancel_table; - nla_nest_end(msg, table_attr); return 0; -- cgit v1.2.3 From f43c00c04bbf01be0822ef9f0281cc69b56c4e40 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Sat, 12 May 2018 07:50:30 -0500 Subject: i40iw: Extend port reuse support for listeners If two listeners are created with different IP's but same port, the second rdma_listen fails due to a duplicate port entry being added from the CQP add APBVT OP. commit f16dc0aa5ea2 ("i40iw: Add support for port reuse on active side connections") does not account for listener side port reuse. Check for duplicate port before invoking the CQP command to add APBVT entry and delete the entry only if the port is not in use. Additionally, consolidate all port-reuse logic into i40iw_manage_apbvt. Fixes: f16dc0aa5ea2 ("i40iw: Add support for port reuse on active side connections") Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 63 ++++++++++++++-------------------- drivers/infiniband/hw/i40iw/i40iw_cm.h | 4 ++- drivers/infiniband/hw/i40iw/i40iw_hw.c | 34 ++++++++++++++++-- 3 files changed, 59 insertions(+), 42 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 0243ec48e4b5..a24daac719c3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1519,18 +1519,13 @@ static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core, /** * i40iw_find_port - find port that matches reference port - * @port: port number + * @hte: ptr to accelerated or non-accelerated list * @accelerated_list: flag for accelerated vs non-accelerated list */ -static bool i40iw_find_port(struct i40iw_cm_core *cm_core, u16 port, - bool accelerated_list) +static bool i40iw_find_port(struct list_head *hte, u16 port) { - struct list_head *hte; struct i40iw_cm_node *cm_node; - hte = accelerated_list ? - &cm_core->accelerated_list : &cm_core->non_accelerated_list; - list_for_each_entry(cm_node, hte, list) { if (cm_node->loc_port == port) return true; @@ -1540,35 +1535,32 @@ static bool i40iw_find_port(struct i40iw_cm_core *cm_core, u16 port, /** * i40iw_port_in_use - determine if port is in use + * @cm_core: cm's core * @port: port number - * @active_side: flag for listener side vs active side */ -static bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port, bool active_side) +bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port) { struct i40iw_cm_listener *listen_node; unsigned long flags; - bool ret = false; - if (active_side) { - spin_lock_irqsave(&cm_core->ht_lock, flags); - ret = i40iw_find_port(cm_core, port, true); - if (!ret) - ret = i40iw_find_port(cm_core, port, false); - if (!ret) - clear_bit(port, cm_core->active_side_ports); + spin_lock_irqsave(&cm_core->ht_lock, flags); + if (i40iw_find_port(&cm_core->accelerated_list, port) || + i40iw_find_port(&cm_core->non_accelerated_list, port)) { spin_unlock_irqrestore(&cm_core->ht_lock, flags); - } else { - spin_lock_irqsave(&cm_core->listen_list_lock, flags); - list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { - if (listen_node->loc_port == port) { - ret = true; - break; - } + return true; + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { + if (listen_node->loc_port == port) { + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + return true; } - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); } + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - return ret; + return false; } /** @@ -1917,7 +1909,7 @@ static int i40iw_dec_refcnt_listen(struct i40iw_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); if (listener->iwdev) { - if (apbvt_del && !i40iw_port_in_use(cm_core, listener->loc_port, false)) + if (apbvt_del) i40iw_manage_apbvt(listener->iwdev, listener->loc_port, I40IW_MANAGE_APBVT_DEL); @@ -2298,7 +2290,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node) if (cm_node->listener) { i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true); } else { - if (!i40iw_port_in_use(cm_core, cm_node->loc_port, true) && cm_node->apbvt_set) { + if (cm_node->apbvt_set) { i40iw_manage_apbvt(cm_node->iwdev, cm_node->loc_port, I40IW_MANAGE_APBVT_DEL); @@ -3244,6 +3236,7 @@ void i40iw_setup_cm_core(struct i40iw_device *iwdev) spin_lock_init(&cm_core->ht_lock); spin_lock_init(&cm_core->listen_list_lock); + spin_lock_init(&cm_core->apbvt_lock); cm_core->event_wq = alloc_ordered_workqueue("iwewq", WQ_MEM_RECLAIM); @@ -3811,7 +3804,6 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct sockaddr_in6 *laddr6; struct sockaddr_in6 *raddr6; int ret = 0; - unsigned long flags; ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -3882,15 +3874,10 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->qhash_set = true; } - spin_lock_irqsave(&iwdev->cm_core.ht_lock, flags); - if (!test_and_set_bit(cm_info.loc_port, iwdev->cm_core.active_side_ports)) { - spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); - if (i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD)) { - ret = -EINVAL; - goto err; - } - } else { - spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); + if (i40iw_manage_apbvt(iwdev, cm_info.loc_port, + I40IW_MANAGE_APBVT_ADD)) { + ret = -EINVAL; + goto err; } cm_node->apbvt_set = true; diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 78ba36ae2bbe..66dc1ba03389 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -413,8 +413,9 @@ struct i40iw_cm_core { spinlock_t ht_lock; /* manage hash table */ spinlock_t listen_list_lock; /* listen list */ + spinlock_t apbvt_lock; /*manage apbvt entries*/ - unsigned long active_side_ports[BITS_TO_LONGS(MAX_PORTS)]; + unsigned long ports_in_use[BITS_TO_LONGS(MAX_PORTS)]; u64 stats_nodes_created; u64 stats_nodes_destroyed; @@ -457,4 +458,5 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr, struct i40iw_cm_info *nfo, bool disconnect_all); +bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port); #endif /* I40IW_CM_H */ diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index 6139836fb533..414a36ce16af 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -443,13 +443,37 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool add_port) { struct i40iw_apbvt_info *info; - enum i40iw_status_code status; struct i40iw_cqp_request *cqp_request; struct cqp_commands_info *cqp_info; + unsigned long flags; + struct i40iw_cm_core *cm_core = &iwdev->cm_core; + enum i40iw_status_code status = 0; + bool in_use; + + /* apbvt_lock is held across CQP delete APBVT OP (non-waiting) to + * protect against race where add APBVT CQP can race ahead of the delete + * APBVT for same port. + */ + spin_lock_irqsave(&cm_core->apbvt_lock, flags); + + if (!add_port) { + in_use = i40iw_port_in_use(cm_core, accel_local_port); + if (in_use) + goto exit; + clear_bit(accel_local_port, cm_core->ports_in_use); + } else { + in_use = test_and_set_bit(accel_local_port, + cm_core->ports_in_use); + spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); + if (in_use) + return 0; + } cqp_request = i40iw_get_cqp_request(&iwdev->cqp, add_port); - if (!cqp_request) - return -ENOMEM; + if (!cqp_request) { + status = -ENOMEM; + goto exit; + } cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_apbvt_entry.info; @@ -465,6 +489,10 @@ int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool ad status = i40iw_handle_cqp_op(iwdev, cqp_request); if (status) i40iw_pr_err("CQP-OP Manage APBVT entry fail"); +exit: + if (!add_port) + spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); + return status; } -- cgit v1.2.3 From da2f22ae7707b6ed254983aa3b23e013e07cd532 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:33 +0300 Subject: IB/mlx5: Add support for GRE flow specification This patch introduces support for the GRE flow spec and allowing the creation of rules based on the protocol and key fields that are part of GRE protocol header. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a42c6b1cdb5a..81f696b21356 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2448,6 +2448,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) #define LAST_TUNNEL_FIELD tunnel_id #define LAST_FLOW_TAG_FIELD tag_id #define LAST_DROP_FIELD size +#define LAST_DROP_FIELD size /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ @@ -2689,6 +2690,29 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; + case IB_FLOW_SPEC_GRE: + if (ib_spec->gre.mask.c_ks_res0_ver) + return -EOPNOTSUPP; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + IPPROTO_GRE); + + MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol, + 0xffff); + MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol, + ntohs(ib_spec->gre.val.protocol)); + + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c, + gre_key_h), + &ib_spec->gre.mask.key, + sizeof(ib_spec->gre.mask.key)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v, + gre_key_h), + &ib_spec->gre.val.key, + sizeof(ib_spec->gre.val.key)); + break; case IB_FLOW_SPEC_VXLAN_TUNNEL: if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, LAST_TUNNEL_FIELD)) -- cgit v1.2.3 From 71c6e8638ce3baf9ec16d64d263aab74beac912d Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:34 +0300 Subject: IB/mlx5: Add support for MPLS flow specification This patch introduces support for the MPLS flow spec and allows the creation of rules that are matching on the MPLS label. Applying the rule matching depends on the flow specs order and the location of the MPLS in the spec list as there are different configurations to be made in the device in the cases of MPLSoGRE and MPLSoUDP vs. non-encapsulated MPLS. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 102 +++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 14 ++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 +- include/linux/mlx5/device.h | 7 ++ include/linux/mlx5/mlx5_ifc.h | 45 ++++++++-- 5 files changed, 159 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 81f696b21356..8792248034cb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2386,7 +2386,8 @@ static int mlx5_ib_dealloc_pd(struct ib_pd *pd) enum { MATCH_CRITERIA_ENABLE_OUTER_BIT, MATCH_CRITERIA_ENABLE_MISC_BIT, - MATCH_CRITERIA_ENABLE_INNER_BIT + MATCH_CRITERIA_ENABLE_INNER_BIT, + MATCH_CRITERIA_ENABLE_MISC2_BIT }; #define HEADER_IS_ZERO(match_criteria, headers) \ @@ -2406,6 +2407,9 @@ static u8 get_match_criteria_enable(u32 *match_criteria) match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, inner_headers)) << MATCH_CRITERIA_ENABLE_INNER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) << + MATCH_CRITERIA_ENABLE_MISC2_BIT; return match_criteria_enable; } @@ -2440,6 +2444,27 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); } +static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) +{ + if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL)) + return -EOPNOTSUPP; + + return 0; +} + #define LAST_ETH_FIELD vlan_tag #define LAST_IB_FIELD sl #define LAST_IPV4_FIELD tos @@ -2480,12 +2505,16 @@ static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, u32 *match_v, const union ib_flow_spec *ib_spec, const struct ib_flow_attr *flow_attr, - struct mlx5_flow_act *action) + struct mlx5_flow_act *action, u32 prev_type) { void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters); + void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters_2); + void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v, + misc_parameters_2); void *headers_c; void *headers_v; int match_ipv; @@ -2713,6 +2742,70 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, &ib_spec->gre.val.key, sizeof(ib_spec->gre.val.key)); break; + case IB_FLOW_SPEC_MPLS: + switch (prev_type) { + case IB_FLOW_SPEC_UDP: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_udp), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + case IB_FLOW_SPEC_GRE: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_gre), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + default: + if (ib_spec->type & IB_FLOW_SPEC_INNER) { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + inner_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + inner_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } else { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } + } + break; case IB_FLOW_SPEC_VXLAN_TUNNEL: if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, LAST_TUNNEL_FIELD)) @@ -3044,6 +3137,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_flow_destination *rule_dst = dst; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; + u32 prev_type = 0; int err = 0; int dest_num = 1; bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; @@ -3063,10 +3157,12 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(dev->mdev, spec->match_criteria, spec->match_value, - ib_flow, flow_attr, &flow_act); + ib_flow, flow_attr, &flow_act, + prev_type); if (err < 0) goto free; + prev_type = ((union ib_flow_spec *)ib_flow)->type; ib_flow += ((union ib_flow_spec *)ib_flow)->size; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index de51e7c39bc8..556202b9256a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -324,7 +324,8 @@ static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria if (match_criteria_enable & ~( (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS) | (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) | - (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS))) + (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS) | + (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2))) return false; if (!(match_criteria_enable & @@ -360,6 +361,17 @@ static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria return false; } + if (!(match_criteria_enable & + 1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2)) { + char *fg_type_mask = MLX5_ADDR_OF(fte_match_param, + match_criteria, misc_parameters_2); + + if (fg_type_mask[0] || + memcmp(fg_type_mask, fg_type_mask + 1, + MLX5_ST_SZ_BYTES(fte_match_set_misc2) - 1)) + return false; + } + return check_last_reserved(match_criteria); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index e26d3e9d5f9f..b6da322a8016 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -159,7 +159,7 @@ struct mlx5_ft_underlay_qp { u32 qpn; }; -#define MLX5_FTE_MATCH_PARAM_RESERVED reserved_at_600 +#define MLX5_FTE_MATCH_PARAM_RESERVED reserved_at_800 /* Calculate the fte_match_param length and without the reserved length. * Make sure the reserved field is the last. */ diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2bc27f8c5b87..fd1a9341edfa 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -994,6 +994,13 @@ enum mlx5_wol_mode { MLX5_WOL_PHY_ACTIVITY = 1 << 7, }; +enum mlx5_mpls_supported_fields { + MLX5_FIELD_SUPPORT_MPLS_LABEL = 1 << 0, + MLX5_FIELD_SUPPORT_MPLS_EXP = 1 << 1, + MLX5_FIELD_SUPPORT_MPLS_S_BOS = 1 << 2, + MLX5_FIELD_SUPPORT_MPLS_TTL = 1 << 3 +}; + /* MLX5 DEV CAPs */ /* TODO: EAT.ME */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1aad455538f4..3fee2f74d09d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -298,9 +298,15 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 inner_tcp_dport[0x1]; u8 inner_tcp_flags[0x1]; u8 reserved_at_37[0x9]; - u8 reserved_at_40[0x17]; + + u8 reserved_at_40[0x5]; + u8 outer_first_mpls_over_udp[0x4]; + u8 outer_first_mpls_over_gre[0x4]; + u8 inner_first_mpls[0x4]; + u8 outer_first_mpls[0x4]; + u8 reserved_at_55[0x2]; u8 outer_esp_spi[0x1]; - u8 reserved_at_58[0x2]; + u8 reserved_at_58[0x2]; u8 bth_dst_qp[0x1]; u8 reserved_at_5b[0x25]; @@ -450,6 +456,29 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_1a0[0x60]; }; +struct mlx5_ifc_fte_match_mpls_bits { + u8 mpls_label[0x14]; + u8 mpls_exp[0x3]; + u8 mpls_s_bos[0x1]; + u8 mpls_ttl[0x8]; +}; + +struct mlx5_ifc_fte_match_set_misc2_bits { + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls; + + struct mlx5_ifc_fte_match_mpls_bits inner_first_mpls; + + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_gre; + + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_udp; + + u8 reserved_at_80[0x100]; + + u8 metadata_reg_a[0x20]; + + u8 reserved_at_1a0[0x60]; +}; + struct mlx5_ifc_cmd_pas_bits { u8 pa_h[0x20]; @@ -1170,7 +1199,9 @@ struct mlx5_ifc_fte_match_param_bits { struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers; - u8 reserved_at_600[0xa00]; + struct mlx5_ifc_fte_match_set_misc2_bits misc_parameters_2; + + u8 reserved_at_800[0x800]; }; enum { @@ -4579,6 +4610,7 @@ enum { MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, + MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0X3, }; struct mlx5_ifc_query_flow_group_out_bits { @@ -6969,9 +7001,10 @@ struct mlx5_ifc_create_flow_group_out_bits { }; enum { - MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, - MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, - MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3, }; struct mlx5_ifc_create_flow_group_in_bits { -- cgit v1.2.3 From e818e255a58d64e86c8c93e3aa52498b1a3d1760 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:35 +0300 Subject: IB/mlx5: Expose MPLS related tunneling offloads This patch reports the device's capbilities to offload encapsulated MPLS tunnel protocols to user-space: - Capability to offload MPLS over GRE. - Capability to offload MPLS over UDP. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++++ include/linux/mlx5/device.h | 5 +++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- include/uapi/rdma/mlx5-abi.h | 4 +++- 4 files changed, 19 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8792248034cb..ab8cd5c034a2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1084,6 +1084,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_GRE; + if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_CW_MPLS_GRE) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE; + if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_CW_MPLS_UDP) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; } if (uhw->outlen) { diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index fd1a9341edfa..5004ddc702e3 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1001,6 +1001,11 @@ enum mlx5_mpls_supported_fields { MLX5_FIELD_SUPPORT_MPLS_TTL = 1 << 3 }; +enum mlx5_flex_parser_protos { + MLX5_FLEX_PROTO_CW_MPLS_GRE = 1 << 4, + MLX5_FLEX_PROTO_CW_MPLS_UDP = 1 << 5, +}; + /* MLX5 DEV CAPs */ /* TODO: EAT.ME */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3fee2f74d09d..68f756ea550d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1138,7 +1138,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_500[0x20]; u8 num_of_uars_per_page[0x20]; - u8 reserved_at_540[0x40]; + + u8 flex_parser_protocols[0x20]; + u8 reserved_at_560[0x20]; u8 reserved_at_580[0x3d]; u8 cqe_128_always[0x1]; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index fdaf00e20649..508ea8c82da7 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -233,7 +233,9 @@ enum mlx5_ib_query_dev_resp_flags { enum mlx5_ib_tunnel_offloads { MLX5_IB_TUNNELED_OFFLOADS_VXLAN = 1 << 0, MLX5_IB_TUNNELED_OFFLOADS_GRE = 1 << 1, - MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2 + MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2, + MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE = 1 << 3, + MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP = 1 << 4, }; struct mlx5_ib_query_device_resp { -- cgit v1.2.3 From 5e6e78dbd3b93414ca53af5d51c090878d1c9e5d Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 11 May 2018 16:31:23 +0800 Subject: RDMA/hns: Add 64KB page size support for hip08 This patch adds the support of 64KB page size for hip08 in kernel. Signed-off-by: Yixian Liu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 36 ++++++++++++++++------------- drivers/infiniband/hw/hns/hns_roce_mr.c | 6 ----- 3 files changed, 23 insertions(+), 22 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index fb305b7f99a8..53c2f1b8d068 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -100,6 +100,9 @@ #define SERV_TYPE_UC 2 #define SERV_TYPE_UD 3 +/* Configure to HW for PAGE_SIZE larger than 4KB */ +#define PG_SHIFT_OFFSET (PAGE_SHIFT - 12) + #define PAGES_SHIFT_8 8 #define PAGES_SHIFT_16 16 #define PAGES_SHIFT_24 24 diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 25916e8522ed..e0ab672e1c0a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1024,40 +1024,40 @@ static int hns_roce_v2_set_bt(struct hns_roce_dev *hr_dev) roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_M, CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S, - hr_dev->caps.qpc_ba_pg_sz); + hr_dev->caps.qpc_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_M, CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_S, - hr_dev->caps.qpc_buf_pg_sz); + hr_dev->caps.qpc_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_M, CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_S, qpc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : qpc_hop_num); roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_M, CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_S, - hr_dev->caps.srqc_ba_pg_sz); + hr_dev->caps.srqc_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_M, CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_S, - hr_dev->caps.srqc_buf_pg_sz); + hr_dev->caps.srqc_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_M, CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_S, srqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : srqc_hop_num); roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_M, CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_S, - hr_dev->caps.cqc_ba_pg_sz); + hr_dev->caps.cqc_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_M, CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_S, - hr_dev->caps.cqc_buf_pg_sz); + hr_dev->caps.cqc_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_M, CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_S, cqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : cqc_hop_num); roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_M, CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_S, - hr_dev->caps.mpt_ba_pg_sz); + hr_dev->caps.mpt_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_M, CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_S, - hr_dev->caps.mpt_buf_pg_sz); + hr_dev->caps.mpt_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M, CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S, mpt_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : mpt_hop_num); @@ -1351,7 +1351,8 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, HNS_ROCE_HOP_NUM_0 ? 0 : mr->pbl_hop_num); roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, - V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, mr->pbl_ba_pg_sz); + V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, + mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, V2_MPT_BYTE_4_PD_S, mr->pd); mpt_entry->byte_4_pd_hop_st = cpu_to_le32(mpt_entry->byte_4_pd_hop_st); @@ -1425,7 +1426,8 @@ found: roce_set_field(mpt_entry->byte_64_buf_pa1, V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, mr->pbl_buf_pg_sz); + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, + mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); mpt_entry->byte_64_buf_pa1 = cpu_to_le32(mpt_entry->byte_64_buf_pa1); return 0; @@ -1606,11 +1608,11 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_24_pgsz_addr, V2_CQC_BYTE_24_CQE_BA_PG_SZ_M, V2_CQC_BYTE_24_CQE_BA_PG_SZ_S, - hr_dev->caps.cqe_ba_pg_sz); + hr_dev->caps.cqe_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(cq_context->byte_24_pgsz_addr, V2_CQC_BYTE_24_CQE_BUF_PG_SZ_M, V2_CQC_BYTE_24_CQE_BUF_PG_SZ_S, - hr_dev->caps.cqe_buf_pg_sz); + hr_dev->caps.cqe_buf_pg_sz + PG_SHIFT_OFFSET); cq_context->cqe_ba = (u32)(dma_handle >> 3); @@ -2707,7 +2709,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, - hr_dev->caps.mtt_ba_pg_sz); + hr_dev->caps.mtt_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0); @@ -2715,7 +2717,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, - hr_dev->caps.mtt_buf_pg_sz); + hr_dev->caps.mtt_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0); @@ -4149,12 +4151,14 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, /* set eqe_ba_pg_sz */ roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BA_PG_SZ_M, - HNS_ROCE_EQC_BA_PG_SZ_S, eq->eqe_ba_pg_sz); + HNS_ROCE_EQC_BA_PG_SZ_S, + eq->eqe_ba_pg_sz + PG_SHIFT_OFFSET); /* set eqe_buf_pg_sz */ roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BUF_PG_SZ_M, - HNS_ROCE_EQC_BUF_PG_SZ_S, eq->eqe_buf_pg_sz); + HNS_ROCE_EQC_BUF_PG_SZ_S, + eq->eqe_buf_pg_sz + PG_SHIFT_OFFSET); /* set eq_producer_idx */ roce_set_field(eqc->byte_8, diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index f7256d88d38f..d1fe0e7957e3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1007,12 +1007,6 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } n = ib_umem_page_count(mr->umem); - if (mr->umem->page_shift != HNS_ROCE_HEM_PAGE_SHIFT) { - dev_err(dev, "Just support 4K page size but is 0x%lx now!\n", - BIT(mr->umem->page_shift)); - ret = -EINVAL; - goto err_umem; - } if (!hr_dev->caps.pbl_hop_num) { if (n > HNS_ROCE_MAX_MTPT_PBL_NUM) { -- cgit v1.2.3 From b06f2efd3bbe522ee0e118c3f29497c857d97f8b Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:31:28 -0700 Subject: iw_cxgb4: always set iw_cm_id.provider_data In active side connections, the provider_data field is not getting set. This will be used in a subsequent patch to dump state, so always set it. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 4cf17c650c36..0912fa026327 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3210,6 +3210,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->com.cm_id = cm_id; ref_cm_id(&ep->com); + cm_id->provider_data = ep; ep->com.dev = dev; ep->com.qp = get_qhp(dev, conn_param->qpn); if (!ep->com.qp) { -- cgit v1.2.3 From 116aeb8873712ea559d26b0d9d88147af5c88db5 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:31:43 -0700 Subject: iw_cxgb4: provide detailed provider-specific CM_ID information Add a table of important fields from the c4iw_ep* structures to the cm_id resource tracking table. This is helpful in debugging. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/restrack.c | 84 ++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index b9724d0b32e0..463ef5813a59 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -30,6 +30,8 @@ * SOFTWARE. */ +#include + #include "iw_cxgb4.h" #include #include @@ -188,6 +190,88 @@ err: return -EMSGSIZE; } +union union_ep { + struct c4iw_listen_ep lep; + struct c4iw_ep ep; +}; + +static int fill_res_ep_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct rdma_cm_id *cm_id = rdma_res_to_id(res); + struct nlattr *table_attr; + struct c4iw_ep_common *epcp; + struct c4iw_listen_ep *listen_ep = NULL; + struct c4iw_ep *ep = NULL; + struct iw_cm_id *iw_cm_id; + union union_ep *uep; + + iw_cm_id = rdma_iw_cm_id(cm_id); + if (!iw_cm_id) + return 0; + epcp = (struct c4iw_ep_common *)iw_cm_id->provider_data; + if (!epcp) + return 0; + uep = kcalloc(1, sizeof(*uep), GFP_KERNEL); + if (!uep) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err_free_uep; + + /* Get a consistent snapshot */ + mutex_lock(&epcp->mutex); + if (epcp->state == LISTEN) { + uep->lep = *(struct c4iw_listen_ep *)epcp; + mutex_unlock(&epcp->mutex); + listen_ep = &uep->lep; + epcp = &listen_ep->com; + } else { + uep->ep = *(struct c4iw_ep *)epcp; + mutex_unlock(&epcp->mutex); + ep = &uep->ep; + epcp = &ep->com; + } + + if (rdma_nl_put_driver_u32(msg, "state", epcp->state)) + goto err_cancel_table; + if (rdma_nl_put_driver_u64_hex(msg, "flags", epcp->flags)) + goto err_cancel_table; + if (rdma_nl_put_driver_u64_hex(msg, "history", epcp->history)) + goto err_cancel_table; + + if (epcp->state == LISTEN) { + if (rdma_nl_put_driver_u32(msg, "stid", listen_ep->stid)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "backlog", listen_ep->backlog)) + goto err_cancel_table; + } else { + if (rdma_nl_put_driver_u32(msg, "hwtid", ep->hwtid)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "ord", ep->ord)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "ird", ep->ird)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "emss", ep->emss)) + goto err_cancel_table; + + if (!ep->parent_ep && rdma_nl_put_driver_u32(msg, "atid", + ep->atid)) + goto err_cancel_table; + } + nla_nest_end(msg, table_attr); + kfree(uep); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err_free_uep: + kfree(uep); + return -EMSGSIZE; +} + c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = fill_res_qp_entry, + [RDMA_RESTRACK_CM_ID] = fill_res_ep_entry, }; -- cgit v1.2.3 From 54e7688e54bed5db5c7859cbdbf393e8b2b7ef0b Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:31:51 -0700 Subject: iw_cxgb4: provide detailed driver-specific CQ information Add a table of important fields from the c4iw_cq* structures to the cq resource tracking table. This is helpful in debugging. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/restrack.c | 163 +++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index 463ef5813a59..013524a093cf 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -271,7 +271,170 @@ err_free_uep: return -EMSGSIZE; } +static int fill_cq(struct sk_buff *msg, struct t4_cq *cq) +{ + if (rdma_nl_put_driver_u32(msg, "cqid", cq->cqid)) + goto err; + if (rdma_nl_put_driver_u32(msg, "memsize", cq->memsize)) + goto err; + if (rdma_nl_put_driver_u32(msg, "size", cq->size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx", cq->cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx_inc", cq->cidx_inc)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sw_cidx", cq->sw_cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sw_pidx", cq->sw_pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sw_in_use", cq->sw_in_use)) + goto err; + if (rdma_nl_put_driver_u32(msg, "vector", cq->vector)) + goto err; + if (rdma_nl_put_driver_u32(msg, "gen", cq->gen)) + goto err; + if (rdma_nl_put_driver_u32(msg, "error", cq->error)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "bits_type_ts", + be64_to_cpu(cq->bits_type_ts))) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "flags", cq->flags)) + goto err; + + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_cqe(struct sk_buff *msg, struct t4_cqe *cqe, u16 idx, + const char *qstr) +{ + if (rdma_nl_put_driver_u32(msg, qstr, idx)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "header", + be32_to_cpu(cqe->header))) + goto err; + if (rdma_nl_put_driver_u32(msg, "len", be32_to_cpu(cqe->len))) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "wrid_hi", + be32_to_cpu(cqe->u.gen.wrid_hi))) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "wrid_low", + be32_to_cpu(cqe->u.gen.wrid_low))) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "bits_type_ts", + be64_to_cpu(cqe->bits_type_ts))) + goto err; + + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_hwcqes(struct sk_buff *msg, struct t4_cq *cq, + struct t4_cqe *cqes) +{ + u16 idx; + + idx = (cq->cidx > 0) ? cq->cidx - 1 : cq->size - 1; + if (fill_cqe(msg, cqes, idx, "hwcq_idx")) + goto err; + idx = cq->cidx; + if (fill_cqe(msg, cqes + 1, idx, "hwcq_idx")) + goto err; + + return 0; +err: + return -EMSGSIZE; +} + +static int fill_swcqes(struct sk_buff *msg, struct t4_cq *cq, + struct t4_cqe *cqes) +{ + u16 idx; + + if (!cq->sw_in_use) + return 0; + + idx = cq->sw_cidx; + if (fill_cqe(msg, cqes, idx, "swcq_idx")) + goto err; + if (cq->sw_in_use == 1) + goto out; + idx = (cq->sw_pidx > 0) ? cq->sw_pidx - 1 : cq->size - 1; + if (fill_cqe(msg, cqes + 1, idx, "swcq_idx")) + goto err; +out: + return 0; +err: + return -EMSGSIZE; +} + +static int fill_res_cq_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct ib_cq *ibcq = container_of(res, struct ib_cq, res); + struct c4iw_cq *chp = to_c4iw_cq(ibcq); + struct nlattr *table_attr; + struct t4_cqe hwcqes[2]; + struct t4_cqe swcqes[2]; + struct t4_cq cq; + u16 idx; + + /* User cq state is not available, so don't dump user cqs */ + if (ibcq->uobject) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err; + + /* Get a consistent snapshot */ + spin_lock_irq(&chp->lock); + + /* t4_cq struct */ + cq = chp->cq; + + /* get 2 hw cqes: cidx-1, and cidx */ + idx = (cq.cidx > 0) ? cq.cidx - 1 : cq.size - 1; + hwcqes[0] = chp->cq.queue[idx]; + + idx = cq.cidx; + hwcqes[1] = chp->cq.queue[idx]; + + /* get first and last sw cqes */ + if (cq.sw_in_use) { + swcqes[0] = chp->cq.sw_queue[cq.sw_cidx]; + if (cq.sw_in_use > 1) { + idx = (cq.sw_pidx > 0) ? cq.sw_pidx - 1 : cq.size - 1; + swcqes[1] = chp->cq.sw_queue[idx]; + } + } + + spin_unlock_irq(&chp->lock); + + if (fill_cq(msg, &cq)) + goto err_cancel_table; + + if (fill_swcqes(msg, &cq, swcqes)) + goto err_cancel_table; + + if (fill_hwcqes(msg, &cq, hwcqes)) + goto err_cancel_table; + + nla_nest_end(msg, table_attr); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err: + return -EMSGSIZE; +} + c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = fill_res_qp_entry, [RDMA_RESTRACK_CM_ID] = fill_res_ep_entry, + [RDMA_RESTRACK_CQ] = fill_res_cq_entry, }; -- cgit v1.2.3 From 013f64a88059fc65f01f1b967f1cf9c666a231a2 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:32:01 -0700 Subject: iw_cxgb4: provide detailed driver-specific MR information Add a table of important fields from the fw_ri_tpte structure to the mr resource tracking table. This is helpful in debugging. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/restrack.c | 61 ++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index 013524a093cf..9a7520ee41e0 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -433,8 +433,69 @@ err: return -EMSGSIZE; } +static int fill_res_mr_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct ib_mr *ibmr = container_of(res, struct ib_mr, res); + struct c4iw_mr *mhp = to_c4iw_mr(ibmr); + struct c4iw_dev *dev = mhp->rhp; + u32 stag = mhp->attr.stag; + struct nlattr *table_attr; + struct fw_ri_tpte tpte; + int ret; + + if (!stag) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err; + + ret = cxgb4_read_tpte(dev->rdev.lldi.ports[0], stag, (__be32 *)&tpte); + if (ret) { + dev_err(&dev->rdev.lldi.pdev->dev, + "%s cxgb4_read_tpte err %d\n", __func__, ret); + return 0; + } + + if (rdma_nl_put_driver_u32_hex(msg, "idx", stag >> 8)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "valid", + FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32_hex(msg, "key", stag & 0xff)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "state", + FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "pdid", + FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32_hex(msg, "perm", + FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "ps", + FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u64(msg, "len", + ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32_hex(msg, "pbl_addr", + FW_RI_TPTE_PBLADDR_G(ntohl(tpte.nosnoop_pbladdr)))) + goto err_cancel_table; + + nla_nest_end(msg, table_attr); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err: + return -EMSGSIZE; +} + c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = fill_res_qp_entry, [RDMA_RESTRACK_CM_ID] = fill_res_ep_entry, [RDMA_RESTRACK_CQ] = fill_res_cq_entry, + [RDMA_RESTRACK_MR] = fill_res_mr_entry, }; -- cgit v1.2.3 From 4171a693a5159e47f72eea3331bebf538dea9b83 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 15 May 2018 18:28:07 -0700 Subject: IB/hfi1: Define 16B Management Packets Add 16B Management Packet definition. This optimized packet format replaces the ib_other_headers and BTH with a source and destination QP number. To support these packets we introduce struct opa_16b_mgmt into the struct hfi1_16b_header. This packet format is only used for MAD packets using the IB_OPCODE_UD_SEND_ONLY opcode on QP0/1. The original 16B implementation failed to use 16B management packets so now we add their definition. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 1 + drivers/infiniband/hw/hfi1/verbs.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index dd84238c1aac..531ac89c8213 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -392,6 +392,7 @@ struct hfi1_packet { */ #define OPA_16B_L4_9B 0x00 #define OPA_16B_L2_TYPE 0x02 +#define OPA_16B_L4_FM 0x08 #define OPA_16B_L4_IB_LOCAL 0x09 #define OPA_16B_L4_IB_GLOBAL 0x0A #define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index a16fe5d3f7c4..a4d06502f06d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -110,6 +110,12 @@ enum { #define LRH_9B_BYTES (FIELD_SIZEOF(struct ib_header, lrh)) #define LRH_9B_DWORDS (LRH_9B_BYTES / sizeof(u32)) +/* 24Bits for qpn, upper 8Bits reserved */ +struct opa_16b_mgmt { + __be32 dest_qpn; + __be32 src_qpn; +}; + struct hfi1_16b_header { u32 lrh[4]; union { @@ -118,6 +124,7 @@ struct hfi1_16b_header { struct ib_other_headers oth; } l; struct ib_other_headers oth; + struct opa_16b_mgmt mgmt; } u; } __packed; -- cgit v1.2.3 From 81cd3891f021b88319f7243715c30945aaabe9ea Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 15 May 2018 18:28:15 -0700 Subject: IB/hfi1: Add support for 16B Management Packets 16B Management Packets (L4=0x08) replace the BTH and DETH of normal MAD packet packets with a header containing the the source and destination queue pair numbers; fields that were originally retrieved from the BTH/DETH are now populated from this header as well as from the 16B LRH (e.g. pkey). 16B Management Packets are used as an optimized management format on 16B fabrics. These management packets have an opcode of IB_OPCODE_UD_SEND_ONLY, a fixed 3Byte pad, and a header length of 24Bytes. The decision as to when we send a management packet is based upon either the source or destination queue pair number being 0 or 1. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/driver.c | 33 ++++++++++++++------- drivers/infiniband/hw/hfi1/hfi.h | 28 ++++++++++++++++++ drivers/infiniband/hw/hfi1/ud.c | 57 +++++++++++++++++++++++++++---------- drivers/infiniband/hw/hfi1/verbs.c | 25 +++++++++++----- 4 files changed, 110 insertions(+), 33 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index e5a57ebd8da4..94dca95db04f 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1483,38 +1483,51 @@ static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) struct hfi1_pportdata *ppd = rcd->ppd; struct hfi1_ibport *ibp = &ppd->ibport_data; u8 l4; - u8 grh_len; packet->hdr = (struct hfi1_16b_header *) hfi1_get_16B_header(packet->rcd->dd, packet->rhf_addr); - packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; - l4 = hfi1_16B_get_l4(packet->hdr); if (l4 == OPA_16B_L4_IB_LOCAL) { - grh_len = 0; packet->ohdr = packet->ebuf; packet->grh = NULL; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + /* hdr_len_by_opcode already has an IB LRH factored in */ + packet->hlen = hdr_len_by_opcode[packet->opcode] + + (LRH_16B_BYTES - LRH_9B_BYTES); + packet->migrated = opa_bth_is_migration(packet->ohdr); } else if (l4 == OPA_16B_L4_IB_GLOBAL) { u32 vtf; + u8 grh_len = sizeof(struct ib_grh); - grh_len = sizeof(struct ib_grh); packet->ohdr = packet->ebuf + grh_len; packet->grh = packet->ebuf; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + /* hdr_len_by_opcode already has an IB LRH factored in */ + packet->hlen = hdr_len_by_opcode[packet->opcode] + + (LRH_16B_BYTES - LRH_9B_BYTES) + grh_len; + packet->migrated = opa_bth_is_migration(packet->ohdr); + if (packet->grh->next_hdr != IB_GRH_NEXT_HDR) goto drop; vtf = be32_to_cpu(packet->grh->version_tclass_flow); if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) goto drop; + } else if (l4 == OPA_16B_L4_FM) { + packet->mgmt = packet->ebuf; + packet->ohdr = NULL; + packet->grh = NULL; + packet->opcode = IB_OPCODE_UD_SEND_ONLY; + packet->pad = OPA_16B_L4_FM_PAD; + packet->hlen = OPA_16B_L4_FM_HLEN; + packet->migrated = false; } else { goto drop; } /* Query commonly used fields from packet header */ - packet->opcode = ib_bth_get_opcode(packet->ohdr); - /* hdr_len_by_opcode already has an IB LRH factored in */ - packet->hlen = hdr_len_by_opcode[packet->opcode] + - (LRH_16B_BYTES - LRH_9B_BYTES) + grh_len; packet->payload = packet->ebuf + packet->hlen - LRH_16B_BYTES; packet->slid = hfi1_16B_get_slid(packet->hdr); packet->dlid = hfi1_16B_get_dlid(packet->hdr); @@ -1524,10 +1537,8 @@ static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) 16B); packet->sc = hfi1_16B_get_sc(packet->hdr); packet->sl = ibp->sc_to_sl[packet->sc]; - packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); packet->extra_byte = SIZE_OF_LT; packet->pkey = hfi1_16B_get_pkey(packet->hdr); - packet->migrated = opa_bth_is_migration(packet->ohdr); if (hfi1_bypass_ingress_pkt_check(packet)) goto drop; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 531ac89c8213..f49cd80df557 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -333,6 +333,7 @@ struct hfi1_packet { struct rvt_qp *qp; struct ib_other_headers *ohdr; struct ib_grh *grh; + struct opa_16b_mgmt *mgmt; u64 rhf; u32 maxcnt; u32 rhqoff; @@ -397,6 +398,12 @@ struct hfi1_packet { #define OPA_16B_L4_IB_GLOBAL 0x0A #define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR +/* + * OPA 16B Management + */ +#define OPA_16B_L4_FM_PAD 3 /* fixed 3B pad */ +#define OPA_16B_L4_FM_HLEN 24 /* 16B(16) + L4_FM(8) */ + static inline u8 hfi1_16B_get_l4(struct hfi1_16b_header *hdr) { return (u8)(hdr->lrh[2] & OPA_16B_L4_MASK); @@ -473,6 +480,27 @@ static inline u8 hfi1_16B_bth_get_pad(struct ib_other_headers *ohdr) OPA_16B_BTH_PAD_MASK); } +/* + * 16B Management + */ +#define OPA_16B_MGMT_QPN_MASK 0xFFFFFF +static inline u32 hfi1_16B_get_dest_qpn(struct opa_16b_mgmt *mgmt) +{ + return be32_to_cpu(mgmt->dest_qpn) & OPA_16B_MGMT_QPN_MASK; +} + +static inline u32 hfi1_16B_get_src_qpn(struct opa_16b_mgmt *mgmt) +{ + return be32_to_cpu(mgmt->src_qpn) & OPA_16B_MGMT_QPN_MASK; +} + +static inline void hfi1_16B_set_qpn(struct opa_16b_mgmt *mgmt, + u32 dest_qp, u32 src_qp) +{ + mgmt->dest_qpn = cpu_to_be32(dest_qp & OPA_16B_MGMT_QPN_MASK); + mgmt->src_qpn = cpu_to_be32(src_qp & OPA_16B_MGMT_QPN_MASK); +} + struct rvt_sge_state; /* diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 6ad203f6da88..1ab332f1866e 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -399,16 +399,30 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; u32 dlid, slid, nwords, extra_bytes; + u32 dest_qp = wqe->ud_wr.remote_qpn; + u32 src_qp = qp->ibqp.qp_num; u16 len, pkey; u8 l4, sc5; + bool is_mgmt = false; ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; - /* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */ - ps->s_txreq->hdr_dwords = 9; - if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) - ps->s_txreq->hdr_dwords++; + + /* + * Build 16B Management Packet if either the destination + * or source queue pair number is 0 or 1. + */ + if (dest_qp == 0 || src_qp == 0 || dest_qp == 1 || src_qp == 1) { + /* header size in dwords 16B LRH+L4_FM = (16+8)/4. */ + ps->s_txreq->hdr_dwords = 6; + is_mgmt = true; + } else { + /* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */ + ps->s_txreq->hdr_dwords = 9; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + ps->s_txreq->hdr_dwords++; + } /* SW provides space for CRC and LT for bypass packets. */ extra_bytes = hfi1_get_16b_padding((ps->s_txreq->hdr_dwords << 2), @@ -453,7 +467,14 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); - hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true); + if (is_mgmt) { + l4 = OPA_16B_L4_FM; + pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt, + dest_qp, src_qp); + } else { + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true); + } /* Convert dwords to flits */ len = (ps->s_txreq->hdr_dwords + nwords) >> 1; @@ -845,10 +866,8 @@ static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, */ void hfi1_ud_rcv(struct hfi1_packet *packet) { - struct ib_other_headers *ohdr = packet->ohdr; u32 hdrsize = packet->hlen; struct ib_wc wc; - u32 qkey; u32 src_qp; u16 pkey; int mgmt_pkey_idx = -1; @@ -864,27 +883,35 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) u32 dlid = packet->dlid; u32 slid = packet->slid; u8 extra_bytes; + u8 l4 = 0; bool dlid_is_permissive; bool slid_is_permissive; + bool solicited = false; extra_bytes = packet->pad + packet->extra_byte + (SIZE_OF_CRC << 2); - qkey = ib_get_qkey(ohdr); - src_qp = ib_get_sqpn(ohdr); if (packet->etype == RHF_RCV_TYPE_BYPASS) { u32 permissive_lid = opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B); + l4 = hfi1_16B_get_l4(packet->hdr); pkey = hfi1_16B_get_pkey(packet->hdr); dlid_is_permissive = (dlid == permissive_lid); slid_is_permissive = (slid == permissive_lid); } else { - pkey = ib_bth_get_pkey(ohdr); + pkey = ib_bth_get_pkey(packet->ohdr); dlid_is_permissive = (dlid == be16_to_cpu(IB_LID_PERMISSIVE)); slid_is_permissive = (slid == be16_to_cpu(IB_LID_PERMISSIVE)); } sl_from_sc = ibp->sc_to_sl[sc5]; + if (likely(l4 != OPA_16B_L4_FM)) { + src_qp = ib_get_sqpn(packet->ohdr); + solicited = ib_bth_is_solicited(packet->ohdr); + } else { + src_qp = hfi1_16B_get_src_qpn(packet->mgmt); + } + process_ecn(qp, packet, (opcode != IB_OPCODE_CNP)); /* * Get the number of bytes the message was padded by @@ -922,8 +949,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) if (mgmt_pkey_idx < 0) goto drop; } - if (unlikely(qkey != qp->qkey)) /* Silent drop */ - return; + if (unlikely(l4 != OPA_16B_L4_FM && + ib_get_qkey(packet->ohdr) != qp->qkey)) + return; /* Silent drop */ /* Drop invalid MAD packets (see 13.5.3.1). */ if (unlikely(qp->ibqp.qp_num == 1 && @@ -950,7 +978,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) if (qp->ibqp.qp_num > 1 && opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { - wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.ex.imm_data = packet->ohdr->u.ud.imm_data; wc.wc_flags = IB_WC_WITH_IMM; tlen -= sizeof(u32); } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { @@ -1047,8 +1075,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited); return; drop: diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index fc2e44cde161..08991874c0e2 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -617,7 +617,12 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet, wake_up(&mcast->wait); } else { /* Get the destination QP number. */ - qp_num = ib_bth_get_qpn(packet->ohdr); + if (packet->etype == RHF_RCV_TYPE_BYPASS && + hfi1_16B_get_l4(packet->hdr) == OPA_16B_L4_FM) + qp_num = hfi1_16B_get_dest_qpn(packet->mgmt); + else + qp_num = ib_bth_get_qpn(packet->ohdr); + rcu_read_lock(); packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); if (!packet->qp) @@ -1308,21 +1313,23 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct hfi1_qp_priv *priv = qp->priv; - struct ib_other_headers *ohdr; + struct ib_other_headers *ohdr = NULL; send_routine sr; int ret; u16 pkey; u32 slid; + u8 l4 = 0; /* locate the pkey within the headers */ if (ps->s_txreq->phdr.hdr.hdr_type) { struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah; - u8 l4 = hfi1_16B_get_l4(hdr); - if (l4 == OPA_16B_L4_IB_GLOBAL) - ohdr = &hdr->u.l.oth; - else + l4 = hfi1_16B_get_l4(hdr); + if (l4 == OPA_16B_L4_IB_LOCAL) ohdr = &hdr->u.oth; + else if (l4 == OPA_16B_L4_IB_GLOBAL) + ohdr = &hdr->u.l.oth; + slid = hfi1_16B_get_slid(hdr); pkey = hfi1_16B_get_pkey(hdr); } else { @@ -1337,7 +1344,11 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) pkey = ib_bth_get_pkey(ohdr); } - ps->opcode = ib_bth_get_opcode(ohdr); + if (likely(l4 != OPA_16B_L4_FM)) + ps->opcode = ib_bth_get_opcode(ohdr); + else + ps->opcode = IB_OPCODE_UD_SEND_ONLY; + sr = get_send_routine(qp, ps); ret = egress_pkey_check(dd->pport, slid, pkey, priv->s_sc, qp->s_pkey_index); -- cgit v1.2.3 From 43a68c35c7b1135ec05b8c84e7509a50925b00b6 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 15 May 2018 18:28:22 -0700 Subject: IB/hfi1: Add 16B Management Packet trace support Add trace support for 16B Management Packets. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/trace.c | 40 ++++++-- drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 160 +++++++++++++++++++----------- 2 files changed, 130 insertions(+), 70 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 332b9b7c554a..7c8aed0ffc07 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -63,13 +63,20 @@ static u8 __get_ib_hdr_len(struct ib_header *hdr) static u8 __get_16b_hdr_len(struct hfi1_16b_header *hdr) { - struct ib_other_headers *ohdr; + struct ib_other_headers *ohdr = NULL; u8 opcode; + u8 l4 = hfi1_16B_get_l4(hdr); + + if (l4 == OPA_16B_L4_FM) { + opcode = IB_OPCODE_UD_SEND_ONLY; + return (8 + 8); /* No BTH */ + } - if (hfi1_16B_get_l4(hdr) == OPA_16B_L4_IB_LOCAL) + if (l4 == OPA_16B_L4_IB_LOCAL) ohdr = &hdr->u.oth; else ohdr = &hdr->u.l.oth; + opcode = ib_bth_get_opcode(ohdr); return hdr_len_by_opcode[opcode] == 0 ? 0 : hdr_len_by_opcode[opcode] - (12 + 8 + 8); @@ -234,17 +241,24 @@ const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, #define BTH_16B_PRN \ "op:0x%.2x,%s se:%d m:%d pad:%d tver:%d " \ "qpn:0x%.6x a:%d psn:0x%.8x" -const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, - u8 ack, bool becn, bool fecn, u8 mig, - u8 se, u8 pad, u8 opcode, const char *opname, - u8 tver, u16 pkey, u32 psn, u32 qpn) +#define L4_FM_16B_PRN \ + "op:0x%.2x,%s dest_qpn:0x%.6x src_qpn:0x%.6x" +const char *hfi1_trace_fmt_rest(struct trace_seq *p, bool bypass, u8 l4, + u8 ack, bool becn, bool fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn, + u32 dest_qpn, u32 src_qpn) { const char *ret = trace_seq_buffer_ptr(p); if (bypass) - trace_seq_printf(p, BTH_16B_PRN, - opcode, opname, - se, mig, pad, tver, qpn, ack, psn); + if (l4 == OPA_16B_L4_FM) + trace_seq_printf(p, L4_FM_16B_PRN, + opcode, opname, dest_qpn, src_qpn); + else + trace_seq_printf(p, BTH_16B_PRN, + opcode, opname, + se, mig, pad, tver, qpn, ack, psn); else trace_seq_printf(p, BTH_9B_PRN, @@ -258,12 +272,17 @@ const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, const char *parse_everbs_hdrs( struct trace_seq *p, - u8 opcode, + u8 opcode, u8 l4, u32 dest_qpn, u32 src_qpn, void *ehdrs) { union ib_ehdrs *eh = ehdrs; const char *ret = trace_seq_buffer_ptr(p); + if (l4 == OPA_16B_L4_FM) { + trace_seq_printf(p, "mgmt pkt"); + goto out; + } + switch (opcode) { /* imm */ case OP(RC, SEND_LAST_WITH_IMMEDIATE): @@ -334,6 +353,7 @@ const char *parse_everbs_hdrs( be32_to_cpu(eh->ieth)); break; } +out: trace_seq_putc(p, 0); return ret; } diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 2847626d3819..1dc2c28fc96e 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -96,7 +96,9 @@ __print_symbolic(opcode, \ ib_opcode_name(CNP)) u8 ibhdr_exhdr_len(struct ib_header *hdr); -const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); +const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, + u8 l4, u32 dest_qpn, u32 src_qpn, + void *ehdrs); u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opah); u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet); const char *hfi1_trace_get_packet_l4_str(u8 l4); @@ -123,14 +125,16 @@ const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, u8 rc, u8 sc, u8 sl, u16 entropy, u16 len, u16 pkey, u32 dlid, u32 slid); -const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, - u8 ack, bool becn, bool fecn, u8 mig, - u8 se, u8 pad, u8 opcode, const char *opname, - u8 tver, u16 pkey, u32 psn, u32 qpn); +const char *hfi1_trace_fmt_rest(struct trace_seq *p, bool bypass, u8 l4, + u8 ack, bool becn, bool fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn, + u32 dest_qpn, u32 src_qpn); const char *hfi1_trace_get_packet_l2_str(u8 l2); -#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) +#define __parse_ib_ehdrs(op, l4, dest_qpn, src_qpn, ehdrs) \ + parse_everbs_hdrs(p, op, l4, dest_qpn, src_qpn, ehdrs) #define lrh_name(lrh) { HFI1_##lrh, #lrh } #define show_lnh(lrh) \ @@ -169,6 +173,8 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, __field(u32, psn) __field(u32, qpn) __field(u32, slid) + __field(u32, dest_qpn) + __field(u32, src_qpn) /* extended headers */ __dynamic_array(u8, ehdrs, hfi1_trace_packet_hdr_len(packet)) @@ -178,6 +184,8 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, __entry->etype = packet->etype; __entry->l2 = hfi1_16B_get_l2(packet->hdr); + __entry->dest_qpn = 0; + __entry->src_qpn = 0; if (__entry->etype == RHF_RCV_TYPE_BYPASS) { hfi1_trace_parse_16b_hdr(packet->hdr, &__entry->age, @@ -192,16 +200,23 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, &__entry->dlid, &__entry->slid); - hfi1_trace_parse_16b_bth(packet->ohdr, - &__entry->ack, - &__entry->mig, - &__entry->opcode, - &__entry->pad, - &__entry->se, - &__entry->tver, - &__entry->psn, - &__entry->qpn); + if (__entry->l4 == OPA_16B_L4_FM) { + __entry->opcode = IB_OPCODE_UD_SEND_ONLY; + __entry->dest_qpn = hfi1_16B_get_dest_qpn(packet->mgmt); + __entry->src_qpn = hfi1_16B_get_src_qpn(packet->mgmt); + } else { + hfi1_trace_parse_16b_bth(packet->ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } } else { + __entry->l4 = OPA_16B_L4_9B; hfi1_trace_parse_9b_hdr(packet->hdr, sc5, &__entry->lnh, &__entry->lver, @@ -223,8 +238,9 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, &__entry->pkey, &__entry->psn, &__entry->qpn); - } - /* extended headers */ + } + /* extended headers */ + if (__entry->l4 != OPA_16B_L4_FM) memcpy(__get_dynamic_array(ehdrs), &packet->ohdr->u, __get_dynamic_array_len(ehdrs)); @@ -253,25 +269,31 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, __entry->pkey, __entry->dlid, __entry->slid), - hfi1_trace_fmt_bth(p, - __entry->etype == + hfi1_trace_fmt_rest(p, + __entry->etype == RHF_RCV_TYPE_BYPASS, - __entry->ack, - __entry->becn, - __entry->fecn, - __entry->mig, - __entry->se, - __entry->pad, - __entry->opcode, - show_ib_opcode(__entry->opcode), - __entry->tver, - __entry->pkey, - __entry->psn, - __entry->qpn), + __entry->l4, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn, + __entry->dest_qpn, + __entry->src_qpn), /* extended headers */ __get_dynamic_array_len(ehdrs), __parse_ib_ehdrs( __entry->opcode, + __entry->l4, + __entry->dest_qpn, + __entry->src_qpn, (void *)__get_dynamic_array(ehdrs)) ) ); @@ -310,6 +332,8 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, __field(u32, psn) __field(u32, qpn) __field(u32, slid) + __field(u32, dest_qpn) + __field(u32, src_qpn) /* extended headers */ __dynamic_array(u8, ehdrs, hfi1_trace_opa_hdr_len(opah)) @@ -320,6 +344,8 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, DD_DEV_ASSIGN(dd); __entry->hdr_type = opah->hdr_type; + __entry->dest_qpn = 0; + __entry->src_qpn = 0; if (__entry->hdr_type) { hfi1_trace_parse_16b_hdr(&opah->opah, &__entry->age, @@ -334,19 +360,26 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, &__entry->dlid, &__entry->slid); - if (__entry->l4 == OPA_16B_L4_IB_LOCAL) - ohdr = &opah->opah.u.oth; - else - ohdr = &opah->opah.u.l.oth; - hfi1_trace_parse_16b_bth(ohdr, - &__entry->ack, - &__entry->mig, - &__entry->opcode, - &__entry->pad, - &__entry->se, - &__entry->tver, - &__entry->psn, - &__entry->qpn); + if (__entry->l4 == OPA_16B_L4_FM) { + ohdr = NULL; + __entry->opcode = IB_OPCODE_UD_SEND_ONLY; + __entry->dest_qpn = hfi1_16B_get_dest_qpn(&opah->opah.u.mgmt); + __entry->src_qpn = hfi1_16B_get_src_qpn(&opah->opah.u.mgmt); + } else { + if (__entry->l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &opah->opah.u.oth; + else + ohdr = &opah->opah.u.l.oth; + hfi1_trace_parse_16b_bth(ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } } else { __entry->l4 = OPA_16B_L4_9B; hfi1_trace_parse_9b_hdr(&opah->ibh, sc5, @@ -376,8 +409,9 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, } /* extended headers */ - memcpy(__get_dynamic_array(ehdrs), - &ohdr->u, __get_dynamic_array_len(ehdrs)); + if (__entry->l4 != OPA_16B_L4_FM) + memcpy(__get_dynamic_array(ehdrs), + &ohdr->u, __get_dynamic_array_len(ehdrs)); ), TP_printk("[%s] (%s) %s %s hlen:%d %s", __get_str(dev), @@ -399,24 +433,30 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, __entry->pkey, __entry->dlid, __entry->slid), - hfi1_trace_fmt_bth(p, - !!__entry->hdr_type, - __entry->ack, - __entry->becn, - __entry->fecn, - __entry->mig, - __entry->se, - __entry->pad, - __entry->opcode, - show_ib_opcode(__entry->opcode), - __entry->tver, - __entry->pkey, - __entry->psn, - __entry->qpn), + hfi1_trace_fmt_rest(p, + !!__entry->hdr_type, + __entry->l4, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn, + __entry->dest_qpn, + __entry->src_qpn), /* extended headers */ __get_dynamic_array_len(ehdrs), __parse_ib_ehdrs( __entry->opcode, + __entry->l4, + __entry->dest_qpn, + __entry->src_qpn, (void *)__get_dynamic_array(ehdrs)) ) ); -- cgit v1.2.3 From c8314811f9b2068eb53728d7a06b1ea195579e79 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 15 May 2018 18:31:09 -0700 Subject: IB/hfi1: Cleanup of exp_rcv The knowledge of the internal workings of the expect receive is too distributed. Fix by: - right size several rcd fields associated with expect receive - making an init entrance to init all the lists - consolidate all the allocations into an array anchored in the rcd Reviewed-by: Michael J. Ruhl Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/exp_rcv.c | 39 ++++++++++++++++++++++-------------- drivers/infiniband/hw/hfi1/exp_rcv.h | 24 +++++++++++++++++++++- drivers/infiniband/hw/hfi1/hfi.h | 14 +++++++------ drivers/infiniband/hw/hfi1/init.c | 4 +--- 4 files changed, 56 insertions(+), 25 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c index 0af91675acc6..1be49a0d9c11 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/exp_rcv.c @@ -52,12 +52,23 @@ * exp_tid_group_init - initialize exp_tid_set * @set - the set */ -void hfi1_exp_tid_group_init(struct exp_tid_set *set) +static void hfi1_exp_tid_set_init(struct exp_tid_set *set) { INIT_LIST_HEAD(&set->list); set->count = 0; } +/** + * hfi1_exp_tid_group_init - initialize rcd expected receive + * @rcd - the rcd + */ +void hfi1_exp_tid_group_init(struct hfi1_ctxtdata *rcd) +{ + hfi1_exp_tid_set_init(&rcd->tid_group_list); + hfi1_exp_tid_set_init(&rcd->tid_used_list); + hfi1_exp_tid_set_init(&rcd->tid_full_list); +} + /** * alloc_ctxt_rcv_groups - initialize expected receive groups * @rcd - the context to add the groupings to @@ -68,13 +79,17 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) u32 tidbase; struct tid_group *grp; int i; + u32 ngroups; + ngroups = rcd->expected_count / dd->rcv_entries.group_size; + rcd->groups = + kcalloc_node(ngroups, sizeof(*rcd->groups), + GFP_KERNEL, rcd->numa_id); + if (!rcd->groups) + return -ENOMEM; tidbase = rcd->expected_base; - for (i = 0; i < rcd->expected_count / - dd->rcv_entries.group_size; i++) { - grp = kzalloc(sizeof(*grp), GFP_KERNEL); - if (!grp) - goto bail; + for (i = 0; i < ngroups; i++) { + grp = &rcd->groups[i]; grp->size = dd->rcv_entries.group_size; grp->base = tidbase; tid_group_add_tail(grp, &rcd->tid_group_list); @@ -82,9 +97,6 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) } return 0; -bail: - hfi1_free_ctxt_rcv_groups(rcd); - return -ENOMEM; } /** @@ -100,15 +112,12 @@ bail: */ void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) { - struct tid_group *grp, *gptr; - WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list)); WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list)); - list_for_each_entry_safe(grp, gptr, &rcd->tid_group_list.list, list) { - tid_group_remove(grp, &rcd->tid_group_list); - kfree(grp); - } + kfree(rcd->groups); + rcd->groups = NULL; + hfi1_exp_tid_group_init(rcd); hfi1_clear_tids(rcd); } diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h index 08719047628a..f25362015095 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/exp_rcv.h @@ -183,8 +183,30 @@ static inline u32 rcventry2tidinfo(u32 rcventry) EXP_TID_SET(CTRL, 1 << (rcventry - pair)); } +/** + * hfi1_tid_group_to_idx - convert an index to a group + * @rcd - the receive context + * @grp - the group pointer + */ +static inline u16 +hfi1_tid_group_to_idx(struct hfi1_ctxtdata *rcd, struct tid_group *grp) +{ + return grp - &rcd->groups[0]; +} + +/** + * hfi1_idx_to_tid_group - convert a group to an index + * @rcd - the receive context + * @idx - the index + */ +static inline struct tid_group * +hfi1_idx_to_tid_group(struct hfi1_ctxtdata *rcd, u16 idx) +{ + return &rcd->groups[idx]; +} + int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); -void hfi1_exp_tid_group_init(struct exp_tid_set *set); +void hfi1_exp_tid_group_init(struct hfi1_ctxtdata *rcd); #endif /* _HFI1_EXP_RCV_H */ diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index f49cd80df557..5eb3bf0849c7 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -231,13 +231,15 @@ struct hfi1_ctxtdata { /* job key */ u16 jkey; /* number of RcvArray groups for this context. */ - u32 rcv_array_groups; + u16 rcv_array_groups; /* index of first eager TID entry. */ - u32 eager_base; + u16 eager_base; /* number of expected TID entries */ - u32 expected_count; + u16 expected_count; /* index of first expected TID entry. */ - u32 expected_base; + u16 expected_base; + /* array of tid_groups */ + struct tid_group *groups; struct exp_tid_set tid_group_list; struct exp_tid_set tid_used_list; @@ -282,7 +284,7 @@ struct hfi1_ctxtdata { /* interrupt handling */ u64 imask; /* clear interrupt mask */ int ireg; /* clear interrupt register */ - unsigned numa_id; /* numa node of this context */ + int numa_id; /* numa node of this context */ /* verbs rx_stats per rcd */ struct hfi1_opcode_stats_perctx *opstats; @@ -909,9 +911,9 @@ typedef void (*hfi1_make_req)(struct rvt_qp *qp, #define RHF_RCV_REPROCESS 2 /* stop. retain this packet */ struct rcv_array_data { - u8 group_size; u16 ngroups; u16 nctxt_extra; + u8 group_size; }; struct per_vl_data { diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 5d1adfc450d3..3feecf926322 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -361,9 +361,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, } INIT_LIST_HEAD(&rcd->qp_wait_list); - hfi1_exp_tid_group_init(&rcd->tid_group_list); - hfi1_exp_tid_group_init(&rcd->tid_used_list); - hfi1_exp_tid_group_init(&rcd->tid_full_list); + hfi1_exp_tid_group_init(rcd); rcd->ppd = ppd; rcd->dd = dd; __set_bit(0, rcd->in_use_ctxts); -- cgit v1.2.3 From 5938d94cf0380bca9e6fbb910c010f4483906c11 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Tue, 15 May 2018 18:31:17 -0700 Subject: IB/hfi1: Set port number for errorinfo MAD response For errorinfo MAD requests, the response has a 0 port number left over from a memset. Instead we should always set the port number in the response. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 983b5794a660..0307405491e0 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -3424,6 +3424,7 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)pmp); } + rsp->port_number = port; /* PortRcvErrorInfo */ rsp->port_rcv_ei.status_and_code = -- cgit v1.2.3 From 3ce459cd684b7f18ca79838e62310ffdc930920b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 15 May 2018 18:31:24 -0700 Subject: IB/{rdmavt,hfi1}: Change hrtimer add to use pinned version Given we are dealing with nano-second level timers, when the timer pops, ensure it happens on the CPU which caused the timer to be set in the first place. This avoids excessive jitter from the desired expiration time by avoiding the cost of switching our context to another CPU that is cache cold for this given timer. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 79ee2b9e28c6..1a1a47ac53c6 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2012,7 +2012,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, unsigned long nsec = 1024 * ccti_timer; hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_PINNED); } spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 6e9a351f45fb..40046135c509 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2225,7 +2225,7 @@ void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth) to = rvt_aeth_to_usec(aeth); trace_rvt_rnrnak_add(qp, to); hrtimer_start(&qp->s_rnr_timer, - ns_to_ktime(1000 * to), HRTIMER_MODE_REL); + ns_to_ktime(1000 * to), HRTIMER_MODE_REL_PINNED); } EXPORT_SYMBOL(rvt_add_rnr_timer); -- cgit v1.2.3 From 0252f73334f9ef68868e4684200bea3565a4fcee Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 18 May 2018 17:07:01 -0700 Subject: IB/qib: Fix DMA api warning with debug kernel The following error occurs in a debug build when running MPI PSM: [ 307.415911] WARNING: CPU: 4 PID: 23867 at lib/dma-debug.c:1158 check_unmap+0x4ee/0xa20 [ 307.455661] ib_qib 0000:05:00.0: DMA-API: device driver failed to check map error[device address=0x00000000df82b000] [size=4096 bytes] [mapped as page] [ 307.517494] Modules linked in: [ 307.531584] ib_isert iscsi_target_mod ib_srpt target_core_mod rpcrdma sunrpc ib_srp scsi_transport_srp scsi_tgt ib_iser libiscsi ib_ipoib scsi_transport_iscsi rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_qib intel_powerclamp coretemp rdmavt intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel ipmi_ssif ib_core aesni_intel sg ipmi_si lrw gf128mul dca glue_helper ipmi_devintf iTCO_wdt gpio_ich hpwdt iTCO_vendor_support ablk_helper hpilo acpi_power_meter cryptd ipmi_msghandler ie31200_edac shpchp pcc_cpufreq lpc_ich pcspkr ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm ahci crct10dif_pclmul crct10dif_common drm crc32c_intel libahci tg3 libata serio_raw ptp i2c_core [ 307.846113] pps_core dm_mirror dm_region_hash dm_log dm_mod [ 307.866505] CPU: 4 PID: 23867 Comm: mpitests-IMB-MP Kdump: loaded Not tainted 3.10.0-862.el7.x86_64.debug #1 [ 307.911178] Hardware name: HP ProLiant DL320e Gen8, BIOS J05 11/09/2013 [ 307.944206] Call Trace: [ 307.956973] [] dump_stack+0x19/0x1b [ 307.982201] [] __warn+0xd8/0x100 [ 308.005999] [] warn_slowpath_fmt+0x5f/0x80 [ 308.034260] [] check_unmap+0x4ee/0xa20 [ 308.060801] [] ? page_add_file_rmap+0x2a/0x1d0 [ 308.090689] [] debug_dma_unmap_page+0x9d/0xb0 [ 308.120155] [] ? might_fault+0xa0/0xb0 [ 308.146656] [] qib_tid_free.isra.14+0x215/0x2a0 [ib_qib] [ 308.180739] [] qib_write+0x894/0x1280 [ib_qib] [ 308.210733] [] ? __inode_security_revalidate+0x70/0x80 [ 308.244837] [] ? security_file_permission+0x27/0xb0 [ 308.266025] qib_ib0.8006: multicast join failed for ff12:401b:8006:0000:0000:0000:ffff:ffff, status -22 [ 308.323421] [] vfs_write+0xc3/0x1f0 [ 308.347077] [] ? fget_light+0xfc/0x510 [ 308.372533] [] SyS_write+0x8a/0x100 [ 308.396456] [] system_call_fastpath+0x1c/0x21 The code calls a qib_map_page() which has never correctly tested for a mapping error. Fix by testing for pci_dma_mapping_error() in all cases and properly handling the failure in the caller. Additionally, streamline qib_map_page() arguments to satisfy just the single caller. Cc: Reviewed-by: Alex Estrin Tested-by: Don Dutile Reviewed-by: Don Dutile Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qib/qib.h | 3 +-- drivers/infiniband/hw/qib/qib_file_ops.c | 10 +++++++--- drivers/infiniband/hw/qib/qib_user_pages.c | 20 ++++++++++++-------- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 43a68d7b51bb..3461df002f81 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1424,8 +1424,7 @@ u64 qib_sps_ints(void); /* * dma_addr wrappers - all 0's invalid for hw */ -dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, - size_t, int); +int qib_map_page(struct pci_dev *d, struct page *p, dma_addr_t *daddr); struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi); /* diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index bbb720bfd030..98e1ce14fa2a 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -364,6 +364,8 @@ static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp, goto done; } for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + dma_addr_t daddr; + for (; ntids--; tid++) { if (tid == tidcnt) tid = 0; @@ -380,12 +382,14 @@ static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp, ret = -ENOMEM; break; } + ret = qib_map_page(dd->pcidev, pagep[i], &daddr); + if (ret) + break; + tidlist[i] = tid + tidoff; /* we "know" system pages and TID pages are same size */ dd->pageshadow[ctxttid + tid] = pagep[i]; - dd->physshadow[ctxttid + tid] = - qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE, - PCI_DMA_FROMDEVICE); + dd->physshadow[ctxttid + tid] = daddr; /* * don't need atomic or it's overhead */ diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index ce83ba9a12ef..16543d5e80c3 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -99,23 +99,27 @@ bail: * * I'm sure we won't be so lucky with other iommu's, so FIXME. */ -dma_addr_t qib_map_page(struct pci_dev *hwdev, struct page *page, - unsigned long offset, size_t size, int direction) +int qib_map_page(struct pci_dev *hwdev, struct page *page, dma_addr_t *daddr) { dma_addr_t phys; - phys = pci_map_page(hwdev, page, offset, size, direction); + phys = pci_map_page(hwdev, page, 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(hwdev, phys)) + return -ENOMEM; - if (phys == 0) { - pci_unmap_page(hwdev, phys, size, direction); - phys = pci_map_page(hwdev, page, offset, size, direction); + if (!phys) { + pci_unmap_page(hwdev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE); + phys = pci_map_page(hwdev, page, 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(hwdev, phys)) + return -ENOMEM; /* * FIXME: If we get 0 again, we should keep this page, * map another, then free the 0 page. */ } - - return phys; + *daddr = phys; + return 0; } /** -- cgit v1.2.3 From cc3391cb5356edad235555e5930723cb4c0ac9af Mon Sep 17 00:00:00 2001 From: oulijun Date: Tue, 22 May 2018 20:47:16 +0800 Subject: RDMA/hns: Rename the idx field of db The lower 15 bit of paramter of db structure means different meanings when db type is sq, rq and srq. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e0ab672e1c0a..a25c3daaff20 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -480,8 +480,8 @@ out: V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn); roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M, V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB); - roce_set_field(sq_db.parameter, V2_DB_PARAMETER_CONS_IDX_M, - V2_DB_PARAMETER_CONS_IDX_S, + roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M, + V2_DB_PARAMETER_IDX_S, qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)); roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M, V2_DB_PARAMETER_SL_S, qp->sl); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 182b6726f783..983c0be2afd0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -897,8 +897,8 @@ struct hns_roce_v2_mpt_entry { #define V2_DB_BYTE_4_CMD_S 24 #define V2_DB_BYTE_4_CMD_M GENMASK(27, 24) -#define V2_DB_PARAMETER_CONS_IDX_S 0 -#define V2_DB_PARAMETER_CONS_IDX_M GENMASK(15, 0) +#define V2_DB_PARAMETER_IDX_S 0 +#define V2_DB_PARAMETER_IDX_M GENMASK(15, 0) #define V2_DB_PARAMETER_SL_S 16 #define V2_DB_PARAMETER_SL_M GENMASK(18, 16) -- cgit v1.2.3 From 8f06228733582bcaba5533a5dcae7e017bd18317 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 22 May 2018 08:31:03 +0300 Subject: RDMA/mlx5: Remove debug prints of VMA pointers Remove various prints of VMA pointers. Reported-by: Michal Kalderon Signed-off-by: Leon Romanovsky Reviewed-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ab8cd5c034a2..f3e7d7cd3d87 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2059,10 +2059,6 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (err) return err; - mlx5_ib_dbg(dev, "mapped clock info at 0x%lx, PA 0x%llx\n", - vma->vm_start, - (unsigned long long)pfn << PAGE_SHIFT); - return mlx5_ib_set_vma_data(vma, context); } @@ -2157,15 +2153,14 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, err = io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot); if (err) { - mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", - err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); + mlx5_ib_err(dev, + "io_remap_pfn_range failed with error=%d, mmap_cmd=%s\n", + err, mmap_cmd2str(cmd)); err = -EAGAIN; goto err; } pa = pfn << PAGE_SHIFT; - mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), - vma->vm_start, &pa); err = mlx5_ib_set_vma_data(vma, context); if (err) @@ -2251,10 +2246,6 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; - - mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n", - vma->vm_start, - (unsigned long long)pfn << PAGE_SHIFT); break; case MLX5_IB_MMAP_CLOCK_INFO: return mlx5_ib_mmap_clock_info_page(dev, vma, context); -- cgit v1.2.3 From 7b74a83cf54a3747e22c57e25712bd70eef8acee Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Mon, 21 May 2018 11:41:01 +0300 Subject: IB/mlx5: Fetch soft WQE's on fatal error state On fatal error the driver simulates CQE's for ULPs that rely on completion of all their posted work-request. For the GSI traffic, the mlx5 has its own mechanism that sends the completions via software CQE's directly to the relevant CQ. This should be kept in fatal error too, so the driver should simulate such CQE's with the specified error state in order to complete GSI QP work requests. Without the fix the next deadlock might appears: schedule_timeout+0x274/0x350 wait_for_common+0xec/0x240 mcast_remove_one+0xd0/0x120 [ib_core] ib_unregister_device+0x12c/0x230 [ib_core] mlx5_ib_remove+0xc4/0x270 [mlx5_ib] mlx5_detach_device+0x184/0x1a0 [mlx5_core] mlx5_unload_one+0x308/0x340 [mlx5_core] mlx5_pci_err_detected+0x74/0xe0 [mlx5_core] Cc: # 4.7 Fixes: 89ea94a7b6c4 ("IB/mlx5: Reset flow support for IB kernel ULPs") Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cq.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 77d257ec899b..9f6bc34cd4db 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -637,7 +637,7 @@ repoll: } static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, - struct ib_wc *wc) + struct ib_wc *wc, bool is_fatal_err) { struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); struct mlx5_ib_wc *soft_wc, *next; @@ -650,6 +650,10 @@ static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", cq->mcq.cqn); + if (unlikely(is_fatal_err)) { + soft_wc->wc.status = IB_WC_WR_FLUSH_ERR; + soft_wc->wc.vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + } wc[npolled++] = soft_wc->wc; list_del(&soft_wc->list); kfree(soft_wc); @@ -670,12 +674,17 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_lock_irqsave(&cq->lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + /* make sure no soft wqe's are waiting */ + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc, true); + + mlx5_ib_poll_sw_comp(cq, num_entries - soft_polled, + wc + soft_polled, &npolled); goto out; } if (unlikely(!list_empty(&cq->wc_list))) - soft_polled = poll_soft_wc(cq, num_entries, wc); + soft_polled = poll_soft_wc(cq, num_entries, wc, false); for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled)) -- cgit v1.2.3 From 25e62655c79395c596601a35805c3c7376d097b6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 22 May 2018 20:33:45 +0300 Subject: IB/core: Reduce the places that use zgid Instead of open coding memcmp() to check whether a given GID is zero or not, use a helper function to do so, and replace instances of memcpy(z,&zgid) with memset. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 19 ++++++++++++++----- drivers/infiniband/hw/mlx4/main.c | 5 +++-- drivers/infiniband/hw/mlx4/qp.c | 2 +- include/rdma/ib_cache.h | 1 + 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 5f1a8333a45a..82699f70e9b6 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -125,6 +125,16 @@ const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) } EXPORT_SYMBOL(ib_cache_gid_type_str); +/** rdma_is_zero_gid - Check if given GID is zero or not. + * @gid: GID to check + * Returns true if given GID is zero, returns false otherwise. + */ +bool rdma_is_zero_gid(const union ib_gid *gid) +{ + return !memcmp(gid, &zgid, sizeof(*gid)); +} +EXPORT_SYMBOL(rdma_is_zero_gid); + int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; @@ -231,7 +241,7 @@ static int add_modify_gid(struct ib_gid_table *table, * So ignore such behavior for IB link layer and don't * fail the call, but don't add such entry to GID cache. */ - if (!memcmp(gid, &zgid, sizeof(*gid))) + if (rdma_is_zero_gid(gid)) return 0; } @@ -264,7 +274,7 @@ static void del_gid(struct ib_device *ib_dev, u8 port, if (rdma_protocol_roce(ib_dev, port)) del_roce_gid(ib_dev, port, table, ix); - memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid)); + memset(&table->data_vec[ix].gid, 0, sizeof(table->data_vec[ix].gid)); memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr)); table->data_vec[ix].context = NULL; } @@ -363,7 +373,7 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, * IB spec version 1.3 section 4.1.1 point (6) and * section 12.7.10 and section 12.7.20 */ - if (!memcmp(gid, &zgid, sizeof(*gid))) + if (rdma_is_zero_gid(gid)) return -EINVAL; table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; @@ -724,8 +734,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { - if (memcmp(&table->data_vec[i].gid, &zgid, - sizeof(table->data_vec[i].gid))) { + if (!rdma_is_zero_gid(&table->data_vec[i].gid)) { del_gid(ib_dev, port, table, i); deleted = true; } diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 5b70744f414a..bf12394c13c1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -276,7 +276,7 @@ static int mlx4_ib_add_gid(const union ib_gid *gid, found = i; break; } - if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid))) + if (free < 0 && rdma_is_zero_gid(&port_gid_table->gids[i].gid)) free = i; /* HW has space */ } @@ -345,7 +345,8 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) if (!ctx->refcount) { unsigned int real_index = ctx->real_index; - memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid)); + memset(&port_gid_table->gids[real_index].gid, 0, + sizeof(port_gid_table->gids[real_index].gid)); kfree(port_gid_table->gids[real_index].ctx); port_gid_table->gids[real_index].ctx = NULL; hw_update = 1; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 199648adac74..cd2c08c45334 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -3078,7 +3078,7 @@ static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num, memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid)); *gid_type = port_gid_table->gids[index].gid_type; spin_unlock_irqrestore(&iboe->lock, flags); - if (!memcmp(gid, &zgid, sizeof(*gid))) + if (rdma_is_zero_gid(gid)) return -ENOENT; return 0; diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index eb49cc8d1f95..a5f249828115 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -149,4 +149,5 @@ int ib_get_cached_port_state(struct ib_device *device, u8 port_num, enum ib_port_state *port_active); +bool rdma_is_zero_gid(const union ib_gid *gid); #endif /* _IB_CACHE_H */ -- cgit v1.2.3 From 5b6eb54f586ba0a6385f1523bce4c96cbdb79afd Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Wed, 23 May 2018 18:16:27 +0800 Subject: RDMA/hns: Modify uar allocation algorithm to avoid bitmap exhaust This patch modified uar allocation algorithm in hns_roce_uar_alloc function to avoid bitmap exhaust. Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_pd.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 53c2f1b8d068..412297d4b86c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -214,6 +214,7 @@ enum { struct hns_roce_uar { u64 pfn; unsigned long index; + unsigned long logic_idx; }; struct hns_roce_ucontext { diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index 4b41e041799c..b9f2c871ff9a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -107,13 +107,15 @@ int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) int ret = 0; /* Using bitmap to manager UAR index */ - ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->index); + ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->logic_idx); if (ret == -1) return -ENOMEM; - if (uar->index > 0) - uar->index = (uar->index - 1) % + if (uar->logic_idx > 0 && hr_dev->caps.phy_num_uars > 1) + uar->index = (uar->logic_idx - 1) % (hr_dev->caps.phy_num_uars - 1) + 1; + else + uar->index = 0; if (!dev_is_pci(hr_dev->dev)) { res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0); @@ -132,7 +134,7 @@ int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) void hns_roce_uar_free(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) { - hns_roce_bitmap_free(&hr_dev->uar_table.bitmap, uar->index, + hns_roce_bitmap_free(&hr_dev->uar_table.bitmap, uar->logic_idx, BITMAP_NO_RR); } -- cgit v1.2.3 From d59fcacc4b089c9920ff4a148e33a3f3f7275ef6 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Wed, 23 May 2018 18:16:28 +0800 Subject: RDMA/hns: Increase checking CMQ status timeout value This patch increases checking CMQ status timeout value and uses the same value with NIC driver to avoid deficiency of time. Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 983c0be2afd0..2caeb4cdad5c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -76,7 +76,7 @@ #define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFFF000 #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2 #define HNS_ROCE_INVALID_LKEY 0x100 -#define HNS_ROCE_CMQ_TX_TIMEOUT 200 +#define HNS_ROCE_CMQ_TX_TIMEOUT 30000 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_MTT_HOP_NUM 1 -- cgit v1.2.3 From d8f9cc328c8888369880e2527e9186d745f2bbf6 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 23 May 2018 15:30:31 +0300 Subject: IB/mlx4: Mark user MR as writable if actual virtual memory is writable To allow rereg_user_mr to modify the MR from read-only to writable without using get_user_pages again, we needed to define the initial MR as writable. However, this was originally done unconditionally, without taking into account the writability of the underlying virtual memory. As a result, any attempt to register a read-only MR over read-only virtual memory failed. To fix this, do not add the writable flag bit when the user virtual memory is not writable (e.g. const memory). However, when the underlying memory is NOT writable (and we therefore do not define the initial MR as writable), the IB core adds a "force writable" flag to its user-pages request. If this succeeds, the reg_user_mr caller gets a writable copy of the original pages. If the user-space caller then does a rereg_user_mr operation to enable writability, this will succeed. This should not be allowed, since the original virtual memory was not writable. Cc: Fixes: 9376932d0c26 ("IB/mlx4_ib: Add support for user MR re-registration") Signed-off-by: Jason Gunthorpe Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/mr.c | 50 ++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 61d8b06375bb..ed1f253faf97 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -367,6 +367,40 @@ end: return block_shift; } +static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start, + u64 length, u64 virt_addr, + int access_flags) +{ + /* + * Force registering the memory as writable if the underlying pages + * are writable. This is so rereg can change the access permissions + * from readable to writable without having to run through ib_umem_get + * again + */ + if (!ib_access_writable(access_flags)) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + /* + * FIXME: Ideally this would iterate over all the vmas that + * cover the memory, but for now it requires a single vma to + * entirely cover the MR to support RO mappings. + */ + vma = find_vma(current->mm, start); + if (vma && vma->vm_end >= start + length && + vma->vm_start <= start) { + if (vma->vm_flags & VM_WRITE) + access_flags |= IB_ACCESS_LOCAL_WRITE; + } else { + access_flags |= IB_ACCESS_LOCAL_WRITE; + } + + up_read(¤t->mm->mmap_sem); + } + + return ib_umem_get(context, start, length, access_flags, 0); +} + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -381,10 +415,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - /* Force registering the memory as writable. */ - /* Used for memory re-registeration. HCA protects the access */ - mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags | IB_ACCESS_LOCAL_WRITE, 0); + mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length, + virt_addr, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -454,6 +486,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, } if (flags & IB_MR_REREG_ACCESS) { + if (ib_access_writable(mr_access_flags) && !mmr->umem->writable) + return -EPERM; + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, convert_access(mr_access_flags)); @@ -467,10 +502,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); - mmr->umem = ib_umem_get(mr->uobject->context, start, length, - mr_access_flags | - IB_ACCESS_LOCAL_WRITE, - 0); + mmr->umem = + mlx4_get_umem_mr(mr->uobject->context, start, length, + virt_addr, mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ -- cgit v1.2.3 From 572f46bf947c3eeca8d16518e0fb70f9250b4416 Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Sun, 27 May 2018 13:42:33 +0300 Subject: IB/mlx5: Refactor CQE compression response Refactor CQE compression response to be fully set only when it`s really supported. There is no change from user perspective because anyway resp.cqe_comp_caps.max_num was set to zero. Reviewed-by: Yishai Hadas Signed-off-by: Yonatan Cohen W Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index daa919e5a442..029c310a0dd1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -983,13 +983,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { - resp.cqe_comp_caps.max_num = - MLX5_CAP_GEN(dev->mdev, cqe_compression) ? - MLX5_CAP_GEN(dev->mdev, cqe_compression_max_num) : 0; - resp.cqe_comp_caps.supported_format = - MLX5_IB_CQE_RES_FORMAT_HASH | - MLX5_IB_CQE_RES_FORMAT_CSUM; resp.response_length += sizeof(resp.cqe_comp_caps); + + if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { + resp.cqe_comp_caps.max_num = + MLX5_CAP_GEN(dev->mdev, + cqe_compression_max_num); + + resp.cqe_comp_caps.supported_format = + MLX5_IB_CQE_RES_FORMAT_HASH | + MLX5_IB_CQE_RES_FORMAT_CSUM; + } } if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && -- cgit v1.2.3 From 6f1006a43869ff82745eea3b88204d0a3bcc0158 Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Sun, 27 May 2018 13:42:34 +0300 Subject: IB/mlx5: Introduce a new mini-CQE format The new mini-CQE format includes the stride index, byte count and packet checksum. Stride index is needed for striding WQ feature. This patch exposes this capability and enables its setting via mlx5 UHW data as part of query device and cq creation. Reviewed-by: Yishai Hadas Reviewed-by: Guy Levi Signed-off-by: Yonatan Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cq.c | 42 +++++++++++++++++++++++++++++---------- drivers/infiniband/hw/mlx5/main.c | 4 ++++ include/uapi/rdma/mlx5-abi.h | 2 +- 3 files changed, 37 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 6d52ea03574e..68775e12d721 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -742,6 +742,28 @@ static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev, return 0; } +enum { + MLX5_CQE_RES_FORMAT_HASH = 0, + MLX5_CQE_RES_FORMAT_CSUM = 1, + MLX5_CQE_RES_FORMAT_CSUM_STRIDX = 3, +}; + +static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format) +{ + switch (format) { + case MLX5_IB_CQE_RES_FORMAT_HASH: + return MLX5_CQE_RES_FORMAT_HASH; + case MLX5_IB_CQE_RES_FORMAT_CSUM: + return MLX5_CQE_RES_FORMAT_CSUM; + case MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX: + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + return MLX5_CQE_RES_FORMAT_CSUM_STRIDX; + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct ib_ucontext *context, struct mlx5_ib_cq *cq, int entries, u32 **cqb, @@ -807,6 +829,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, *index = to_mucontext(context)->bfregi.sys_pages[0]; if (ucmd.cqe_comp_en == 1) { + int mini_cqe_format; + if (!((*cqe_size == 128 && MLX5_CAP_GEN(dev->mdev, cqe_compression_128)) || (*cqe_size == 64 && @@ -817,20 +841,18 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, goto err_cqb; } - if (unlikely(!ucmd.cqe_comp_res_format || - !(ucmd.cqe_comp_res_format < - MLX5_IB_CQE_RES_RESERVED) || - (ucmd.cqe_comp_res_format & - (ucmd.cqe_comp_res_format - 1)))) { - err = -EOPNOTSUPP; - mlx5_ib_warn(dev, "CQE compression res format %d is not supported!\n", - ucmd.cqe_comp_res_format); + mini_cqe_format = + mini_cqe_res_format_to_hw(dev, + ucmd.cqe_comp_res_format); + if (mini_cqe_format < 0) { + err = mini_cqe_format; + mlx5_ib_dbg(dev, "CQE compression res format %d error: %d\n", + ucmd.cqe_comp_res_format, err); goto err_cqb; } MLX5_SET(cqc, cqc, cqe_comp_en, 1); - MLX5_SET(cqc, cqc, mini_cqe_res_format, - ilog2(ucmd.cqe_comp_res_format)); + MLX5_SET(cqc, cqc, mini_cqe_res_format, mini_cqe_format); } if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD) { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 029c310a0dd1..e0894b203d59 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -993,6 +993,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.cqe_comp_caps.supported_format = MLX5_IB_CQE_RES_FORMAT_HASH | MLX5_IB_CQE_RES_FORMAT_CSUM; + + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + resp.cqe_comp_caps.supported_format |= + MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX; } } diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index cb4a02c4a1ce..9783648d5511 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -163,7 +163,7 @@ struct mlx5_ib_rss_caps { enum mlx5_ib_cqe_comp_res_format { MLX5_IB_CQE_RES_FORMAT_HASH = 1 << 0, MLX5_IB_CQE_RES_FORMAT_CSUM = 1 << 1, - MLX5_IB_CQE_RES_RESERVED = 1 << 2, + MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX = 1 << 2, }; struct mlx5_ib_cqe_comp_caps { -- cgit v1.2.3 From cb7a94c9c808d291d813f90bdb53e2005324a332 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:24 +0800 Subject: RDMA/hns: Add reset process for RoCE in hip08 This patch added reset process for RoCE in hip08. Signed-off-by: Wei Hu (Xavier) Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 3 ++ drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 76 +++++++++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_main.c | 7 +++ 4 files changed, 88 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 9ebe839d8b24..a0ba19d4a10e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -176,6 +176,9 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, u8 op_modifier, u16 op, unsigned long timeout) { + if (hr_dev->is_reset) + return 0; + if (hr_dev->cmd.use_events) return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, in_modifier, op_modifier, op, diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 412297d4b86c..da8512b40252 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -774,6 +774,8 @@ struct hns_roce_dev { const char *irq_names[HNS_ROCE_MAX_IRQ_NUM]; spinlock_t sm_lock; spinlock_t bt_cmd_lock; + bool active; + bool is_reset; struct hns_roce_ib_iboe iboe; struct list_head pgdir_list; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dd716ed60661..166f0469b5f5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -775,6 +775,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, int ret = 0; int ntc; + if (hr_dev->is_reset) + return 0; + spin_lock_bh(&csq->lock); if (num > hns_roce_cmq_space(csq)) { @@ -4804,14 +4807,87 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, { struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + if (!hr_dev) + return; + hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } +static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) +{ + struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + struct ib_event event; + + if (!hr_dev) { + dev_err(&handle->pdev->dev, + "Input parameter handle->priv is NULL!\n"); + return -EINVAL; + } + + hr_dev->active = false; + hr_dev->is_reset = true; + + event.event = IB_EVENT_DEVICE_FATAL; + event.device = &hr_dev->ib_dev; + event.element.port_num = 1; + ib_dispatch_event(&event); + + return 0; +} + +static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) +{ + int ret; + + ret = hns_roce_hw_v2_init_instance(handle); + if (ret) { + /* when reset notify type is HNAE3_INIT_CLIENT In reset notify + * callback function, RoCE Engine reinitialize. If RoCE reinit + * failed, we should inform NIC driver. + */ + handle->priv = NULL; + dev_err(&handle->pdev->dev, + "In reset process RoCE reinit failed %d.\n", ret); + } + + return ret; +} + +static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) +{ + msleep(100); + hns_roce_hw_v2_uninit_instance(handle, false); + return 0; +} + +static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, + enum hnae3_reset_notify_type type) +{ + int ret = 0; + + switch (type) { + case HNAE3_DOWN_CLIENT: + ret = hns_roce_hw_v2_reset_notify_down(handle); + break; + case HNAE3_INIT_CLIENT: + ret = hns_roce_hw_v2_reset_notify_init(handle); + break; + case HNAE3_UNINIT_CLIENT: + ret = hns_roce_hw_v2_reset_notify_uninit(handle); + break; + default: + break; + } + + return ret; +} + static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, + .reset_notify = hns_roce_hw_v2_reset_notify, }; static struct hnae3_client hns_roce_hw_v2_client = { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index c614f9182b1a..fbb0c0a857b8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -332,6 +332,9 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); + if (!hr_dev->active) + return ERR_PTR(-EAGAIN); + resp.qp_tab_size = hr_dev->caps.num_qps; context = kmalloc(sizeof(*context), GFP_KERNEL); @@ -425,6 +428,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) { struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); ib_unregister_device(&hr_dev->ib_dev); } @@ -536,6 +540,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) goto error_failed_setup_mtu_mac; } + hr_dev->active = true; return 0; error_failed_setup_mtu_mac: @@ -728,6 +733,7 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) return ret; } } + hr_dev->is_reset = false; if (hr_dev->hw->cmq_init) { ret = hr_dev->hw->cmq_init(hr_dev); @@ -827,6 +833,7 @@ EXPORT_SYMBOL_GPL(hns_roce_init); void hns_roce_exit(struct hns_roce_dev *hr_dev) { hns_roce_unregister_device(hr_dev); + if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); hns_roce_cleanup_bitmap(hr_dev); -- cgit v1.2.3 From 0b25c9cc53b5c0f87fab5e3cab0ff64e8d4ccc0b Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:25 +0800 Subject: RDMA/hns: Fix the illegal memory operation when cross page This patch fixed the potential illegal operation when using the extend sge buffer cross page in post send operation. The bug will cause the calltrace as below. [ 3302.922107] Unable to handle kernel paging request at virtual address ffff00003b3a0004 [ 3302.930009] Mem abort info: [ 3302.932790] Exception class = DABT (current EL), IL = 32 bits [ 3302.938695] SET = 0, FnV = 0 [ 3302.941735] EA = 0, S1PTW = 0 [ 3302.944863] Data abort info: [ 3302.947729] ISV = 0, ISS = 0x00000047 [ 3302.951551] CM = 0, WnR = 1 [ 3302.954506] swapper pgtable: 4k pages, 48-bit VAs, pgd = ffff000009ea5000 [ 3302.961279] [ffff00003b3a0004] *pgd=00000023dfffe003, *pud=00000023dfffd003, *pmd=00000022dc84c003, *pte=0000000000000000 [ 3302.972224] Internal error: Oops: 96000047 [#1] SMP [ 3302.999509] CPU: 9 PID: 19628 Comm: roce_test_main Tainted: G OE 4.14.10 #1 [ 3303.007498] task: ffff80234df78000 task.stack: ffff00000f640000 [ 3303.013412] PC is at hns_roce_v2_post_send+0x690/0xe20 [hns_roce_pci] [ 3303.019843] LR is at hns_roce_v2_post_send+0x658/0xe20 [hns_roce_pci] [ 3303.026269] pc : [] lr : [] pstate: 804001c9 [ 3303.033649] sp : ffff00000f643870 [ 3303.036951] x29: ffff00000f643870 x28: ffff80232bfa9c00 [ 3303.042250] x27: ffff80234d909380 x26: ffff00003b37f0c0 [ 3303.047549] x25: 0000000000000000 x24: 0000000000000003 [ 3303.052848] x23: 0000000000000000 x22: 0000000000000000 [ 3303.058148] x21: 0000000000000101 x20: 0000000000000001 [ 3303.063447] x19: ffff80236163f800 x18: 0000000000000000 [ 3303.068746] x17: 0000ffff86b76fc8 x16: ffff000008301600 [ 3303.074045] x15: 000020a51c000000 x14: 3128726464615f65 [ 3303.079344] x13: 746f6d6572202c29 x12: 303035312879656b [ 3303.084643] x11: 723a6f666e692072 x10: 573a6f666e693a5d [ 3303.089943] x9 : 0000000000000004 x8 : ffff8023ce38b000 [ 3303.095242] x7 : ffff8023ce38b320 x6 : 0000000000000418 [ 3303.100541] x5 : ffff80232bfa9cc8 x4 : 0000000000000030 [ 3303.105839] x3 : 0000000000000100 x2 : 0000000000000200 [ 3303.111138] x1 : 0000000000000320 x0 : ffff00003b3a0000 [ 3303.116438] Process roce_test_main (pid: 19628, stack limit = 0xffff00000f640000) [ 3303.123906] Call trace: [ 3303.126339] Exception stack(0xffff00000f643730 to 0xffff00000f643870) [ 3303.215790] [] hns_roce_v2_post_send+0x690/0xe20 [hns_roce_pci] [ 3303.223293] [] rt_ktest_post_send+0x5d0/0x8b8 [rdma_test] [ 3303.230261] [] exec_send_cmd+0x664/0x1350 [rdma_test] [ 3303.236881] [] rt_ktest_dispatch_cmd_3+0x1510/0x3790 [rdma_test] [ 3303.244455] [] rt_ktest_dispatch_cmd_2+0xa4/0x118 [rdma_test] [ 3303.251770] [] rt_ktest_dispatch_cmd+0x124/0xaa8 [rdma_test] [ 3303.258997] [] rt_ktest_dev_write+0x2cc/0x568 [rdma_test] [ 3303.265947] [] __vfs_write+0x60/0x18c [ 3303.271158] [] vfs_write+0xa8/0x198 [ 3303.276196] [] SyS_write+0x6c/0xd4 [ 3303.281147] Exception stack(0xffff00000f643ec0 to 0xffff00000f644000) [ 3303.287573] 3ec0: 0000000000000003 0000fffffc85faa8 0000000000004e60 0000000000000000 [ 3303.295388] 3ee0: 0000000021fb2000 000000000000ffff eff0e3efe4e58080 0000fffffcc724fe [ 3303.303204] 3f00: 0000000000000040 1999999999999999 0101010101010101 0000000000000038 [ 3303.311019] 3f20: 0000000000000005 ffffffffffffffff 0d73757461747320 ffffffffffffffff [ 3303.318835] 3f40: 0000000000000000 0000000000459b00 0000fffffc85e360 000000000043d788 [ 3303.326650] 3f60: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 3303.334465] 3f80: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 3303.342281] 3fa0: 0000000000000000 0000fffffc85e570 0000000000438804 0000fffffc85e570 [ 3303.350096] 3fc0: 0000ffff8553f618 0000000080000000 0000000000000003 0000000000000040 [ 3303.357911] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 3303.365729] [] __sys_trace_return+0x0/0x4 [ 3303.371288] Code: b94008e9 34000129 b9400ce2 110006b5 (b9000402) [ 3303.377377] ---[ end trace fd5ab98b3325cf9a ]--- Reported-by: Jie Chen Reported-by: Xiping Zhang (Francis) Fixes: b1c158350968("RDMA/hns: Get rid of virt_to_page and vmap calls after dma_alloc_coherent") Signed-off-by: Wei Hu (Xavier) Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 72 +++++++++++++++++++++--------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 + 2 files changed, 53 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 166f0469b5f5..0e8dad68910a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,53 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } +static void set_extend_sge(struct hns_roce_qp *qp, struct ib_send_wr *wr, + unsigned int *sge_ind) +{ + struct hns_roce_v2_wqe_data_seg *dseg; + struct ib_sge *sg; + int num_in_wqe = 0; + int extend_sge_num; + int fi_sge_num; + int se_sge_num; + int shift; + int i; + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + num_in_wqe = HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; + extend_sge_num = wr->num_sge - num_in_wqe; + sg = wr->sg_list + num_in_wqe; + shift = qp->hr_buf.page_shift; + + /* + * Check whether wr->num_sge sges are in the same page. If not, we + * should calculate how many sges in the first page and the second + * page. + */ + dseg = get_send_extend_sge(qp, (*sge_ind) & (qp->sge.sge_cnt - 1)); + fi_sge_num = (round_up((uintptr_t)dseg, 1 << shift) - + (uintptr_t)dseg) / + sizeof(struct hns_roce_v2_wqe_data_seg); + if (extend_sge_num > fi_sge_num) { + se_sge_num = extend_sge_num - fi_sge_num; + for (i = 0; i < fi_sge_num; i++) { + set_data_seg_v2(dseg++, sg + i); + (*sge_ind)++; + } + dseg = get_send_extend_sge(qp, + (*sge_ind) & (qp->sge.sge_cnt - 1)); + for (i = 0; i < se_sge_num; i++) { + set_data_seg_v2(dseg++, sg + fi_sge_num + i); + (*sge_ind)++; + } + } else { + for (i = 0; i < extend_sge_num; i++) { + set_data_seg_v2(dseg++, sg + i); + (*sge_ind)++; + } + } +} + static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, void *wqe, unsigned int *sge_ind, @@ -85,7 +133,7 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1); } else { - if (wr->num_sge <= 2) { + if (wr->num_sge <= HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) { for (i = 0; i < wr->num_sge; i++) { if (likely(wr->sg_list[i].length)) { set_data_seg_v2(dseg, wr->sg_list + i); @@ -98,24 +146,14 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, (*sge_ind) & (qp->sge.sge_cnt - 1)); - for (i = 0; i < 2; i++) { + for (i = 0; i < HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; i++) { if (likely(wr->sg_list[i].length)) { set_data_seg_v2(dseg, wr->sg_list + i); dseg++; } } - dseg = get_send_extend_sge(qp, - (*sge_ind) & (qp->sge.sge_cnt - 1)); - - for (i = 0; i < wr->num_sge - 2; i++) { - if (likely(wr->sg_list[i + 2].length)) { - set_data_seg_v2(dseg, - wr->sg_list + 2 + i); - dseg++; - (*sge_ind)++; - } - } + set_extend_sge(qp, wr, sge_ind); } roce_set_field(rc_sq_wqe->byte_16, @@ -319,13 +357,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], GID_LEN_V2); - dseg = get_send_extend_sge(qp, - sge_ind & (qp->sge.sge_cnt - 1)); - for (i = 0; i < wr->num_sge; i++) { - set_data_seg_v2(dseg + i, wr->sg_list + i); - sge_ind++; - } - + set_extend_sge(qp, wr, &sge_ind); ind++; } else if (ibqp->qp_type == IB_QPT_RC) { rc_sq_wqe = wqe; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 2caeb4cdad5c..d47675f365c7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -77,6 +77,7 @@ #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2 #define HNS_ROCE_INVALID_LKEY 0x100 #define HNS_ROCE_CMQ_TX_TIMEOUT 30000 +#define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_MTT_HOP_NUM 1 -- cgit v1.2.3 From a0976f418daf6f93c3c572767f0cf1e770df4717 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:26 +0800 Subject: RDMA/uverbs: Hoist the common process of disassociate_ucontext into ib core This patch hoisted the common process of disassociate_ucontext callback function into ib core code, and these code are common to ervery ib_device driver. Signed-off-by: Wei Hu (Xavier) Acked-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 42 ++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx4/main.c | 34 ---------------------------- drivers/infiniband/hw/mlx5/main.c | 34 ---------------------------- 3 files changed, 41 insertions(+), 69 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 4445d8ee9314..3ae2339dd27a 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include #include @@ -1090,6 +1092,44 @@ err: return; } +static void ib_uverbs_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + struct ib_device *ib_dev = ibcontext->device; + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); + if (!owning_process) + return; + + owning_mm = get_task_mm(owning_process); + if (!owning_mm) { + pr_info("no mm, disassociate ucontext is pending task termination\n"); + while (1) { + put_task_struct(owning_process); + usleep_range(1000, 2000); + owning_process = get_pid_task(ibcontext->tgid, + PIDTYPE_PID); + if (!owning_process || + owning_process->state == TASK_DEAD) { + pr_info("disassociate ucontext done, task was terminated\n"); + /* in case task was dead need to release the + * task struct. + */ + if (owning_process) + put_task_struct(owning_process); + return; + } + } + } + + down_write(&owning_mm->mmap_sem); + ib_dev->disassociate_ucontext(ibcontext); + up_write(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); +} + static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, struct ib_device *ib_dev) { @@ -1130,7 +1170,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, * (e.g mmput). */ ib_uverbs_event_handler(&file->event_handler, &event); - ib_dev->disassociate_ucontext(ucontext); + ib_uverbs_disassociate_ucontext(ucontext); mutex_lock(&file->cleanup_mutex); ib_uverbs_cleanup_ucontext(file, ucontext, true); mutex_unlock(&file->cleanup_mutex); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index bf12394c13c1..59aed458d3be 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1189,40 +1189,10 @@ static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) int ret = 0; struct vm_area_struct *vma; struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - /* make sure that task is dead before returning, it may - * prevent a rare case of module down in parallel to a - * call to mlx4_ib_vma_close. - */ - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the task struct */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } /* need to protect from a race on closing the vma as part of * mlx4_ib_vma_close(). */ - down_write(&owning_mm->mmap_sem); for (i = 0; i < HW_BAR_COUNT; i++) { vma = context->hw_bar_info[i].vma; if (!vma) @@ -1241,10 +1211,6 @@ static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) /* context going to be destroyed, should not access ops any more */ context->hw_bar_info[i].vma->vm_ops = NULL; } - - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); } static void mlx4_ib_set_vma_data(struct vm_area_struct *vma, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 92879d2d3026..a182d19c557e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1973,38 +1973,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) struct vm_area_struct *vma; struct mlx5_ib_vma_private_data *vma_private, *n; struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the - * task struct. - */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } - - /* need to protect from a race on closing the vma as part of - * mlx5_ib_vma_close. - */ - down_write(&owning_mm->mmap_sem); mutex_lock(&context->vma_private_list_mutex); list_for_each_entry_safe(vma_private, n, &context->vma_private_list, list) { @@ -2021,9 +1990,6 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) kfree(vma_private); } mutex_unlock(&context->vma_private_list_mutex); - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); } static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) -- cgit v1.2.3 From fedc3abe7bd2dcc4c80bcf3cff8708a3908d8219 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:27 +0800 Subject: RDMA/hns: Implement the disassociate_ucontext API This patch implemented the IB core disassociate_ucontext API. Signed-off-by: Wei Hu (Xavier) Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_device.h | 8 ++++ drivers/infiniband/hw/hns/hns_roce_main.c | 70 ++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index da8512b40252..31221d506d9a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -217,11 +217,19 @@ struct hns_roce_uar { unsigned long logic_idx; }; +struct hns_roce_vma_data { + struct list_head list; + struct vm_area_struct *vma; + struct mutex *vma_list_mutex; +}; + struct hns_roce_ucontext { struct ib_ucontext ibucontext; struct hns_roce_uar uar; struct list_head page_list; struct mutex page_mutex; + struct list_head vma_list; + struct mutex vma_list_mutex; }; struct hns_roce_pd { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index fbb0c0a857b8..08c795e11cdd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -345,6 +345,8 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, if (ret) goto error_fail_uar_alloc; + INIT_LIST_HEAD(&context->vma_list); + mutex_init(&context->vma_list_mutex); if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { INIT_LIST_HEAD(&context->page_list); mutex_init(&context->page_mutex); @@ -375,6 +377,50 @@ static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } +static void hns_roce_vma_open(struct vm_area_struct *vma) +{ + vma->vm_ops = NULL; +} + +static void hns_roce_vma_close(struct vm_area_struct *vma) +{ + struct hns_roce_vma_data *vma_data; + + vma_data = (struct hns_roce_vma_data *)vma->vm_private_data; + vma_data->vma = NULL; + mutex_lock(vma_data->vma_list_mutex); + list_del(&vma_data->list); + mutex_unlock(vma_data->vma_list_mutex); + kfree(vma_data); +} + +static const struct vm_operations_struct hns_roce_vm_ops = { + .open = hns_roce_vma_open, + .close = hns_roce_vma_close, +}; + +static int hns_roce_set_vma_data(struct vm_area_struct *vma, + struct hns_roce_ucontext *context) +{ + struct list_head *vma_head = &context->vma_list; + struct hns_roce_vma_data *vma_data; + + vma_data = kzalloc(sizeof(*vma_data), GFP_KERNEL); + if (!vma_data) + return -ENOMEM; + + vma_data->vma = vma; + vma_data->vma_list_mutex = &context->vma_list_mutex; + vma->vm_private_data = vma_data; + vma->vm_ops = &hns_roce_vm_ops; + + mutex_lock(&context->vma_list_mutex); + list_add(&vma_data->list, vma_head); + mutex_unlock(&context->vma_list_mutex); + + return 0; +} + static int hns_roce_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { @@ -400,7 +446,7 @@ static int hns_roce_mmap(struct ib_ucontext *context, } else return -EINVAL; - return 0; + return hns_roce_set_vma_data(vma, to_hr_ucontext(context)); } static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, @@ -424,6 +470,27 @@ static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, return 0; } +static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); + struct hns_roce_vma_data *vma_data, *n; + struct vm_area_struct *vma; + int ret; + + mutex_lock(&context->vma_list_mutex); + list_for_each_entry_safe(vma_data, n, &context->vma_list, list) { + vma = vma_data->vma; + ret = zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); + WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + vma->vm_ops = NULL; + list_del(&vma_data->list); + kfree(vma_data); + } + mutex_unlock(&context->vma_list_mutex); +} + static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) { struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; @@ -519,6 +586,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) /* OTHERS */ ib_dev->get_port_immutable = hns_roce_port_immutable; + ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; ib_dev->driver_id = RDMA_DRIVER_HNS; ret = ib_register_device(ib_dev, NULL); -- cgit v1.2.3 From 367d2f0787e8363f30cbac4d5270a772b69828c1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 30 May 2018 10:40:29 +0100 Subject: RDMA/qedr: fix spelling mistake: "adrresses" -> "addresses" Trivial fix to spelling mistake in DP_ERR error message Signed-off-by: Colin Ian King Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 988aace89430..614a954d0757 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -414,7 +414,7 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if ((vma->vm_start & (PAGE_SIZE - 1)) || (len & (PAGE_SIZE - 1))) { DP_ERR(dev, - "failed mmap, adrresses must be page aligned: start=0x%pK, end=0x%pK\n", + "failed mmap, addresses must be page aligned: start=0x%pK, end=0x%pK\n", (void *)vma->vm_start, (void *)vma->vm_end); return -EINVAL; } -- cgit v1.2.3 From f77f3036264b2e0d9abd2938b8999dc9d33819ed Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:13 +0300 Subject: RDMA/mlx4: Catch FW<->SW misalignment without machine crash Any steering QP is supposed be above steering_qp_base, see function mlx4_ib_steer_qp_alloc() for it, however in case of misalignment between SW and FW, this qp_base can be wrong. Use WARN() to catch such situation without killing the machine. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 59aed458d3be..1fea1497263b 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -3017,7 +3017,10 @@ void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) return; - BUG_ON(qpn < dev->steer_qpn_base); + if (WARN(qpn < dev->steer_qpn_base, "qpn = %u, steer_qpn_base = %u\n", + qpn, dev->steer_qpn_base)) + /* not supposed to be here */ + return; bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, -- cgit v1.2.3 From 6b1ca7ece15e94251d1d0d919f813943e4a58059 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:14 +0300 Subject: RDMA/mlx4: Discard unknown SQP work requests There is no need to crash the machine if unknown work request was received in SQP MAD. Cc: # 3.6 Fixes: 37bfc7c1e83f ("IB/mlx4: SR-IOV multiplex and demultiplex MADs") Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mad.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 0793a21d76f4..d604b3d5aa3e 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -1934,7 +1934,6 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work) "buf:%lld\n", wc.wr_id); break; default: - BUG_ON(1); break; } } else { -- cgit v1.2.3 From cd13a399e66c1b9b039064e8aa2f959eb90d6947 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:15 +0300 Subject: RDMA/cxgb3: Don't crash kernel just because IDR is full cxgb3 driver properly handles errors returned by IDR, so there is no need to have special case (kernel crash) just because IDR is full. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb3/iwch.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h index 837862287a29..c69bc4f52049 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.h +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -162,7 +162,6 @@ static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, spin_unlock_irq(&rhp->lock); idr_preload_end(); - BUG_ON(ret == -ENOSPC); return ret < 0 ? ret : 0; } -- cgit v1.2.3 From 2cb4079188a1421520372f5e57ddaceab39435aa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 15:14:05 +0300 Subject: RDMA/mlx5: Don't check return value of zap_vma_ptes() There is no need to check return value of zap_vma_ptes() because there is nothing to do with this knowledge. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a182d19c557e..0541581c5d84 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1969,7 +1969,6 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { - int ret; struct vm_area_struct *vma; struct mlx5_ib_vma_private_data *vma_private, *n; struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); @@ -1978,9 +1977,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) list_for_each_entry_safe(vma_private, n, &context->vma_private_list, list) { vma = vma_private->vma; - ret = zap_vma_ptes(vma, vma->vm_start, - PAGE_SIZE); - WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); /* context going to be destroyed, should * not access ops any more. */ -- cgit v1.2.3 From 7fc8ff267d8a94964626b847a0440b6feef9dd68 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 15:14:06 +0300 Subject: RDMA/mlx4: Don't crash machine if zap_vma_ptes() fails The failure reported by zap_vma_ptes() means that wrong VMA pages were supplied, however it is impossible for this type of address. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1fea1497263b..722c825e3e71 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1186,7 +1186,6 @@ static const struct vm_operations_struct mlx4_ib_vm_ops = { static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { int i; - int ret = 0; struct vm_area_struct *vma; struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); @@ -1198,13 +1197,8 @@ static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) if (!vma) continue; - ret = zap_vma_ptes(context->hw_bar_info[i].vma, - context->hw_bar_info[i].vma->vm_start, - PAGE_SIZE); - if (ret) { - pr_err("Error: zap_vma_ptes failed for index=%d, ret=%d\n", i, ret); - BUG_ON(1); - } + zap_vma_ptes(context->hw_bar_info[i].vma, + context->hw_bar_info[i].vma->vm_start, PAGE_SIZE); context->hw_bar_info[i].vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); -- cgit v1.2.3 From 008fba465d7c010dc14c9d7fd57a7a743d50bf8e Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 1 Jun 2018 11:19:19 -0400 Subject: RDMA/hns_roce: Don't check return value of zap_vma_ptes() There is no need to check return value of zap_vma_ptes() because there is nothing to do with this knowledge. Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 08c795e11cdd..21b901cfa2d6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -475,13 +475,11 @@ static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_vma_data *vma_data, *n; struct vm_area_struct *vma; - int ret; mutex_lock(&context->vma_list_mutex); list_for_each_entry_safe(vma_data, n, &context->vma_list, list) { vma = vma_data->vma; - ret = zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); - WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); vma->vm_ops = NULL; -- cgit v1.2.3 From 59082a327d0145c69b419a0f5bed96b13c5e9ed4 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 May 2018 16:43:35 +0300 Subject: IB/core: Support passing uhw for create_flow This is required when user-space drivers need to pass extra information regarding how to handle this flow steering specification. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Boris Pismenny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 7 ++++++- drivers/infiniband/core/verbs.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 6 +++++- drivers/infiniband/hw/mlx5/main.c | 7 ++++++- include/rdma/ib_verbs.h | 3 ++- 5 files changed, 20 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 13cb5e4deb86..c735c13e53b0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3513,11 +3513,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = -EINVAL; goto err_free; } - flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); + + flow_id = qp->device->create_flow(qp, flow_attr, + IB_FLOW_DOMAIN_USER, uhw); + if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); goto err_free; } + atomic_inc(&qp->usecnt); + flow_id->qp = qp; flow_id->uobject = uobj; uobj->object = flow_id; uflow = container_of(uobj, typeof(*uflow), uobject); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 7eff3aeffe01..9a3e886c12fd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1982,7 +1982,7 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp, if (!qp->device->create_flow) return ERR_PTR(-EOPNOTSUPP); - flow_id = qp->device->create_flow(qp, flow_attr, domain); + flow_id = qp->device->create_flow(qp, flow_attr, domain, NULL); if (!IS_ERR(flow_id)) { atomic_inc(&qp->usecnt); flow_id->qp = qp; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 5b70744f414a..5b88bdd1ecef 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1847,7 +1847,7 @@ static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev, static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, - int domain) + int domain, struct ib_udata *udata) { int err = 0, i = 0, j = 0; struct mlx4_ib_flow *mflow; @@ -1865,6 +1865,10 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, (flow_attr->type != IB_FLOW_ATTR_NORMAL)) return ERR_PTR(-EOPNOTSUPP); + if (udata && + udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index daa919e5a442..e94df85ddf08 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3245,7 +3245,8 @@ err: static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, - int domain) + int domain, + struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); @@ -3257,6 +3258,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, int err; int underlay_qpn; + if (udata && + udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOMEM); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3769a1cc99b0..ea97b91dd88c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2427,7 +2427,8 @@ struct ib_device { struct ib_flow * (*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, - int domain); + int domain, + struct ib_udata *udata); int (*destroy_flow)(struct ib_flow *flow_id); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); -- cgit v1.2.3 From b29e2a1309e38cd1afa598a54f3ccb4e4d2ee01c Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:38 +0300 Subject: IB/mlx5: Add counters create and destroy support This patch implements the device counters create and destroy APIs and introducing some internal management structures. Downstream patches in this series will add the functionality to support flow counters binding and reading. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 23 +++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 10 ++++++++++ 2 files changed, 33 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e94df85ddf08..81471013b776 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5006,6 +5006,27 @@ static void depopulate_specs_root(struct mlx5_ib_dev *dev) uverbs_free_spec_tree(dev->ib_dev.specs_root); } +static int mlx5_ib_destroy_counters(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + kfree(mcounters); + + return 0; +} + +static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters; + + mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL); + if (!mcounters) + return ERR_PTR(-ENOMEM); + + return &mcounters->ibcntrs; +} + void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); @@ -5249,6 +5270,8 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action; dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp; dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; + dev->ib_dev.create_counters = mlx5_ib_create_counters; + dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters; err = init_node_data(dev); if (err) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 49a1aa0ff429..fd27ec1aed08 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -813,6 +813,16 @@ struct mlx5_memic { DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); }; +struct mlx5_ib_mcounters { + struct ib_counters ibcntrs; +}; + +static inline struct mlx5_ib_mcounters * +to_mcounters(struct ib_counters *ibcntrs) +{ + return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs); +} + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; -- cgit v1.2.3 From 3b3233fbf02ee4c5de4d635ca6c4f2566d9716df Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:39 +0300 Subject: IB/mlx5: Add flow counters binding support Associates a counters with a flow when IB_FLOW_SPEC_ACTION_COUNT is part of the flow specifications. The counters user space placements of location and description (index, description) pairs are passed as private data of the counters flow specification. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 222 +++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 15 +++ include/linux/mlx5/fs.h | 1 + include/uapi/rdma/mlx5-abi.h | 24 ++++ 4 files changed, 249 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 81471013b776..c52841bad4e7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2449,6 +2449,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) #define LAST_TUNNEL_FIELD tunnel_id #define LAST_FLOW_TAG_FIELD tag_id #define LAST_DROP_FIELD size +#define LAST_COUNTERS_FIELD counters /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ @@ -2721,6 +2722,18 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, if (ret) return ret; break; + case IB_FLOW_SPEC_ACTION_COUNT: + if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count, + LAST_COUNTERS_FIELD)) + return -EOPNOTSUPP; + + /* for now support only one counters spec per flow */ + if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) + return -EINVAL; + + action->counters = ib_spec->flow_count.counters; + action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + break; default: return -EINVAL; } @@ -2868,6 +2881,17 @@ static void put_flow_table(struct mlx5_ib_dev *dev, } } +static void counters_clear_description(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + mutex_lock(&mcounters->mcntrs_mutex); + kfree(mcounters->counters_data); + mcounters->counters_data = NULL; + mcounters->cntrs_max_index = 0; + mutex_unlock(&mcounters->mcntrs_mutex); +} + static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) { struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); @@ -2887,8 +2911,11 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) mlx5_del_flow_rules(handler->rule); put_flow_table(dev, handler->prio, true); - mutex_unlock(&dev->flow_db->lock); + if (handler->ibcounters && + atomic_read(&handler->ibcounters->usecnt) == 1) + counters_clear_description(handler->ibcounters); + mutex_unlock(&dev->flow_db->lock); kfree(handler); return 0; @@ -3008,21 +3035,127 @@ static void set_underlay_qp(struct mlx5_ib_dev *dev, } } +static int counters_set_description(struct ib_counters *counters, + enum mlx5_ib_counters_type counters_type, + struct mlx5_ib_flow_counters_desc *desc_data, + u32 ncounters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + u32 cntrs_max_index = 0; + int i; + + if (counters_type != MLX5_IB_COUNTERS_FLOW) + return -EINVAL; + + /* init the fields for the object */ + mcounters->type = counters_type; + mcounters->ncounters = ncounters; + /* each counter entry have both description and index pair */ + for (i = 0; i < ncounters; i++) { + if (desc_data[i].description > IB_COUNTER_BYTES) + return -EINVAL; + + if (cntrs_max_index <= desc_data[i].index) + cntrs_max_index = desc_data[i].index + 1; + } + + mutex_lock(&mcounters->mcntrs_mutex); + mcounters->counters_data = desc_data; + mcounters->cntrs_max_index = cntrs_max_index; + mutex_unlock(&mcounters->mcntrs_mutex); + + return 0; +} + +#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2)) +static int flow_counters_set_data(struct ib_counters *ibcounters, + struct mlx5_ib_create_flow *ucmd) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters); + struct mlx5_ib_flow_counters_data *cntrs_data = NULL; + struct mlx5_ib_flow_counters_desc *desc_data = NULL; + bool hw_hndl = false; + int ret = 0; + + if (ucmd && ucmd->ncounters_data != 0) { + cntrs_data = ucmd->data; + if (cntrs_data->ncounters > MAX_COUNTERS_NUM) + return -EINVAL; + + desc_data = kcalloc(cntrs_data->ncounters, + sizeof(*desc_data), + GFP_KERNEL); + if (!desc_data) + return -ENOMEM; + + if (copy_from_user(desc_data, + u64_to_user_ptr(cntrs_data->counters_data), + sizeof(*desc_data) * cntrs_data->ncounters)) { + ret = -EFAULT; + goto free; + } + } + + if (!mcounters->hw_cntrs_hndl) { + mcounters->hw_cntrs_hndl = mlx5_fc_create( + to_mdev(ibcounters->device)->mdev, false); + if (!mcounters->hw_cntrs_hndl) { + ret = -ENOMEM; + goto free; + } + hw_hndl = true; + } + + if (desc_data) { + /* counters already bound to at least one flow */ + if (mcounters->cntrs_max_index) { + ret = -EINVAL; + goto free_hndl; + } + + ret = counters_set_description(ibcounters, + MLX5_IB_COUNTERS_FLOW, + desc_data, + cntrs_data->ncounters); + if (ret) + goto free_hndl; + + } else if (!mcounters->cntrs_max_index) { + /* counters not bound yet, must have udata passed */ + ret = -EINVAL; + goto free_hndl; + } + + return 0; + +free_hndl: + if (hw_hndl) { + mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev, + mcounters->hw_cntrs_hndl); + mcounters->hw_cntrs_hndl = NULL; + } +free: + kfree(desc_data); + return ret; +} + static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, const struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst, - u32 underlay_qpn) + u32 underlay_qpn, + struct mlx5_ib_create_flow *ucmd) { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_flow_spec *spec; - struct mlx5_flow_destination *rule_dst = dst; + struct mlx5_flow_destination dest_arr[2] = {}; + struct mlx5_flow_destination *rule_dst = dest_arr; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; int err = 0; - int dest_num = 1; + int dest_num = 0; bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; if (!is_valid_attr(dev->mdev, flow_attr)) @@ -3036,6 +3169,10 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, } INIT_LIST_HEAD(&handler->list); + if (dst) { + memcpy(&dest_arr[0], dst, sizeof(*dst)); + dest_num++; + } for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(dev->mdev, spec->match_criteria, @@ -3070,15 +3207,30 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, goto free; } + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + err = flow_counters_set_data(flow_act.counters, ucmd); + if (err) + goto free; + + handler->ibcounters = flow_act.counters; + dest_arr[dest_num].type = + MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest_arr[dest_num].counter = + to_mcounters(flow_act.counters)->hw_cntrs_hndl; + dest_num++; + } + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) { - rule_dst = NULL; - dest_num = 0; + if (!(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT)) { + rule_dst = NULL; + dest_num = 0; + } } else { if (is_egress) flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; else flow_act.action |= - dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + dest_num ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; } @@ -3104,8 +3256,12 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, ft_prio->flow_table = ft; free: - if (err) + if (err && handler) { + if (handler->ibcounters && + atomic_read(&handler->ibcounters->usecnt) == 1) + counters_clear_description(handler->ibcounters); kfree(handler); + } kvfree(spec); return err ? ERR_PTR(err) : handler; } @@ -3115,7 +3271,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, const struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { - return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0); + return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); } static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, @@ -3255,12 +3411,43 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; + struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr; + size_t min_ucmd_sz, required_ucmd_sz; int err; int underlay_qpn; - if (udata && - udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen)) - return ERR_PTR(-EOPNOTSUPP); + if (udata && udata->inlen) { + min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) + + sizeof(ucmd_hdr.reserved); + if (udata->inlen < min_ucmd_sz) + return ERR_PTR(-EOPNOTSUPP); + + err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz); + if (err) + return ERR_PTR(err); + + /* currently supports only one counters data */ + if (ucmd_hdr.ncounters_data > 1) + return ERR_PTR(-EINVAL); + + required_ucmd_sz = min_ucmd_sz + + sizeof(struct mlx5_ib_flow_counters_data) * + ucmd_hdr.ncounters_data; + if (udata->inlen > required_ucmd_sz && + !ib_is_udata_cleared(udata, required_ucmd_sz, + udata->inlen - required_ucmd_sz)) + return ERR_PTR(-EOPNOTSUPP); + + ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL); + if (!ucmd) + return ERR_PTR(-ENOMEM); + + err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz); + if (err) { + kfree(ucmd); + return ERR_PTR(err); + } + } if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOMEM); @@ -3315,7 +3502,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ? mqp->underlay_qpn : 0; handler = _create_flow_rule(dev, ft_prio, flow_attr, - dst, underlay_qpn); + dst, underlay_qpn, ucmd); } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { @@ -3336,6 +3523,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, mutex_unlock(&dev->flow_db->lock); kfree(dst); + kfree(ucmd); return &handler->ibflow; @@ -3346,6 +3534,7 @@ destroy_ft: unlock: mutex_unlock(&dev->flow_db->lock); kfree(dst); + kfree(ucmd); kfree(handler); return ERR_PTR(err); } @@ -5010,6 +5199,11 @@ static int mlx5_ib_destroy_counters(struct ib_counters *counters) { struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + counters_clear_description(counters); + if (mcounters->hw_cntrs_hndl) + mlx5_fc_destroy(to_mdev(counters->device)->mdev, + mcounters->hw_cntrs_hndl); + kfree(mcounters); return 0; @@ -5024,6 +5218,8 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, if (!mcounters) return ERR_PTR(-ENOMEM); + mutex_init(&mcounters->mcntrs_mutex); + return &mcounters->ibcntrs; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index fd27ec1aed08..155bca627222 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -175,6 +175,7 @@ struct mlx5_ib_flow_handler { struct ib_flow ibflow; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_handle *rule; + struct ib_counters *ibcounters; }; struct mlx5_ib_flow_db { @@ -813,8 +814,22 @@ struct mlx5_memic { DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); }; +enum mlx5_ib_counters_type { + MLX5_IB_COUNTERS_FLOW, +}; + struct mlx5_ib_mcounters { struct ib_counters ibcntrs; + enum mlx5_ib_counters_type type; + void *hw_cntrs_hndl; + /* max index set as part of create_flow */ + u32 cntrs_max_index; + /* number of counters data entries ( pair) */ + u32 ncounters; + /* counters data array for descriptions and indexes */ + struct mlx5_ib_flow_counters_desc *counters_data; + /* protects access to mcounters internal data */ + struct mutex mcntrs_mutex; }; static inline struct mlx5_ib_mcounters * diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 3b4c3298061c..757b4a30281e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -160,6 +160,7 @@ struct mlx5_flow_act { u32 modify_id; uintptr_t esp_id; struct mlx5_fs_vlan vlan; + struct ib_counters *counters; }; #define MLX5_DECLARE_FLOW_ACT(name) \ diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index cb4a02c4a1ce..ab71e939eb78 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -36,6 +36,7 @@ #include #include /* For ETH_ALEN. */ +#include enum { MLX5_QP_FLAG_SIGNATURE = 1 << 0, @@ -441,4 +442,27 @@ enum { enum { MLX5_IB_CLOCK_INFO_V1 = 0, }; + +struct mlx5_ib_flow_counters_desc { + __u32 description; + __u32 index; +}; + +struct mlx5_ib_flow_counters_data { + RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data); + __u32 ncounters; + __u32 reserved; +}; + +struct mlx5_ib_create_flow { + __u32 ncounters_data; + __u32 reserved; + /* + * Following are counters data based on ncounters_data, each + * entry in the data[] should match a corresponding counter object + * that was pointed by a counters spec upon the flow creation + */ + struct mlx5_ib_flow_counters_data data[]; +}; + #endif /* MLX5_ABI_USER_H */ -- cgit v1.2.3 From 5e95af5f7b60796ccd890a39c0ed9c5df3537952 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:40 +0300 Subject: IB/mlx5: Add flow counters read support Implements the flow counters read wrapper. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 15 +++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 13 ++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c52841bad4e7..59e9d10e54b7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3035,6 +3035,19 @@ static void set_underlay_qp(struct mlx5_ib_dev *dev, } } +static int read_flow_counters(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr) +{ + struct mlx5_fc *fc = read_attr->hw_cntrs_hndl; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + return mlx5_fc_query(dev->mdev, fc, + &read_attr->out[IB_COUNTER_PACKETS], + &read_attr->out[IB_COUNTER_BYTES]); +} + +/* flow counters currently expose two counters packets and bytes */ +#define FLOW_COUNTERS_NUM 2 static int counters_set_description(struct ib_counters *counters, enum mlx5_ib_counters_type counters_type, struct mlx5_ib_flow_counters_desc *desc_data, @@ -3049,6 +3062,8 @@ static int counters_set_description(struct ib_counters *counters, /* init the fields for the object */ mcounters->type = counters_type; + mcounters->read_counters = read_flow_counters; + mcounters->counters_num = FLOW_COUNTERS_NUM; mcounters->ncounters = ncounters; /* each counter entry have both description and index pair */ for (i = 0; i < ncounters; i++) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 155bca627222..d89c8fe626f6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -814,6 +814,12 @@ struct mlx5_memic { DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); }; +struct mlx5_read_counters_attr { + struct mlx5_fc *hw_cntrs_hndl; + u64 *out; + u32 flags; +}; + enum mlx5_ib_counters_type { MLX5_IB_COUNTERS_FLOW, }; @@ -821,7 +827,12 @@ enum mlx5_ib_counters_type { struct mlx5_ib_mcounters { struct ib_counters ibcntrs; enum mlx5_ib_counters_type type; - void *hw_cntrs_hndl; + /* number of counters supported for this counters type */ + u32 counters_num; + struct mlx5_fc *hw_cntrs_hndl; + /* read function for this counters type */ + int (*read_counters)(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr); /* max index set as part of create_flow */ u32 cntrs_max_index; /* number of counters data entries ( pair) */ -- cgit v1.2.3 From 1a1e03dc15cfa94b7e878a32a979705df614d9c4 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:41 +0300 Subject: IB/mlx5: Add counters read support This patch implements the uverbs counters read API, it will use the specific read counters function to the given type to accomplish its task. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 59e9d10e54b7..7a563478d0b2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5210,6 +5210,48 @@ static void depopulate_specs_root(struct mlx5_ib_dev *dev) uverbs_free_spec_tree(dev->ib_dev.specs_root); } +static int mlx5_ib_read_counters(struct ib_counters *counters, + struct ib_counters_read_attr *read_attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + struct mlx5_read_counters_attr mread_attr = {}; + struct mlx5_ib_flow_counters_desc *desc; + int ret, i; + + mutex_lock(&mcounters->mcntrs_mutex); + if (mcounters->cntrs_max_index > read_attr->ncounters) { + ret = -EINVAL; + goto err_bound; + } + + mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64), + GFP_KERNEL); + if (!mread_attr.out) { + ret = -ENOMEM; + goto err_bound; + } + + mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl; + mread_attr.flags = read_attr->flags; + ret = mcounters->read_counters(counters->device, &mread_attr); + if (ret) + goto err_read; + + /* do the pass over the counters data array to assign according to the + * descriptions and indexing pairs + */ + desc = mcounters->counters_data; + for (i = 0; i < mcounters->ncounters; i++) + read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description]; + +err_read: + kfree(mread_attr.out); +err_bound: + mutex_unlock(&mcounters->mcntrs_mutex); + return ret; +} + static int mlx5_ib_destroy_counters(struct ib_counters *counters) { struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); @@ -5483,6 +5525,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; dev->ib_dev.create_counters = mlx5_ib_create_counters; dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters; + dev->ib_dev.read_counters = mlx5_ib_read_counters; err = init_node_data(dev); if (err) -- cgit v1.2.3 From e4b1672ac0a54c7740cbc4ff39dfdc56182236cb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 May 2018 23:58:18 +0200 Subject: iw_cxgb4: add INFINIBAND_ADDR_TRANS dependency The newly added fill_res_ep_entry function fails to link if CONFIG_INFINIBAND_ADDR_TRANS is not set: drivers/infiniband/hw/cxgb4/restrack.o: In function `fill_res_ep_entry': restrack.c:(.text+0x3cc): undefined reference to `rdma_res_to_id' restrack.c:(.text+0x3d0): undefined reference to `rdma_iw_cm_id' This adds a Kconfig dependency for the driver. Fixes: 116aeb887371 ("iw_cxgb4: provide detailed provider-specific CM_ID information") Signed-off-by: Arnd Bergmann Acked-by: Greg Thelen Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig index 0a671a61fc92..e0522a5d5a06 100644 --- a/drivers/infiniband/hw/cxgb4/Kconfig +++ b/drivers/infiniband/hw/cxgb4/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_CXGB4 tristate "Chelsio T4/T5 RDMA Driver" depends on CHELSIO_T4 && INET + depends on INFINIBAND_ADDR_TRANS select CHELSIO_LIB select GENERIC_ALLOCATOR ---help--- -- cgit v1.2.3 From 8c61b24585c44e1de337e45858129abce9c3a008 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 3 Jun 2018 17:32:22 +0800 Subject: IB/hns: Use zeroing memory allocator instead of allocator/memset Use dma_zalloc_coherent for allocating zeroed memory and remove unnecessary memset function. Signed-off-by: YueHaibing Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index a40ec939ece5..46f65f9f59d0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -197,7 +197,8 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, buf->npages = 1 << order; buf->page_shift = page_shift; /* MTT PA must be recorded in 4k alignment, t is 4k aligned */ - buf->direct.buf = dma_alloc_coherent(dev, size, &t, GFP_KERNEL); + buf->direct.buf = dma_zalloc_coherent(dev, + size, &t, GFP_KERNEL); if (!buf->direct.buf) return -ENOMEM; @@ -207,8 +208,6 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, --buf->page_shift; buf->npages *= 2; } - - memset(buf->direct.buf, 0, size); } else { buf->nbufs = (size + page_size - 1) / page_size; buf->npages = buf->nbufs; @@ -220,7 +219,7 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, return -ENOMEM; for (i = 0; i < buf->nbufs; ++i) { - buf->page_list[i].buf = dma_alloc_coherent(dev, + buf->page_list[i].buf = dma_zalloc_coherent(dev, page_size, &t, GFP_KERNEL); @@ -228,7 +227,6 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, goto err_free; buf->page_list[i].map = t; - memset(buf->page_list[i].buf, 0, page_size); } } -- cgit v1.2.3 From 1bc0299d976e000ececc6acd76e33b4582646cb7 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 31 May 2018 11:30:09 -0700 Subject: IB/hfi1: Fix user context tail allocation for DMA_RTAIL The following code fails to allocate a buffer for the tail address that the hardware DMAs into when the user context DMA_RTAIL is set. if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( &dd->pcidev->dev, PAGE_SIZE, &dma_hdrqtail, gfp_flags); if (!rcd->rcvhdrtail_kvaddr) goto bail_free; rcd->rcvhdrqtailaddr_dma = dma_hdrqtail; } So the rcvhdrtail_kvaddr would then be NULL. The mmap logic fails to check for a NULL rcvhdrtail_kvaddr. The fix is to test for both user and kernel DMA_TAIL options during the allocation as well as testing for a NULL rcvhdrtail_kvaddr during the mmap processing. Additionally, all downstream testing of the capmask for DMA_RTAIL have been eliminated in favor of testing rcvhdrtail_kvaddr. Cc: # 4.9.x Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 8 ++++---- drivers/infiniband/hw/hfi1/file_ops.c | 2 +- drivers/infiniband/hw/hfi1/init.c | 9 ++++----- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 68580cb2ae1e..f75080d63142 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -6841,7 +6841,7 @@ static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) } rcvmask = HFI1_RCVCTRL_CTXT_ENB; /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */ - rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? + rcvmask |= rcd->rcvhdrtail_kvaddr ? HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; hfi1_rcvctrl(dd, rcvmask, rcd); hfi1_rcd_put(rcd); @@ -8367,7 +8367,7 @@ static inline int check_packet_present(struct hfi1_ctxtdata *rcd) u32 tail; int present; - if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) + if (!rcd->rcvhdrtail_kvaddr) present = (rcd->seq_cnt == rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd)))); else /* is RDMA rtail */ @@ -11843,7 +11843,7 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, /* reset the tail and hdr addresses, and sequence count */ write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR, rcd->rcvhdrq_dma); - if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) + if (rcd->rcvhdrtail_kvaddr) write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, rcd->rcvhdrqtailaddr_dma); rcd->seq_cnt = 1; @@ -11923,7 +11923,7 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; - if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_dma) + if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr) rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; if (op & HFI1_RCVCTRL_TAILUPD_DIS) { /* See comment on RcvCtxtCtrl.TailUpd above */ diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index c9d23c37a371..0fc4aa9455c3 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -505,7 +505,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EINVAL; goto done; } - if (flags & VM_WRITE) { + if ((flags & VM_WRITE) || !uctxt->rcvhdrtail_kvaddr) { ret = -EPERM; goto done; } diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 3feecf926322..4a478ee0a79b 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1853,7 +1853,6 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) u64 reg; if (!rcd->rcvhdrq) { - dma_addr_t dma_hdrqtail; gfp_t gfp_flags; /* @@ -1878,13 +1877,13 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) goto bail; } - if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || + HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( - &dd->pcidev->dev, PAGE_SIZE, &dma_hdrqtail, - gfp_flags); + &dd->pcidev->dev, PAGE_SIZE, + &rcd->rcvhdrqtailaddr_dma, gfp_flags); if (!rcd->rcvhdrtail_kvaddr) goto bail_free; - rcd->rcvhdrqtailaddr_dma = dma_hdrqtail; } rcd->rcvhdrq_size = amt; -- cgit v1.2.3 From f9458bc2c1303bcbd02645de0d59e4b0210c669a Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 31 May 2018 11:30:17 -0700 Subject: IB/hfi1: Ensure VL index is within bounds Improve the safety of the code and ensure the array cannot be indexed out of bounds when picking the CPU for a given SDMA engine. Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Kaike Wan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/sdma.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 1f203309cf24..298e0e3fc0c9 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -923,9 +923,10 @@ ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, cpumask_var_t mask, new_mask; unsigned long cpu; int ret, vl, sz; + struct sdma_rht_node *rht_node; vl = sdma_engine_get_vl(sde); - if (unlikely(vl < 0)) + if (unlikely(vl < 0 || vl >= ARRAY_SIZE(rht_node->map))) return -EINVAL; ret = zalloc_cpumask_var(&mask, GFP_KERNEL); @@ -953,19 +954,12 @@ ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, mutex_lock(&process_to_sde_mutex); for_each_cpu(cpu, mask) { - struct sdma_rht_node *rht_node; - /* Check if we have this already mapped */ if (cpumask_test_cpu(cpu, &sde->cpu_mask)) { cpumask_set_cpu(cpu, new_mask); continue; } - if (vl >= ARRAY_SIZE(rht_node->map)) { - ret = -EINVAL; - goto out; - } - rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu, sdma_rht_params); if (!rht_node) { -- cgit v1.2.3 From 5465f11083629e99cb34767790316ea076f7502f Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 4 Jun 2018 11:43:12 -0700 Subject: IB/hfi1: Remove unused variable The variable extended_psn was not used any more. Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index a3d192424344..d2bc77f75253 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -1,7 +1,7 @@ #ifndef _HFI1_USER_SDMA_H #define _HFI1_USER_SDMA_H /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -122,8 +122,6 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size, (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ ##__VA_ARGS__) -extern uint extended_psn; - struct hfi1_user_sdma_pkt_q { u16 ctxt; u16 subctxt; -- cgit v1.2.3 From dc2b2a917c3427223188ac476afc915831b1244c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 4 Jun 2018 11:43:21 -0700 Subject: IB/hfi1: Add bypass register defines and replace blind constants These registers were not added in the 16B work. Add them and replace blind constants with the correct defines. Fixes: 72c07e2b671e ("IB/hfi1: Add support to receive 16B bypass packets") Reviewed-by: Don Hiatt Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 4 +++- drivers/infiniband/hw/hfi1/chip_registers.h | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index f75080d63142..6deb101cdd43 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14640,7 +14640,9 @@ static void init_rxe(struct hfi1_devdata *dd) /* Have 16 bytes (4DW) of bypass header available in header queue */ val = read_csr(dd, RCV_BYPASS); - val |= (4ull << 16); + val &= ~RCV_BYPASS_HDR_SIZE_SMASK; + val |= ((4ull & RCV_BYPASS_HDR_SIZE_MASK) << + RCV_BYPASS_HDR_SIZE_SHIFT); write_csr(dd, RCV_BYPASS, val); } diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index da598b5fe8f6..ee6dca5e2a2f 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -638,6 +638,12 @@ #define RCV_BTH_QP_KDETH_QP_MASK 0xFFull #define RCV_BTH_QP_KDETH_QP_SHIFT 16 #define RCV_BYPASS (RXE + 0x000000000038) +#define RCV_BYPASS_HDR_SIZE_SHIFT 16 +#define RCV_BYPASS_HDR_SIZE_MASK 0x1Full +#define RCV_BYPASS_HDR_SIZE_SMASK 0x1F0000ull +#define RCV_BYPASS_BYPASS_CONTEXT_SHIFT 0 +#define RCV_BYPASS_BYPASS_CONTEXT_MASK 0xFFull +#define RCV_BYPASS_BYPASS_CONTEXT_SMASK 0xFFull #define RCV_CONTEXTS (RXE + 0x000000000010) #define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400) #define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500) -- cgit v1.2.3 From ed71e86a8d66ec018d16047cdd507f95c89e257b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 4 Jun 2018 11:43:54 -0700 Subject: IB/hfi1: Rename exp_lock to exp_mutex The mutex exp_lock in struct hfi1_ctxtdata is used to protect all Expected TID data of a user context. This patch renames it to exp_mutex to better reflect its identity and prepare for upcoming patches. Reviewed-by: Ashutosh Dixit Reviewed-by: Mike Marciniszyn Signed-off-by: Harish Chegondi Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/hfi.h | 4 ++-- drivers/infiniband/hw/hfi1/init.c | 2 +- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 5eb3bf0849c7..4ab8b5bfbed1 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -245,8 +245,8 @@ struct hfi1_ctxtdata { struct exp_tid_set tid_used_list; struct exp_tid_set tid_full_list; - /* lock protecting all Expected TID data */ - struct mutex exp_lock; + /* lock protecting all Expected TID data of user contexts */ + struct mutex exp_mutex; /* per-context configuration flags */ unsigned long flags; /* per-context event flags for fileops/intr communication */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 4a478ee0a79b..561ad66d0ab3 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -368,7 +368,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; - mutex_init(&rcd->exp_lock); + mutex_init(&rcd->exp_mutex); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 6a4c5142515a..dbe7d14a5c76 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -375,7 +375,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, * From this point on, we are going to be using shared (between master * and subcontexts) context resources. We need to take the lock. */ - mutex_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_mutex); /* * The first step is to program the RcvArray entries which are complete * groups. @@ -461,7 +461,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, } } unlock: - mutex_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_mutex); nomem: hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, mapped_pages, ret); @@ -517,7 +517,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, if (IS_ERR(tidinfo)) return PTR_ERR(tidinfo); - mutex_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_mutex); for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); if (ret) { @@ -530,7 +530,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, fd->tid_used -= tididx; spin_unlock(&fd->tid_lock); tinfo->tidcnt = tididx; - mutex_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_mutex); kfree(tidinfo); return ret; -- cgit v1.2.3 From d9a6ce68a0117fd13785d8316bf24f61bb6e2e72 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 4 Jun 2018 11:44:11 -0700 Subject: IB/hfi1: Fix comment on default hdr entry size The comment for the default header queue entry size is incorrect. Correct the comment and fix the resulting S_IRUGO warning that shows up in the widened patch context. Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 561ad66d0ab3..f110842b91f5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -113,8 +113,8 @@ module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); static uint hfi1_hdrq_entsize = 32; -module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO); -MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B"); +module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); +MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); unsigned int user_credit_return_threshold = 33; /* default is 33% */ module_param(user_credit_return_threshold, uint, S_IRUGO); -- cgit v1.2.3 From c1191a19fecad92b73c25770a7f47174280ca564 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 5 Jun 2018 07:53:53 +0300 Subject: RDMA/mlx5: Update SPDX tags to show proper license Mellanox code is supposed to be OpenIB compliant code, so let's update SPDX tags to show it. Fixes: fc385b7ac480 ("IB/mlx5: Add basic regiser/unregister representors code") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ib_rep.c | 2 +- drivers/infiniband/hw/mlx5/ib_rep.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/hw') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 0e04fdddf670..35a0e04c38f2 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2018 Mellanox Technologies. All rights reserved. */ diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h index 046fd942fd46..2ba73636a2fb 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.h +++ b/drivers/infiniband/hw/mlx5/ib_rep.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2018 Mellanox Technologies. All rights reserved. */ -- cgit v1.2.3