From f0697bf078368d765b9e9ceef1dac0d5eb69b4b6 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Mon, 11 Mar 2024 19:38:19 +0800 Subject: RDMA/erdma: Allocate doorbell records from dma pool Currently, the 8 byte doorbell record is allocated along with the queue buffer, which may result in waste of dma space when the queue buffer is page aligned. To address this issue, we introduce a dma pool named db_pool and allocate doorbell record from it. Reviewed-by: Cheng Xu Signed-off-by: Boshi Yu Link: https://lore.kernel.org/r/20240311113821.22482-2-boshiyu@alibaba-inc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 7 +- drivers/infiniband/hw/erdma/erdma_cmdq.c | 102 ++++++++++++++++++------------ drivers/infiniband/hw/erdma/erdma_eq.c | 55 ++++++++++------ drivers/infiniband/hw/erdma/erdma_main.c | 15 ++++- drivers/infiniband/hw/erdma/erdma_verbs.c | 85 ++++++++++++++----------- drivers/infiniband/hw/erdma/erdma_verbs.h | 4 ++ 6 files changed, 167 insertions(+), 101 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 5df401a30cb9..e116263a608f 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -34,6 +34,7 @@ struct erdma_eq { void __iomem *db; u64 *db_record; + dma_addr_t db_record_dma_addr; }; struct erdma_cmdq_sq { @@ -49,6 +50,7 @@ struct erdma_cmdq_sq { u16 wqebb_cnt; u64 *db_record; + dma_addr_t db_record_dma_addr; }; struct erdma_cmdq_cq { @@ -62,6 +64,7 @@ struct erdma_cmdq_cq { u32 cmdsn; u64 *db_record; + dma_addr_t db_record_dma_addr; atomic64_t armed_num; }; @@ -177,9 +180,6 @@ enum { ERDMA_RES_CNT = 2, }; -#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE -#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE) - struct erdma_dev { struct ib_device ibdev; struct net_device *netdev; @@ -213,6 +213,7 @@ struct erdma_dev { atomic_t num_ctx; struct list_head cep_list; + struct dma_pool *db_pool; struct dma_pool *resp_pool; }; diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index a151a7bdd504..c2c666040949 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -89,20 +89,19 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_cmdq_sq *sq = &cmdq->sq; - u32 buf_size; sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE); sq->depth = cmdq->max_outstandings * sq->wqebb_cnt; - buf_size = sq->depth << SQEBB_SHIFT; - - sq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &sq->qbuf_dma_addr, GFP_KERNEL); + sq->qbuf = dma_alloc_coherent(&dev->pdev->dev, sq->depth << SQEBB_SHIFT, + &sq->qbuf_dma_addr, GFP_KERNEL); if (!sq->qbuf) return -ENOMEM; - sq->db_record = (u64 *)(sq->qbuf + buf_size); + sq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &sq->db_record_dma_addr); + if (!sq->db_record) + goto err_out; spin_lock_init(&sq->lock); @@ -112,29 +111,35 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) lower_32_bits(sq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, - sq->qbuf_dma_addr + buf_size); + sq->db_record_dma_addr); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, sq->depth << SQEBB_SHIFT, + sq->qbuf, sq->qbuf_dma_addr); + + return -ENOMEM; } static int erdma_cmdq_cq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_cmdq_cq *cq = &cmdq->cq; - u32 buf_size; cq->depth = cmdq->sq.depth; - buf_size = cq->depth << CQE_SHIFT; - - cq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + cq->qbuf = dma_alloc_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, + &cq->qbuf_dma_addr, + GFP_KERNEL | __GFP_ZERO); if (!cq->qbuf) return -ENOMEM; spin_lock_init(&cq->lock); - cq->db_record = (u64 *)(cq->qbuf + buf_size); + cq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &cq->db_record_dma_addr); + if (!cq->db_record) + goto err_out; atomic64_set(&cq->armed_num, 0); @@ -143,23 +148,26 @@ static int erdma_cmdq_cq_init(struct erdma_dev *dev) erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, lower_32_bits(cq->qbuf_dma_addr)); erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, - cq->qbuf_dma_addr + buf_size); + cq->db_record_dma_addr); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, cq->qbuf, + cq->qbuf_dma_addr); + + return -ENOMEM; } static int erdma_cmdq_eq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_eq *eq = &cmdq->eq; - u32 buf_size; eq->depth = cmdq->max_outstandings; - buf_size = eq->depth << EQE_SHIFT; - - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, + &eq->qbuf_dma_addr, + GFP_KERNEL | __GFP_ZERO); if (!eq->qbuf) return -ENOMEM; @@ -167,7 +175,10 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) atomic64_set(&eq->event_num, 0); eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG; - eq->db_record = (u64 *)(eq->qbuf + buf_size); + eq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &eq->db_record_dma_addr); + if (!eq->db_record) + goto err_out; erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); @@ -175,9 +186,15 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, - eq->qbuf_dma_addr + buf_size); + eq->db_record_dma_addr); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, + eq->qbuf_dma_addr); + + return -ENOMEM; } int erdma_cmdq_init(struct erdma_dev *dev) @@ -211,17 +228,19 @@ int erdma_cmdq_init(struct erdma_dev *dev) return 0; err_destroy_cq: - dma_free_coherent(&dev->pdev->dev, - (cmdq->cq.depth << CQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cmdq->cq.db_record, + cmdq->cq.db_record_dma_addr); + err_destroy_sq: - dma_free_coherent(&dev->pdev->dev, - (cmdq->sq.depth << SQEBB_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cmdq->sq.db_record, + cmdq->sq.db_record_dma_addr); + return err; } @@ -238,18 +257,23 @@ void erdma_cmdq_destroy(struct erdma_dev *dev) clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); - dma_free_coherent(&dev->pdev->dev, - (cmdq->eq.depth << EQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + dma_free_coherent(&dev->pdev->dev, cmdq->eq.depth << EQE_SHIFT, cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); - dma_free_coherent(&dev->pdev->dev, - (cmdq->sq.depth << SQEBB_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + + dma_pool_free(dev->db_pool, cmdq->eq.db_record, + cmdq->eq.db_record_dma_addr); + + dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); - dma_free_coherent(&dev->pdev->dev, - (cmdq->cq.depth << CQE_SHIFT) + - ERDMA_EXTRA_BUFFER_SIZE, + + dma_pool_free(dev->db_pool, cmdq->sq.db_record, + cmdq->sq.db_record_dma_addr); + + dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + + dma_pool_free(dev->db_pool, cmdq->cq.db_record, + cmdq->cq.db_record_dma_addr); } static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index ea47cb21fdb8..809c33628f38 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -83,14 +83,12 @@ void erdma_aeq_event_handler(struct erdma_dev *dev) int erdma_aeq_init(struct erdma_dev *dev) { struct erdma_eq *eq = &dev->aeq; - u32 buf_size; eq->depth = ERDMA_DEFAULT_EQ_DEPTH; - buf_size = eq->depth << EQE_SHIFT; - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, + &eq->qbuf_dma_addr, + GFP_KERNEL | __GFP_ZERO); if (!eq->qbuf) return -ENOMEM; @@ -99,7 +97,10 @@ int erdma_aeq_init(struct erdma_dev *dev) atomic64_set(&eq->notify_num, 0); eq->db = dev->func_bar + ERDMA_REGS_AEQ_DB_REG; - eq->db_record = (u64 *)(eq->qbuf + buf_size); + eq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &eq->db_record_dma_addr); + if (!eq->db_record) + goto err_out; erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); @@ -107,18 +108,25 @@ int erdma_aeq_init(struct erdma_dev *dev) lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, - eq->qbuf_dma_addr + buf_size); + eq->db_record_dma_addr); return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, + eq->qbuf_dma_addr); + + return -ENOMEM; } void erdma_aeq_destroy(struct erdma_dev *dev) { struct erdma_eq *eq = &dev->aeq; - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf, + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, eq->qbuf_dma_addr); + + dma_pool_free(dev->db_pool, eq->db_record, eq->db_record_dma_addr); } void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) @@ -209,7 +217,6 @@ static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn) static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) { struct erdma_cmdq_create_eq_req req; - dma_addr_t db_info_dma_addr; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_CREATE_EQ); @@ -219,9 +226,8 @@ static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) req.qtype = ERDMA_EQ_TYPE_CEQ; /* Vector index is the same as EQN. */ req.vector_idx = eqn; - db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT); - req.db_dma_addr_l = lower_32_bits(db_info_dma_addr); - req.db_dma_addr_h = upper_32_bits(db_info_dma_addr); + req.db_dma_addr_l = lower_32_bits(eq->db_record_dma_addr); + req.db_dma_addr_h = upper_32_bits(eq->db_record_dma_addr); return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); } @@ -229,12 +235,12 @@ static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) { struct erdma_eq *eq = &dev->ceqs[ceqn].eq; - u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; int ret; - eq->qbuf = - dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), - &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; + eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, + &eq->qbuf_dma_addr, + GFP_KERNEL | __GFP_ZERO); if (!eq->qbuf) return -ENOMEM; @@ -242,10 +248,17 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) atomic64_set(&eq->event_num, 0); atomic64_set(&eq->notify_num, 0); - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + (ceqn + 1) * ERDMA_DB_SIZE; - eq->db_record = (u64 *)(eq->qbuf + buf_size); + + eq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &eq->db_record_dma_addr); + if (!eq->db_record) { + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, + eq->qbuf, eq->qbuf_dma_addr); + return -ENOMEM; + } + eq->ci = 0; dev->ceqs[ceqn].dev = dev; @@ -259,7 +272,6 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) { struct erdma_eq *eq = &dev->ceqs[ceqn].eq; - u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; struct erdma_cmdq_destroy_eq_req req; int err; @@ -276,8 +288,9 @@ static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) if (err) return; - dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf, + dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, eq->qbuf_dma_addr); + dma_pool_free(dev->db_pool, eq->db_record, eq->db_record_dma_addr); } int erdma_ceqs_init(struct erdma_dev *dev) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 472939172f0c..7080f8a71ec4 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -178,16 +178,26 @@ static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) if (!dev->resp_pool) return -ENOMEM; + dev->db_pool = dma_pool_create("erdma_db_pool", &pdev->dev, + ERDMA_DB_SIZE, ERDMA_DB_SIZE, 0); + if (!dev->db_pool) { + ret = -ENOMEM; + goto destroy_resp_pool; + } + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ERDMA_PCI_WIDTH)); if (ret) - goto destroy_pool; + goto destroy_db_pool; dma_set_max_seg_size(&pdev->dev, UINT_MAX); return 0; -destroy_pool: +destroy_db_pool: + dma_pool_destroy(dev->db_pool); + +destroy_resp_pool: dma_pool_destroy(dev->resp_pool); return ret; @@ -195,6 +205,7 @@ destroy_pool: static void erdma_device_uninit(struct erdma_dev *dev) { + dma_pool_destroy(dev->db_pool); dma_pool_destroy(dev->resp_pool); } diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 23dfc01603f8..b78ddca1483e 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -76,10 +76,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; - req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr + - (qp->attrs.sq_size << SQEBB_SHIFT); - req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr + - (qp->attrs.rq_size << RQE_SHIFT); + req.sq_db_info_dma_addr = qp->kern_qp.sq_db_info_dma_addr; + req.rq_db_info_dma_addr = qp->kern_qp.rq_db_info_dma_addr; } else { user_qp = &qp->user_qp; req.sq_cqn_mtt_cfg = FIELD_PREP( @@ -209,8 +207,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) ERDMA_MR_MTT_0LEVEL); req.first_page_offset = 0; - req.cq_db_info_addr = - cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT); + req.cq_db_info_addr = cq->kern_cq.db_record_dma_addr; } else { mem = &cq->user_cq.qbuf_mem; req.cfg0 |= @@ -482,16 +479,24 @@ static void free_kernel_qp(struct erdma_qp *qp) vfree(qp->kern_qp.rwr_tbl); if (qp->kern_qp.sq_buf) - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), - qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + qp->attrs.sq_size << SQEBB_SHIFT, + qp->kern_qp.sq_buf, + qp->kern_qp.sq_buf_dma_addr); + + if (qp->kern_qp.sq_db_info) + dma_pool_free(dev->db_pool, qp->kern_qp.sq_db_info, + qp->kern_qp.sq_db_info_dma_addr); if (qp->kern_qp.rq_buf) - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), - qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + qp->attrs.rq_size << RQE_SHIFT, + qp->kern_qp.rq_buf, + qp->kern_qp.rq_buf_dma_addr); + + if (qp->kern_qp.rq_db_info) + dma_pool_free(dev->db_pool, qp->kern_qp.rq_db_info, + qp->kern_qp.rq_db_info_dma_addr); } static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, @@ -516,20 +521,27 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, if (!kqp->swr_tbl || !kqp->rwr_tbl) goto err_out; - size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + size = qp->attrs.sq_size << SQEBB_SHIFT; kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size, &kqp->sq_buf_dma_addr, GFP_KERNEL); if (!kqp->sq_buf) goto err_out; - size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->sq_db_info = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &kqp->sq_db_info_dma_addr); + if (!kqp->sq_db_info) + goto err_out; + + size = qp->attrs.rq_size << RQE_SHIFT; kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size, &kqp->rq_buf_dma_addr, GFP_KERNEL); if (!kqp->rq_buf) goto err_out; - kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT); - kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT); + kqp->rq_db_info = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &kqp->rq_db_info_dma_addr); + if (!kqp->rq_db_info) + goto err_out; return 0; @@ -1237,9 +1249,10 @@ int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) return err; if (rdma_is_kernel_res(&cq->ibcq.res)) { - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.db_record, + cq->kern_cq.db_record_dma_addr); } else { erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); put_mtt_entries(dev, &cq->user_cq.qbuf_mem); @@ -1279,16 +1292,7 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) wait_for_completion(&qp->safe_free); if (rdma_is_kernel_res(&qp->ibqp.res)) { - vfree(qp->kern_qp.swr_tbl); - vfree(qp->kern_qp.rwr_tbl); - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), - qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); - dma_free_coherent( - &dev->pdev->dev, - WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), - qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + free_kernel_qp(qp); } else { put_mtt_entries(dev, &qp->user_qp.sq_mem); put_mtt_entries(dev, &qp->user_qp.rq_mem); @@ -1600,19 +1604,27 @@ static int erdma_init_kernel_cq(struct erdma_cq *cq) struct erdma_dev *dev = to_edev(cq->ibcq.device); cq->kern_cq.qbuf = - dma_alloc_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + dma_alloc_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL); if (!cq->kern_cq.qbuf) return -ENOMEM; - cq->kern_cq.db_record = - (u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT)); + cq->kern_cq.db_record = dma_pool_zalloc( + dev->db_pool, GFP_KERNEL, &cq->kern_cq.db_record_dma_addr); + if (!cq->kern_cq.db_record) + goto err_out; + spin_lock_init(&cq->kern_cq.lock); /* use default cqdb addr */ cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET; return 0; + +err_out: + dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + + return -ENOMEM; } int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, @@ -1676,9 +1688,10 @@ err_free_res: erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); put_mtt_entries(dev, &cq->user_cq.qbuf_mem); } else { - dma_free_coherent(&dev->pdev->dev, - WARPPED_BUFSIZE(depth << CQE_SHIFT), + dma_free_coherent(&dev->pdev->dev, depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.db_record, + cq->kern_cq.db_record_dma_addr); } err_out_xa: diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index db6018529ccc..b02ffdc8c811 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -170,6 +170,9 @@ struct erdma_kqp { void *sq_db_info; void *rq_db_info; + dma_addr_t sq_db_info_dma_addr; + dma_addr_t rq_db_info_dma_addr; + u8 sig_all; }; @@ -247,6 +250,7 @@ struct erdma_kcq_info { spinlock_t lock; u8 __iomem *db; u64 *db_record; + dma_addr_t db_record_dma_addr; }; struct erdma_ucq_info { -- cgit v1.2.3 From fdb09ed15f272adb7c0403f7a6f9b4db3959284d Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Mon, 11 Mar 2024 19:38:20 +0800 Subject: RDMA/erdma: Unify the names related to doorbell records There exist two different names for the doorbell records: db_info and db_record. We use dbrec for cpu address of the doorbell record and dbrec_dma for dma address of the doorbell recordi uniformly. Reviewed-by: Cheng Xu Signed-off-by: Boshi Yu Link: https://lore.kernel.org/r/20240311113821.22482-3-boshiyu@alibaba-inc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 12 +++--- drivers/infiniband/hw/erdma/erdma_cmdq.c | 43 ++++++++------------- drivers/infiniband/hw/erdma/erdma_cq.c | 2 +- drivers/infiniband/hw/erdma/erdma_eq.c | 23 +++++------ drivers/infiniband/hw/erdma/erdma_hw.h | 6 +-- drivers/infiniband/hw/erdma/erdma_qp.c | 4 +- drivers/infiniband/hw/erdma/erdma_verbs.c | 64 +++++++++++++++---------------- drivers/infiniband/hw/erdma/erdma_verbs.h | 18 ++++----- 8 files changed, 79 insertions(+), 93 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index e116263a608f..c8bd698e21b0 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -33,8 +33,8 @@ struct erdma_eq { atomic64_t notify_num; void __iomem *db; - u64 *db_record; - dma_addr_t db_record_dma_addr; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_cmdq_sq { @@ -49,8 +49,8 @@ struct erdma_cmdq_sq { u16 wqebb_cnt; - u64 *db_record; - dma_addr_t db_record_dma_addr; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_cmdq_cq { @@ -63,8 +63,8 @@ struct erdma_cmdq_cq { u32 ci; u32 cmdsn; - u64 *db_record; - dma_addr_t db_record_dma_addr; + u64 *dbrec; + dma_addr_t dbrec_dma; atomic64_t armed_num; }; diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index c2c666040949..0ac2683cfccf 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -14,7 +14,7 @@ static void arm_cmdq_cq(struct erdma_cmdq *cmdq) FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) | FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn); - *cmdq->cq.db_record = db_data; + *cmdq->cq.dbrec = db_data; writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG); atomic64_inc(&cmdq->cq.armed_num); @@ -25,7 +25,7 @@ static void kick_cmdq_db(struct erdma_cmdq *cmdq) struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi); - *cmdq->sq.db_record = db_data; + *cmdq->sq.dbrec = db_data; writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG); } @@ -98,9 +98,8 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) if (!sq->qbuf) return -ENOMEM; - sq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, - &sq->db_record_dma_addr); - if (!sq->db_record) + sq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &sq->dbrec_dma); + if (!sq->dbrec) goto err_out; spin_lock_init(&sq->lock); @@ -110,8 +109,7 @@ static int erdma_cmdq_sq_init(struct erdma_dev *dev) erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG, lower_32_bits(sq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); - erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, - sq->db_record_dma_addr); + erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, sq->dbrec_dma); return 0; @@ -136,9 +134,8 @@ static int erdma_cmdq_cq_init(struct erdma_dev *dev) spin_lock_init(&cq->lock); - cq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, - &cq->db_record_dma_addr); - if (!cq->db_record) + cq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &cq->dbrec_dma); + if (!cq->dbrec) goto err_out; atomic64_set(&cq->armed_num, 0); @@ -147,8 +144,7 @@ static int erdma_cmdq_cq_init(struct erdma_dev *dev) upper_32_bits(cq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, lower_32_bits(cq->qbuf_dma_addr)); - erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, - cq->db_record_dma_addr); + erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, cq->dbrec_dma); return 0; @@ -175,9 +171,8 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) atomic64_set(&eq->event_num, 0); eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG; - eq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, - &eq->db_record_dma_addr); - if (!eq->db_record) + eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); + if (!eq->dbrec) goto err_out; erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, @@ -185,8 +180,7 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG, lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); - erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, - eq->db_record_dma_addr); + erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; @@ -231,15 +225,13 @@ err_destroy_cq: dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); - dma_pool_free(dev->db_pool, cmdq->cq.db_record, - cmdq->cq.db_record_dma_addr); + dma_pool_free(dev->db_pool, cmdq->cq.dbrec, cmdq->cq.dbrec_dma); err_destroy_sq: dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); - dma_pool_free(dev->db_pool, cmdq->sq.db_record, - cmdq->sq.db_record_dma_addr); + dma_pool_free(dev->db_pool, cmdq->sq.dbrec, cmdq->sq.dbrec_dma); return err; } @@ -260,20 +252,17 @@ void erdma_cmdq_destroy(struct erdma_dev *dev) dma_free_coherent(&dev->pdev->dev, cmdq->eq.depth << EQE_SHIFT, cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); - dma_pool_free(dev->db_pool, cmdq->eq.db_record, - cmdq->eq.db_record_dma_addr); + dma_pool_free(dev->db_pool, cmdq->eq.dbrec, cmdq->eq.dbrec_dma); dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); - dma_pool_free(dev->db_pool, cmdq->sq.db_record, - cmdq->sq.db_record_dma_addr); + dma_pool_free(dev->db_pool, cmdq->sq.dbrec, cmdq->sq.dbrec_dma); dma_free_coherent(&dev->pdev->dev, cmdq->cq.depth << CQE_SHIFT, cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); - dma_pool_free(dev->db_pool, cmdq->cq.db_record, - cmdq->cq.db_record_dma_addr); + dma_pool_free(dev->db_pool, cmdq->cq.dbrec, cmdq->cq.dbrec_dma); } static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index c1cb5568eab2..70f89f0162aa 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -26,7 +26,7 @@ static void notify_cq(struct erdma_cq *cq, u8 solcitied) FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) | FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci); - *cq->kern_cq.db_record = db_data; + *cq->kern_cq.dbrec = db_data; writeq(db_data, cq->kern_cq.db); } diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index 809c33628f38..0a4746e6d05c 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -13,7 +13,7 @@ void notify_eq(struct erdma_eq *eq) u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) | FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1); - *eq->db_record = db_data; + *eq->dbrec = db_data; writeq(db_data, eq->db); atomic64_inc(&eq->notify_num); @@ -97,9 +97,8 @@ int erdma_aeq_init(struct erdma_dev *dev) atomic64_set(&eq->notify_num, 0); eq->db = dev->func_bar + ERDMA_REGS_AEQ_DB_REG; - eq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, - &eq->db_record_dma_addr); - if (!eq->db_record) + eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); + if (!eq->dbrec) goto err_out; erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, @@ -107,8 +106,7 @@ int erdma_aeq_init(struct erdma_dev *dev) erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, lower_32_bits(eq->qbuf_dma_addr)); erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); - erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, - eq->db_record_dma_addr); + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; @@ -126,7 +124,7 @@ void erdma_aeq_destroy(struct erdma_dev *dev) dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, eq->qbuf_dma_addr); - dma_pool_free(dev->db_pool, eq->db_record, eq->db_record_dma_addr); + dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); } void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) @@ -226,8 +224,8 @@ static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) req.qtype = ERDMA_EQ_TYPE_CEQ; /* Vector index is the same as EQN. */ req.vector_idx = eqn; - req.db_dma_addr_l = lower_32_bits(eq->db_record_dma_addr); - req.db_dma_addr_h = upper_32_bits(eq->db_record_dma_addr); + req.db_dma_addr_l = lower_32_bits(eq->dbrec_dma); + req.db_dma_addr_h = upper_32_bits(eq->dbrec_dma); return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); } @@ -251,9 +249,8 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + (ceqn + 1) * ERDMA_DB_SIZE; - eq->db_record = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, - &eq->db_record_dma_addr); - if (!eq->db_record) { + eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); + if (!eq->dbrec) { dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, eq->qbuf_dma_addr); return -ENOMEM; @@ -290,7 +287,7 @@ static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, eq->qbuf_dma_addr); - dma_pool_free(dev->db_pool, eq->db_record, eq->db_record_dma_addr); + dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); } int erdma_ceqs_init(struct erdma_dev *dev) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 3212a1222760..05978f3b1475 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -240,7 +240,7 @@ struct erdma_cmdq_create_cq_req { u32 qbuf_addr_l; u32 qbuf_addr_h; u32 cfg1; - u64 cq_db_info_addr; + u64 cq_dbrec_dma; u32 first_page_offset; u32 cfg2; }; @@ -335,8 +335,8 @@ struct erdma_cmdq_create_qp_req { u64 rq_buf_addr; u32 sq_mtt_cfg; u32 rq_mtt_cfg; - u64 sq_db_info_dma_addr; - u64 rq_db_info_dma_addr; + u64 sq_dbrec_dma; + u64 rq_dbrec_dma; u64 sq_mtt_entry[3]; u64 rq_mtt_entry[3]; diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 6d0330badd68..4d1f9114cd97 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -492,7 +492,7 @@ static void kick_sq_db(struct erdma_qp *qp, u16 pi) u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) | FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); - *(u64 *)qp->kern_qp.sq_db_info = db_data; + *(u64 *)qp->kern_qp.sq_dbrec = db_data; writeq(db_data, qp->kern_qp.hw_sq_db); } @@ -557,7 +557,7 @@ static int erdma_post_recv_one(struct erdma_qp *qp, return -EINVAL; } - *(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe; + *(u64 *)qp->kern_qp.rq_dbrec = *(u64 *)rqe; writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db); qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] = diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index b78ddca1483e..40c9b6e46b82 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -76,8 +76,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; - req.sq_db_info_dma_addr = qp->kern_qp.sq_db_info_dma_addr; - req.rq_db_info_dma_addr = qp->kern_qp.rq_db_info_dma_addr; + req.sq_dbrec_dma = qp->kern_qp.sq_dbrec_dma; + req.rq_dbrec_dma = qp->kern_qp.rq_dbrec_dma; } else { user_qp = &qp->user_qp; req.sq_cqn_mtt_cfg = FIELD_PREP( @@ -105,8 +105,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) assemble_qbuf_mtt_for_cmd(&user_qp->rq_mem, &req.rq_mtt_cfg, &req.rq_buf_addr, req.rq_mtt_entry); - req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; - req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + req.sq_dbrec_dma = user_qp->sq_dbrec_dma; + req.rq_dbrec_dma = user_qp->rq_dbrec_dma; if (uctx->ext_db.enable) { req.sq_cqn_mtt_cfg |= @@ -207,7 +207,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) ERDMA_MR_MTT_0LEVEL); req.first_page_offset = 0; - req.cq_db_info_addr = cq->kern_cq.db_record_dma_addr; + req.cq_dbrec_dma = cq->kern_cq.dbrec_dma; } else { mem = &cq->user_cq.qbuf_mem; req.cfg0 |= @@ -230,7 +230,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) mem->mtt_nents); req.first_page_offset = mem->page_offset; - req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + req.cq_dbrec_dma = cq->user_cq.dbrec_dma; if (uctx->ext_db.enable) { req.cfg1 |= FIELD_PREP( @@ -484,9 +484,9 @@ static void free_kernel_qp(struct erdma_qp *qp) qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); - if (qp->kern_qp.sq_db_info) - dma_pool_free(dev->db_pool, qp->kern_qp.sq_db_info, - qp->kern_qp.sq_db_info_dma_addr); + if (qp->kern_qp.sq_dbrec) + dma_pool_free(dev->db_pool, qp->kern_qp.sq_dbrec, + qp->kern_qp.sq_dbrec_dma); if (qp->kern_qp.rq_buf) dma_free_coherent(&dev->pdev->dev, @@ -494,9 +494,9 @@ static void free_kernel_qp(struct erdma_qp *qp) qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); - if (qp->kern_qp.rq_db_info) - dma_pool_free(dev->db_pool, qp->kern_qp.rq_db_info, - qp->kern_qp.rq_db_info_dma_addr); + if (qp->kern_qp.rq_dbrec) + dma_pool_free(dev->db_pool, qp->kern_qp.rq_dbrec, + qp->kern_qp.rq_dbrec_dma); } static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, @@ -527,9 +527,9 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, if (!kqp->sq_buf) goto err_out; - kqp->sq_db_info = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, - &kqp->sq_db_info_dma_addr); - if (!kqp->sq_db_info) + kqp->sq_dbrec = + dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &kqp->sq_dbrec_dma); + if (!kqp->sq_dbrec) goto err_out; size = qp->attrs.rq_size << RQE_SHIFT; @@ -538,9 +538,9 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, if (!kqp->rq_buf) goto err_out; - kqp->rq_db_info = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, - &kqp->rq_db_info_dma_addr); - if (!kqp->rq_db_info) + kqp->rq_dbrec = + dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &kqp->rq_dbrec_dma); + if (!kqp->rq_dbrec) goto err_out; return 0; @@ -876,9 +876,9 @@ erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx, } static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, - u64 va, u32 len, u64 db_info_va) + u64 va, u32 len, u64 dbrec_va) { - dma_addr_t db_info_dma_addr; + dma_addr_t dbrec_dma; u32 rq_offset; int ret; @@ -901,14 +901,14 @@ static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, if (ret) goto put_sq_mtt; - ret = erdma_map_user_dbrecords(uctx, db_info_va, + ret = erdma_map_user_dbrecords(uctx, dbrec_va, &qp->user_qp.user_dbr_page, - &db_info_dma_addr); + &dbrec_dma); if (ret) goto put_rq_mtt; - qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr; - qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE; + qp->user_qp.sq_dbrec_dma = dbrec_dma; + qp->user_qp.rq_dbrec_dma = dbrec_dma + ERDMA_DB_SIZE; return 0; @@ -1251,8 +1251,8 @@ int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) if (rdma_is_kernel_res(&cq->ibcq.res)) { dma_free_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); - dma_pool_free(dev->db_pool, cq->kern_cq.db_record, - cq->kern_cq.db_record_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.dbrec, + cq->kern_cq.dbrec_dma); } else { erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); put_mtt_entries(dev, &cq->user_cq.qbuf_mem); @@ -1592,7 +1592,7 @@ static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq, ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va, &cq->user_cq.user_dbr_page, - &cq->user_cq.db_info_dma_addr); + &cq->user_cq.dbrec_dma); if (ret) put_mtt_entries(dev, &cq->user_cq.qbuf_mem); @@ -1609,9 +1609,9 @@ static int erdma_init_kernel_cq(struct erdma_cq *cq) if (!cq->kern_cq.qbuf) return -ENOMEM; - cq->kern_cq.db_record = dma_pool_zalloc( - dev->db_pool, GFP_KERNEL, &cq->kern_cq.db_record_dma_addr); - if (!cq->kern_cq.db_record) + cq->kern_cq.dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, + &cq->kern_cq.dbrec_dma); + if (!cq->kern_cq.dbrec) goto err_out; spin_lock_init(&cq->kern_cq.lock); @@ -1690,8 +1690,8 @@ err_free_res: } else { dma_free_coherent(&dev->pdev->dev, depth << CQE_SHIFT, cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); - dma_pool_free(dev->db_pool, cq->kern_cq.db_record, - cq->kern_cq.db_record_dma_addr); + dma_pool_free(dev->db_pool, cq->kern_cq.dbrec, + cq->kern_cq.dbrec_dma); } err_out_xa: diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index b02ffdc8c811..4f02ba06b210 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -140,8 +140,8 @@ struct erdma_uqp { struct erdma_mem sq_mem; struct erdma_mem rq_mem; - dma_addr_t sq_db_info_dma_addr; - dma_addr_t rq_db_info_dma_addr; + dma_addr_t sq_dbrec_dma; + dma_addr_t rq_dbrec_dma; struct erdma_user_dbrecords_page *user_dbr_page; @@ -167,11 +167,11 @@ struct erdma_kqp { void *rq_buf; dma_addr_t rq_buf_dma_addr; - void *sq_db_info; - void *rq_db_info; + void *sq_dbrec; + void *rq_dbrec; - dma_addr_t sq_db_info_dma_addr; - dma_addr_t rq_db_info_dma_addr; + dma_addr_t sq_dbrec_dma; + dma_addr_t rq_dbrec_dma; u8 sig_all; }; @@ -249,14 +249,14 @@ struct erdma_kcq_info { spinlock_t lock; u8 __iomem *db; - u64 *db_record; - dma_addr_t db_record_dma_addr; + u64 *dbrec; + dma_addr_t dbrec_dma; }; struct erdma_ucq_info { struct erdma_mem qbuf_mem; struct erdma_user_dbrecords_page *user_dbr_page; - dma_addr_t db_info_dma_addr; + dma_addr_t dbrec_dma; }; struct erdma_cq { -- cgit v1.2.3 From df0e16bab5c7f13d083484e0ab7488cc7ca510f1 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Mon, 11 Mar 2024 19:38:21 +0800 Subject: RDMA/erdma: Remove unnecessary __GFP_ZERO flag The dma_alloc_coherent() interface automatically zero the memory returned. Thus, we do not need to specify the __GFP_ZERO flag explicitly when we call dma_alloc_coherent(). Reviewed-by: Cheng Xu Signed-off-by: Boshi Yu Link: https://lore.kernel.org/r/20240311113821.22482-4-boshiyu@alibaba-inc.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cmdq.c | 6 ++---- drivers/infiniband/hw/erdma/erdma_eq.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index 0ac2683cfccf..43ff40b5a09d 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -127,8 +127,7 @@ static int erdma_cmdq_cq_init(struct erdma_dev *dev) cq->depth = cmdq->sq.depth; cq->qbuf = dma_alloc_coherent(&dev->pdev->dev, cq->depth << CQE_SHIFT, - &cq->qbuf_dma_addr, - GFP_KERNEL | __GFP_ZERO); + &cq->qbuf_dma_addr, GFP_KERNEL); if (!cq->qbuf) return -ENOMEM; @@ -162,8 +161,7 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) eq->depth = cmdq->max_outstandings; eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - &eq->qbuf_dma_addr, - GFP_KERNEL | __GFP_ZERO); + &eq->qbuf_dma_addr, GFP_KERNEL); if (!eq->qbuf) return -ENOMEM; diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index 0a4746e6d05c..84ccdd8144c9 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -87,8 +87,7 @@ int erdma_aeq_init(struct erdma_dev *dev) eq->depth = ERDMA_DEFAULT_EQ_DEPTH; eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - &eq->qbuf_dma_addr, - GFP_KERNEL | __GFP_ZERO); + &eq->qbuf_dma_addr, GFP_KERNEL); if (!eq->qbuf) return -ENOMEM; @@ -237,8 +236,7 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) eq->depth = ERDMA_DEFAULT_EQ_DEPTH; eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - &eq->qbuf_dma_addr, - GFP_KERNEL | __GFP_ZERO); + &eq->qbuf_dma_addr, GFP_KERNEL); if (!eq->qbuf) return -ENOMEM; -- cgit v1.2.3 From ca537a34775c103f7b14d7bbd976403f1d1525d8 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Mon, 18 Mar 2024 17:23:20 +0800 Subject: RDMA/restrack: Fix potential invalid address access struct rdma_restrack_entry's kern_name was set to KBUILD_MODNAME in ib_create_cq(), while if the module exited but forgot del this rdma_restrack_entry, it would cause a invalid address access in rdma_restrack_clean() when print the owner of this rdma_restrack_entry. These code is used to help find one forgotten PD release in one of the ULPs. But it is not needed anymore, so delete them. Signed-off-by: Wenchao Hao Link: https://lore.kernel.org/r/20240318092320.1215235-1-haowenchao2@huawei.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/restrack.c | 51 +------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 01a499a8b88d..438ed3588175 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -37,22 +37,6 @@ int rdma_restrack_init(struct ib_device *dev) return 0; } -static const char *type2str(enum rdma_restrack_type type) -{ - static const char * const names[RDMA_RESTRACK_MAX] = { - [RDMA_RESTRACK_PD] = "PD", - [RDMA_RESTRACK_CQ] = "CQ", - [RDMA_RESTRACK_QP] = "QP", - [RDMA_RESTRACK_CM_ID] = "CM_ID", - [RDMA_RESTRACK_MR] = "MR", - [RDMA_RESTRACK_CTX] = "CTX", - [RDMA_RESTRACK_COUNTER] = "COUNTER", - [RDMA_RESTRACK_SRQ] = "SRQ", - }; - - return names[type]; -}; - /** * rdma_restrack_clean() - clean resource tracking * @dev: IB device @@ -60,47 +44,14 @@ static const char *type2str(enum rdma_restrack_type type) void rdma_restrack_clean(struct ib_device *dev) { struct rdma_restrack_root *rt = dev->res; - struct rdma_restrack_entry *e; - char buf[TASK_COMM_LEN]; - bool found = false; - const char *owner; int i; for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { struct xarray *xa = &dev->res[i].xa; - if (!xa_empty(xa)) { - unsigned long index; - - if (!found) { - pr_err("restrack: %s", CUT_HERE); - dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); - } - xa_for_each(xa, index, e) { - if (rdma_is_kernel_res(e)) { - owner = e->kern_name; - } else { - /* - * There is no need to call get_task_struct here, - * because we can be here only if there are more - * get_task_struct() call than put_task_struct(). - */ - get_task_comm(buf, e->task); - owner = buf; - } - - pr_err("restrack: %s %s object allocated by %s is not freed\n", - rdma_is_kernel_res(e) ? "Kernel" : - "User", - type2str(e->type), owner); - } - found = true; - } + WARN_ON(!xa_empty(xa)); xa_destroy(xa); } - if (found) - pr_err("restrack: %s", CUT_HERE); - kfree(rt); } -- cgit v1.2.3 From 46f5be7cd4bceb3a503c544b3dab7b75fe4bb96b Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 26 Mar 2024 13:08:05 -0700 Subject: RDMA/mana_ib: Introduce helpers to create and destroy mana queues Intoduce helpers to work with mana ib queues (struct mana_ib_queue). A queue always consists of umem, gdma_region, and id. A queue can become a WQ or a CQ. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1711483688-24358-2-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 43 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 10 +++++++++ 2 files changed, 53 insertions(+) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 71e33feee61b..4524c6b80748 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -237,6 +237,49 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); } +int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, + struct mana_ib_queue *queue) +{ + struct ib_umem *umem; + int err; + + queue->umem = NULL; + queue->id = INVALID_QUEUE_ID; + queue->gdma_region = GDMA_INVALID_DMA_REGION; + + umem = ib_umem_get(&mdev->ib_dev, addr, size, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, "Failed to get umem, %d\n", err); + return err; + } + + err = mana_ib_create_zero_offset_dma_region(mdev, umem, &queue->gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to create dma region, %d\n", err); + goto free_umem; + } + queue->umem = umem; + + ibdev_dbg(&mdev->ib_dev, + "create_dma_region ret %d gdma_region 0x%llx\n", + err, queue->gdma_region); + + return 0; +free_umem: + ib_umem_release(umem); + return err; +} + +void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue) +{ + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_dma_region(mdev, queue->gdma_region); + ib_umem_release(queue->umem); +} + static int mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index f83390eebb7d..859fd3bfc764 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -45,6 +45,12 @@ struct mana_ib_adapter_caps { u32 max_inline_data_size; }; +struct mana_ib_queue { + struct ib_umem *umem; + u64 gdma_region; + u64 id; +}; + struct mana_ib_dev { struct ib_device ib_dev; struct gdma_dev *gdma_dev; @@ -169,6 +175,10 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, mana_handle_t gdma_region); +int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, + struct mana_ib_queue *queue); +void mana_ib_destroy_queue(struct mana_ib_dev *mdev, struct mana_ib_queue *queue); + struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); -- cgit v1.2.3 From 60a7ac0b8bec5df9764b7460ffee91fc981e8a31 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 26 Mar 2024 13:08:06 -0700 Subject: RDMA/mana_ib: Use struct mana_ib_queue for CQs Use struct mana_ib_queue and its helpers for CQs Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1711483688-24358-3-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 52 +++++++----------------------------- drivers/infiniband/hw/mana/mana_ib.h | 4 +-- drivers/infiniband/hw/mana/qp.c | 26 +++++++++--------- 3 files changed, 24 insertions(+), 58 deletions(-) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 4a71e678d09c..c9129218f1be 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -39,37 +39,13 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, } cq->cqe = attr->cqe; - cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(cq->umem)) { - err = PTR_ERR(cq->umem); - ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n", - err); - return err; - } - - err = mana_ib_create_zero_offset_dma_region(mdev, cq->umem, &cq->gdma_region); + err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, &cq->queue); if (err) { - ibdev_dbg(ibdev, - "Failed to create dma region for create cq, %d\n", - err); - goto err_release_umem; + ibdev_dbg(ibdev, "Failed to create queue for create cq, %d\n", err); + return err; } - ibdev_dbg(ibdev, - "create_dma_region ret %d gdma_region 0x%llx\n", - err, cq->gdma_region); - - /* - * The CQ ID is not known at this time. The ID is generated at create_qp - */ - cq->id = INVALID_QUEUE_ID; - return 0; - -err_release_umem: - ib_umem_release(cq->umem); - return err; } int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) @@ -78,24 +54,16 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) struct ib_device *ibdev = ibcq->device; struct mana_ib_dev *mdev; struct gdma_context *gc; - int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); gc = mdev_to_gc(mdev); - err = mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region); - if (err) { - ibdev_dbg(ibdev, - "Failed to destroy dma region, %d\n", err); - return err; - } - - if (cq->id != INVALID_QUEUE_ID) { - kfree(gc->cq_table[cq->id]); - gc->cq_table[cq->id] = NULL; + if (cq->queue.id != INVALID_QUEUE_ID) { + kfree(gc->cq_table[cq->queue.id]); + gc->cq_table[cq->queue.id] = NULL; } - ib_umem_release(cq->umem); + mana_ib_destroy_queue(mdev, &cq->queue); return 0; } @@ -114,7 +82,7 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) struct gdma_queue *gdma_cq; /* Create CQ table entry */ - WARN_ON(gc->cq_table[cq->id]); + WARN_ON(gc->cq_table[cq->queue.id]); gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); if (!gdma_cq) return -ENOMEM; @@ -122,7 +90,7 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) gdma_cq->cq.context = cq; gdma_cq->type = GDMA_CQ; gdma_cq->cq.callback = mana_ib_cq_handler; - gdma_cq->id = cq->id; - gc->cq_table[cq->id] = gdma_cq; + gdma_cq->id = cq->queue.id; + gc->cq_table[cq->queue.id] = gdma_cq; return 0; } diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 859fd3bfc764..6acb5c281c36 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -88,10 +88,8 @@ struct mana_ib_mr { struct mana_ib_cq { struct ib_cq ibcq; - struct ib_umem *umem; + struct mana_ib_queue queue; int cqe; - u64 gdma_region; - u64 id; u32 comp_vector; }; diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 6e7627745c95..d7485ee6a685 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -197,7 +197,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, wq_spec.gdma_region = wq->gdma_region; wq_spec.queue_size = wq->wq_buf_size; - cq_spec.gdma_region = cq->gdma_region; + cq_spec.gdma_region = cq->queue.gdma_region; cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; eq = &mpc->ac->eqs[cq->comp_vector % gc->max_num_queues]; @@ -213,16 +213,16 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, /* The GDMA regions are now owned by the WQ object */ wq->gdma_region = GDMA_INVALID_DMA_REGION; - cq->gdma_region = GDMA_INVALID_DMA_REGION; + cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; wq->id = wq_spec.queue_index; - cq->id = cq_spec.queue_index; + cq->queue.id = cq_spec.queue_index; ibdev_dbg(&mdev->ib_dev, "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", - ret, wq->rx_object, wq->id, cq->id); + ret, wq->rx_object, wq->id, cq->queue.id); - resp.entries[i].cqid = cq->id; + resp.entries[i].cqid = cq->queue.id; resp.entries[i].wqid = wq->id; mana_ind_table[i] = wq->rx_object; @@ -232,7 +232,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, if (ret) goto fail; - gdma_cq_allocated[i] = gc->cq_table[cq->id]; + gdma_cq_allocated[i] = gc->cq_table[cq->queue.id]; } resp.num_entries = i; @@ -264,7 +264,7 @@ fail: wq = container_of(ibwq, struct mana_ib_wq, ibwq); cq = container_of(ibcq, struct mana_ib_cq, ibcq); - gc->cq_table[cq->id] = NULL; + gc->cq_table[cq->queue.id] = NULL; kfree(gdma_cq_allocated[i]); mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); @@ -374,7 +374,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, wq_spec.gdma_region = qp->sq_gdma_region; wq_spec.queue_size = ucmd.sq_buf_size; - cq_spec.gdma_region = send_cq->gdma_region; + cq_spec.gdma_region = send_cq->queue.gdma_region; cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; eq_vec = send_cq->comp_vector % gc->max_num_queues; @@ -392,10 +392,10 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, /* The GDMA regions are now owned by the WQ object */ qp->sq_gdma_region = GDMA_INVALID_DMA_REGION; - send_cq->gdma_region = GDMA_INVALID_DMA_REGION; + send_cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; qp->sq_id = wq_spec.queue_index; - send_cq->id = cq_spec.queue_index; + send_cq->queue.id = cq_spec.queue_index; /* Create CQ table entry */ err = mana_ib_install_cq_cb(mdev, send_cq); @@ -404,10 +404,10 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, ibdev_dbg(&mdev->ib_dev, "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err, - qp->tx_object, qp->sq_id, send_cq->id); + qp->tx_object, qp->sq_id, send_cq->queue.id); resp.sqid = qp->sq_id; - resp.cqid = send_cq->id; + resp.cqid = send_cq->queue.id; resp.tx_vp_offset = pd->tx_vp_offset; err = ib_copy_to_udata(udata, &resp, sizeof(resp)); @@ -422,7 +422,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, err_release_gdma_cq: kfree(gdma_cq); - gc->cq_table[send_cq->id] = NULL; + gc->cq_table[send_cq->queue.id] = NULL; err_destroy_wq_obj: mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); -- cgit v1.2.3 From 688bac28e3dc9eb795ae8ea5aa40cb637e289faa Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 26 Mar 2024 13:08:07 -0700 Subject: RDMA/mana_ib: Use struct mana_ib_queue for WQs Use struct mana_ib_queue and its helpers for WQs Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1711483688-24358-4-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mana_ib.h | 4 +--- drivers/infiniband/hw/mana/qp.c | 10 +++++----- drivers/infiniband/hw/mana/wq.c | 31 ++++--------------------------- 3 files changed, 10 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 6acb5c281c36..a8953ee808d9 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -59,11 +59,9 @@ struct mana_ib_dev { struct mana_ib_wq { struct ib_wq ibwq; - struct ib_umem *umem; + struct mana_ib_queue queue; int wqe; u32 wq_buf_size; - u64 gdma_region; - u64 id; mana_handle_t rx_object; }; diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index d7485ee6a685..f606caa75839 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -194,7 +194,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ibcq = ibwq->cq; cq = container_of(ibcq, struct mana_ib_cq, ibcq); - wq_spec.gdma_region = wq->gdma_region; + wq_spec.gdma_region = wq->queue.gdma_region; wq_spec.queue_size = wq->wq_buf_size; cq_spec.gdma_region = cq->queue.gdma_region; @@ -212,18 +212,18 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, } /* The GDMA regions are now owned by the WQ object */ - wq->gdma_region = GDMA_INVALID_DMA_REGION; + wq->queue.gdma_region = GDMA_INVALID_DMA_REGION; cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; - wq->id = wq_spec.queue_index; + wq->queue.id = wq_spec.queue_index; cq->queue.id = cq_spec.queue_index; ibdev_dbg(&mdev->ib_dev, "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", - ret, wq->rx_object, wq->id, cq->queue.id); + ret, wq->rx_object, wq->queue.id, cq->queue.id); resp.entries[i].cqid = cq->queue.id; - resp.entries[i].wqid = wq->id; + resp.entries[i].wqid = wq->queue.id; mana_ind_table[i] = wq->rx_object; diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c index 7c9c69962573..f959f4b9244f 100644 --- a/drivers/infiniband/hw/mana/wq.c +++ b/drivers/infiniband/hw/mana/wq.c @@ -13,7 +13,6 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, container_of(pd->device, struct mana_ib_dev, ib_dev); struct mana_ib_create_wq ucmd = {}; struct mana_ib_wq *wq; - struct ib_umem *umem; int err; if (udata->inlen < sizeof(ucmd)) @@ -32,39 +31,18 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr); - umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(umem)) { - err = PTR_ERR(umem); + err = mana_ib_create_queue(mdev, ucmd.wq_buf_addr, ucmd.wq_buf_size, &wq->queue); + if (err) { ibdev_dbg(&mdev->ib_dev, - "Failed to get umem for create wq, err %d\n", err); + "Failed to create queue for create wq, %d\n", err); goto err_free_wq; } - wq->umem = umem; wq->wqe = init_attr->max_wr; wq->wq_buf_size = ucmd.wq_buf_size; wq->rx_object = INVALID_MANA_HANDLE; - - err = mana_ib_create_zero_offset_dma_region(mdev, wq->umem, &wq->gdma_region); - if (err) { - ibdev_dbg(&mdev->ib_dev, - "Failed to create dma region for create wq, %d\n", - err); - goto err_release_umem; - } - - ibdev_dbg(&mdev->ib_dev, - "create_dma_region ret %d gdma_region 0x%llx\n", - err, wq->gdma_region); - - /* WQ ID is returned at wq_create time, doesn't know the value yet */ - return &wq->ibwq; -err_release_umem: - ib_umem_release(umem); - err_free_wq: kfree(wq); @@ -86,8 +64,7 @@ int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev); - mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region); - ib_umem_release(wq->umem); + mana_ib_destroy_queue(mdev, &wq->queue); kfree(wq); -- cgit v1.2.3 From f10242b3da908dc9d4bfa040e6511a5b86522499 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 26 Mar 2024 13:08:08 -0700 Subject: RDMA/mana_ib: Use struct mana_ib_queue for RAW QPs Use struct mana_ib_queue and its helpers for RAW QPs Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1711483688-24358-5-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mana_ib.h | 8 ++---- drivers/infiniband/hw/mana/qp.c | 56 +++++++++++------------------------- 2 files changed, 18 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index a8953ee808d9..ceca21cef72a 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -94,12 +94,8 @@ struct mana_ib_cq { struct mana_ib_qp { struct ib_qp ibqp; - /* Work queue info */ - struct ib_umem *sq_umem; - int sqe; - u64 sq_gdma_region; - u64 sq_id; - mana_handle_t tx_object; + mana_handle_t qp_handle; + struct mana_ib_queue raw_sq; /* The port on the IB device, starting with 1 */ u32 port; diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index f606caa75839..ef0a6dc664d0 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -297,7 +297,6 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, struct mana_obj_spec cq_spec = {}; struct mana_port_context *mpc; struct net_device *ndev; - struct ib_umem *umem; struct mana_eq *eq; int eq_vec; u32 port; @@ -346,32 +345,15 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n", ucmd.sq_buf_addr, ucmd.port); - umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(umem)) { - err = PTR_ERR(umem); - ibdev_dbg(&mdev->ib_dev, - "Failed to get umem for create qp-raw, err %d\n", - err); - goto err_free_vport; - } - qp->sq_umem = umem; - - err = mana_ib_create_zero_offset_dma_region(mdev, qp->sq_umem, - &qp->sq_gdma_region); + err = mana_ib_create_queue(mdev, ucmd.sq_buf_addr, ucmd.sq_buf_size, &qp->raw_sq); if (err) { ibdev_dbg(&mdev->ib_dev, - "Failed to create dma region for create qp-raw, %d\n", - err); - goto err_release_umem; + "Failed to create queue for create qp-raw, err %d\n", err); + goto err_free_vport; } - ibdev_dbg(&mdev->ib_dev, - "create_dma_region ret %d gdma_region 0x%llx\n", - err, qp->sq_gdma_region); - /* Create a WQ on the same port handle used by the Ethernet */ - wq_spec.gdma_region = qp->sq_gdma_region; + wq_spec.gdma_region = qp->raw_sq.gdma_region; wq_spec.queue_size = ucmd.sq_buf_size; cq_spec.gdma_region = send_cq->queue.gdma_region; @@ -382,19 +364,19 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, cq_spec.attached_eq = eq->eq->id; err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, - &cq_spec, &qp->tx_object); + &cq_spec, &qp->qp_handle); if (err) { ibdev_dbg(&mdev->ib_dev, "Failed to create wq for create raw-qp, err %d\n", err); - goto err_destroy_dma_region; + goto err_destroy_queue; } /* The GDMA regions are now owned by the WQ object */ - qp->sq_gdma_region = GDMA_INVALID_DMA_REGION; + qp->raw_sq.gdma_region = GDMA_INVALID_DMA_REGION; send_cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; - qp->sq_id = wq_spec.queue_index; + qp->raw_sq.id = wq_spec.queue_index; send_cq->queue.id = cq_spec.queue_index; /* Create CQ table entry */ @@ -403,10 +385,10 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, goto err_destroy_wq_obj; ibdev_dbg(&mdev->ib_dev, - "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err, - qp->tx_object, qp->sq_id, send_cq->queue.id); + "ret %d qp->qp_handle 0x%llx sq id %llu cq id %llu\n", err, + qp->qp_handle, qp->raw_sq.id, send_cq->queue.id); - resp.sqid = qp->sq_id; + resp.sqid = qp->raw_sq.id; resp.cqid = send_cq->queue.id; resp.tx_vp_offset = pd->tx_vp_offset; @@ -425,13 +407,10 @@ err_release_gdma_cq: gc->cq_table[send_cq->queue.id] = NULL; err_destroy_wq_obj: - mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->qp_handle); -err_destroy_dma_region: - mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); - -err_release_umem: - ib_umem_release(umem); +err_destroy_queue: + mana_ib_destroy_queue(mdev, &qp->raw_sq); err_free_vport: mana_ib_uncfg_vport(mdev, pd, port); @@ -505,12 +484,9 @@ static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata) mpc = netdev_priv(ndev); pd = container_of(ibpd, struct mana_ib_pd, ibpd); - mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->qp_handle); - if (qp->sq_umem) { - mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); - ib_umem_release(qp->sq_umem); - } + mana_ib_destroy_queue(mdev, &qp->raw_sq); mana_ib_uncfg_vport(mdev, pd, qp->port); -- cgit v1.2.3 From 0611a8e8b475fc5230b9a24d29c8397aaab20b63 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Wed, 3 Apr 2024 13:35:59 +0300 Subject: RDMA/mlx5: Uncacheable mkey has neither rb_key or cache_ent As some mkeys can't be modified with UMR due to some UMR limitations, like the size of translation that can be updated, not all user mkeys can be cached. Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow") Signed-off-by: Or Har-Toov Link: https://lore.kernel.org/r/f2742dd934ed73b2d32c66afb8e91b823063880c.1712140377.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a8de35c07c9e..e74f04865062 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -643,7 +643,7 @@ struct mlx5_ib_mkey { unsigned int ndescs; struct wait_queue_head wait; refcount_t usecount; - /* User Mkey must hold either a rb_key or a cache_ent. */ + /* Cacheable user Mkey must hold either a rb_key or a cache_ent. */ struct mlx5r_cache_rb_key rb_key; struct mlx5_cache_ent *cache_ent; }; -- cgit v1.2.3 From 8c1185fef68cc603b954fece2a434c9f851d6a86 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Wed, 3 Apr 2024 13:36:00 +0300 Subject: RDMA/mlx5: Change check for cacheable mkeys umem can be NULL for user application mkeys in some cases. Therefore umem can't be used for checking if the mkey is cacheable and it is changed for checking a flag that indicates it. Also make sure that all mkeys which are not returned to the cache will be destroyed. Fixes: dd1b913fb0d0 ("RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow") Signed-off-by: Or Har-Toov Link: https://lore.kernel.org/r/2690bc5c6896bcb937f89af16a1ff0343a7ab3d0.1712140377.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/mr.c | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index e74f04865062..f255a12e26a0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -646,6 +646,7 @@ struct mlx5_ib_mkey { /* Cacheable user Mkey must hold either a rb_key or a cache_ent. */ struct mlx5r_cache_rb_key rb_key; struct mlx5_cache_ent *cache_ent; + u8 cacheable : 1; }; #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index a8ee2ca1f4a1..7f7b1f59b5f0 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1158,6 +1158,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (IS_ERR(mr)) return mr; mr->mmkey.rb_key = rb_key; + mr->mmkey.cacheable = true; return mr; } @@ -1168,6 +1169,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, mr->ibmr.pd = pd; mr->umem = umem; mr->page_shift = order_base_2(page_size); + mr->mmkey.cacheable = true; set_mr_fields(dev, mr, umem->length, access_flags, iova); return mr; @@ -1835,6 +1837,23 @@ end: return ret; } +static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; + + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) + return 0; + + if (ent) { + spin_lock_irq(&ent->mkeys_queue.lock); + ent->in_use--; + mr->mmkey.cache_ent = NULL; + spin_unlock_irq(&ent->mkeys_queue.lock); + } + return destroy_mkey(dev, mr); +} + int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct mlx5_ib_mr *mr = to_mmr(ibmr); @@ -1880,16 +1899,9 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } /* Stop DMA */ - if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length)) - if (mlx5r_umr_revoke_mr(mr) || - cache_ent_find_and_store(dev, mr)) - mr->mmkey.cache_ent = NULL; - - if (!mr->mmkey.cache_ent) { - rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); - if (rc) - return rc; - } + rc = mlx5_revoke_mr(mr); + if (rc) + return rc; if (mr->umem) { bool is_odp = is_odp_mr(mr); -- cgit v1.2.3 From 2ca7e93bc963d9ec2f5c24d117176851454967af Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Wed, 3 Apr 2024 13:36:01 +0300 Subject: RDMA/mlx5: Adding remote atomic access flag to updatable flags Currently IB_ACCESS_REMOTE_ATOMIC is blocked from being updated via UMR although in some cases it should be possible. These cases are checked in mlx5r_umr_can_reconfig function. Fixes: ef3642c4f54d ("RDMA/mlx5: Fix error unwinds for rereg_mr") Signed-off-by: Or Har-Toov Link: https://lore.kernel.org/r/24dac73e2fa48cb806f33a932d97f3e402a5ea2c.1712140377.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 7f7b1f59b5f0..ecc111ed5d86 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1572,7 +1572,8 @@ static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, unsigned int diffs = current_access_flags ^ target_access_flags; if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) + IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING | + IB_ACCESS_REMOTE_ATOMIC)) return false; return mlx5r_umr_can_reconfig(dev, current_access_flags, target_access_flags); -- cgit v1.2.3 From ee20cc17e9d8fd85225e18351637460f3482be2f Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Fri, 15 Mar 2024 17:35:51 +0800 Subject: RDMA/hns: Support DSCP Add support for DSCP configuration. For DSCP, get dscp-prio mapping via hns3 nic driver api .get_dscp_prio() and fill the SL (in WQE for UD or in QPC for RC) with the priority value. The prio-tc mapping is configured to HW by hns3 nic driver. HW will select a corresponding TC according to SL and the prio-tc mapping. Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240315093551.1650088-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_ah.c | 33 +++++++---- drivers/infiniband/hw/hns/hns_roce_device.h | 6 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 85 +++++++++++++++++++++++------ drivers/infiniband/hw/hns/hns_roce_qp.c | 13 +++++ include/uapi/rdma/hns-abi.h | 9 ++- 5 files changed, 116 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index b4209b6aed8d..3e02c474f59f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -59,8 +59,10 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device); struct hns_roce_ib_create_ah_resp resp = {}; struct hns_roce_ah *ah = to_hr_ah(ibah); - int ret = 0; - u32 max_sl; + u8 tclass = get_tclass(grh); + u8 priority = 0; + u8 tc_mode = 0; + int ret; if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) return -EOPNOTSUPP; @@ -74,16 +76,23 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ah->av.hop_limit = grh->hop_limit; ah->av.flowlabel = grh->flow_label; ah->av.udp_sport = get_ah_udp_sport(ah_attr); - ah->av.tclass = get_tclass(grh); - - ah->av.sl = rdma_ah_get_sl(ah_attr); - max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); - if (unlikely(ah->av.sl > max_sl)) { - ibdev_err_ratelimited(&hr_dev->ib_dev, - "failed to set sl, sl (%u) shouldn't be larger than %u.\n", - ah->av.sl, max_sl); + ah->av.tclass = tclass; + + ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); + if (ret == -EOPNOTSUPP) + ret = 0; + + if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + return ret; + + if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + ah->av.sl = priority; + else + ah->av.sl = rdma_ah_get_sl(ah_attr); + + if (!check_sl_valid(hr_dev, ah->av.sl)) return -EINVAL; - } memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE); memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN); @@ -99,6 +108,8 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, } if (udata) { + resp.priority = ah->av.sl; + resp.tc_mode = tc_mode; memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN); ret = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index c3cbd0a494bf..78b4d19ff848 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -645,6 +645,8 @@ struct hns_roce_qp { struct hns_user_mmap_entry *dwqe_mmap_entry; u32 config; enum hns_roce_cong_type cong_type; + u8 tc_mode; + u8 priority; }; struct hns_roce_ib_iboe { @@ -950,6 +952,8 @@ struct hns_roce_hw { int (*query_sccc)(struct hns_roce_dev *hr_dev, u32 qpn, void *buffer); int (*query_hw_counter)(struct hns_roce_dev *hr_dev, u64 *stats, u32 port, int *hw_counters); + int (*get_dscp)(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority); const struct ib_device_ops *hns_roce_dev_ops; const struct ib_device_ops *hns_roce_dev_srq_ops; }; @@ -1292,4 +1296,6 @@ struct hns_user_mmap_entry * hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address, size_t length, enum hns_roce_mmap_type mmap_type); +bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl); + #endif /* _HNS_ROCE_DEVICE_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ba7ae792d279..423ab66c5856 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -443,10 +443,6 @@ static int fill_ud_av(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe, hr_reg_write(ud_sq_wqe, UD_SEND_WQE_HOPLIMIT, ah->av.hop_limit); hr_reg_write(ud_sq_wqe, UD_SEND_WQE_TCLASS, ah->av.tclass); hr_reg_write(ud_sq_wqe, UD_SEND_WQE_FLOW_LABEL, ah->av.flowlabel); - - if (WARN_ON(ah->av.sl > MAX_SERVICE_LEVEL)) - return -EINVAL; - hr_reg_write(ud_sq_wqe, UD_SEND_WQE_SL, ah->av.sl); ud_sq_wqe->sgid_index = ah->av.gid_index; @@ -4828,6 +4824,69 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr, return 0; } +static int hns_roce_hw_v2_get_dscp(struct hns_roce_dev *hr_dev, u8 dscp, + u8 *tc_mode, u8 *priority) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (!ops->get_dscp_prio) + return -EOPNOTSUPP; + + return ops->get_dscp_prio(handle, dscp, tc_mode, priority); +} + +bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl) +{ + u32 max_sl; + + max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); + if (unlikely(sl > max_sl)) { + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to set SL(%u). Shouldn't be larger than %u.\n", + sl, max_sl); + return false; + } + + return true; +} + +static int hns_roce_set_sl(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct ib_device *ibdev = &hr_dev->ib_dev; + int ret; + + ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh), + &hr_qp->tc_mode, &hr_qp->priority); + if (ret && ret != -EOPNOTSUPP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ibdev_err_ratelimited(ibdev, + "failed to get dscp, ret = %d.\n", ret); + return ret; + } + + if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP && + grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + hr_qp->sl = hr_qp->priority; + else + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + + if (!check_sl_valid(hr_dev, hr_qp->sl)) + return -EINVAL; + + hr_reg_write(context, QPC_SL, hr_qp->sl); + hr_reg_clear(qpc_mask, QPC_SL); + + return 0; +} + static int hns_roce_v2_set_path(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, @@ -4843,25 +4902,18 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, int is_roce_protocol; u16 vlan_id = 0xffff; bool is_udp = false; - u32 max_sl; u8 ib_port; u8 hr_port; int ret; - max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1); - if (unlikely(sl > max_sl)) { - ibdev_err_ratelimited(ibdev, - "failed to fill QPC, sl (%u) shouldn't be larger than %u.\n", - sl, max_sl); - return -EINVAL; - } - /* * If free_mr_en of qp is set, it means that this qp comes from * free mr. This qp will perform the loopback operation. * In the loopback scenario, only sl needs to be set. */ if (hr_qp->free_mr_en) { + if (!check_sl_valid(hr_dev, sl)) + return -EINVAL; hr_reg_write(context, QPC_SL, sl); hr_reg_clear(qpc_mask, QPC_SL); hr_qp->sl = sl; @@ -4931,11 +4983,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw)); - hr_qp->sl = sl; - hr_reg_write(context, QPC_SL, hr_qp->sl); - hr_reg_clear(qpc_mask, QPC_SL); - - return 0; + return hns_roce_set_sl(ibqp, attr, context, qpc_mask); } static bool check_qp_state(enum ib_qp_state cur_state, @@ -6735,6 +6783,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .query_srqc = hns_roce_v2_query_srqc, .query_sccc = hns_roce_v2_query_sccc, .query_hw_counter = hns_roce_hw_v2_query_counter, + .get_dscp = hns_roce_hw_v2_get_dscp, .hns_roce_dev_ops = &hns_roce_v2_dev_ops, .hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops, }; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f35a66325d9a..697230f964b1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1386,6 +1386,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_ib_modify_qp_resp resp = {}; struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); enum ib_qp_state cur_state, new_state; int ret = -EINVAL; @@ -1427,6 +1428,18 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state, new_state, udata); + if (ret) + goto out; + + if (udata && udata->outlen) { + resp.tc_mode = hr_qp->tc_mode; + resp.priority = hr_qp->sl; + ret = ib_copy_to_udata(udata, &resp, + min(udata->outlen, sizeof(resp))); + if (ret) + ibdev_err_ratelimited(&hr_dev->ib_dev, + "failed to copy modify qp resp.\n"); + } out: mutex_unlock(&hr_qp->mutex); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 158670da2b2a..94e861870e27 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -109,6 +109,12 @@ struct hns_roce_ib_create_qp_resp { __aligned_u64 dwqe_mmap_key; }; +struct hns_roce_ib_modify_qp_resp { + __u8 tc_mode; + __u8 priority; + __u8 reserved[6]; +}; + enum { HNS_ROCE_EXSGE_FLAGS = 1 << 0, HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, @@ -143,7 +149,8 @@ struct hns_roce_ib_alloc_pd_resp { struct hns_roce_ib_create_ah_resp { __u8 dmac[6]; - __u8 reserved[2]; + __u8 priority; + __u8 tc_mode; }; #endif /* HNS_ABI_USER_H */ -- cgit v1.2.3 From c8fc935f4b198dc6e9871b29f4f3360631d90c8e Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 9 Apr 2024 07:21:05 -0700 Subject: RDMA/mana_ib: remove useless return values from dbg prints Remove printing ret value on success as it was always 0. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712672465-29960-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 4 +--- drivers/infiniband/hw/mana/mr.c | 2 +- drivers/infiniband/hw/mana/qp.c | 6 +++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 4524c6b80748..b31dcff32699 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -261,9 +261,7 @@ int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size, } queue->umem = umem; - ibdev_dbg(&mdev->ib_dev, - "create_dma_region ret %d gdma_region 0x%llx\n", - err, queue->gdma_region); + ibdev_dbg(&mdev->ib_dev, "created dma region 0x%llx\n", queue->gdma_region); return 0; free_umem: diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index b70b13484f09..4f13423ecdbd 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -135,7 +135,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, } ibdev_dbg(ibdev, - "create_dma_region ret %d gdma_region %llx\n", err, + "created dma region for user-mr 0x%llx\n", dma_region_handle); mr_params.pd_handle = pd->pd_handle; diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 4cd8f8afe80d..8fedf6e01925 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -217,8 +217,8 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, cq->queue.id = cq_spec.queue_index; ibdev_dbg(&mdev->ib_dev, - "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", - ret, wq->rx_object, wq->queue.id, cq->queue.id); + "rx_object 0x%llx wq id %llu cq id %llu\n", + wq->rx_object, wq->queue.id, cq->queue.id); resp.entries[i].cqid = cq->queue.id; resp.entries[i].wqid = wq->queue.id; @@ -383,7 +383,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, goto err_destroy_wq_obj; ibdev_dbg(&mdev->ib_dev, - "ret %d qp->qp_handle 0x%llx sq id %llu cq id %llu\n", err, + "qp->qp_handle 0x%llx sq id %llu cq id %llu\n", qp->qp_handle, qp->raw_sq.id, send_cq->queue.id); resp.sqid = qp->raw_sq.id; -- cgit v1.2.3 From dfcdb38b21e4fb92a49acdbdf6afa82c07c8eba0 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 8 Apr 2024 16:21:42 +0200 Subject: RDMA/rxe: Return the correct errno In the function __rxe_add_to_pool, the function xa_alloc_cyclic is called. The return value of the function xa_alloc_cyclic is as below: " Return: 0 if the allocation succeeded without wrapping. 1 if the allocation succeeded after wrapping, -ENOMEM if memory could not be allocated or -EBUSY if there are no free entries in @limit. " But now the function __rxe_add_to_pool only returns -EINVAL. All the returned error value should be returned to the caller. Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20240408142142.792413-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 6215c6de3a84..67567d62195e 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -119,7 +119,7 @@ void rxe_pool_cleanup(struct rxe_pool *pool) int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, bool sleepable) { - int err; + int err = -EINVAL; gfp_t gfp_flags; if (atomic_inc_return(&pool->num_elem) > pool->max_elem) @@ -147,7 +147,7 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, err_cnt: atomic_dec(&pool->num_elem); - return -EINVAL; + return err; } void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) -- cgit v1.2.3 From 23f59f4e837bba9db8d25ae85b8455d53b23665b Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Fri, 12 Apr 2024 01:47:36 -0700 Subject: RDMA/mana_ib: Use num_comp_vectors of ib_device Use num_comp_vectors of struct ib_device instead of max_num_queues from gdma_context. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712911656-17352-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 7 +------ drivers/infiniband/hw/mana/device.c | 2 +- drivers/infiniband/hw/mana/qp.c | 4 ++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index c9129218f1be..dc931b9c3491 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -12,19 +12,14 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_device *ibdev = ibcq->device; struct mana_ib_create_cq ucmd = {}; struct mana_ib_dev *mdev; - struct gdma_context *gc; int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev_to_gc(mdev); if (udata->inlen < sizeof(ucmd)) return -EINVAL; - if (attr->comp_vector > gc->max_num_queues) - return -EINVAL; - - cq->comp_vector = attr->comp_vector; + cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); if (err) { diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 6fa902ee80a6..07e97de31886 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -74,7 +74,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, * num_comp_vectors needs to set to the max MSIX index * when interrupts and event queues are implemented */ - dev->ib_dev.num_comp_vectors = 1; + dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; dev->ib_dev.dev.parent = mdev->gdma_context->dev; ret = mana_gd_register_device(&mdev->gdma_context->mana_ib); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 8fedf6e01925..280e85a83f7e 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -198,7 +198,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, cq_spec.gdma_region = cq->queue.gdma_region; cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq = &mpc->ac->eqs[cq->comp_vector % gc->max_num_queues]; + eq = &mpc->ac->eqs[cq->comp_vector]; cq_spec.attached_eq = eq->eq->id; ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, @@ -357,7 +357,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, cq_spec.gdma_region = send_cq->queue.gdma_region; cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; cq_spec.modr_ctx_id = 0; - eq_vec = send_cq->comp_vector % gc->max_num_queues; + eq_vec = send_cq->comp_vector; eq = &mpc->ac->eqs[eq_vec]; cq_spec.attached_eq = eq->eq->id; -- cgit v1.2.3 From 98b889c43935c43ad15783dbfb1e59b4ee7f4a56 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 10 Apr 2024 01:42:26 -0700 Subject: RDMA/mana_ib: Add EQ creation for rnic adapter Create an error EQ for the RNIC adapter. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712738551-22075-2-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 13 ++++++++++--- drivers/infiniband/hw/mana/main.c | 26 ++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 5 +++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 07e97de31886..08fdd917d075 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -92,15 +92,23 @@ static int mana_ib_probe(struct auxiliary_device *adev, goto deregister_device; } + ret = mana_ib_create_eqs(dev); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret); + goto deregister_device; + } + ret = ib_register_device(&dev->ib_dev, "mana_%d", mdev->gdma_context->dev); if (ret) - goto deregister_device; + goto destroy_eqs; dev_set_drvdata(&adev->dev, dev); return 0; +destroy_eqs: + mana_ib_destroy_eqs(dev); deregister_device: mana_gd_deregister_device(dev->gdma_dev); free_ib_device: @@ -113,9 +121,8 @@ static void mana_ib_remove(struct auxiliary_device *adev) struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); ib_unregister_device(&dev->ib_dev); - + mana_ib_destroy_eqs(dev); mana_gd_deregister_device(dev->gdma_dev); - ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index b31dcff32699..bcf6b282f9d7 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -611,3 +611,29 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev) return 0; } + +int mana_ib_create_eqs(struct mana_ib_dev *mdev) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct gdma_queue_spec spec = {}; + int err; + + spec.type = GDMA_EQ; + spec.monitor_avl_buf = false; + spec.queue_size = EQ_SIZE; + spec.eq.callback = NULL; + spec.eq.context = mdev; + spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; + spec.eq.msix_index = 0; + + err = mana_gd_create_mana_eq(&gc->mana_ib, &spec, &mdev->fatal_err_eq); + if (err) + return err; + + return 0; +} + +void mana_ib_destroy_eqs(struct mana_ib_dev *mdev) +{ + mana_gd_destroy_queue(mdev_to_gc(mdev), mdev->fatal_err_eq); +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index ceca21cef72a..7c55204125de 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -54,6 +54,7 @@ struct mana_ib_queue { struct mana_ib_dev { struct ib_device ib_dev; struct gdma_dev *gdma_dev; + struct gdma_queue *fatal_err_eq; struct mana_ib_adapter_caps adapter_caps; }; @@ -233,4 +234,8 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext); int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *mdev); + +int mana_ib_create_eqs(struct mana_ib_dev *mdev); + +void mana_ib_destroy_eqs(struct mana_ib_dev *mdev); #endif -- cgit v1.2.3 From 1a79c2b9d4a08788cf1554981f10d23fbad77d11 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 10 Apr 2024 01:42:27 -0700 Subject: RDMA/mana_ib: Create and destroy rnic adapter Add functions for RNIC creation and destruction. If creation fails, the ib_probe fails as well. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712738551-22075-3-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 9 +++++++- drivers/infiniband/hw/mana/main.c | 43 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 28 +++++++++++++++++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 08fdd917d075..721e2ab8388f 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -98,15 +98,21 @@ static int mana_ib_probe(struct auxiliary_device *adev, goto deregister_device; } + ret = mana_ib_gd_create_rnic_adapter(dev); + if (ret) + goto destroy_eqs; + ret = ib_register_device(&dev->ib_dev, "mana_%d", mdev->gdma_context->dev); if (ret) - goto destroy_eqs; + goto destroy_rnic; dev_set_drvdata(&adev->dev, dev); return 0; +destroy_rnic: + mana_ib_gd_destroy_rnic_adapter(dev); destroy_eqs: mana_ib_destroy_eqs(dev); deregister_device: @@ -121,6 +127,7 @@ static void mana_ib_remove(struct auxiliary_device *adev) struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); ib_unregister_device(&dev->ib_dev); + mana_ib_gd_destroy_rnic_adapter(dev); mana_ib_destroy_eqs(dev); mana_gd_deregister_device(dev->gdma_dev); ib_dealloc_device(&dev->ib_dev); diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index bcf6b282f9d7..344e85f4940d 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -637,3 +637,46 @@ void mana_ib_destroy_eqs(struct mana_ib_dev *mdev) { mana_gd_destroy_queue(mdev_to_gc(mdev), mdev->fatal_err_eq); } + +int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev) +{ + struct mana_rnic_create_adapter_resp resp = {}; + struct mana_rnic_create_adapter_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER, sizeof(req), sizeof(resp)); + req.hdr.req.msg_version = GDMA_MESSAGE_V2; + req.hdr.dev_id = gc->mana_ib.dev_id; + req.notify_eq_id = mdev->fatal_err_eq->id; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create RNIC adapter err %d", err); + return err; + } + mdev->adapter_handle = resp.adapter; + + return 0; +} + +int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev) +{ + struct mana_rnic_destroy_adapter_resp resp = {}; + struct mana_rnic_destroy_adapter_req req = {}; + struct gdma_context *gc; + int err; + + gc = mdev_to_gc(mdev); + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy RNIC adapter err %d", err); + return err; + } + + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 7c55204125de..842f9c63a495 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -54,6 +54,7 @@ struct mana_ib_queue { struct mana_ib_dev { struct ib_device ib_dev; struct gdma_dev *gdma_dev; + mana_handle_t adapter_handle; struct gdma_queue *fatal_err_eq; struct mana_ib_adapter_caps adapter_caps; }; @@ -113,6 +114,8 @@ struct mana_ib_rwq_ind_table { enum mana_ib_command_code { MANA_IB_GET_ADAPTER_CAP = 0x30001, + MANA_IB_CREATE_ADAPTER = 0x30002, + MANA_IB_DESTROY_ADAPTER = 0x30003, }; struct mana_ib_query_adapter_caps_req { @@ -141,6 +144,27 @@ struct mana_ib_query_adapter_caps_resp { u32 max_inline_data_size; }; /* HW Data */ +struct mana_rnic_create_adapter_req { + struct gdma_req_hdr hdr; + u32 notify_eq_id; + u32 reserved; + u64 feature_flags; +}; /*HW Data */ + +struct mana_rnic_create_adapter_resp { + struct gdma_resp_hdr hdr; + mana_handle_t adapter; +}; /* HW Data */ + +struct mana_rnic_destroy_adapter_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; +}; /*HW Data */ + +struct mana_rnic_destroy_adapter_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; @@ -238,4 +262,8 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *mdev); int mana_ib_create_eqs(struct mana_ib_dev *mdev); void mana_ib_destroy_eqs(struct mana_ib_dev *mdev); + +int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev); + +int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev); #endif -- cgit v1.2.3 From 4bda1d5332ec1b00262ad53f6a4cfa88190a048d Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 10 Apr 2024 01:42:28 -0700 Subject: RDMA/mana_ib: Implement port parameters Implement port parameters for RNIC: 1) extend query_port() method 2) implement get_link_layer() 3) implement query_pkey() Only port 1 can store GIDs. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712738551-22075-4-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 2 ++ drivers/infiniband/hw/mana/main.c | 37 +++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mana/mana_ib.h | 4 ++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 721e2ab8388f..ef04cc48264c 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -29,12 +29,14 @@ static const struct ib_device_ops mana_ib_dev_ops = { .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, .destroy_wq = mana_ib_destroy_wq, .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_link_layer = mana_ib_get_link_layer, .get_port_immutable = mana_ib_get_port_immutable, .mmap = mana_ib_mmap, .modify_qp = mana_ib_modify_qp, .modify_wq = mana_ib_modify_wq, .query_device = mana_ib_query_device, .query_gid = mana_ib_query_gid, + .query_pkey = mana_ib_query_pkey, .query_port = mana_ib_query_port, .reg_user_mr = mana_ib_reg_user_mr, diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 344e85f4940d..b2817c92f1c0 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -555,7 +555,42 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, int mana_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props) { - /* This version doesn't return port properties */ + struct net_device *ndev = mana_ib_get_netdev(ibdev, port); + + if (!ndev) + return -EINVAL; + + memset(props, 0, sizeof(*props)); + props->max_mtu = IB_MTU_4096; + props->active_mtu = ib_mtu_int_to_enum(ndev->mtu); + + if (netif_carrier_ok(ndev) && netif_running(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + } else { + props->state = IB_PORT_DOWN; + props->phys_state = IB_PORT_PHYS_STATE_DISABLED; + } + + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_EDR; + props->pkey_tbl_len = 1; + if (port == 1) + props->gid_tbl_len = 16; + + return 0; +} + +enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +int mana_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + if (index != 0) + return -EINVAL; + *pkey = IB_DEFAULT_PKEY_FULL; return 0; } diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 842f9c63a495..b9117cbc7629 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -266,4 +266,8 @@ void mana_ib_destroy_eqs(struct mana_ib_dev *mdev); int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev); int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev); + +int mana_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); + +enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_num); #endif -- cgit v1.2.3 From 8b184e4f1c328d9b37994f66224550befdefe49b Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 10 Apr 2024 01:42:29 -0700 Subject: RDMA/mana_ib: Enable RoCE on port 1 Set netdev and RoCEv2 flag to enable GID population on port 1. Use GIDs of the master netdev. As mc->ports[] stores slave devices, use a helper to get the master netdev. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712738551-22075-5-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 15 +++++++++++++++ drivers/infiniband/hw/mana/main.c | 15 +++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index ef04cc48264c..77994d32f87b 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -53,6 +53,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, { struct mana_adev *madev = container_of(adev, struct mana_adev, adev); struct gdma_dev *mdev = madev->mdev; + struct net_device *upper_ndev; struct mana_context *mc; struct mana_ib_dev *dev; int ret; @@ -79,6 +80,20 @@ static int mana_ib_probe(struct auxiliary_device *adev, dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues; dev->ib_dev.dev.parent = mdev->gdma_context->dev; + rcu_read_lock(); /* required to get upper dev */ + upper_ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]); + if (!upper_ndev) { + rcu_read_unlock(); + ibdev_err(&dev->ib_dev, "Failed to get master netdev"); + goto free_ib_device; + } + ret = ib_device_set_netdev(&dev->ib_dev, upper_ndev, 1); + rcu_read_unlock(); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret); + goto free_ib_device; + } + ret = mana_gd_register_device(&mdev->gdma_context->mana_ib); if (ret) { ibdev_err(&dev->ib_dev, "Failed to register device, ret %d", diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index b2817c92f1c0..c020183385d4 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -525,11 +525,18 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { - /* - * This version only support RAW_PACKET - * other values need to be filled for other types - */ + struct ib_port_attr attr; + int err; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + if (port_num == 1) + immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; return 0; } -- cgit v1.2.3 From faafb8b126ad6043663a77e6b234bca932f60694 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 10 Apr 2024 01:42:30 -0700 Subject: RDMA/mana_ib: Adding and deleting GIDs Implement add_gid and del_gid for RNIC. IPv4 and IPv6 addresses are supported. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712738551-22075-6-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 2 ++ drivers/infiniband/hw/mana/main.c | 60 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 35 +++++++++++++++++++++ 3 files changed, 97 insertions(+) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 77994d32f87b..5d9fd59b1ff2 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -15,6 +15,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .driver_id = RDMA_DRIVER_MANA, .uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION, + .add_gid = mana_ib_gd_add_gid, .alloc_pd = mana_ib_alloc_pd, .alloc_ucontext = mana_ib_alloc_ucontext, .create_cq = mana_ib_create_cq, @@ -23,6 +24,7 @@ static const struct ib_device_ops mana_ib_dev_ops = { .create_wq = mana_ib_create_wq, .dealloc_pd = mana_ib_dealloc_pd, .dealloc_ucontext = mana_ib_dealloc_ucontext, + .del_gid = mana_ib_gd_del_gid, .dereg_mr = mana_ib_dereg_mr, .destroy_cq = mana_ib_destroy_cq, .destroy_qp = mana_ib_destroy_qp, diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index c020183385d4..e404762ba029 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -722,3 +722,63 @@ int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev) return 0; } + +int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct mana_ib_dev *mdev = container_of(attr->device, struct mana_ib_dev, ib_dev); + enum rdma_network_type ntype = rdma_gid_attr_network_type(attr); + struct mana_rnic_config_addr_resp resp = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_config_addr_req req = {}; + int err; + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) { + ibdev_dbg(&mdev->ib_dev, "Unsupported rdma network type %d", ntype); + return -EINVAL; + } + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.op = ADDR_OP_ADD; + req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; + copy_in_reverse(req.ip_addr, attr->gid.raw, sizeof(union ib_gid)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config IP addr err %d\n", err); + return err; + } + + return 0; +} + +int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context) +{ + struct mana_ib_dev *mdev = container_of(attr->device, struct mana_ib_dev, ib_dev); + enum rdma_network_type ntype = rdma_gid_attr_network_type(attr); + struct mana_rnic_config_addr_resp resp = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_config_addr_req req = {}; + int err; + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) { + ibdev_dbg(&mdev->ib_dev, "Unsupported rdma network type %d", ntype); + return -EINVAL; + } + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.op = ADDR_OP_REMOVE; + req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4; + copy_in_reverse(req.ip_addr, attr->gid.raw, sizeof(union ib_gid)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config IP addr err %d\n", err); + return err; + } + + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index b9117cbc7629..89ac5b39dbce 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -116,6 +116,7 @@ enum mana_ib_command_code { MANA_IB_GET_ADAPTER_CAP = 0x30001, MANA_IB_CREATE_ADAPTER = 0x30002, MANA_IB_DESTROY_ADAPTER = 0x30003, + MANA_IB_CONFIG_IP_ADDR = 0x30004, }; struct mana_ib_query_adapter_caps_req { @@ -165,6 +166,28 @@ struct mana_rnic_destroy_adapter_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +enum mana_ib_addr_op { + ADDR_OP_ADD = 1, + ADDR_OP_REMOVE = 2, +}; + +enum sgid_entry_type { + SGID_TYPE_IPV4 = 1, + SGID_TYPE_IPV6 = 2, +}; + +struct mana_rnic_config_addr_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + enum mana_ib_addr_op op; + enum sgid_entry_type sgid_type; + u8 ip_addr[16]; +}; /* HW Data */ + +struct mana_rnic_config_addr_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; @@ -181,6 +204,14 @@ static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 return mc->ports[port - 1]; } +static inline void copy_in_reverse(u8 *dst, const u8 *src, u32 size) +{ + u32 i; + + for (i = 0; i < size; i++) + dst[size - 1 - i] = src[i]; +} + int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, @@ -270,4 +301,8 @@ int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev); int mana_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_num); + +int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context); + +int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context); #endif -- cgit v1.2.3 From 8859f009ace237ffc165c95edcc113d3824b9bf3 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 10 Apr 2024 01:42:31 -0700 Subject: RDMA/mana_ib: Configure mac address in RNIC Set local mac address in RNIC, which is required by the HW. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1712738551-22075-7-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/device.c | 9 +++++++++ drivers/infiniband/hw/mana/main.c | 22 ++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 15 +++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 5d9fd59b1ff2..fca4d0d85c64 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -58,6 +58,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, struct net_device *upper_ndev; struct mana_context *mc; struct mana_ib_dev *dev; + u8 mac_addr[ETH_ALEN]; int ret; mc = mdev->driver_data; @@ -89,6 +90,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, ibdev_err(&dev->ib_dev, "Failed to get master netdev"); goto free_ib_device; } + ether_addr_copy(mac_addr, upper_ndev->dev_addr); ret = ib_device_set_netdev(&dev->ib_dev, upper_ndev, 1); rcu_read_unlock(); if (ret) { @@ -121,6 +123,13 @@ static int mana_ib_probe(struct auxiliary_device *adev, if (ret) goto destroy_eqs; + ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr); + if (ret) { + ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", + ret); + goto destroy_rnic; + } + ret = ib_register_device(&dev->ib_dev, "mana_%d", mdev->gdma_context->dev); if (ret) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index e404762ba029..f5401471bffe 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -782,3 +782,25 @@ int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context) return 0; } + +int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 *mac) +{ + struct mana_rnic_config_mac_addr_resp resp = {}; + struct mana_rnic_config_mac_addr_req req = {}; + struct gdma_context *gc = mdev_to_gc(mdev); + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_MAC_ADDR, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.op = op; + copy_in_reverse(req.mac_addr, mac, ETH_ALEN); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to config Mac addr err %d", err); + return err; + } + + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 89ac5b39dbce..4c1240da0c5f 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -117,6 +117,7 @@ enum mana_ib_command_code { MANA_IB_CREATE_ADAPTER = 0x30002, MANA_IB_DESTROY_ADAPTER = 0x30003, MANA_IB_CONFIG_IP_ADDR = 0x30004, + MANA_IB_CONFIG_MAC_ADDR = 0x30005, }; struct mana_ib_query_adapter_caps_req { @@ -188,6 +189,18 @@ struct mana_rnic_config_addr_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +struct mana_rnic_config_mac_addr_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + enum mana_ib_addr_op op; + u8 mac_addr[ETH_ALEN]; + u8 reserved[6]; +}; /* HW Data */ + +struct mana_rnic_config_mac_addr_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; @@ -305,4 +318,6 @@ enum rdma_link_layer mana_ib_get_link_layer(struct ib_device *device, u32 port_n int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context); int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context); + +int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 *mac); #endif -- cgit v1.2.3 From 203b70fda63425a4eb29f03f9074859afe821a39 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 11 Apr 2024 11:38:51 +0800 Subject: RDMA/hns: Fix return value in hns_roce_map_mr_sg As described in the ib_map_mr_sg function comment, it returns the number of sg elements that were mapped to the memory region. However, hns_roce_map_mr_sg returns the number of pages required for mapping the DMA area. Fix it. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20240411033851.2884771-1-shaozhengchao@huawei.com Reviewed-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_mr.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 9e05b57a2d67..80c050d7d0ea 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -441,18 +441,18 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); struct hns_roce_mtr *mtr = &mr->pbl_mtr; - int ret = 0; + int ret, sg_num = 0; mr->npages = 0; mr->page_list = kvcalloc(mr->pbl_mtr.hem_cfg.buf_pg_count, sizeof(dma_addr_t), GFP_KERNEL); if (!mr->page_list) - return ret; + return sg_num; - ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); - if (ret < 1) { + sg_num = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); + if (sg_num < 1) { ibdev_err(ibdev, "failed to store sg pages %u %u, cnt = %d.\n", - mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, ret); + mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, sg_num); goto err_page_list; } @@ -463,17 +463,16 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, ret = hns_roce_mtr_map(hr_dev, mtr, mr->page_list, mr->npages); if (ret) { ibdev_err(ibdev, "failed to map sg mtr, ret = %d.\n", ret); - ret = 0; + sg_num = 0; } else { mr->pbl_mtr.hem_cfg.buf_pg_shift = (u32)ilog2(ibmr->page_size); - ret = mr->npages; } err_page_list: kvfree(mr->page_list); mr->page_list = NULL; - return ret; + return sg_num; } static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, -- cgit v1.2.3 From bfb6be401470206ac02cbfdaf7b76ee040c1ae3d Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Fri, 12 Apr 2024 17:16:07 +0800 Subject: RDMA/hns: Use macro instead of magic number Use macro instead of magic number. Signed-off-by: Yangyang Li Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-2-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 34 +++++++++++++++++------------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 13 ++++++++++++ drivers/infiniband/hw/hns/hns_roce_qp.c | 3 ++- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 423ab66c5856..30ac5fb5ab16 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3204,13 +3204,14 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, /* Aligned to the hardware address access unit */ for (i = 0; i < ARRAY_SIZE(pages); i++) - pages[i] >>= 6; + pages[i] >>= MPT_PBL_BUF_ADDR_S; pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr); mpt_entry->pbl_size = cpu_to_le32(mr->npages); - mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> 3); - hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); + mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> MPT_PBL_BA_ADDR_S); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, + upper_32_bits(pbl_ba >> MPT_PBL_BA_ADDR_S)); mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0])); hr_reg_write(mpt_entry, MPT_PA0_H, upper_32_bits(pages[0])); @@ -3331,8 +3332,10 @@ static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, mpt_entry->pbl_size = cpu_to_le32(mr->npages); - mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> 3)); - hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); + mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> + MPT_PBL_BA_ADDR_S)); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, + upper_32_bits(pbl_ba >> MPT_PBL_BA_ADDR_S)); return 0; } @@ -3578,14 +3581,14 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.ba_pg_shift)); hr_reg_write(cq_context, CQC_CQE_BUF_PG_SZ, to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.buf_pg_shift)); - hr_reg_write(cq_context, CQC_CQE_BA_L, dma_handle >> 3); - hr_reg_write(cq_context, CQC_CQE_BA_H, (dma_handle >> (32 + 3))); + hr_reg_write(cq_context, CQC_CQE_BA_L, dma_handle >> CQC_CQE_BA_L_S); + hr_reg_write(cq_context, CQC_CQE_BA_H, dma_handle >> CQC_CQE_BA_H_S); hr_reg_write_bool(cq_context, CQC_DB_RECORD_EN, hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB); hr_reg_write(cq_context, CQC_CQE_DB_RECORD_ADDR_L, ((u32)hr_cq->db.dma) >> 1); hr_reg_write(cq_context, CQC_CQE_DB_RECORD_ADDR_H, - hr_cq->db.dma >> 32); + hr_cq->db.dma >> CQC_CQE_DB_RECORD_ADDR_H_S); hr_reg_write(cq_context, CQC_CQ_MAX_CNT, HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM); hr_reg_write(cq_context, CQC_CQ_PERIOD, @@ -4517,16 +4520,16 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, return -EINVAL; } - hr_reg_write(context, QPC_TRRL_BA_L, trrl_ba >> 4); + hr_reg_write(context, QPC_TRRL_BA_L, trrl_ba >> QPC_TRRL_BA_L_S); hr_reg_clear(qpc_mask, QPC_TRRL_BA_L); - context->trrl_ba = cpu_to_le32(trrl_ba >> (16 + 4)); + context->trrl_ba = cpu_to_le32(trrl_ba >> QPC_TRRL_BA_M_S); qpc_mask->trrl_ba = 0; - hr_reg_write(context, QPC_TRRL_BA_H, trrl_ba >> (32 + 16 + 4)); + hr_reg_write(context, QPC_TRRL_BA_H, trrl_ba >> QPC_TRRL_BA_H_S); hr_reg_clear(qpc_mask, QPC_TRRL_BA_H); - context->irrl_ba = cpu_to_le32(irrl_ba >> 6); + context->irrl_ba = cpu_to_le32(irrl_ba >> QPC_IRRL_BA_L_S); qpc_mask->irrl_ba = 0; - hr_reg_write(context, QPC_IRRL_BA_H, irrl_ba >> (32 + 6)); + hr_reg_write(context, QPC_IRRL_BA_H, irrl_ba >> QPC_IRRL_BA_H_S); hr_reg_clear(qpc_mask, QPC_IRRL_BA_H); hr_reg_enable(context, QPC_RMT_E2E); @@ -4588,8 +4591,9 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_clear(qpc_mask, QPC_TRRL_HEAD_MAX); hr_reg_clear(qpc_mask, QPC_TRRL_TAIL_MAX); +#define MAX_LP_SGEN 3 /* rocee send 2^lp_sgen_ini segs every time */ - hr_reg_write(context, QPC_LP_SGEN_INI, 3); + hr_reg_write(context, QPC_LP_SGEN_INI, MAX_LP_SGEN); hr_reg_clear(qpc_mask, QPC_LP_SGEN_INI); if (udata && ibqp->qp_type == IB_QPT_RC && @@ -4681,7 +4685,7 @@ static int get_dip_ctx_idx(struct ib_qp *ibqp, const struct ib_qp_attr *attr, *tail = (*tail == hr_dev->caps.num_qps - 1) ? 0 : (*tail + 1); list_for_each_entry(hr_dip, &hr_dev->dip_list, node) { - if (!memcmp(grh->dgid.raw, hr_dip->dgid, 16)) { + if (!memcmp(grh->dgid.raw, hr_dip->dgid, GID_LEN_V2)) { *dip_idx = hr_dip->dip_idx; goto out; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index df04bc8ede57..4bac34f6bbe8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -276,6 +276,10 @@ struct hns_roce_v2_cq_context { __le32 byte_64_se_cqe_idx; }; +#define CQC_CQE_BA_L_S 3 +#define CQC_CQE_BA_H_S (32 + CQC_CQE_BA_L_S) +#define CQC_CQE_DB_RECORD_ADDR_H_S 32 + #define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0 #define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL 0x0 @@ -447,6 +451,12 @@ struct hns_roce_v2_qp_context { struct hns_roce_v2_qp_context_ex ext; }; +#define QPC_TRRL_BA_L_S 4 +#define QPC_TRRL_BA_M_S (16 + QPC_TRRL_BA_L_S) +#define QPC_TRRL_BA_H_S (32 + QPC_TRRL_BA_M_S) +#define QPC_IRRL_BA_L_S 6 +#define QPC_IRRL_BA_H_S (32 + QPC_IRRL_BA_L_S) + #define QPC_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_qp_context, h, l) #define QPC_TST QPC_FIELD_LOC(2, 0) @@ -716,6 +726,9 @@ struct hns_roce_v2_mpt_entry { __le32 byte_64_buf_pa1; }; +#define MPT_PBL_BUF_ADDR_S 6 +#define MPT_PBL_BA_ADDR_S 3 + #define MPT_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_mpt_entry, h, l) #define MPT_ST MPT_FIELD_LOC(1, 0) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 697230f964b1..cac3fe588672 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -410,7 +410,8 @@ static void free_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) bankid = get_qp_bankid(hr_qp->qpn); - ida_free(&hr_dev->qp_table.bank[bankid].ida, hr_qp->qpn >> 3); + ida_free(&hr_dev->qp_table.bank[bankid].ida, + hr_qp->qpn / HNS_ROCE_QP_BANK_NUM); mutex_lock(&hr_dev->qp_table.bank_mutex); hr_dev->qp_table.bank[bankid].inuse--; -- cgit v1.2.3 From f4caa864af84f801a5821ea2ba6c1cc46f8252c1 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 Apr 2024 17:16:08 +0800 Subject: RDMA/hns: Remove unused parameters and variables Remove unused parameters and variables. Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-3-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 3 +-- drivers/infiniband/hw/hns/hns_roce_device.h | 5 ++--- drivers/infiniband/hw/hns/hns_roce_hem.c | 13 +++++-------- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 20 +++++++------------- drivers/infiniband/hw/hns/hns_roce_mr.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_qp.c | 4 +--- drivers/infiniband/hw/hns/hns_roce_srq.c | 4 ++-- 7 files changed, 20 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 11a78ceae568..950c133d4220 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -153,8 +153,7 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, return total; } -int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, - int buf_cnt, struct ib_umem *umem, +int hns_roce_get_umem_bufs(dma_addr_t *bufs, int buf_cnt, struct ib_umem *umem, unsigned int page_shift) { struct ib_block_iter biter; diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 78b4d19ff848..37888f78849d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -925,8 +925,7 @@ struct hns_roce_hw { int (*rereg_write_mtpt)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, int flags, void *mb_buf); - int (*frmr_write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf, - struct hns_roce_mr *mr); + int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr); int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, @@ -1232,7 +1231,7 @@ struct hns_roce_buf *hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int buf_cnt, struct hns_roce_buf *buf, unsigned int page_shift); -int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, +int hns_roce_get_umem_bufs(dma_addr_t *bufs, int buf_cnt, struct ib_umem *umem, unsigned int page_shift); diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index a4b3f19161dc..a9ea55506779 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -986,15 +986,13 @@ static void hem_list_free_all(struct hns_roce_dev *hr_dev, } } -static void hem_list_link_bt(struct hns_roce_dev *hr_dev, void *base_addr, - u64 table_addr) +static void hem_list_link_bt(void *base_addr, u64 table_addr) { *(u64 *)(base_addr) = table_addr; } /* assign L0 table address to hem from root bt */ -static void hem_list_assign_bt(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_item *hem, void *cpu_addr, +static void hem_list_assign_bt(struct hns_roce_hem_item *hem, void *cpu_addr, u64 phy_addr) { hem->addr = cpu_addr; @@ -1163,8 +1161,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, if (level > 1) { pre = hem_ptrs[level - 1]; step = (cur->start - pre->start) / step * BA_BYTE_LEN; - hem_list_link_bt(hr_dev, pre->addr + step, - cur->dma_addr); + hem_list_link_bt(pre->addr + step, cur->dma_addr); } } @@ -1222,7 +1219,7 @@ static int alloc_fake_root_bt(struct hns_roce_dev *hr_dev, void *cpu_base, if (!hem) return -ENOMEM; - hem_list_assign_bt(hr_dev, hem, cpu_base, phy_base); + hem_list_assign_bt(hem, cpu_base, phy_base); list_add(&hem->list, branch_head); list_add(&hem->sibling, leaf_head); @@ -1245,7 +1242,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, /* if exist mid bt, link L1 to L0 */ list_for_each_entry_safe(hem, temp_hem, branch_head, list) { offset = (hem->start - r->offset) / step * BA_BYTE_LEN; - hem_list_link_bt(hr_dev, cpu_base + offset, hem->dma_addr); + hem_list_link_bt(cpu_base + offset, hem->dma_addr); total++; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 30ac5fb5ab16..e3f87090bad0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3304,8 +3304,7 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, return ret; } -static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, - void *mb_buf, struct hns_roce_mr *mr) +static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) { dma_addr_t pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr); struct hns_roce_v2_mpt_entry *mpt_entry; @@ -4216,8 +4215,7 @@ static void set_access_flags(struct hns_roce_qp *hr_qp, } static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp, - struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *context) { hr_reg_write(context, QPC_SGE_SHIFT, to_hr_hem_entries_shift(hr_qp->sge.sge_cnt, @@ -4239,7 +4237,6 @@ static inline int get_pdn(struct ib_pd *ib_pd) } static void modify_qp_reset_to_init(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -4258,7 +4255,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, hr_reg_write(context, QPC_RQWS, ilog2(hr_qp->rq.max_gs)); - set_qpc_wqe_cnt(hr_qp, context, qpc_mask); + set_qpc_wqe_cnt(hr_qp, context); /* No VLAN need to set 0xFFF */ hr_reg_write(context, QPC_VLAN_ID, 0xfff); @@ -4299,7 +4296,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, } static void modify_qp_init_to_init(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -4619,8 +4615,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, return 0; } -static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, int attr_mask, +static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, int attr_mask, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -5034,15 +5029,14 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { memset(qpc_mask, 0, hr_dev->caps.qpc_sz); - modify_qp_reset_to_init(ibqp, attr, context, qpc_mask); + modify_qp_reset_to_init(ibqp, context, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { - modify_qp_init_to_init(ibqp, attr, context, qpc_mask); + modify_qp_init_to_init(ibqp, context, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context, qpc_mask, udata); } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { - ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context, - qpc_mask); + ret = modify_qp_rtr_to_rts(ibqp, attr_mask, context, qpc_mask); } return ret; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 80c050d7d0ea..1a61dceb3319 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -162,7 +162,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, if (mr->type != MR_TYPE_FRMR) ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr); else - ret = hr_dev->hw->frmr_write_mtpt(hr_dev, mailbox->buf, mr); + ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr); if (ret) { dev_err(dev, "failed to write mtpt, ret = %d.\n", ret); goto err_page; @@ -755,7 +755,7 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) return -ENOMEM; if (mtr->umem) - npage = hns_roce_get_umem_bufs(hr_dev, pages, page_count, + npage = hns_roce_get_umem_bufs(pages, page_count, mtr->umem, page_shift); else npage = hns_roce_get_kmem_bufs(hr_dev, pages, page_count, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index cac3fe588672..dc3cb26f434e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1118,7 +1118,6 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, } static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, - struct ib_pd *ib_pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct hns_roce_qp *hr_qp) @@ -1272,7 +1271,6 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, struct ib_device *ibdev = qp->device; struct hns_roce_dev *hr_dev = to_hr_dev(ibdev); struct hns_roce_qp *hr_qp = to_hr_qp(qp); - struct ib_pd *pd = qp->pd; int ret; ret = check_qp_type(hr_dev, init_attr->qp_type, !!udata); @@ -1287,7 +1285,7 @@ int hns_roce_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port]; } - ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, hr_qp); + ret = hns_roce_create_qp_common(hr_dev, init_attr, udata, hr_qp); if (ret) ibdev_err(ibdev, "create QP type 0x%x failed(%d)\n", init_attr->qp_type, ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 4abae9477854..e4705ccdfa65 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -250,7 +250,7 @@ static void free_srq_wqe_buf(struct hns_roce_dev *hr_dev, hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr); } -static int alloc_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +static int alloc_srq_wrid(struct hns_roce_srq *srq) { srq->wrid = kvmalloc_array(srq->wqe_cnt, sizeof(u64), GFP_KERNEL); if (!srq->wrid) @@ -366,7 +366,7 @@ static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, goto err_idx; if (!udata) { - ret = alloc_srq_wrid(hr_dev, srq); + ret = alloc_srq_wrid(srq); if (ret) goto err_wqe_buf; } -- cgit v1.2.3 From 2ce384307f2ddf39dc662878e151722199afc9ae Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 Apr 2024 17:16:09 +0800 Subject: RDMA/hns: Add max_ah and cq moderation capacities in query_device() Add max_ah and cq moderation capacities to hns_roce_query_device(). Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-4-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 7 +++++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 37888f78849d..ff0b3f68ee3a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -100,6 +100,9 @@ #define CQ_BANKID_SHIFT 2 #define CQ_BANKID_MASK GENMASK(1, 0) +#define HNS_ROCE_MAX_CQ_COUNT 0xFFFF +#define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF + enum { SERV_TYPE_RC, SERV_TYPE_UC, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e3f87090bad0..2a97a81ae19f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5848,7 +5848,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) dev_info(hr_dev->dev, "cq_period(%u) reached the upper limit, adjusted to 65.\n", cq_period); - cq_period = HNS_ROCE_MAX_CQ_PERIOD; + cq_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; } cq_period *= HNS_ROCE_CLOCK_ADJUST; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 4bac34f6bbe8..def1d15a03c7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1347,7 +1347,7 @@ struct fmea_ram_ecc { /* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 -#define HNS_ROCE_MAX_CQ_PERIOD 65 +#define HNS_ROCE_MAX_CQ_PERIOD_HIP08 65 #define HNS_ROCE_MAX_EQ_PERIOD 65 #define HNS_ROCE_RNR_TIMER_10NS 1 #define HNS_ROCE_1US_CFG 999 diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 1dc60c2b2b7a..4d94fcb8685a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -40,6 +40,7 @@ #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" +#include "hns_roce_hw_v2.h" static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -192,6 +193,12 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->max_pkeys = 1; props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay; + props->max_ah = INT_MAX; + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD; + props->cq_caps.max_cq_moderation_count = HNS_ROCE_MAX_CQ_COUNT; + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + props->cq_caps.max_cq_moderation_period = HNS_ROCE_MAX_CQ_PERIOD_HIP08; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { props->max_srq = hr_dev->caps.num_srqs; props->max_srq_wr = hr_dev->caps.max_srq_wrs; -- cgit v1.2.3 From b46494b6f9c19f141114a57729e198698f40af37 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 Apr 2024 17:16:10 +0800 Subject: RDMA/hns: Fix deadlock on SRQ async events. xa_lock for SRQ table may be required in AEQ. Use xa_store_irq()/ xa_erase_irq() to avoid deadlock. Fixes: 81fce6291d99 ("RDMA/hns: Add SRQ asynchronous event support") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-5-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/hns/hns_roce_srq.c | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 4d94fcb8685a..d202258368ed 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -37,6 +37,7 @@ #include #include #include +#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index e4705ccdfa65..7210e53a82f3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -123,7 +123,7 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) return ret; } - ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); + ret = xa_err(xa_store_irq(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); if (ret) { ibdev_err(ibdev, "failed to store SRQC, ret = %d.\n", ret); goto err_put; @@ -136,7 +136,7 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) return 0; err_xa: - xa_erase(&srq_table->xa, srq->srqn); + xa_erase_irq(&srq_table->xa, srq->srqn); err_put: hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn); @@ -154,7 +154,7 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", ret, srq->srqn); - xa_erase(&srq_table->xa, srq->srqn); + xa_erase_irq(&srq_table->xa, srq->srqn); if (refcount_dec_and_test(&srq->refcount)) complete(&srq->free); -- cgit v1.2.3 From a942ec2745ca864cd8512142100e4027dc306a42 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 Apr 2024 17:16:11 +0800 Subject: RDMA/hns: Fix UAF for cq async event The refcount of CQ is not protected by locks. When CQ asynchronous events and CQ destruction are concurrent, CQ may have been released, which will cause UAF. Use the xa_lock() to protect the CQ refcount. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-6-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_cq.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 7250d0643b5c..68e22f368d43 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -149,7 +149,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) return ret; } - ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); + ret = xa_err(xa_store_irq(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); if (ret) { ibdev_err(ibdev, "failed to xa_store CQ, ret = %d.\n", ret); goto err_put; @@ -163,7 +163,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) return 0; err_xa: - xa_erase(&cq_table->array, hr_cq->cqn); + xa_erase_irq(&cq_table->array, hr_cq->cqn); err_put: hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn); @@ -182,7 +182,7 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, hr_cq->cqn); - xa_erase(&cq_table->array, hr_cq->cqn); + xa_erase_irq(&cq_table->array, hr_cq->cqn); /* Waiting interrupt process procedure carried out */ synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq); @@ -476,13 +476,6 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) struct ib_event event; struct ib_cq *ibcq; - hr_cq = xa_load(&hr_dev->cq_table.array, - cqn & (hr_dev->caps.num_cqs - 1)); - if (!hr_cq) { - dev_warn(dev, "async event for bogus CQ 0x%06x\n", cqn); - return; - } - if (event_type != HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID && event_type != HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR && event_type != HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW) { @@ -491,7 +484,16 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) return; } - refcount_inc(&hr_cq->refcount); + xa_lock(&hr_dev->cq_table.array); + hr_cq = xa_load(&hr_dev->cq_table.array, + cqn & (hr_dev->caps.num_cqs - 1)); + if (hr_cq) + refcount_inc(&hr_cq->refcount); + xa_unlock(&hr_dev->cq_table.array); + if (!hr_cq) { + dev_warn(dev, "async event for bogus CQ 0x%06x\n", cqn); + return; + } ibcq = &hr_cq->ib_cq; if (ibcq->event_handler) { -- cgit v1.2.3 From dc3bda6e568e9310b7cd07769dd70a3f0cd696ca Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 12 Apr 2024 17:16:12 +0800 Subject: RDMA/hns: Fix mismatch exception rollback When dma_alloc_coherent() fails in hns_roce_alloc_hem(), just call kfree() to release hem instead of hns_roce_free_hem(). Fixes: c00743cbf2b8 ("RDMA/hns: Simplify 'struct hns_roce_hem' allocation") Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-7-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index a9ea55506779..1c2ec803e030 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -281,7 +281,7 @@ static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev, return hem; fail: - hns_roce_free_hem(hr_dev, hem); + kfree(hem); return NULL; } -- cgit v1.2.3 From ee045493283403969591087bd405fa280103282a Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 Apr 2024 17:16:13 +0800 Subject: RDMA/hns: Fix GMV table pagesize GMV's BA table only supports 4K pages. Currently, PAGESIZE is used to calculate gmv_bt_num, which will cause an abnormal number of gmv_bt_num in a 64K OS. Fixes: d6d91e46210f ("RDMA/hns: Add support for configuring GMV table") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-8-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2a97a81ae19f..89d0f5b8be75 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2101,7 +2101,7 @@ static void apply_func_caps(struct hns_roce_dev *hr_dev) caps->gmv_bt_num * (HNS_HW_PAGE_SIZE / caps->gmv_entry_sz)); - caps->gmv_entry_num = caps->gmv_bt_num * (PAGE_SIZE / + caps->gmv_entry_num = caps->gmv_bt_num * (HNS_HW_PAGE_SIZE / caps->gmv_entry_sz); } else { u32 func_num = max_t(u32, 1, hr_dev->func_num); -- cgit v1.2.3 From 9a84848dcee289966e8a2c21223bb0d7bc44f201 Mon Sep 17 00:00:00 2001 From: wenglianfa Date: Fri, 12 Apr 2024 17:16:14 +0800 Subject: RDMA/hns: Add mutex_destroy() Add mutex_destroy(). Signed-off-by: wenglianfa Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-9-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_cq.c | 1 + drivers/infiniband/hw/hns/hns_roce_hem.c | 2 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++++- drivers/infiniband/hw/hns/hns_roce_main.c | 24 ++++++++++++++++++++++-- drivers/infiniband/hw/hns/hns_roce_qp.c | 9 +++++++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 2 ++ 6 files changed, 39 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 68e22f368d43..56dc3908da2f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -536,4 +536,5 @@ void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) ida_destroy(&hr_dev->cq_table.bank[i].ida); + mutex_destroy(&hr_dev->cq_table.bank_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 1c2ec803e030..02baa853a76c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -877,6 +877,7 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_cleanup_mhop_hem_table(hr_dev, table); + mutex_destroy(&table->mutex); return; } @@ -891,6 +892,7 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, hns_roce_free_hem(hr_dev, table->hem[i]); } + mutex_destroy(&table->mutex); kfree(table->hem); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 89d0f5b8be75..5d526b5c4b81 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2667,6 +2667,8 @@ static void free_mr_exit(struct hns_roce_dev *hr_dev) kfree(free_mr->rsv_pd); free_mr->rsv_pd = NULL; } + + mutex_destroy(&free_mr->mutex); } static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) @@ -2817,8 +2819,10 @@ static int free_mr_init(struct hns_roce_dev *hr_dev) mutex_init(&free_mr->mutex); ret = free_mr_alloc_res(hr_dev); - if (ret) + if (ret) { + mutex_destroy(&free_mr->mutex); return ret; + } ret = free_mr_modify_qp(hr_dev); if (ret) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d202258368ed..4cb0af733587 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -429,6 +429,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, return 0; error_fail_copy_to_udata: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&context->page_mutex); hns_roce_dealloc_uar_entry(context); error_fail_uar_entry: @@ -445,6 +448,10 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&context->page_mutex); + hns_roce_dealloc_uar_entry(context); ida_free(&hr_dev->uar_ida.ida, (int)context->uar.logic_idx); @@ -933,6 +940,15 @@ err_unmap_dmpt: return ret; } +static void hns_roce_teardown_hca(struct hns_roce_dev *hr_dev) +{ + hns_roce_cleanup_bitmap(hr_dev); + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&hr_dev->pgdir_mutex); +} + /** * hns_roce_setup_hca - setup host channel adapter * @hr_dev: pointer to hns roce device @@ -981,6 +997,10 @@ static int hns_roce_setup_hca(struct hns_roce_dev *hr_dev) err_uar_table_free: ida_destroy(&hr_dev->uar_ida.ida); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || + hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) + mutex_destroy(&hr_dev->pgdir_mutex); + return ret; } @@ -1126,7 +1146,7 @@ error_failed_register_device: hr_dev->hw->hw_exit(hr_dev); error_failed_engine_init: - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev); error_failed_setup_hca: hns_roce_cleanup_hem(hr_dev); @@ -1156,7 +1176,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); - hns_roce_cleanup_bitmap(hr_dev); + hns_roce_teardown_hca(hr_dev); hns_roce_cleanup_hem(hr_dev); if (hr_dev->cmd_mod) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index dc3cb26f434e..db34665d1dfb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1140,7 +1140,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, ret = set_qp_param(hr_dev, hr_qp, init_attr, udata, &ucmd); if (ret) { ibdev_err(ibdev, "failed to set QP param, ret = %d.\n", ret); - return ret; + goto err_out; } if (!udata) { @@ -1148,7 +1148,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, if (ret) { ibdev_err(ibdev, "failed to alloc wrid, ret = %d.\n", ret); - return ret; + goto err_out; } } @@ -1219,6 +1219,8 @@ err_qpn: free_qp_buf(hr_dev, hr_qp); err_buf: free_kernel_wrid(hr_qp); +err_out: + mutex_destroy(&hr_qp->mutex); return ret; } @@ -1234,6 +1236,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, free_qp_buf(hr_dev, hr_qp); free_kernel_wrid(hr_qp); free_qp_db(hr_dev, hr_qp, udata); + mutex_destroy(&hr_qp->mutex); } static int check_qp_type(struct hns_roce_dev *hr_dev, enum ib_qp_type type, @@ -1573,5 +1576,7 @@ void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) ida_destroy(&hr_dev->qp_table.bank[i].ida); + mutex_destroy(&hr_dev->qp_table.bank_mutex); + mutex_destroy(&hr_dev->qp_table.scc_mutex); kfree(hr_dev->qp_table.idx_table.spare_idx); } diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 7210e53a82f3..f1997abc97ca 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -518,6 +518,7 @@ err_srq_db: err_srq_buf: free_srq_buf(hr_dev, srq); err_out: + mutex_destroy(&srq->mutex); atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_SRQ_CREATE_ERR_CNT]); return ret; @@ -532,6 +533,7 @@ int hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) free_srqn(hr_dev, srq); free_srq_db(hr_dev, srq, udata); free_srq_buf(hr_dev, srq); + mutex_destroy(&srq->mutex); return 0; } -- cgit v1.2.3 From 4125269bb9b22e1d8cdf4412c81be8074dbc61ca Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 Apr 2024 17:16:15 +0800 Subject: RDMA/hns: Use complete parentheses in macros Use complete parentheses to ensure that macro expansion does not produce unexpected results. Fixes: a25d13cbe816 ("RDMA/hns: Add the interfaces to support multi hop addressing for the contexts in hip08") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-10-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hem.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index 6fb51db9682b..9c415b2541af 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -57,16 +57,16 @@ enum { }; #define check_whether_bt_num_3(type, hop_num) \ - (type < HEM_TYPE_MTT && hop_num == 2) + ((type) < HEM_TYPE_MTT && (hop_num) == 2) #define check_whether_bt_num_2(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == 2)) + (((type) < HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 2)) #define check_whether_bt_num_1(type, hop_num) \ - ((type < HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0) || \ - (type >= HEM_TYPE_MTT && hop_num == 1) || \ - (type >= HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0)) + (((type) < HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == 1) || \ + ((type) >= HEM_TYPE_MTT && (hop_num) == HNS_ROCE_HOP_NUM_0)) struct hns_roce_hem { void *buf; -- cgit v1.2.3 From 349e859952285ab9689779fb46de163f13f18f43 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 Apr 2024 17:16:16 +0800 Subject: RDMA/hns: Modify the print level of CQE error Too much print may lead to a panic in kernel. Change ibdev_err() to ibdev_err_ratelimited(), and change the printing level of cqe dump to debug level. Fixes: 7c044adca272 ("RDMA/hns: Simplify the cqe code of poll cq") Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Link: https://lore.kernel.org/r/20240412091616.370789-11-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5d526b5c4b81..4287818a737f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3713,8 +3713,9 @@ static void get_cqe_status(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, wc->status == IB_WC_WR_FLUSH_ERR)) return; - ibdev_err(&hr_dev->ib_dev, "error cqe status 0x%x:\n", cqe_status); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_NONE, 16, 4, cqe, + ibdev_err_ratelimited(&hr_dev->ib_dev, "error cqe status 0x%x:\n", + cqe_status); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 4, cqe, cq->cqe_size, false); wc->vendor_err = hr_reg_read(cqe, CQE_SUB_STATUS); -- cgit v1.2.3 From ca0b44e20a6f3032224599f02e7c8fb49525c894 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Tue, 16 Apr 2024 15:01:44 +0300 Subject: IB/core: Implement a limit on UMAD receive List The existing behavior of ib_umad, which maintains received MAD packets in an unbounded list, poses a risk of uncontrolled growth. As user-space applications extract packets from this list, the rate of extraction may not match the rate of incoming packets, leading to potential list overflow. To address this, we introduce a limit to the size of the list. After considering typical scenarios, such as OpenSM processing, which can handle approximately 100k packets per second, and the 1-second retry timeout for most packets, we set the list size limit to 200k. Packets received beyond this limit are dropped, assuming they are likely timed out by the time they are handled by user-space. Notably, packets queued on the receive list due to reasons like timed-out sends are preserved even when the list is full. Signed-off-by: Michael Guralnik Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/7197cb58a7d9e78399008f25036205ceab07fbd5.1713268818.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/user_mad.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index f5feca7fa9b9..2ed749f50a29 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -63,6 +63,8 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); +#define MAX_UMAD_RECV_LIST_SIZE 200000 + enum { IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, IB_UMAD_MAX_AGENTS = 32, @@ -113,6 +115,7 @@ struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; struct list_head recv_list; + atomic_t recv_list_size; struct list_head send_list; struct list_head port_list; spinlock_t send_lock; @@ -180,24 +183,28 @@ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) return file->agents_dead ? NULL : file->agent[id]; } -static int queue_packet(struct ib_umad_file *file, - struct ib_mad_agent *agent, - struct ib_umad_packet *packet) +static int queue_packet(struct ib_umad_file *file, struct ib_mad_agent *agent, + struct ib_umad_packet *packet, bool is_recv_mad) { int ret = 1; mutex_lock(&file->mutex); + if (is_recv_mad && + atomic_read(&file->recv_list_size) > MAX_UMAD_RECV_LIST_SIZE) + goto unlock; + for (packet->mad.hdr.id = 0; packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { list_add_tail(&packet->list, &file->recv_list); + atomic_inc(&file->recv_list_size); wake_up_interruptible(&file->recv_wait); ret = 0; break; } - +unlock: mutex_unlock(&file->mutex); return ret; @@ -224,7 +231,7 @@ static void send_handler(struct ib_mad_agent *agent, if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { packet->length = IB_MGMT_MAD_HDR; packet->mad.hdr.status = ETIMEDOUT; - if (!queue_packet(file, agent, packet)) + if (!queue_packet(file, agent, packet, false)) return; } kfree(packet); @@ -284,7 +291,7 @@ static void recv_handler(struct ib_mad_agent *agent, rdma_destroy_ah_attr(&ah_attr); } - if (queue_packet(file, agent, packet)) + if (queue_packet(file, agent, packet, true)) goto err2; return; @@ -409,6 +416,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); list_del(&packet->list); + atomic_dec(&file->recv_list_size); mutex_unlock(&file->mutex); @@ -421,6 +429,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, /* Requeue packet */ mutex_lock(&file->mutex); list_add(&packet->list, &file->recv_list); + atomic_inc(&file->recv_list_size); mutex_unlock(&file->mutex); } else { if (packet->recv_wc) -- cgit v1.2.3 From 2b23b6097303ed0ba5f4bc036a1c07b6027af5c6 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:04 -0500 Subject: RDMA/rxe: Fix seg fault in rxe_comp_queue_pkt In rxe_comp_queue_pkt() an incoming response packet skb is enqueued to the resp_pkts queue and then a decision is made whether to run the completer task inline or schedule it. Finally the skb is dereferenced to bump a 'hw' performance counter. This is wrong because if the completer task is already running in a separate thread it may have already processed the skb and freed it which can cause a seg fault. This has been observed infrequently in testing at high scale. This patch fixes this by changing the order of enqueuing the packet until after the counter is accessed. Link: https://lore.kernel.org/r/20240329145513.35381-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Fixes: 0b1e5b99a48b ("IB/rxe: Add port protocol stats") Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index b78b8c0856ab..c997b7cbf2a9 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -131,12 +131,12 @@ void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { int must_sched; - skb_queue_tail(&qp->resp_pkts, skb); - - must_sched = skb_queue_len(&qp->resp_pkts) > 1; + must_sched = skb_queue_len(&qp->resp_pkts) > 0; if (must_sched != 0) rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); + skb_queue_tail(&qp->resp_pkts, skb); + if (must_sched) rxe_sched_task(&qp->comp.task); else -- cgit v1.2.3 From b703374837a8f8422fa3f1edcf65505421a65a6a Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:05 -0500 Subject: RDMA/rxe: Allow good work requests to be executed A previous commit incorrectly added an 'if(!err)' before scheduling the requester task in rxe_post_send_kernel(). But if there were send wrs successfully added to the send queue before a bad wr they might never get executed. This commit fixes this by scheduling the requester task if any wqes were successfully posted in rxe_post_send_kernel() in rxe_verbs.c. Link: https://lore.kernel.org/r/20240329145513.35381-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Fixes: 5bf944f24129 ("RDMA/rxe: Add error messages") Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 614581989b38..a49784e5156c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -888,6 +888,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, { int err = 0; unsigned long flags; + int good = 0; spin_lock_irqsave(&qp->sq.sq_lock, flags); while (ibwr) { @@ -895,12 +896,15 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, if (err) { *bad_wr = ibwr; break; + } else { + good++; } ibwr = ibwr->next; } spin_unlock_irqrestore(&qp->sq.sq_lock, flags); - if (!err) + /* kickoff processing of any posted wqes */ + if (good) rxe_sched_task(&qp->req.task); spin_lock_irqsave(&qp->state_lock, flags); -- cgit v1.2.3 From ff30e45376d2ea68e032e6430babc0df15c4fc39 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:06 -0500 Subject: RDMA/rxe: Remove redundant scheduling of rxe_completer In rxe_post_send_kernel() if the qp is in the error state after posting the work requests the rxe_completer() task is scheduled. But, the only way to move the qp into the error state is to call rxe_qp_error() which also schedules the rxe_completer() task to drain the queues. Calling it a second time has no effect. This commit removes the redundant call. Link: https://lore.kernel.org/r/20240329145513.35381-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index a49784e5156c..71b0f834030f 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -907,11 +907,6 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, if (good) rxe_sched_task(&qp->req.task); - spin_lock_irqsave(&qp->state_lock, flags); - if (qp_state(qp) == IB_QPS_ERR) - rxe_sched_task(&qp->comp.task); - spin_unlock_irqrestore(&qp->state_lock, flags); - return err; } -- cgit v1.2.3 From 67f57892f9b2c93a3a020109d2285232fbde8b81 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:07 -0500 Subject: RDMA/rxe: Merge request and complete tasks Currently the rxe driver has three work queue tasks per qp. These are the req.task, comp.task and resp.task which call rxe_requester(), rxe_completer() and rxe_responder() respectively directly or on work queues. Each of these subroutines checks to see if there is work to be performed on the send queue or on the response packet queue or the request packet queue and will run until there is no work remaining or yield the cpu and reschedule itself until there is no work remaining. This commit combines the req.task and comp.task into a single send.task and renames the resp.task to the recv.task. The combined send.task calls rxe_requester() and rxe_completer() serially and continues until all work on both the send queue and the response packet queue are done. In various benchmarks the performance is either improved or left the same. At high scale there is a significant reduction in the load on the cpu. This is the first step in combining these two tasks. Once they are serialized cross rescheduling of req.task and comp.task can be more efficiently handled by just letting the send.task continue to run. This will be done in the next several patches. Link: https://lore.kernel.org/r/20240329145513.35381-7-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 20 ++++++------- drivers/infiniband/sw/rxe/rxe_hw_counters.c | 2 +- drivers/infiniband/sw/rxe/rxe_hw_counters.h | 2 +- drivers/infiniband/sw/rxe/rxe_loc.h | 3 +- drivers/infiniband/sw/rxe/rxe_net.c | 4 +-- drivers/infiniband/sw/rxe/rxe_qp.c | 44 +++++++++++------------------ drivers/infiniband/sw/rxe/rxe_req.c | 25 +++++++++++++--- drivers/infiniband/sw/rxe/rxe_resp.c | 6 ++-- drivers/infiniband/sw/rxe/rxe_verbs.c | 6 ++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 6 ++-- 10 files changed, 63 insertions(+), 55 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index c997b7cbf2a9..ea64a25fe876 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -122,7 +122,7 @@ void retransmit_timer(struct timer_list *t) spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; - rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -133,14 +133,14 @@ void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) must_sched = skb_queue_len(&qp->resp_pkts) > 0; if (must_sched != 0) - rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); + rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_SENDER_SCHED); skb_queue_tail(&qp->resp_pkts, skb); if (must_sched) - rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); else - rxe_run_task(&qp->comp.task); + rxe_run_task(&qp->send_task); } static inline enum comp_state get_wqe(struct rxe_qp *qp, @@ -325,7 +325,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } } return COMPST_ERROR_RETRY; @@ -476,7 +476,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } } @@ -515,7 +515,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } } @@ -541,7 +541,7 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } } @@ -737,7 +737,7 @@ int rxe_completer(struct rxe_qp *qp) if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } state = COMPST_DONE; @@ -792,7 +792,7 @@ int rxe_completer(struct rxe_qp *qp) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } goto done; diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c index a012522b577a..437917a7d8f2 100644 --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c @@ -14,7 +14,7 @@ static const struct rdma_stat_desc rxe_counter_descs[] = { [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", [RXE_CNT_SND_RNR].name = "send_rnr_err", [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", - [RXE_CNT_COMPLETER_SCHED].name = "ack_deferred", + [RXE_CNT_SENDER_SCHED].name = "ack_deferred", [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", [RXE_CNT_COMP_RETRY].name = "completer_retry_err", diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h index 71f4d4fa9dc8..051f9e1c3852 100644 --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.h +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h @@ -18,7 +18,7 @@ enum rxe_counters { RXE_CNT_RCV_RNR, RXE_CNT_SND_RNR, RXE_CNT_RCV_SEQ_ERR, - RXE_CNT_COMPLETER_SCHED, + RXE_CNT_SENDER_SCHED, RXE_CNT_RETRY_EXCEEDED, RXE_CNT_RNR_RETRY_EXCEEDED, RXE_CNT_COMP_RETRY, diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 746110898a0e..ded46119151b 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -164,7 +164,8 @@ void rxe_dealloc(struct ib_device *ib_dev); int rxe_completer(struct rxe_qp *qp); int rxe_requester(struct rxe_qp *qp); -int rxe_responder(struct rxe_qp *qp); +int rxe_sender(struct rxe_qp *qp); +int rxe_receiver(struct rxe_qp *qp); /* rxe_icrc.c */ int rxe_icrc_init(struct rxe_dev *rxe); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index cd59666158b1..928508558df4 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -351,7 +351,7 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb) if (unlikely(qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); rxe_put(qp); } @@ -443,7 +443,7 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, if ((qp_type(qp) != IB_QPT_RC) && (pkt->mask & RXE_END_MASK)) { pkt->wqe->state = wqe_state_done; - rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); } rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index e3589c02013e..c7d99063594b 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -265,8 +265,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, qp->req.opcode = -1; qp->comp.opcode = -1; - rxe_init_task(&qp->req.task, qp, rxe_requester); - rxe_init_task(&qp->comp.task, qp, rxe_completer); + rxe_init_task(&qp->send_task, qp, rxe_sender); qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ if (init->qp_type == IB_QPT_RC) { @@ -337,7 +336,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, return err; } - rxe_init_task(&qp->resp.task, qp, rxe_responder); + rxe_init_task(&qp->recv_task, qp, rxe_receiver); qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; @@ -514,14 +513,12 @@ err1: static void rxe_qp_reset(struct rxe_qp *qp) { /* stop tasks from running */ - rxe_disable_task(&qp->resp.task); - rxe_disable_task(&qp->comp.task); - rxe_disable_task(&qp->req.task); + rxe_disable_task(&qp->recv_task); + rxe_disable_task(&qp->send_task); /* drain work and packet queuesc */ - rxe_requester(qp); - rxe_completer(qp); - rxe_responder(qp); + rxe_sender(qp); + rxe_receiver(qp); if (qp->rq.queue) rxe_queue_reset(qp->rq.queue); @@ -548,9 +545,8 @@ static void rxe_qp_reset(struct rxe_qp *qp) cleanup_rd_atomic_resources(qp); /* reenable tasks */ - rxe_enable_task(&qp->resp.task); - rxe_enable_task(&qp->comp.task); - rxe_enable_task(&qp->req.task); + rxe_enable_task(&qp->recv_task); + rxe_enable_task(&qp->send_task); } /* move the qp to the error state */ @@ -562,9 +558,8 @@ void rxe_qp_error(struct rxe_qp *qp) qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ - rxe_sched_task(&qp->resp.task); - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->recv_task); + rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -575,8 +570,7 @@ static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -821,19 +815,15 @@ static void rxe_qp_do_cleanup(struct work_struct *work) del_timer_sync(&qp->rnr_nak_timer); } - if (qp->resp.task.func) - rxe_cleanup_task(&qp->resp.task); + if (qp->recv_task.func) + rxe_cleanup_task(&qp->recv_task); - if (qp->req.task.func) - rxe_cleanup_task(&qp->req.task); - - if (qp->comp.task.func) - rxe_cleanup_task(&qp->comp.task); + if (qp->send_task.func) + rxe_cleanup_task(&qp->send_task); /* flush out any receive wr's or pending requests */ - rxe_requester(qp); - rxe_completer(qp); - rxe_responder(qp); + rxe_sender(qp); + rxe_receiver(qp); if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index d8c41fd626a9..31a611ced3c5 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -108,7 +108,7 @@ void rnr_nak_timer(struct timer_list *t) /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -659,7 +659,7 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) * which can lead to a deadlock. So go ahead and complete * it now. */ - rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); return 0; } @@ -786,7 +786,7 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); goto done; } payload = mtu; @@ -855,7 +855,7 @@ int rxe_requester(struct rxe_qp *qp) */ qp->need_req_skb = 1; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); goto exit; } @@ -878,3 +878,20 @@ exit: out: return ret; } + +int rxe_sender(struct rxe_qp *qp) +{ + int req_ret; + int comp_ret; + + /* process the send queue */ + req_ret = rxe_requester(qp); + + /* process the response queue */ + comp_ret = rxe_completer(qp); + + /* exit the task loop if both requester and completer + * are ready + */ + return (req_ret && comp_ret) ? -EAGAIN : 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 963382f625d7..3ce7a32b5dcf 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -58,9 +58,9 @@ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) (skb_queue_len(&qp->req_pkts) > 1); if (must_sched) - rxe_sched_task(&qp->resp.task); + rxe_sched_task(&qp->recv_task); else - rxe_run_task(&qp->resp.task); + rxe_run_task(&qp->recv_task); } static inline enum resp_states get_req(struct rxe_qp *qp, @@ -1485,7 +1485,7 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify) qp->resp.wqe = NULL; } -int rxe_responder(struct rxe_qp *qp) +int rxe_receiver(struct rxe_qp *qp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); enum resp_states state; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 71b0f834030f..d07f7bd3b2ae 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -905,7 +905,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, /* kickoff processing of any posted wqes */ if (good) - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); return err; } @@ -935,7 +935,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, if (qp->is_user) { /* Utilize process context to do protocol processing */ - rxe_run_task(&qp->req.task); + rxe_run_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) @@ -1045,7 +1045,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) - rxe_sched_task(&qp->resp.task); + rxe_sched_task(&qp->recv_task); spin_unlock_irqrestore(&qp->state_lock, flags); return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index ccb9d19ffe8a..af8939b8c7a1 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -113,7 +113,6 @@ struct rxe_req_info { int need_retry; int wait_for_rnr_timer; int noack_pkts; - struct rxe_task task; }; struct rxe_comp_info { @@ -124,7 +123,6 @@ struct rxe_comp_info { int started_retry; u32 retry_cnt; u32 rnr_retry; - struct rxe_task task; }; enum rdatm_res_state { @@ -196,7 +194,6 @@ struct rxe_resp_info { unsigned int res_head; unsigned int res_tail; struct resp_res *res; - struct rxe_task task; }; struct rxe_qp { @@ -229,6 +226,9 @@ struct rxe_qp { struct sk_buff_head req_pkts; struct sk_buff_head resp_pkts; + struct rxe_task send_task; + struct rxe_task recv_task; + struct rxe_req_info req; struct rxe_comp_info comp; struct rxe_resp_info resp; -- cgit v1.2.3 From cd8aaddf0d6dbd4798d2de2f4e1cd62a91ac62f0 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:08 -0500 Subject: RDMA/rxe: Remove save/rollback_state in rxe_requester Now that req.task and comp.task are merged it is no longer necessary to call save_state() before calling rxe_xmit_pkt() and rollback_state() if rxe_xmit_pkt() fails. This was done originally to prevent races between rxe_completer() and rxe_requester() which now cannot happen. Link: https://lore.kernel.org/r/20240329145513.35381-8-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 40 ++----------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 31a611ced3c5..e20462c3040d 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -573,30 +573,6 @@ static void update_wqe_psn(struct rxe_qp *qp, qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; } -static void save_state(struct rxe_send_wqe *wqe, - struct rxe_qp *qp, - struct rxe_send_wqe *rollback_wqe, - u32 *rollback_psn) -{ - rollback_wqe->state = wqe->state; - rollback_wqe->first_psn = wqe->first_psn; - rollback_wqe->last_psn = wqe->last_psn; - rollback_wqe->dma = wqe->dma; - *rollback_psn = qp->req.psn; -} - -static void rollback_state(struct rxe_send_wqe *wqe, - struct rxe_qp *qp, - struct rxe_send_wqe *rollback_wqe, - u32 rollback_psn) -{ - wqe->state = rollback_wqe->state; - wqe->first_psn = rollback_wqe->first_psn; - wqe->last_psn = rollback_wqe->last_psn; - wqe->dma = rollback_wqe->dma; - qp->req.psn = rollback_psn; -} - static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { qp->req.opcode = pkt->opcode; @@ -676,8 +652,6 @@ int rxe_requester(struct rxe_qp *qp) int opcode; int err; int ret; - struct rxe_send_wqe rollback_wqe; - u32 rollback_psn; struct rxe_queue *q = qp->sq.queue; struct rxe_ah *ah; struct rxe_av *av; @@ -799,9 +773,6 @@ int rxe_requester(struct rxe_qp *qp) pkt.mask = rxe_opcode[opcode].mask; pkt.wqe = wqe; - /* save wqe state before we build and send packet */ - save_state(wqe, qp, &rollback_wqe, &rollback_psn); - av = rxe_get_av(&pkt, &ah); if (unlikely(!av)) { rxe_dbg_qp(qp, "Failed no address vector\n"); @@ -834,10 +805,6 @@ int rxe_requester(struct rxe_qp *qp) if (ah) rxe_put(ah); - /* update wqe state as though we had sent it */ - update_wqe_state(qp, wqe, &pkt); - update_wqe_psn(qp, wqe, &pkt, payload); - err = rxe_xmit_packet(qp, &pkt, skb); if (err) { if (err != -EAGAIN) { @@ -845,11 +812,6 @@ int rxe_requester(struct rxe_qp *qp) goto err; } - /* the packet was dropped so reset wqe to the state - * before we sent it so we can try to resend - */ - rollback_state(wqe, qp, &rollback_wqe, rollback_psn); - /* force a delay until the dropped packet is freed and * the send queue is drained below the low water mark */ @@ -859,6 +821,8 @@ int rxe_requester(struct rxe_qp *qp) goto exit; } + update_wqe_state(qp, wqe, &pkt); + update_wqe_psn(qp, wqe, &pkt, payload); update_state(qp, &pkt); /* A non-zero return value will cause rxe_do_task to -- cgit v1.2.3 From 4891f4fed04718a642ff4a4563128699c47d8918 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:09 -0500 Subject: RDMA/rxe: Don't schedule rxe_completer from rxe_requester Now that rxe_completer() is always called serially after rxe_requester() there is no reason to schedule rxe_completer() from rxe_requester(). Link: https://lore.kernel.org/r/20240329145513.35381-9-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 6 ------ drivers/infiniband/sw/rxe/rxe_req.c | 9 ++------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 928508558df4..a2fc118e7ec1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -440,12 +440,6 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, return err; } - if ((qp_type(qp) != IB_QPT_RC) && - (pkt->mask & RXE_END_MASK)) { - pkt->wqe->state = wqe_state_done; - rxe_sched_task(&qp->send_task); - } - rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); goto done; diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index e20462c3040d..34c55dee0774 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -545,6 +545,8 @@ static void update_wqe_state(struct rxe_qp *qp, if (pkt->mask & RXE_END_MASK) { if (qp_type(qp) == IB_QPT_RC) wqe->state = wqe_state_pending; + else + wqe->state = wqe_state_done; } else { wqe->state = wqe_state_processing; } @@ -631,12 +633,6 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) wqe->status = IB_WC_SUCCESS; qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); - /* There is no ack coming for local work requests - * which can lead to a deadlock. So go ahead and complete - * it now. - */ - rxe_sched_task(&qp->send_task); - return 0; } @@ -760,7 +756,6 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - rxe_sched_task(&qp->send_task); goto done; } payload = mtu; -- cgit v1.2.3 From 3d807a3ebc48a2e1685ebfa9d26ea2c9ceb9c53e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:10 -0500 Subject: RDMA/rxe: Don't call rxe_requester from rxe_completer Instead of rescheduling rxe_requester from rxe_completer() just extend the duration of rxe_sender() by one pass. Setting run_requester_again forces rxe_completer() to return 0 which will cause rxe_sender() to be called at least one more time. Link: https://lore.kernel.org/r/20240329145513.35381-10-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 17 ++++++++++------- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index ea64a25fe876..357c1d516efb 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -325,7 +325,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->send_task); + qp->req.again = 1; } } return COMPST_ERROR_RETRY; @@ -476,7 +476,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_sched_task(&qp->send_task); + qp->req.again = 1; } } @@ -515,7 +515,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_sched_task(&qp->send_task); + qp->req.again = 1; } } @@ -541,7 +541,7 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->send_task); + qp->req.again = 1; } } @@ -654,6 +654,8 @@ int rxe_completer(struct rxe_qp *qp) int ret; unsigned long flags; + qp->req.again = 0; + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { @@ -737,7 +739,7 @@ int rxe_completer(struct rxe_qp *qp) if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->send_task); + qp->req.again = 1; } state = COMPST_DONE; @@ -792,7 +794,7 @@ int rxe_completer(struct rxe_qp *qp) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_sched_task(&qp->send_task); + qp->req.again = 1; } goto done; @@ -843,8 +845,9 @@ done: ret = 0; goto out; exit: - ret = -EAGAIN; + ret = (qp->req.again) ? 0 : -EAGAIN; out: + qp->req.again = 0; if (pkt) free_pkt(pkt); return ret; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index af8939b8c7a1..3c1354f82283 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -113,6 +113,7 @@ struct rxe_req_info { int need_retry; int wait_for_rnr_timer; int noack_pkts; + int again; }; struct rxe_comp_info { -- cgit v1.2.3 From 23bc06af547f2ca3b7d345e09fd8d04575406274 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:11 -0500 Subject: RDMA/rxe: Don't call direct between tasks Replace calls to rxe_run_task() with rxe_sched_task(). This prevents the tasks from all running on the same cpu. This change slightly reduces performance for single qp send and write benchmarks in loopback mode but greatly improves the performance with multiple qps because if run task is used all the work tends to be performed on one cpu. For actual on the wire benchmarks there is no noticeable performance change. Link: https://lore.kernel.org/r/20240329145513.35381-11-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 13 ++----------- drivers/infiniband/sw/rxe/rxe_resp.c | 12 +----------- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 357c1d516efb..d48af2180745 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -129,18 +129,9 @@ void retransmit_timer(struct timer_list *t) void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { - int must_sched; - - must_sched = skb_queue_len(&qp->resp_pkts) > 0; - if (must_sched != 0) - rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_SENDER_SCHED); - + rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_SENDER_SCHED); skb_queue_tail(&qp->resp_pkts, skb); - - if (must_sched) - rxe_sched_task(&qp->send_task); - else - rxe_run_task(&qp->send_task); + rxe_sched_task(&qp->send_task); } static inline enum comp_state get_wqe(struct rxe_qp *qp, diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 3ce7a32b5dcf..c6a7fa3054fa 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -49,18 +49,8 @@ static char *resp_state_name[] = { /* rxe_recv calls here to add a request packet to the input queue */ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { - int must_sched; - struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); - skb_queue_tail(&qp->req_pkts, skb); - - must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || - (skb_queue_len(&qp->req_pkts) > 1); - - if (must_sched) - rxe_sched_task(&qp->recv_task); - else - rxe_run_task(&qp->recv_task); + rxe_sched_task(&qp->recv_task); } static inline enum resp_states get_req(struct rxe_qp *qp, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index d07f7bd3b2ae..c7d4d8ab5a09 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -935,7 +935,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, if (qp->is_user) { /* Utilize process context to do protocol processing */ - rxe_run_task(&qp->send_task); + rxe_sched_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) -- cgit v1.2.3 From 8776618dbbd1b6f210b31509507e1aad461d6435 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:12 -0500 Subject: RDMA/rxe: Fix incorrect rxe_put in error path In rxe_send() a ref is taken on the qp to keep it alive until the kfree_skb() has a chance to call the skb destructor rxe_skb_tx_dtor() which drops the reference. If the packet has an incorrect protocol the error path just calls kfree_skb() which will call the destructor which will drop the ref. Currently the driver also calls rxe_put() which is incorrect. Additionally since the packets sent to rxe_send() are under the control of the driver and it only ever produces IPV4 or IPV6 packets the simplest fix is to remove all the code in this block. Link: https://lore.kernel.org/r/20240329145513.35381-12-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Fixes: 9eb7f8e44d13 ("IB/rxe: Move refcounting earlier in rxe_send()") Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index a2fc118e7ec1..d81440038f91 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -366,18 +366,10 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); - if (skb->protocol == htons(ETH_P_IP)) { + if (skb->protocol == htons(ETH_P_IP)) err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - } else if (skb->protocol == htons(ETH_P_IPV6)) { + else err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - } else { - rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", - skb->protocol); - atomic_dec(&pkt->qp->skb_out); - rxe_put(pkt->qp); - kfree_skb(skb); - return -EINVAL; - } if (unlikely(net_xmit_eval(err))) { rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); -- cgit v1.2.3 From 55bec1c440e6852e907c47cd33fbbf63fcc5f1ba Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:13 -0500 Subject: RDMA/rxe: Make rxe_loopback match rxe_send behavior The rxe send path currently counts the number of skbs outstanding between the rxe driver and the ethernet driver to prevent too many packets to accumulate waiting to send. This patch makes the local loopback path behave the same way. The loopback path forwards the packets to the receive path which will eventually call kfree_skb on all packets and drop the qp references. This makes the loopback path more useful for software testing. Link: https://lore.kernel.org/r/20240329145513.35381-13-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index d81440038f91..d081409450a4 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -386,6 +386,12 @@ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + skb->destructor = rxe_skb_tx_dtor; + skb->sk = pkt->qp->sk->sk; + + rxe_get(pkt->qp); + atomic_inc(&pkt->qp->skb_out); + if (skb->protocol == htons(ETH_P_IP)) skb_pull(skb, sizeof(struct iphdr)); else -- cgit v1.2.3 From 9cc6290991e6cfc9a6447823275fa4ba4d902103 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:14 -0500 Subject: RDMA/rxe: Get rid of pkt resend on err Currently the rxe_driver detects packet drops by ip_local_out() which occur before the packet is sent on the wire and attempts to resend them. This is redundant with the usual retry mechanism which covers packets that get dropped in transit to or from the remote node. The way this is implemented is not robust since it sets need_req_skb and waits for the number of local skbs outstanding for this qp to drop below a low water mark. This is racy since the skb may be sent to the destructor before the requester can set the need_req_skb flag. This will cause a deadlock in the send path for that qp. This patch removes this mechanism since the normal retry path will correct the error and resend the packet and it makes no difference if the packet is dropped locally or later. Link: https://lore.kernel.org/r/20240329145513.35381-14-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 7 +------ drivers/infiniband/sw/rxe/rxe_req.c | 14 ++------------ 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index d081409450a4..b58eab75df97 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -371,12 +371,7 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) else err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - if (unlikely(net_xmit_eval(err))) { - rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); - return -EAGAIN; - } - - return 0; + return err; } /* fix up a send packet to match the packets diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 34c55dee0774..cd14c4c2dff9 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -802,18 +802,8 @@ int rxe_requester(struct rxe_qp *qp) err = rxe_xmit_packet(qp, &pkt, skb); if (err) { - if (err != -EAGAIN) { - wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err; - } - - /* force a delay until the dropped packet is freed and - * the send queue is drained below the low water mark - */ - qp->need_req_skb = 1; - - rxe_sched_task(&qp->send_task); - goto exit; + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; } update_wqe_state(qp, wqe, &pkt); -- cgit v1.2.3 From 1a633bdc8fd9e9e4a9f9a668ae122edfc5aacc86 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 29 Mar 2024 09:55:15 -0500 Subject: RDMA/rxe: Let destroy qp succeed with stuck packet In some situations a sent packet may get queued in the NIC longer than than timeout of a ULP. Currently if this happens the ULP may try to reset the link by destroying the qp and setting up an alternate connection but will fail because the rxe driver is waiting for the packet to finish getting sent and be returned to the skb destructor function where the qp reference holding things up will be dropped. This patch modifies the way that the qp is passed to the destructor to pass the qp index and not a qp pointer. Then the destructor will attempt to lookup the qp from its index and if it fails exit early. This requires taking a reference on the struct sock rather than the qp allowing the qp to be destroyed while the sk is still around waiting for the packet to finish. Link: https://lore.kernel.org/r/20240329145513.35381-15-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 42 +++++++++++++++++++++++++++---------- drivers/infiniband/sw/rxe/rxe_qp.c | 2 +- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index b58eab75df97..ca9a82e1c4c7 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -345,25 +345,44 @@ int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, static void rxe_skb_tx_dtor(struct sk_buff *skb) { - struct sock *sk = skb->sk; - struct rxe_qp *qp = sk->sk_user_data; - int skb_out = atomic_dec_return(&qp->skb_out); + struct net_device *ndev = skb->dev; + struct rxe_dev *rxe; + unsigned int qp_index; + struct rxe_qp *qp; + int skb_out; + + rxe = rxe_get_dev_from_net(ndev); + if (!rxe && is_vlan_dev(ndev)) + rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); + if (WARN_ON(!rxe)) + return; - if (unlikely(qp->need_req_skb && - skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) + qp_index = (int)(uintptr_t)skb->sk->sk_user_data; + if (!qp_index) + return; + + qp = rxe_pool_get_index(&rxe->qp_pool, qp_index); + if (!qp) + goto put_dev; + + skb_out = atomic_dec_return(&qp->skb_out); + if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW) rxe_sched_task(&qp->send_task); rxe_put(qp); +put_dev: + ib_device_put(&rxe->ib_dev); + sock_put(skb->sk); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) { int err; + struct sock *sk = pkt->qp->sk->sk; + sock_hold(sk); + skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; - skb->sk = pkt->qp->sk->sk; - - rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) @@ -379,12 +398,13 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) */ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { + struct sock *sk = pkt->qp->sk->sk; + memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + sock_hold(sk); + skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; - skb->sk = pkt->qp->sk->sk; - - rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c7d99063594b..d2f7b5195c19 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -244,7 +244,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) return err; - qp->sk->sk->sk_user_data = qp; + qp->sk->sk->sk_user_data = (void *)(uintptr_t)qp->elem.index; /* pick a source UDP port number for this QP based on * the source QPN. this spreads traffic for different QPs -- cgit v1.2.3 From 20516d6e51dd9994afda8d556507cfbe7853384b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Apr 2024 13:46:14 -0300 Subject: x86: Stop using weak symbols for __iowrite32_copy() Start switching iomap_copy routines over to use #define and arch provided inline/macro functions instead of weak symbols. Inline functions allow more compiler optimization and this is often a driver hot path. x86 has the only weak implementation for __iowrite32_copy(), so replace it with a static inline containing the same single instruction inline assembly. The compiler will generate the "mov edx,ecx" in a more optimal way. Remove iomap_copy_64.S Link: https://lore.kernel.org/r/1-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Acked-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- arch/x86/include/asm/io.h | 17 +++++++++++++++++ arch/x86/lib/Makefile | 1 - arch/x86/lib/iomap_copy_64.S | 15 --------------- include/linux/io.h | 5 ++++- lib/iomap_copy.c | 6 +++--- 5 files changed, 24 insertions(+), 20 deletions(-) delete mode 100644 arch/x86/lib/iomap_copy_64.S diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 294cd2a40818..4b99ed326b17 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -209,6 +209,23 @@ void memset_io(volatile void __iomem *, int, size_t); #define memcpy_toio memcpy_toio #define memset_io memset_io +#ifdef CONFIG_X86_64 +/* + * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for + * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy + * loop. + */ +static inline void __iowrite32_copy(void __iomem *to, const void *from, + size_t count) +{ + asm volatile("rep ; movsl" + : "=&c"(count), "=&D"(to), "=&S"(from) + : "0"(count), "1"(to), "2"(from) + : "memory"); +} +#define __iowrite32_copy __iowrite32_copy +#endif + /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 6da73513f026..98583a9dbab3 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -53,7 +53,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += atomic64_386_32.o endif else - obj-y += iomap_copy_64.o ifneq ($(CONFIG_GENERIC_CSUM),y) lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o endif diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S deleted file mode 100644 index 6ff2f56cb0f7..000000000000 --- a/arch/x86/lib/iomap_copy_64.S +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2006 PathScale, Inc. All Rights Reserved. - */ - -#include - -/* - * override generic version in lib/iomap_copy.c - */ -SYM_FUNC_START(__iowrite32_copy) - movl %edx,%ecx - rep movsl - RET -SYM_FUNC_END(__iowrite32_copy) diff --git a/include/linux/io.h b/include/linux/io.h index 235ba7d80a8f..ce86120ce9d5 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -16,7 +16,10 @@ struct device; struct resource; -__visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count); +#ifndef __iowrite32_copy +void __iowrite32_copy(void __iomem *to, const void *from, size_t count); +#endif + void __ioread32_copy(void *to, const void __iomem *from, size_t count); void __iowrite64_copy(void __iomem *to, const void *from, size_t count); diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c index 5de7c04e05ef..8ddcbb53507d 100644 --- a/lib/iomap_copy.c +++ b/lib/iomap_copy.c @@ -16,9 +16,8 @@ * time. Order of access is not guaranteed, nor is a memory barrier * performed afterwards. */ -void __attribute__((weak)) __iowrite32_copy(void __iomem *to, - const void *from, - size_t count) +#ifndef __iowrite32_copy +void __iowrite32_copy(void __iomem *to, const void *from, size_t count) { u32 __iomem *dst = to; const u32 *src = from; @@ -28,6 +27,7 @@ void __attribute__((weak)) __iowrite32_copy(void __iomem *to, __raw_writel(*src++, dst++); } EXPORT_SYMBOL_GPL(__iowrite32_copy); +#endif /** * __ioread32_copy - copy data from MMIO space, in 32-bit units -- cgit v1.2.3 From 6ae798cbef4ba1f180aa1a590e33a2d89f7cc34f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Apr 2024 13:46:15 -0300 Subject: s390: Implement __iowrite32_copy() It is trivial to implement an inline to do this, so provide it in the s390 headers. Like the 64 bit version it should just invoke zpci_memcpy_toio() with the correct size. Link: https://lore.kernel.org/r/2-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Acked-by: Niklas Schnelle Signed-off-by: Jason Gunthorpe --- arch/s390/include/asm/io.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 4453ad7c11ac..00704fc8a54b 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -73,6 +73,14 @@ static inline void ioport_unmap(void __iomem *p) #define __raw_writel zpci_write_u32 #define __raw_writeq zpci_write_u64 +/* combine single writes by using store-block insn */ +static inline void __iowrite32_copy(void __iomem *to, const void *from, + size_t count) +{ + zpci_memcpy_toio(to, from, count * 4); +} +#define __iowrite32_copy __iowrite32_copy + #endif /* CONFIG_PCI */ #include -- cgit v1.2.3 From e7bc47b16622d1016b3b77bbdb20fb9e213045f2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Apr 2024 13:46:16 -0300 Subject: s390: Stop using weak symbols for __iowrite64_copy() Complete switching the __iowriteXX_copy() routines over to use #define and arch provided inline/macro functions instead of weak symbols. S390 has an implementation that simply calls another memcpy function. Inline this so the callers don't have to do two jumps. Link: https://lore.kernel.org/r/3-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Acked-by: Niklas Schnelle Acked-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- arch/s390/include/asm/io.h | 7 +++++++ arch/s390/pci/pci.c | 6 ------ include/linux/io.h | 3 +++ lib/iomap_copy.c | 7 +++---- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 00704fc8a54b..0fbc992d7a5e 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -81,6 +81,13 @@ static inline void __iowrite32_copy(void __iomem *to, const void *from, } #define __iowrite32_copy __iowrite32_copy +static inline void __iowrite64_copy(void __iomem *to, const void *from, + size_t count) +{ + zpci_memcpy_toio(to, from, count * 8); +} +#define __iowrite64_copy __iowrite64_copy + #endif /* CONFIG_PCI */ #include diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 26afde0d1ed3..0de0f6e405b5 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -250,12 +250,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, return 0; } -/* combine single writes by using store-block insn */ -void __iowrite64_copy(void __iomem *to, const void *from, size_t count) -{ - zpci_memcpy_toio(to, from, count * 8); -} - void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { diff --git a/include/linux/io.h b/include/linux/io.h index ce86120ce9d5..42e132808f00 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -21,7 +21,10 @@ void __iowrite32_copy(void __iomem *to, const void *from, size_t count); #endif void __ioread32_copy(void *to, const void __iomem *from, size_t count); + +#ifndef __iowrite64_copy void __iowrite64_copy(void __iomem *to, const void *from, size_t count); +#endif #ifdef CONFIG_MMU int ioremap_page_range(unsigned long addr, unsigned long end, diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c index 8ddcbb53507d..2fd5712fb7c0 100644 --- a/lib/iomap_copy.c +++ b/lib/iomap_copy.c @@ -60,9 +60,8 @@ EXPORT_SYMBOL_GPL(__ioread32_copy); * time. Order of access is not guaranteed, nor is a memory barrier * performed afterwards. */ -void __attribute__((weak)) __iowrite64_copy(void __iomem *to, - const void *from, - size_t count) +#ifndef __iowrite64_copy +void __iowrite64_copy(void __iomem *to, const void *from, size_t count) { #ifdef CONFIG_64BIT u64 __iomem *dst = to; @@ -75,5 +74,5 @@ void __attribute__((weak)) __iowrite64_copy(void __iomem *to, __iowrite32_copy(to, from, count * 2); #endif } - EXPORT_SYMBOL_GPL(__iowrite64_copy); +#endif -- cgit v1.2.3 From ead79118dae6f9f982532002e82c2fb291ae0480 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Apr 2024 13:46:17 -0300 Subject: arm64/io: Provide a WC friendly __iowriteXX_copy() The kernel provides driver support for using write combining IO memory through the __iowriteXX_copy() API which is commonly used as an optional optimization to generate 16/32/64 byte MemWr TLPs in a PCIe environment. iomap_copy.c provides a generic implementation as a simple 4/8 byte at a time copy loop that has worked well with past ARM64 CPUs, giving a high frequency of large TLPs being successfully formed. However modern ARM64 CPUs are quite sensitive to how the write combining CPU HW is operated and a compiler generated loop with intermixed load/store is not sufficient to frequently generate a large TLP. The CPUs would like to see the entire TLP generated by consecutive store instructions from registers. Compilers like gcc tend to intermix loads and stores and have poor code generation, in part, due to the ARM64 situation that writeq() does not codegen anything other than "[xN]". However even with that resolved compilers like clang still do not have good code generation. This means on modern ARM64 CPUs the rate at which __iowriteXX_copy() successfully generates large TLPs is very small (less than 1 in 10,000) tries), to the point that the use of WC is pointless. Implement __iowrite32/64_copy() specifically for ARM64 and use inline assembly to build consecutive blocks of STR instructions. Provide direct support for 64/32/16 large TLP generation in this manner. Optimize for common constant lengths so that the compiler can directly inline the store blocks. This brings the frequency of large TLP generation up to a high level that is comparable with older CPU generations. As the __iowriteXX_copy() family of APIs is intended for use with WC incorporate the DGH hint directly into the function. Link: https://lore.kernel.org/r/4-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: linux-arch@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Reviewed-by: Catalin Marinas Signed-off-by: Jason Gunthorpe --- arch/arm64/include/asm/io.h | 132 ++++++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/io.c | 42 ++++++++++++++ 2 files changed, 174 insertions(+) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 8d825522c55c..4ff0ae3f6d66 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -139,6 +139,138 @@ extern void __memset_io(volatile void __iomem *, int, size_t); #define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l)) #define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l)) +/* + * The ARM64 iowrite implementation is intended to support drivers that want to + * use write combining. For instance PCI drivers using write combining with a 64 + * byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus. + * + * Newer ARM core have sensitive write combining buffers, it is important that + * the stores be contiguous blocks of store instructions. Normal memcpy + * approaches have a very low chance to generate write combining. + * + * Since this is the only API on ARM64 that should be used with write combining + * it also integrates the DGH hint which is supposed to lower the latency to + * emit the large TLP from the CPU. + */ + +static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to, + const u32 *from, size_t count) +{ + switch (count) { + case 8: + asm volatile("str %w0, [%8, #4 * 0]\n" + "str %w1, [%8, #4 * 1]\n" + "str %w2, [%8, #4 * 2]\n" + "str %w3, [%8, #4 * 3]\n" + "str %w4, [%8, #4 * 4]\n" + "str %w5, [%8, #4 * 5]\n" + "str %w6, [%8, #4 * 6]\n" + "str %w7, [%8, #4 * 7]\n" + : + : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]), + "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]), + "rZ"(from[6]), "rZ"(from[7]), "r"(to)); + break; + case 4: + asm volatile("str %w0, [%4, #4 * 0]\n" + "str %w1, [%4, #4 * 1]\n" + "str %w2, [%4, #4 * 2]\n" + "str %w3, [%4, #4 * 3]\n" + : + : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]), + "rZ"(from[3]), "r"(to)); + break; + case 2: + asm volatile("str %w0, [%2, #4 * 0]\n" + "str %w1, [%2, #4 * 1]\n" + : + : "rZ"(from[0]), "rZ"(from[1]), "r"(to)); + break; + case 1: + __raw_writel(*from, to); + break; + default: + BUILD_BUG(); + } +} + +void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count); + +static inline void __const_iowrite32_copy(void __iomem *to, const void *from, + size_t count) +{ + if (count == 8 || count == 4 || count == 2 || count == 1) { + __const_memcpy_toio_aligned32(to, from, count); + dgh(); + } else { + __iowrite32_copy_full(to, from, count); + } +} + +#define __iowrite32_copy(to, from, count) \ + (__builtin_constant_p(count) ? \ + __const_iowrite32_copy(to, from, count) : \ + __iowrite32_copy_full(to, from, count)) + +static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to, + const u64 *from, size_t count) +{ + switch (count) { + case 8: + asm volatile("str %x0, [%8, #8 * 0]\n" + "str %x1, [%8, #8 * 1]\n" + "str %x2, [%8, #8 * 2]\n" + "str %x3, [%8, #8 * 3]\n" + "str %x4, [%8, #8 * 4]\n" + "str %x5, [%8, #8 * 5]\n" + "str %x6, [%8, #8 * 6]\n" + "str %x7, [%8, #8 * 7]\n" + : + : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]), + "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]), + "rZ"(from[6]), "rZ"(from[7]), "r"(to)); + break; + case 4: + asm volatile("str %x0, [%4, #8 * 0]\n" + "str %x1, [%4, #8 * 1]\n" + "str %x2, [%4, #8 * 2]\n" + "str %x3, [%4, #8 * 3]\n" + : + : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]), + "rZ"(from[3]), "r"(to)); + break; + case 2: + asm volatile("str %x0, [%2, #8 * 0]\n" + "str %x1, [%2, #8 * 1]\n" + : + : "rZ"(from[0]), "rZ"(from[1]), "r"(to)); + break; + case 1: + __raw_writeq(*from, to); + break; + default: + BUILD_BUG(); + } +} + +void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count); + +static inline void __const_iowrite64_copy(void __iomem *to, const void *from, + size_t count) +{ + if (count == 8 || count == 4 || count == 2 || count == 1) { + __const_memcpy_toio_aligned64(to, from, count); + dgh(); + } else { + __iowrite64_copy_full(to, from, count); + } +} + +#define __iowrite64_copy(to, from, count) \ + (__builtin_constant_p(count) ? \ + __const_iowrite64_copy(to, from, count) : \ + __iowrite64_copy_full(to, from, count)) + /* * I/O memory mapping functions. */ diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index aa7a4ec6a3ae..ef48089fbfe1 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -37,6 +37,48 @@ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) } EXPORT_SYMBOL(__memcpy_fromio); +/* + * This generates a memcpy that works on a from/to address which is aligned to + * bits. Count is in terms of the number of bits sized quantities to copy. It + * optimizes to use the STR groupings when possible so that it is WC friendly. + */ +#define memcpy_toio_aligned(to, from, count, bits) \ + ({ \ + volatile u##bits __iomem *_to = to; \ + const u##bits *_from = from; \ + size_t _count = count; \ + const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \ + \ + for (; _from < _end_from; _from += 8, _to += 8) \ + __const_memcpy_toio_aligned##bits(_to, _from, 8); \ + if ((_count % 8) >= 4) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 4); \ + _from += 4; \ + _to += 4; \ + } \ + if ((_count % 4) >= 2) { \ + __const_memcpy_toio_aligned##bits(_to, _from, 2); \ + _from += 2; \ + _to += 2; \ + } \ + if (_count % 2) \ + __const_memcpy_toio_aligned##bits(_to, _from, 1); \ + }) + +void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count) +{ + memcpy_toio_aligned(to, from, count, 64); + dgh(); +} +EXPORT_SYMBOL(__iowrite64_copy_full); + +void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count) +{ + memcpy_toio_aligned(to, from, count, 32); + dgh(); +} +EXPORT_SYMBOL(__iowrite32_copy_full); + /* * Copy data from "real" memory space to IO memory space. */ -- cgit v1.2.3 From 2b7a5e1fe02231acc5d50339b2f10833565ef559 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Apr 2024 13:46:18 -0300 Subject: net: hns3: Remove io_stop_wc() calls after __iowrite64_copy() Now that the ARM64 arch implementation does the DGH as part of __iowrite64_copy() there is no reason to open code this in drivers. Link: https://lore.kernel.org/r/5-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Reviewed-by: Jijie Shao Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 19668a8d22f7..04b9e86363f8 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2068,8 +2068,6 @@ static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num) __iowrite64_copy(ring->tqp->mem_base, desc, (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) / HNS3_BYTES_PER_64BIT); - - io_stop_wc(); } static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring) @@ -2088,8 +2086,6 @@ static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring) u64_stats_update_begin(&ring->syncp); ring->stats.tx_mem_doorbell += ring->pending_buf; u64_stats_update_end(&ring->syncp); - - io_stop_wc(); } static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num, -- cgit v1.2.3 From ef302283ddfceaba2657923af3f90fd58e6dff06 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Apr 2024 13:46:19 -0300 Subject: IB/mlx5: Use __iowrite64_copy() for write combining stores mlx5 has a built in self-test at driver startup to evaluate if the platform supports write combining to generate a 64 byte PCIe TLP or not. This has proven necessary because a lot of common scenarios end up with broken write combining (especially inside virtual machines) and there is other way to learn this information. This self test has been consistently failing on new ARM64 CPU designs (specifically with NVIDIA Grace's implementation of Neoverse V2). The C loop around writeq() generates some pretty terrible ARM64 assembly, but historically this has worked on a lot of existing ARM64 CPUs till now. We see it succeed about 1 time in 10,000 on the worst effected systems. The CPU architects speculate that the load instructions interspersed with the stores makes the WC buffers statistically flush too often and thus the generation of large TLPs becomes infrequent. This makes the boot up test unreliable in that it indicates no write-combining, however userspace would be fine since it uses a ST4 instruction. Further, S390 has similar issues where only the special zpci_memcpy_toio() will actually generate large TLPs, and the open coded loop does not trigger it at all. Fix both ARM64 and S390 by switching to __iowrite64_copy() which now provides architecture specific variants that have a high change of generating a large TLP with write combining. x86 continues to use a similar writeq loop in the generate __iowrite64_copy(). Fixes: 11f552e21755 ("IB/mlx5: Test write combining support") Link: https://lore.kernel.org/r/6-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Tested-by: Niklas Schnelle Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mem.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 96ffbbaf0a73..5a22be14d958 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include #include #include "mlx5_ib.h" #include @@ -108,7 +109,6 @@ static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id, __be32 mmio_wqe[16] = {}; unsigned long flags; unsigned int idx; - int i; if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) return -EIO; @@ -148,10 +148,8 @@ static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id, * we hit doorbell */ wmb(); - for (i = 0; i < 8; i++) - mlx5_write64(&mmio_wqe[i * 2], - bf->bfreg->map + bf->offset + i * 8); - io_stop_wc(); + __iowrite64_copy(bf->bfreg->map + bf->offset, mmio_wqe, + sizeof(mmio_wqe) / 8); bf->offset ^= bf->buf_size; -- cgit v1.2.3 From f88320b698ad099a2f742adfb9f87177bfffe0c5 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Tue, 23 Apr 2024 07:15:51 -0700 Subject: RDMA/mana_ib: Fix missing ret value Set ret to -ENODEV when netdev_master_upper_dev_get_rcu returns NULL. Fixes: 8b184e4f1c32 ("RDMA/mana_ib: Enable RoCE on port 1") Link: https://lore.kernel.org/r/1713881751-21621-1-git-send-email-kotaranov@linux.microsoft.com Signed-off-by: Konstantin Taranov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index fca4d0d85c64..7e09ceb3da53 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -87,6 +87,7 @@ static int mana_ib_probe(struct auxiliary_device *adev, upper_ndev = netdev_master_upper_dev_get_rcu(mc->ports[0]); if (!upper_ndev) { rcu_read_unlock(); + ret = -ENODEV; ibdev_err(&dev->ib_dev, "Failed to get master netdev"); goto free_ib_device; } -- cgit v1.2.3 From f847e840157b91a490a13df78c4a6d4e5700ba0a Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Thu, 25 Apr 2024 17:18:14 +0000 Subject: RDMA/efa: Add shutdown notifier Add driver function to stop the device and release any active IRQs as preparation for shutdown. This should fix issues caused by unexpected AQ interrupts when booting kernel using kexec and possible data integrity issues when the system is being shutdown during traffic. Link: https://lore.kernel.org/r/20240425171814.25216-1-mrgolin@amazon.com Reviewed-by: Firas Jahjah Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_main.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 5fa3603c80d8..d1a48f988f6c 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -671,11 +671,22 @@ static void efa_remove(struct pci_dev *pdev) efa_remove_device(pdev); } +static void efa_shutdown(struct pci_dev *pdev) +{ + struct efa_dev *dev = pci_get_drvdata(pdev); + + efa_destroy_eqs(dev); + efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_SHUTDOWN); + efa_free_irq(dev, &dev->admin_irq); + efa_disable_msix(dev); +} + static struct pci_driver efa_pci_driver = { .name = DRV_MODULE_NAME, .id_table = efa_pci_tbl, .probe = efa_probe, .remove = efa_remove, + .shutdown = efa_shutdown, }; module_pci_driver(efa_pci_driver); -- cgit v1.2.3 From e18fa0bbcedf82aaa1db27079ef6a43e11367592 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 16 Apr 2024 15:03:50 +0300 Subject: RDMA/core: Add an option to display driver-specific QPs in the rdmatool Utilize the -dd flag (driver-specific details) in the rdmatool to view driver-specific QPs which are not exposed yet. Add the netlink attribute to mark request to convey driver details and use it to return QP subtype as a string. $ rdma resource show qp link ibp8s0f1 link ibp8s0f1/1 lqpn 360 type UD state RTS sq-psn 0 comm [mlx5_ib] link ibp8s0f1/1 lqpn 0 type SMI state RTS sq-psn 0 comm [ib_core] link ibp8s0f1/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core] $ rdma resource show qp link ibp8s0f1 -dd link ibp8s0f1/1 lqpn 360 type UD state RTS sq-psn 0 comm [mlx5_ib] link ibp8s0f1/1 lqpn 465 type DRIVER subtype REG_UMR state RTS sq-psn 0 comm [mlx5_ib] link ibp8s0f1/1 lqpn 0 type SMI state RTS sq-psn 0 comm [ib_core] link ibp8s0f1/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core] $ rdma resource show 0: ibp8s0f0: pd 3 cq 4 qp 3 cm_id 0 mr 0 ctx 0 srq 2 1: ibp8s0f1: pd 3 cq 4 qp 3 cm_id 0 mr 0 ctx 0 srq 2 $ rdma resource show -dd 0: ibp8s0f0: pd 3 cq 4 qp 4 cm_id 0 mr 0 ctx 0 srq 2 1: ibp8s0f1: pd 3 cq 4 qp 4 cm_id 0 mr 0 ctx 0 srq 2 Signed-off-by: Chiara Meiohas Link: https://lore.kernel.org/r/2607bb3ddec3cae3443c2ea19e9f700825d20a98.1713268997.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 23 +++++++++++++++++++---- drivers/infiniband/core/restrack.c | 12 ++++++++++-- include/rdma/restrack.h | 7 +++++-- include/uapi/rdma/rdma_netlink.h | 6 ++++++ 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 4900a0848124..bc79ee630d8d 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -137,6 +137,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, @@ -164,6 +166,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -399,7 +402,8 @@ err: return -EMSGSIZE; } -static int fill_res_info(struct sk_buff *msg, struct ib_device *device) +static int fill_res_info(struct sk_buff *msg, struct ib_device *device, + bool show_details) { static const char * const names[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_PD] = "pd", @@ -424,7 +428,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(device, i); + curr = rdma_restrack_count(device, i, show_details); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; @@ -1305,6 +1309,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + bool show_details = false; struct ib_device *device; struct sk_buff *msg; u32 index; @@ -1320,6 +1325,9 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (!device) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) + show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; @@ -1334,7 +1342,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, goto err_free; } - ret = fill_res_info(msg, device); + ret = fill_res_info(msg, device, show_details); if (ret) goto err_free; @@ -1364,7 +1372,7 @@ static int _nldev_res_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); - if (!nlh || fill_res_info(skb, device)) { + if (!nlh || fill_res_info(skb, device, false)) { nlmsg_cancel(skb, nlh); goto out; } @@ -1534,6 +1542,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; + bool show_details = false; struct nlattr *table_attr; struct nlattr *entry_attr; struct ib_device *device; @@ -1562,6 +1571,9 @@ static int res_get_common_dumpit(struct sk_buff *skb, if (!device) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) + show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); + /* * If no PORT_INDEX is supplied, we will return all QPs from that device */ @@ -1599,6 +1611,9 @@ static int res_get_common_dumpit(struct sk_buff *skb, * objects. */ xa_for_each(&rt->xa, id, res) { + if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details) + goto next; + if (idx < start || !rdma_restrack_get(res)) goto next; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 438ed3588175..3313410014cd 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -59,8 +59,10 @@ void rdma_restrack_clean(struct ib_device *dev) * rdma_restrack_count() - the current usage of specific object * @dev: IB device * @type: actual type of object to operate + * @show_details: count driver specific objects */ -int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, + bool show_details) { struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; @@ -68,8 +70,11 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) u32 cnt = 0; xa_lock(&rt->xa); - xas_for_each(&xas, e, U32_MAX) + xas_for_each(&xas, e, U32_MAX) { + if (xa_get_mark(&rt->xa, e->id, RESTRACK_DD) && !show_details) + continue; cnt++; + } xa_unlock(&rt->xa); return cnt; } @@ -198,6 +203,9 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL); if (ret) res->id = 0; + + if (qp->qp_type >= IB_QPT_DRIVER) + xa_set_mark(&rt->xa, res->id, RESTRACK_DD); } else if (res->type == RDMA_RESTRACK_COUNTER) { /* Special case to ensure that cntn points to right counter */ struct rdma_counter *counter; diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 8b7c46daeb07..0d69ded73bf2 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -14,6 +14,9 @@ #include #include +/* Mark entry as containing driver specific details, it is used to provide QP subtype for now */ +#define RESTRACK_DD XA_MARK_1 + struct ib_device; struct sk_buff; @@ -116,8 +119,8 @@ struct rdma_restrack_entry { u32 id; }; -int rdma_restrack_count(struct ib_device *dev, - enum rdma_restrack_type type); +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, + bool show_details); /** * rdma_is_kernel_res() - check the owner of resource * @res: resource entry diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 723bbb0f7042..a214fc259f28 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -558,6 +558,12 @@ enum rdma_nldev_attr { RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE, /* u8 */ + RDMA_NLDEV_ATTR_DRIVER_DETAILS, /* u8 */ + /* + * QP subtype string, used for driver QPs + */ + RDMA_NLDEV_ATTR_RES_SUBTYPE, /* string */ + /* * Always the end */ -- cgit v1.2.3 From fd3af5e21866b776713b8c60556153d758995fb7 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 16 Apr 2024 15:03:51 +0300 Subject: RDMA/mlx5: Track DCT, DCI and REG_UMR QPs as diver_detail resources. Allow user to see driver-specific QPs (the "driver_detail" QPs) through the rdmatool, when requested. When creating DCT, DCI and REG_UMR QPs, we designate them as driver_detail resources. When filling the QP info for the rdma tool, for the driver_detail QPs: -the QP type is IB_QPT_DRIVER -the subtype is a string with the QP name ("DCT", "DCI", "REG_UMR") Signed-off-by: Chiara Meiohas Link: https://lore.kernel.org/r/452432d7d0917f053a80a893a614169857fe3b10.1713268997.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 3 +-- drivers/infiniband/hw/mlx5/restrack.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8115ab107149..e2164f813607 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3097,7 +3097,6 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, switch (qp->type) { case MLX5_IB_QPT_DCT: err = create_dct(dev, pd, qp, params); - rdma_restrack_no_track(&qp->ibqp.res); break; case MLX5_IB_QPT_DCI: err = create_dci(dev, pd, qp, params); @@ -3109,9 +3108,9 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = mlx5_ib_create_gsi(pd, qp, params->attr); break; case MLX5_IB_QPT_HW_GSI: - case MLX5_IB_QPT_REG_UMR: rdma_restrack_no_track(&qp->ibqp.res); fallthrough; + case MLX5_IB_QPT_REG_UMR: default: if (params->udata) err = create_user_qp(dev, pd, qp, params); diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c index 4ac429e72004..affcf8fe943c 100644 --- a/drivers/infiniband/hw/mlx5/restrack.c +++ b/drivers/infiniband/hw/mlx5/restrack.c @@ -156,6 +156,34 @@ static int fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ibcq) return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_CQ, cq->mcq.cqn); } +static int fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + int ret; + + if (qp->type < IB_QPT_DRIVER) + return 0; + + switch (qp->type) { + case MLX5_IB_QPT_REG_UMR: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, + "REG_UMR"); + break; + case MLX5_IB_QPT_DCT: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCT"); + break; + case MLX5_IB_QPT_DCI: + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUBTYPE, "DCI"); + break; + default: + return 0; + } + if (ret) + return ret; + + return nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, IB_QPT_DRIVER); +} + static int fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -168,6 +196,7 @@ static const struct ib_device_ops restrack_ops = { .fill_res_cq_entry_raw = fill_res_cq_entry_raw, .fill_res_mr_entry = fill_res_mr_entry, .fill_res_mr_entry_raw = fill_res_mr_entry_raw, + .fill_res_qp_entry = fill_res_qp_entry, .fill_res_qp_entry_raw = fill_res_qp_entry_raw, .fill_stat_mr_entry = fill_stat_mr_entry, }; -- cgit v1.2.3 From 82e966130ddd67539ab904f2038e7bf5d4a66247 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Wed, 1 May 2024 00:46:42 +0100 Subject: RDMA/mlx5: Remove NULL check before dev_{put, hold} Coccinelle reports a warning WARNING: NULL check before dev_{put, hold} functions is not needed The reason is the call netdev_{put, hold} of dev_{put,hold} will check NULL There is no need to check before using dev_{put, hold} Signed-off-by: Jules Irenge Link: https://lore.kernel.org/r/ZjGC4qXrOwZE0aHi@octinomon.home Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c2b557e64290..2366c46eebc8 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -264,8 +264,7 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, */ read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); ndev = ibdev->port[port_num - 1].roce.netdev; - if (ndev) - dev_hold(ndev); + dev_hold(ndev); read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); out: -- cgit v1.2.3 From e4e40a87024c502dcca279504a4550e617eea037 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Wed, 1 May 2024 00:47:33 +0100 Subject: RDMA/ipoib: Remove NULL check before dev_{put, hold} Coccinelle reports a warning WARNING: NULL check before dev_{put, hold} functions is not needed The reason is the call netdev_{put, hold} of dev_{put,hold} will check NULL There is no need to check before using dev_{put, hold} Signed-off-by: Jules Irenge Link: https://lore.kernel.org/r/ZjGDFatHRMI6Eg7M@octinomon.home Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 6f2a688fccbf..4abec0124ea3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -329,8 +329,7 @@ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); - if (master) - dev_hold(master); + dev_hold(master); rcu_read_unlock(); if (master) -- cgit v1.2.3 From 48d80b484491f177c586874c480cf9ba3af82b4f Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Tue, 30 Apr 2024 23:47:45 +0100 Subject: RDMA/core: Remove NULL check before dev_{put, hold} Coccinelle reports a warning WARNING: NULL check before dev_{put, hold} functions is not needed The reason is the call netdev_{put, hold} of dev_{put,hold} will check NULL There is no need to check before using dev_{put, hold} Signed-off-by: Jules Irenge Link: https://lore.kernel.org/r/ZjF1Eedxwhn4JSkz@octinomon.home Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 10 +++------- drivers/infiniband/core/lag.c | 3 +-- drivers/infiniband/core/roce_gid_mgmt.c | 3 +-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 07cb6c5ffda0..55aa7aa32d4a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2174,8 +2174,7 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); - if (old_ndev) - __dev_put(old_ndev); + __dev_put(old_ndev); return 0; } @@ -2235,8 +2234,7 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); - if (res) - dev_hold(res); + dev_hold(res); spin_unlock(&pdata->netdev_lock); } @@ -2311,9 +2309,7 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); - - if (idev) - dev_put(idev); + dev_put(idev); } } diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c index eca6e37c72ba..8fd80adfe833 100644 --- a/drivers/infiniband/core/lag.c +++ b/drivers/infiniband/core/lag.c @@ -93,8 +93,7 @@ static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device, slave = netdev_get_xmit_slave(master, skb, !!(device->lag_flags & RDMA_LAG_FLAGS_HASH_ALL_SLAVES)); - if (slave) - dev_hold(slave); + dev_hold(slave); rcu_read_unlock(); kfree_skb(skb); return slave; diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index e958c43dd28f..d5131b3ba8ab 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -601,8 +601,7 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); - if (master_ndev) - dev_hold(master_ndev); + dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { -- cgit v1.2.3 From e73c882f0a0149d8cad79f87b28cbbc9b4ed9ebe Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Fri, 26 Apr 2024 06:12:36 -0700 Subject: RDMA/mana_ib: create EQs for RNIC CQs Create EQs within mana_ib device. Such EQs are required for creation of RNIC CQs. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1714137160-5222-2-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 34 ++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mana/mana_ib.h | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index f5401471bffe..546d059470e5 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -658,7 +658,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) { struct gdma_context *gc = mdev_to_gc(mdev); struct gdma_queue_spec spec = {}; - int err; + int err, i; spec.type = GDMA_EQ; spec.monitor_avl_buf = false; @@ -672,12 +672,42 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev) if (err) return err; + mdev->eqs = kcalloc(mdev->ib_dev.num_comp_vectors, sizeof(struct gdma_queue *), + GFP_KERNEL); + if (!mdev->eqs) { + err = -ENOMEM; + goto destroy_fatal_eq; + } + + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) { + spec.eq.msix_index = (i + 1) % gc->num_msix_usable; + err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]); + if (err) + goto destroy_eqs; + } + return 0; + +destroy_eqs: + while (i-- > 0) + mana_gd_destroy_queue(gc, mdev->eqs[i]); + kfree(mdev->eqs); +destroy_fatal_eq: + mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + return err; } void mana_ib_destroy_eqs(struct mana_ib_dev *mdev) { - mana_gd_destroy_queue(mdev_to_gc(mdev), mdev->fatal_err_eq); + struct gdma_context *gc = mdev_to_gc(mdev); + int i; + + mana_gd_destroy_queue(gc, mdev->fatal_err_eq); + + for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) + mana_gd_destroy_queue(gc, mdev->eqs[i]); + + kfree(mdev->eqs); } int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev) diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 4c1240da0c5f..bfcf6dfd221e 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -56,6 +56,7 @@ struct mana_ib_dev { struct gdma_dev *gdma_dev; mana_handle_t adapter_handle; struct gdma_queue *fatal_err_eq; + struct gdma_queue **eqs; struct mana_ib_adapter_caps adapter_caps; }; -- cgit v1.2.3 From 5843415916852983d3aaddc87b57630af9b0adad Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Fri, 26 Apr 2024 06:12:37 -0700 Subject: RDMA/mana_ib: create and destroy RNIC cqs Implement RNIC requests for creation and destruction of RNIC CQs. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1714137160-5222-3-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 54 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 32 +++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 546d059470e5..2a411357640e 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -834,3 +834,57 @@ int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 return 0; } + +int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 doorbell) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_create_cq_resp resp = {}; + struct mana_rnic_create_cq_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_CQ, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.gdma_region = cq->queue.gdma_region; + req.eq_id = mdev->eqs[cq->comp_vector]->id; + req.doorbell_page = doorbell; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to create cq err %d", err); + return err; + } + + cq->queue.id = resp.cq_id; + cq->cq_handle = resp.cq_handle; + /* The GDMA region is now owned by the CQ handle */ + cq->queue.gdma_region = GDMA_INVALID_DMA_REGION; + + return 0; +} + +int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + struct mana_rnic_destroy_cq_resp resp = {}; + struct mana_rnic_destroy_cq_req req = {}; + int err; + + if (cq->cq_handle == INVALID_MANA_HANDLE) + return 0; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_CQ, sizeof(req), sizeof(resp)); + req.hdr.dev_id = gc->mana_ib.dev_id; + req.adapter = mdev->adapter_handle; + req.cq_handle = cq->cq_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to destroy cq err %d", err); + return err; + } + + return 0; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index bfcf6dfd221e..9162f29da02d 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -92,6 +92,7 @@ struct mana_ib_cq { struct mana_ib_queue queue; int cqe; u32 comp_vector; + mana_handle_t cq_handle; }; struct mana_ib_qp { @@ -119,6 +120,8 @@ enum mana_ib_command_code { MANA_IB_DESTROY_ADAPTER = 0x30003, MANA_IB_CONFIG_IP_ADDR = 0x30004, MANA_IB_CONFIG_MAC_ADDR = 0x30005, + MANA_IB_CREATE_CQ = 0x30008, + MANA_IB_DESTROY_CQ = 0x30009, }; struct mana_ib_query_adapter_caps_req { @@ -202,6 +205,31 @@ struct mana_rnic_config_mac_addr_resp { struct gdma_resp_hdr hdr; }; /* HW Data */ +struct mana_rnic_create_cq_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + u64 gdma_region; + u32 eq_id; + u32 doorbell_page; +}; /* HW Data */ + +struct mana_rnic_create_cq_resp { + struct gdma_resp_hdr hdr; + mana_handle_t cq_handle; + u32 cq_id; + u32 reserved; +}; /* HW Data */ + +struct mana_rnic_destroy_cq_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; + mana_handle_t cq_handle; +}; /* HW Data */ + +struct mana_rnic_destroy_cq_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) { return mdev->gdma_dev->gdma_context; @@ -321,4 +349,8 @@ int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context); int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context); int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8 *mac); + +int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 doorbell); + +int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); #endif -- cgit v1.2.3 From 3e41105263d5d74840c0d117278894b428f02841 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Fri, 26 Apr 2024 06:12:38 -0700 Subject: RDMA/mana_ib: introduce a helper to remove cq callbacks Intoduce the mana_ib_remove_cq_cb helper to remove cq callbacks. The helper removes code duplicates. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1714137160-5222-4-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 19 ++++++++++++------- drivers/infiniband/hw/mana/mana_ib.h | 1 + drivers/infiniband/hw/mana/qp.c | 26 ++++---------------------- 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index dc931b9c3491..298e8f15a659 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -48,16 +48,10 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); struct ib_device *ibdev = ibcq->device; struct mana_ib_dev *mdev; - struct gdma_context *gc; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - gc = mdev_to_gc(mdev); - - if (cq->queue.id != INVALID_QUEUE_ID) { - kfree(gc->cq_table[cq->queue.id]); - gc->cq_table[cq->queue.id] = NULL; - } + mana_ib_remove_cq_cb(mdev, cq); mana_ib_destroy_queue(mdev, &cq->queue); return 0; @@ -89,3 +83,14 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) gc->cq_table[cq->queue.id] = gdma_cq; return 0; } + +void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) +{ + struct gdma_context *gc = mdev_to_gc(mdev); + + if (cq->queue.id >= gc->max_num_cqs || cq->queue.id == INVALID_QUEUE_ID) + return; + + kfree(gc->cq_table[cq->queue.id]); + gc->cq_table[cq->queue.id] = NULL; +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 9162f29da02d..68c3b4f0faa4 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -255,6 +255,7 @@ static inline void copy_in_reverse(u8 *dst, const u8 *src, u32 size) } int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); +void mana_ib_remove_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq); int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, mana_handle_t *gdma_region); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 280e85a83f7e..ba13c5abf8ef 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -95,11 +95,9 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); struct mana_ib_dev *mdev = container_of(pd->device, struct mana_ib_dev, ib_dev); - struct gdma_context *gc = mdev_to_gc(mdev); struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl; struct mana_ib_create_qp_rss_resp resp = {}; struct mana_ib_create_qp_rss ucmd = {}; - struct gdma_queue **gdma_cq_allocated; mana_handle_t *mana_ind_table; struct mana_port_context *mpc; unsigned int ind_tbl_size; @@ -173,13 +171,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, goto fail; } - gdma_cq_allocated = kcalloc(ind_tbl_size, sizeof(*gdma_cq_allocated), - GFP_KERNEL); - if (!gdma_cq_allocated) { - ret = -ENOMEM; - goto fail; - } - qp->port = port; for (i = 0; i < ind_tbl_size; i++) { @@ -229,8 +220,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ret = mana_ib_install_cq_cb(mdev, cq); if (ret) goto fail; - - gdma_cq_allocated[i] = gc->cq_table[cq->queue.id]; } resp.num_entries = i; @@ -250,7 +239,6 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, goto fail; } - kfree(gdma_cq_allocated); kfree(mana_ind_table); return 0; @@ -262,13 +250,10 @@ fail: wq = container_of(ibwq, struct mana_ib_wq, ibwq); cq = container_of(ibcq, struct mana_ib_cq, ibcq); - gc->cq_table[cq->queue.id] = NULL; - kfree(gdma_cq_allocated[i]); - + mana_ib_remove_cq_cb(mdev, cq); mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); } - kfree(gdma_cq_allocated); kfree(mana_ind_table); return ret; @@ -287,10 +272,8 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, struct mana_ib_ucontext *mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, ibucontext); - struct gdma_context *gc = mdev_to_gc(mdev); struct mana_ib_create_qp_resp resp = {}; struct mana_ib_create_qp ucmd = {}; - struct gdma_queue *gdma_cq = NULL; struct mana_obj_spec wq_spec = {}; struct mana_obj_spec cq_spec = {}; struct mana_port_context *mpc; @@ -395,14 +378,13 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, ibdev_dbg(&mdev->ib_dev, "Failed copy udata for create qp-raw, %d\n", err); - goto err_release_gdma_cq; + goto err_remove_cq_cb; } return 0; -err_release_gdma_cq: - kfree(gdma_cq); - gc->cq_table[send_cq->queue.id] = NULL; +err_remove_cq_cb: + mana_ib_remove_cq_cb(mdev, send_cq); err_destroy_wq_obj: mana_destroy_wq_obj(mpc, GDMA_SQ, qp->qp_handle); -- cgit v1.2.3 From f79edef79b6a2161f4124112f9b0c46891bb0b74 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Fri, 26 Apr 2024 06:12:39 -0700 Subject: RDMA/mana_ib: boundary check before installing cq callbacks Add a boundary check inside mana_ib_install_cq_cb to prevent index overflow. Fixes: 2a31c5a7e0d8 ("RDMA/mana_ib: Introduce mana_ib_install_cq_cb helper function") Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1714137160-5222-5-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 298e8f15a659..688ffe61f6b2 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -70,6 +70,8 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) struct gdma_context *gc = mdev_to_gc(mdev); struct gdma_queue *gdma_cq; + if (cq->queue.id >= gc->max_num_cqs) + return -EINVAL; /* Create CQ table entry */ WARN_ON(gc->cq_table[cq->queue.id]); gdma_cq = kzalloc(sizeof(*gdma_cq), GFP_KERNEL); -- cgit v1.2.3 From 44b607ad4cdf23ae8f796b95bd14709fa06f7728 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Fri, 26 Apr 2024 06:12:40 -0700 Subject: RDMA/mana_ib: implement uapi for creation of rnic cq Enable users to create RNIC CQs using a corresponding flag. With the previous request size, an ethernet CQ is created. As a response, return ID of the created CQ. Signed-off-by: Konstantin Taranov Link: https://lore.kernel.org/r/1714137160-5222-6-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/cq.c | 55 ++++++++++++++++++++++++++++++++++++++--- include/uapi/rdma/mana-abi.h | 12 +++++++++ 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 688ffe61f6b2..c6a3fd57a196 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -9,17 +9,22 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata) { struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct mana_ib_create_cq_resp resp = {}; + struct mana_ib_ucontext *mana_ucontext; struct ib_device *ibdev = ibcq->device; struct mana_ib_create_cq ucmd = {}; struct mana_ib_dev *mdev; + bool is_rnic_cq; + u32 doorbell; int err; mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); - if (udata->inlen < sizeof(ucmd)) - return -EINVAL; - cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; + cq->cq_handle = INVALID_MANA_HANDLE; + + if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) + return -EINVAL; err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); if (err) { @@ -28,7 +33,9 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return err; } - if (attr->cqe > mdev->adapter_caps.max_qp_wr) { + is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); + + if (!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) { ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); return -EINVAL; } @@ -40,7 +47,41 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return err; } + mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + doorbell = mana_ucontext->doorbell; + + if (is_rnic_cq) { + err = mana_ib_gd_create_cq(mdev, cq, doorbell); + if (err) { + ibdev_dbg(ibdev, "Failed to create RNIC cq, %d\n", err); + goto err_destroy_queue; + } + + err = mana_ib_install_cq_cb(mdev, cq); + if (err) { + ibdev_dbg(ibdev, "Failed to install cq callback, %d\n", err); + goto err_destroy_rnic_cq; + } + } + + resp.cqid = cq->queue.id; + err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err); + goto err_remove_cq_cb; + } + return 0; + +err_remove_cq_cb: + mana_ib_remove_cq_cb(mdev, cq); +err_destroy_rnic_cq: + mana_ib_gd_destroy_cq(mdev, cq); +err_destroy_queue: + mana_ib_destroy_queue(mdev, &cq->queue); + + return err; } int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) @@ -52,6 +93,12 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); mana_ib_remove_cq_cb(mdev, cq); + + /* Ignore return code as there is not much we can do about it. + * The error message is printed inside. + */ + mana_ib_gd_destroy_cq(mdev, cq); + mana_ib_destroy_queue(mdev, &cq->queue); return 0; diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h index 5fcb31b37fb9..2c41cc315218 100644 --- a/include/uapi/rdma/mana-abi.h +++ b/include/uapi/rdma/mana-abi.h @@ -16,8 +16,20 @@ #define MANA_IB_UVERBS_ABI_VERSION 1 +enum mana_ib_create_cq_flags { + MANA_IB_CREATE_RNIC_CQ = 1 << 0, +}; + struct mana_ib_create_cq { __aligned_u64 buf_addr; + __u16 flags; + __u16 reserved0; + __u32 reserved1; +}; + +struct mana_ib_create_cq_resp { + __u32 cqid; + __u32 reserved; }; struct mana_ib_create_qp { -- cgit v1.2.3 From 8f3b7103b41314d26e2653e9ccca29480123a204 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 3 May 2024 16:36:40 +0300 Subject: RDMA/hfi1: Use RMW accessors for changing LNKCTL2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert open coded RMW accesses for LNKCTL2 to use pcie_capability_clear_and_set_word() which makes its easier to understand what the code tries to do. In addition, this futureproofs the code. LNKCTL2 is not really owned by any driver because it is a collection of control bits that PCI core might need to touch. RMW accessors already have support for proper locking for a selected set of registers to avoid losing concurrent updates (LNKCTL2 is not yet among the registers that need protection but likely will be in the future). Suggested-by: Lukas Wunner Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240503133640.15899-1-ilpo.jarvinen@linux.intel.com Reviewed-by: Dean Luick Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/pcie.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 119ec2f1382b..7133964749f8 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1207,14 +1207,11 @@ retry: (u32)lnkctl2); /* only write to parent if target is not as high as ours */ if ((lnkctl2 & PCI_EXP_LNKCTL2_TLS) < target_vector) { - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; - lnkctl2 |= target_vector; - dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); - ret = pcie_capability_write_word(parent, - PCI_EXP_LNKCTL2, lnkctl2); + ret = pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL2, + PCI_EXP_LNKCTL2_TLS, + target_vector); if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); + dd_dev_err(dd, "Unable to change parent PCI target speed\n"); return_error = 1; goto done; } @@ -1223,22 +1220,11 @@ retry: } dd_dev_info(dd, "%s: setting target link speed\n", __func__); - ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + ret = pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL2, + PCI_EXP_LNKCTL2_TLS, + target_vector); if (ret) { - dd_dev_err(dd, "Unable to read from PCI config\n"); - return_error = 1; - goto done; - } - - dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, - (u32)lnkctl2); - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; - lnkctl2 |= target_vector; - dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); - ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); - if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); + dd_dev_err(dd, "Unable to change device PCI target speed\n"); return_error = 1; goto done; } -- cgit v1.2.3 From 5194947e6a3966d50095c14c69edbec90ad191f9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 3 May 2024 04:13:31 -0700 Subject: IB/hfi1: Do not use custom stat allocator With commit 34d21de99cea9 ("net: Move {l,t,d}stats allocation to core and convert veth & vrf"), stats allocation could be done on net core instead of in this driver. With this new approach, the driver doesn't have to bother with error handling (allocation failure checking, making sure free happens in the right spot, etc). This is core responsibility now. Remove the allocation in the hfi1 driver and leverage the network core allocation instead. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240503111333.552360-1-leitao@debian.org Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/ipoib_main.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 5d814afdf7f3..59c6e55f4119 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -21,36 +21,25 @@ static int hfi1_ipoib_dev_init(struct net_device *dev) struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); int ret; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = priv->netdev_ops->ndo_init(dev); if (ret) - goto out_ret; + return ret; ret = hfi1_netdev_add_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr), dev); if (ret < 0) { priv->netdev_ops->ndo_uninit(dev); - goto out_ret; + return ret; } return 0; -out_ret: - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; } static void hfi1_ipoib_dev_uninit(struct net_device *dev) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); - free_percpu(dev->tstats); - dev->tstats = NULL; - hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr)); priv->netdev_ops->ndo_uninit(dev); @@ -173,9 +162,6 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) hfi1_ipoib_txreq_deinit(priv); hfi1_ipoib_rxq_deinit(priv->netdev); - - free_percpu(dev->tstats); - dev->tstats = NULL; } static void hfi1_ipoib_set_id(struct net_device *dev, int id) @@ -234,6 +220,7 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device, netdev->priv_destructor = hfi1_ipoib_netdev_dtor; netdev->needs_free_netdev = true; + netdev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; return 0; } -- cgit v1.2.3 From f483f6a29d4d701f1641898463e93d081bb03b52 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 3 May 2024 04:13:32 -0700 Subject: IB/hfi1: Remove generic .ndo_get_stats64 Commit 3e2f544dd8a33 ("net: get stats64 if device if driver is configured") moved the callback to dev_get_tstats64() to net core, so, unless the driver is doing some custom stats collection, it does not need to set .ndo_get_stats64. Since this driver is now relying in NETDEV_PCPU_STAT_TSTATS, then, it doesn't need to set the dev_get_tstats64() generic .ndo_get_stats64 function pointer. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240503111333.552360-2-leitao@debian.org Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/ipoib_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 59c6e55f4119..7c9d5203002b 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -96,7 +96,6 @@ static const struct net_device_ops hfi1_ipoib_netdev_ops = { .ndo_uninit = hfi1_ipoib_dev_uninit, .ndo_open = hfi1_ipoib_dev_open, .ndo_stop = hfi1_ipoib_dev_stop, - .ndo_get_stats64 = dev_get_tstats64, }; static int hfi1_ipoib_mcast_attach(struct net_device *dev, -- cgit v1.2.3 From 2b8af5001abdf583da3a63201cc6137553019515 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 6 May 2024 15:18:29 +0000 Subject: RDMA/efa: Support QP with unsolicited write w/ imm. receive Add a new EFA flags attribute for QP creation, and support unsolicited write with immediate flag. QPs created with this flag set will not consume receive work requests for incoming RDMA write with immediate. Expose device capability bit for this feature support. Reviewed-by: Daniel Kranzdorf Reviewed-by: Firas Jahjah Signed-off-by: Michael Margolin Link: https://lore.kernel.org/r/20240506151829.6475-1-mrgolin@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 11 +++++++++-- drivers/infiniband/hw/efa/efa_com_cmd.c | 3 +++ drivers/infiniband/hw/efa/efa_com_cmd.h | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 19 ++++++++++++++++++- include/uapi/rdma/efa-abi.h | 7 +++++++ 5 files changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 7377c8a9f4d5..4296662e59c3 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -110,7 +110,10 @@ struct efa_admin_create_qp_cmd { * virtual (IOVA returned by MR registration) * 1 : rq_virt - If set, RQ ring base address is * virtual (IOVA returned by MR registration) - * 7:2 : reserved - MBZ + * 2 : unsolicited_write_recv - If set, work requests + * will not be consumed for incoming RDMA write with + * immediate + * 7:3 : reserved - MBZ */ u8 flags; @@ -663,7 +666,9 @@ struct efa_admin_feature_device_attr_desc { * polling is supported * 3 : rdma_write - If set, RDMA Write is supported * on TX queues - * 31:4 : reserved - MBZ + * 4 : unsolicited_write_recv - If set, unsolicited + * write with imm. receive is supported + * 31:5 : reserved - MBZ */ u32 device_caps; @@ -1009,6 +1014,7 @@ struct efa_admin_host_info { /* create_qp_cmd */ #define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK BIT(0) #define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK BIT(1) +#define EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV_MASK BIT(2) /* modify_qp_cmd */ #define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK BIT(0) @@ -1044,6 +1050,7 @@ struct efa_admin_host_info { #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK BIT(1) #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2) #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK BIT(3) +#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_UNSOLICITED_WRITE_RECV_MASK BIT(4) /* create_eq_cmd */ #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK GENMASK(4, 0) diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index d3398c7b0bd0..5b9c2b16df0e 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -32,6 +32,9 @@ int efa_com_create_qp(struct efa_com_dev *edev, params->rq_depth; create_qp_cmd.uar = params->uarn; + if (params->unsolicited_write_recv) + EFA_SET(&create_qp_cmd.flags, EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV, 1); + err = efa_com_cmd_exec(aq, (struct efa_admin_aq_entry *)&create_qp_cmd, sizeof(create_qp_cmd), diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 720a99ba0f7d..9714105fcf7e 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -27,6 +27,7 @@ struct efa_com_create_qp_params { u16 pd; u16 uarn; u8 qp_type; + u8 unsolicited_write_recv : 1; }; struct efa_com_create_qp_result { diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 2f412db2edcd..8f7a13b79cdc 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -263,6 +263,9 @@ int efa_query_device(struct ib_device *ibdev, if (EFA_DEV_CAP(dev, RDMA_WRITE)) resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE; + if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV)) + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV; + if (dev->neqs) resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS; @@ -639,6 +642,7 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, struct efa_ibv_create_qp cmd = {}; struct efa_qp *qp = to_eqp(ibqp); struct efa_ucontext *ucontext; + u16 supported_efa_flags = 0; int err; ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext, @@ -676,13 +680,23 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, goto err_out; } - if (cmd.comp_mask) { + if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_90)) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, unknown fields in udata\n"); err = -EINVAL; goto err_out; } + if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV)) + supported_efa_flags |= EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV; + + if (cmd.flags & ~supported_efa_flags) { + ibdev_dbg(&dev->ibdev, "Unsupported EFA QP create flags[%#x], supported[%#x]\n", + cmd.flags, supported_efa_flags); + err = -EOPNOTSUPP; + goto err_out; + } + create_qp_params.uarn = ucontext->uarn; create_qp_params.pd = to_epd(ibqp->pd)->pdn; @@ -722,6 +736,9 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, create_qp_params.rq_base_addr = qp->rq_dma_addr; } + if (cmd.flags & EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV) + create_qp_params.unsolicited_write_recv = true; + err = efa_com_create_qp(&dev->edev, &create_qp_params, &create_qp_resp); if (err) diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 701e2d567e41..d689b8b34189 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -85,11 +85,17 @@ enum { EFA_QP_DRIVER_TYPE_SRD = 0, }; +enum { + EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV = 1 << 0, +}; + struct efa_ibv_create_qp { __u32 comp_mask; __u32 rq_ring_size; /* bytes */ __u32 sq_ring_size; /* bytes */ __u32 driver_qp_type; + __u16 flags; + __u8 reserved_90[6]; }; struct efa_ibv_create_qp_resp { @@ -123,6 +129,7 @@ enum { EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID = 1 << 3, EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4, EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5, + EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV = 1 << 6, }; struct efa_ibv_ex_query_device_resp { -- cgit v1.2.3 From 78cfd17142ef70599d6409cbd709d94b3da58659 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Tue, 7 May 2024 12:39:28 +0200 Subject: bnxt_re: avoid shift undefined behavior in bnxt_qplib_alloc_init_hwq Undefined behavior is triggered when bnxt_qplib_alloc_init_hwq is called with hwq_attr->aux_depth != 0 and hwq_attr->aux_stride == 0. In that case, "roundup_pow_of_two(hwq_attr->aux_stride)" gets called. roundup_pow_of_two is documented as undefined for 0. Fix it in the one caller that had this combination. The undefined behavior was detected by UBSAN: UBSAN: shift-out-of-bounds in ./include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'long unsigned int' CPU: 24 PID: 1075 Comm: (udev-worker) Not tainted 6.9.0-rc6+ #4 Hardware name: Abacus electric, s.r.o. - servis@abacus.cz Super Server/H12SSW-iN, BIOS 2.7 10/25/2023 Call Trace: dump_stack_lvl+0x5d/0x80 ubsan_epilogue+0x5/0x30 __ubsan_handle_shift_out_of_bounds.cold+0x61/0xec __roundup_pow_of_two+0x25/0x35 [bnxt_re] bnxt_qplib_alloc_init_hwq+0xa1/0x470 [bnxt_re] bnxt_qplib_create_qp+0x19e/0x840 [bnxt_re] bnxt_re_create_qp+0x9b1/0xcd0 [bnxt_re] ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? __kmalloc+0x1b6/0x4f0 ? create_qp.part.0+0x128/0x1c0 [ib_core] ? __pfx_bnxt_re_create_qp+0x10/0x10 [bnxt_re] create_qp.part.0+0x128/0x1c0 [ib_core] ib_create_qp_kernel+0x50/0xd0 [ib_core] create_mad_qp+0x8e/0xe0 [ib_core] ? __pfx_qp_event_handler+0x10/0x10 [ib_core] ib_mad_init_device+0x2be/0x680 [ib_core] add_client_context+0x10d/0x1a0 [ib_core] enable_device_and_get+0xe0/0x1d0 [ib_core] ib_register_device+0x53c/0x630 [ib_core] ? srso_alias_return_thunk+0x5/0xfbef5 bnxt_re_probe+0xbd8/0xe50 [bnxt_re] ? __pfx_bnxt_re_probe+0x10/0x10 [bnxt_re] auxiliary_bus_probe+0x49/0x80 ? driver_sysfs_add+0x57/0xc0 really_probe+0xde/0x340 ? pm_runtime_barrier+0x54/0x90 ? __pfx___driver_attach+0x10/0x10 __driver_probe_device+0x78/0x110 driver_probe_device+0x1f/0xa0 __driver_attach+0xba/0x1c0 bus_for_each_dev+0x8f/0xe0 bus_add_driver+0x146/0x220 driver_register+0x72/0xd0 __auxiliary_driver_register+0x6e/0xd0 ? __pfx_bnxt_re_mod_init+0x10/0x10 [bnxt_re] bnxt_re_mod_init+0x3e/0xff0 [bnxt_re] ? __pfx_bnxt_re_mod_init+0x10/0x10 [bnxt_re] do_one_initcall+0x5b/0x310 do_init_module+0x90/0x250 init_module_from_file+0x86/0xc0 idempotent_init_module+0x121/0x2b0 __x64_sys_finit_module+0x5e/0xb0 do_syscall_64+0x82/0x160 ? srso_alias_return_thunk+0x5/0xfbef5 ? syscall_exit_to_user_mode_prepare+0x149/0x170 ? srso_alias_return_thunk+0x5/0xfbef5 ? syscall_exit_to_user_mode+0x75/0x230 ? srso_alias_return_thunk+0x5/0xfbef5 ? do_syscall_64+0x8e/0x160 ? srso_alias_return_thunk+0x5/0xfbef5 ? __count_memcg_events+0x69/0x100 ? srso_alias_return_thunk+0x5/0xfbef5 ? count_memcg_events.constprop.0+0x1a/0x30 ? srso_alias_return_thunk+0x5/0xfbef5 ? handle_mm_fault+0x1f0/0x300 ? srso_alias_return_thunk+0x5/0xfbef5 ? do_user_addr_fault+0x34e/0x640 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f4e5132821d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e3 db 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffca9c906a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 0000563ec8a8f130 RCX: 00007f4e5132821d RDX: 0000000000000000 RSI: 00007f4e518fa07d RDI: 000000000000003b RBP: 00007ffca9c90760 R08: 00007f4e513f6b20 R09: 00007ffca9c906f0 R10: 0000563ec8a8faa0 R11: 0000000000000246 R12: 00007f4e518fa07d R13: 0000000000020000 R14: 0000563ec8409e90 R15: 0000563ec8a8fa60 ---[ end trace ]--- Fixes: 0c4dcd602817 ("RDMA/bnxt_re: Refactor hardware queue memory allocation") Signed-off-by: Michal Schmidt Link: https://lore.kernel.org/r/20240507103929.30003-1-mschmidt@redhat.com Acked-by: Selvin Xavier Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 439d0c7c5d0c..04258676d072 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1013,7 +1013,8 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.stride = sizeof(struct sq_sge); hwq_attr.depth = bnxt_qplib_get_depth(sq); hwq_attr.aux_stride = psn_sz; - hwq_attr.aux_depth = bnxt_qplib_set_sq_size(sq, qp->wqe_mode); + hwq_attr.aux_depth = psn_sz ? bnxt_qplib_set_sq_size(sq, qp->wqe_mode) + : 0; /* Update msn tbl size */ if (BNXT_RE_HW_RETX(qp->dev_cap_flags) && psn_sz) { hwq_attr.aux_depth = roundup_pow_of_two(bnxt_qplib_set_sq_size(sq, qp->wqe_mode)); -- cgit v1.2.3 From 49ca2b2ef3d003402584c68ae7b3055ba72e750a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 9 May 2024 10:39:33 +0300 Subject: RDMA/IPoIB: Fix format truncation compilation errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Truncate the device name to store IPoIB VLAN name. [leonro@5b4e8fba4ddd kernel]$ make -s -j 20 allmodconfig [leonro@5b4e8fba4ddd kernel]$ make -s -j 20 W=1 drivers/infiniband/ulp/ipoib/ drivers/infiniband/ulp/ipoib/ipoib_vlan.c: In function ‘ipoib_vlan_add’: drivers/infiniband/ulp/ipoib/ipoib_vlan.c:187:52: error: ‘%04x’ directive output may be truncated writing 4 bytes into a region of size between 0 and 15 [-Werror=format-truncation=] 187 | snprintf(intf_name, sizeof(intf_name), "%s.%04x", | ^~~~ drivers/infiniband/ulp/ipoib/ipoib_vlan.c:187:48: note: directive argument in the range [0, 65535] 187 | snprintf(intf_name, sizeof(intf_name), "%s.%04x", | ^~~~~~~~~ drivers/infiniband/ulp/ipoib/ipoib_vlan.c:187:9: note: ‘snprintf’ output between 6 and 21 bytes into a destination of size 16 187 | snprintf(intf_name, sizeof(intf_name), "%s.%04x", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 188 | ppriv->dev->name, pkey); | ~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors make[6]: *** [scripts/Makefile.build:244: drivers/infiniband/ulp/ipoib/ipoib_vlan.o] Error 1 make[6]: *** Waiting for unfinished jobs.... Fixes: 9baa0b036410 ("IB/ipoib: Add rtnl_link_ops support") Link: https://lore.kernel.org/r/e9d3e1fef69df4c9beaf402cc3ac342bad680791.1715240029.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 4bd161e86f8d..562df2b3ef18 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -184,8 +184,12 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ppriv = ipoib_priv(pdev); - snprintf(intf_name, sizeof(intf_name), "%s.%04x", - ppriv->dev->name, pkey); + /* If you increase IFNAMSIZ, update snprintf below + * to allow longer names. + */ + BUILD_BUG_ON(IFNAMSIZ != 16); + snprintf(intf_name, sizeof(intf_name), "%.10s.%04x", ppriv->dev->name, + pkey); ndev = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); if (IS_ERR(ndev)) { -- cgit v1.2.3 From 9c0731832d3b7420cbadba6a7f334363bc8dfb15 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 10 May 2024 23:12:47 +0200 Subject: RDMA/cma: Fix kmemleak in rdma_core observed during blktests nvme/rdma use siw When running blktests nvme/rdma, the following kmemleak issue will appear. kmemleak: Kernel memory leak detector initialized (mempool available:36041) kmemleak: Automatic memory scanning thread started kmemleak: 2 new suspected memory leaks (see /sys/kernel/debug/kmemleak) kmemleak: 8 new suspected memory leaks (see /sys/kernel/debug/kmemleak) kmemleak: 17 new suspected memory leaks (see /sys/kernel/debug/kmemleak) kmemleak: 4 new suspected memory leaks (see /sys/kernel/debug/kmemleak) unreferenced object 0xffff88855da53400 (size 192): comm "rdma", pid 10630, jiffies 4296575922 hex dump (first 32 bytes): 37 00 00 00 00 00 00 00 c0 ff ff ff 1f 00 00 00 7............... 10 34 a5 5d 85 88 ff ff 10 34 a5 5d 85 88 ff ff .4.].....4.].... backtrace (crc 47f66721): [] kmalloc_trace+0x30d/0x3b0 [] alloc_gid_entry+0x47/0x380 [ib_core] [] add_modify_gid+0x166/0x930 [ib_core] [] ib_cache_update.part.0+0x6d8/0x910 [ib_core] [] ib_cache_setup_one+0x24a/0x350 [ib_core] [] ib_register_device+0x9e/0x3a0 [ib_core] [] 0xffffffffc2a3d389 [] nldev_newlink+0x2b8/0x520 [ib_core] [] rdma_nl_rcv_msg+0x2c3/0x520 [ib_core] [] rdma_nl_rcv_skb.constprop.0.isra.0+0x23c/0x3a0 [ib_core] [] netlink_unicast+0x445/0x710 [] netlink_sendmsg+0x761/0xc40 [] __sys_sendto+0x3a9/0x420 [] __x64_sys_sendto+0xdc/0x1b0 [] do_syscall_64+0x93/0x180 [] entry_SYSCALL_64_after_hwframe+0x71/0x79 The root cause: rdma_put_gid_attr is not called when sgid_attr is set to ERR_PTR(-ENODEV). Reported-and-tested-by: Yi Zhang Closes: https://lore.kernel.org/all/19bf5745-1b3b-4b8a-81c2-20d945943aaf@linux.dev/T/ Fixes: f8ef1be816bf ("RDMA/cma: Avoid GID lookups on iWARP devices") Reviewed-by: Chuck Lever Signed-off-by: Zhu Yanjun Link: https://lore.kernel.org/r/20240510211247.31345-1-yanjun.zhu@linux.dev Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 1e2cd7c8716e..64ace0b968f0 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -715,8 +715,10 @@ cma_validate_port(struct ib_device *device, u32 port, rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (!net_eq(dev_net(ndev), dev_addr->net) || - ndev->ifindex != bound_if_index) + ndev->ifindex != bound_if_index) { + rdma_put_gid_attr(sgid_attr); sgid_attr = ERR_PTR(-ENODEV); + } rcu_read_unlock(); goto out; } -- cgit v1.2.3