From ea7a5c706fa49273cf6d1d9def053ecb50db2076 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Wed, 3 Apr 2019 16:52:54 +0300 Subject: RDMA/vmw_pvrdma: Fix memory leak on pvrdma_pci_remove Make sure to free the DSR on pvrdma_pci_remove() to avoid the memory leak. Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver") Signed-off-by: Kamal Heib Acked-by: Adit Ranadive Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 6d8b3e0de57a..ec41400fec0c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -1131,6 +1131,8 @@ static void pvrdma_pci_remove(struct pci_dev *pdev) pvrdma_page_dir_cleanup(dev, &dev->cq_pdir); pvrdma_page_dir_cleanup(dev, &dev->async_pdir); pvrdma_free_slots(dev); + dma_free_coherent(&pdev->dev, sizeof(*dev->dsr), dev->dsr, + dev->dsrbase); iounmap(dev->regs); kfree(dev->sgid_tbl); -- cgit v1.2.3 From 4772e03d239484f3461e33c79d721c8ea03f7416 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 7 Apr 2019 13:23:38 +0800 Subject: RDMA/hns: Fix bug that caused srq creation to fail Due to the incorrect use of the seg and obj information, the position of the mtt is calculated incorrectly, and the free space of the page is not enough to store the entire mtt, resulting in access to the next page. This patch fixes this problem. Unable to handle kernel paging request at virtual address ffff00006e3cd000 ... Call trace: hns_roce_write_mtt+0x154/0x2f0 [hns_roce] hns_roce_buf_write_mtt+0xa8/0xd8 [hns_roce] hns_roce_create_srq+0x74c/0x808 [hns_roce] ib_create_srq+0x28/0xc8 Fixes: 0203b14c4f32 ("RDMA/hns: Unify the calculation for hem index in hip08") Signed-off-by: chenglang Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hem.c | 6 ++++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index f1fec56f3ff4..8e29dbb5b5fb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -792,6 +792,8 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev, idx_offset = (obj & (table->num_obj - 1)) % obj_per_chunk; dma_offset = offset = idx_offset * table->obj_size; } else { + u32 seg_size = 64; /* 8 bytes per BA and 8 BA per segment */ + hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop); /* mtt mhop */ i = mhop.l0_idx; @@ -803,8 +805,8 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev, hem_idx = i; hem = table->hem[hem_idx]; - dma_offset = offset = (obj & (table->num_obj - 1)) * - table->obj_size % mhop.bt_chunk_size; + dma_offset = offset = (obj & (table->num_obj - 1)) * seg_size % + mhop.bt_chunk_size; if (mhop.hop_num == 2) dma_offset = offset = 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index b09f1cde2ff5..08be0e4eabcd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -746,7 +746,6 @@ static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table; dma_addr_t dma_handle; __le64 *mtts; - u32 s = start_index * sizeof(u64); u32 bt_page_size; u32 i; @@ -780,7 +779,8 @@ static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev, return -EINVAL; mtts = hns_roce_table_find(hr_dev, table, - mtt->first_seg + s / hr_dev->caps.mtt_entry_sz, + mtt->first_seg + + start_index / HNS_ROCE_MTT_ENTRY_PER_SEG, &dma_handle); if (!mtts) return -ENOMEM; -- cgit v1.2.3 From 00fb67ec6b98114a887d9ef26fc7c3e566e7f665 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Sun, 7 Apr 2019 13:23:39 +0800 Subject: RDMA/hns: Bugfix for SCC hem free The method of hem free for SCC context is different from qp context. In the current version, if free SCC hem during the execution of qp free, there may be smmu error as below: arm-smmu-v3 arm-smmu-v3.1.auto: event 0x10 received: arm-smmu-v3 arm-smmu-v3.1.auto: 0x00007d0000000010 arm-smmu-v3 arm-smmu-v3.1.auto: 0x000012000000017c arm-smmu-v3 arm-smmu-v3.1.auto: 0x00000000000009e0 arm-smmu-v3 arm-smmu-v3.1.auto: 0x0000000000000000 As SCC context is still used by hardware after qp free, we can solve this problem by removing SCC hem free from hns_roce_qp_free. Fixes: 6a157f7d1b14 ("RDMA/hns: Add SCC context allocation support for hip08") Signed-off-by: Yangyang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 57c76eafef2f..66cdf625534f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -274,9 +274,6 @@ void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) wait_for_completion(&hr_qp->free); if ((hr_qp->ibqp.qp_type) != IB_QPT_GSI) { - if (hr_dev->caps.sccc_entry_sz) - hns_roce_table_put(hr_dev, &qp_table->sccc_table, - hr_qp->qpn); if (hr_dev->caps.trrl_entry_sz) hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn); -- cgit v1.2.3 From d737b25b1ae0540ba13cbd45ebb9b58a1d6d7f0d Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 10 Apr 2019 06:27:05 -0700 Subject: IB/hfi1: Do not flush send queue in the TID RDMA second leg When a QP is put into error state, the send queue will be flushed. This mechanism is implemented in both the first and the second leg of the send engine. Since the second leg is only responsible for data transactions in the KDETH space for the TID RDMA WRITE request, it should not perform the flushing of the send queue. This patch removes the flushing function of the second leg, but still keeps the bailing out of the QP if it is put into error state. Fixes: 70dcb2e3dc6a ("IB/hfi1: Add the TID second leg send packet builder") Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/tid_rdma.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index fdda33aca77f..43cbce7a19ea 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -5017,24 +5017,14 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) make_tid_rdma_ack(qp, ohdr, ps)) return 1; - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { - if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) - goto bail; - /* We are in the error state, flush the work request. */ - if (qp->s_last == READ_ONCE(qp->s_head)) - goto bail; - /* If DMAs are in progress, we can't flush immediately. */ - if (iowait_sdma_pending(&priv->s_iowait)) { - qp->s_flags |= RVT_S_WAIT_DMA; - goto bail; - } - clear_ahg(qp); - wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? - IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); - /* will get called again */ - goto done_free_tx; - } + /* + * Bail out if we can't send data. + * Be reminded that this check must been done after the call to + * make_tid_rdma_ack() because the responding QP could be in + * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA. + */ + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) + goto bail; if (priv->s_flags & RVT_S_WAIT_ACK) goto bail; @@ -5144,11 +5134,6 @@ int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps) hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2, middle, ps); return 1; -done_free_tx: - hfi1_put_txreq(ps->s_txreq); - ps->s_txreq = NULL; - return 1; - bail: hfi1_put_txreq(ps->s_txreq); bail_no_tx: -- cgit v1.2.3 From 7c39f7f671d2acc0a1f39ebbbee4303ad499bbfa Mon Sep 17 00:00:00 2001 From: Josh Collier Date: Mon, 15 Apr 2019 11:34:22 -0700 Subject: IB/rdmavt: Fix frwr memory registration Current implementation was not properly handling frwr memory registrations. This was uncovered by commit 27f26cec761das ("xprtrdma: Plant XID in on-the-wire RDMA offset (FRWR)") in which xprtrdma, which is used for NFS over RDMA, started failing as it was the first ULP to modify the ib_mr iova resulting in the NFS server getting REMOTE ACCESS ERROR when attempting to perform RDMA Writes to the client. The fix is to properly capture the true iova, offset, and length in the call to ib_map_mr_sg, and then update the iova when processing the IB_WR_REG_MEM on the send queue. Fixes: a41081aa5936 ("IB/rdmavt: Add support for ib_map_mr_sg") Cc: stable@vger.kernel.org Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl Signed-off-by: Josh Collier Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 728795043496..0bb6e39dd03a 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -608,11 +608,6 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr) if (unlikely(mapped_segs == mr->mr.max_segs)) return -ENOMEM; - if (mr->mr.length == 0) { - mr->mr.user_base = addr; - mr->mr.iova = addr; - } - m = mapped_segs / RVT_SEGSZ; n = mapped_segs % RVT_SEGSZ; mr->mr.map[m]->segs[n].vaddr = (void *)addr; @@ -630,17 +625,24 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr) * @sg_nents: number of entries in sg * @sg_offset: offset in bytes into sg * + * Overwrite rvt_mr length with mr length calculated by ib_sg_to_pages. + * * Return: number of sg elements mapped to the memory region */ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { struct rvt_mr *mr = to_imr(ibmr); + int ret; mr->mr.length = 0; mr->mr.page_shift = PAGE_SHIFT; - return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, - rvt_set_page); + ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rvt_set_page); + mr->mr.user_base = ibmr->iova; + mr->mr.iova = ibmr->iova; + mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr; + mr->mr.length = (size_t)ibmr->length; + return ret; } /** @@ -671,6 +673,7 @@ int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, ibmr->rkey = key; mr->mr.lkey = key; mr->mr.access_flags = access; + mr->mr.iova = ibmr->iova; atomic_set(&mr->mr.lkey_invalid, 0); return 0; -- cgit v1.2.3 From 7249c8ea227a582c14f63e9e8853eb7369122f10 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Wed, 10 Apr 2019 10:59:45 +0300 Subject: IB/mlx5: Fix scatter to CQE in DCT QP creation When scatter to CQE is enabled on a DCT QP it corrupts the mailbox command since it tried to treat it as as QP create mailbox command instead of a DCT create command. The corrupted mailbox command causes userspace to malfunction as the device doesn't create the QP as expected. A new mlx5 capability is exposed to user-space which ensures that it will not enable the feature on DCT without this fix in the kernel. Fixes: 5d6ff1babe78 ("IB/mlx5: Support scatter to CQE for DC transport type") Signed-off-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 ++ drivers/infiniband/hw/mlx5/qp.c | 11 +++++++---- include/uapi/rdma/mlx5-abi.h | 1 + 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 531ff20b32ad..241d9ead79eb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1119,6 +1119,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, qp_packet_based)) resp.flags |= MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; + + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; } if (field_avail(typeof(resp), sw_parsing_caps, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7cd006da1dae..8870c350fda0 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1818,13 +1818,16 @@ static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); - if (rcqe_sz == 128) { - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + if (init_attr->qp_type == MLX5_IB_QPT_DCT) { + if (rcqe_sz == 128) + MLX5_SET(dctc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + return; } - if (init_attr->qp_type != MLX5_IB_QPT_DCT) - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); + MLX5_SET(qpc, qpc, cs_res, + rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : + MLX5_RES_SCAT_DATA32_CQE); } static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 87b3198f4b5d..f4d4010b7e3e 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -238,6 +238,7 @@ enum mlx5_ib_query_dev_resp_flags { MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0, MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD = 1 << 1, MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE = 1 << 2, + MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT = 1 << 3, }; enum mlx5_ib_tunnel_offloads { -- cgit v1.2.3 From c660133c339f9ab684fdf568c0d51b9ae5e86002 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:25 +0300 Subject: RDMA/mlx5: Do not allow the user to write to the clock page The intent of this VMA was to be read-only from user space, but the VM_MAYWRITE masking was missed, so mprotect could make it writable. Cc: stable@vger.kernel.org Fixes: 5c99eaecb1fc ("IB/mlx5: Mmap the HCA's clock info to user-space") Signed-off-by: Jason Gunthorpe Reviewed-by: Haggai Eran Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 241d9ead79eb..12a75dfc992b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2068,6 +2068,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (vma->vm_flags & VM_WRITE) return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; if (!dev->mdev->clock_info_page) return -EOPNOTSUPP; @@ -2233,6 +2234,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) -- cgit v1.2.3 From d5e560d3f72382ac4e3bfe4e0f0420e6a220b039 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:26 +0300 Subject: RDMA/mlx5: Use rdma_user_map_io for mapping BAR pages Since mlx5 supports device disassociate it must use this API for all BAR page mmaps, otherwise the pages can remain mapped after the device is unplugged causing a system crash. Cc: stable@vger.kernel.org Fixes: 5f9794dc94f5 ("RDMA/ucontext: Add a core API for mmaping driver IO memory") Signed-off-by: Jason Gunthorpe Reviewed-by: Haggai Eran Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 12a75dfc992b..d3dd290ae1b1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2240,14 +2240,12 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (PAGE_SIZE > 4096) return -EOPNOTSUPP; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); pfn = (dev->mdev->iseg_base + offsetof(struct mlx5_init_seg, internal_timer_h)) >> PAGE_SHIFT; - if (io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - break; + return rdma_user_mmap_io(&context->ibucontext, vma, pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot)); case MLX5_IB_MMAP_CLOCK_INFO: return mlx5_ib_mmap_clock_info_page(dev, vma, context); -- cgit v1.2.3 From 67f269b37f9b4d52c5e7f97acea26c0852e9b8a1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:28 +0300 Subject: RDMA/ucontext: Fix regression with disassociate When this code was consolidated the intention was that the VMA would become backed by anonymous zero pages after the zap_vma_pte - however this very subtly relied on setting the vm_ops = NULL and clearing the VM_SHARED bits to transform the VMA into an anonymous VMA. Since the vm_ops was removed this broke. Now userspace gets a SIGBUS if it touches the vma after disassociation. Instead of converting the VMA to anonymous provide a fault handler that puts a zero'd page into the VMA when user-space touches it after disassociation. Cc: stable@vger.kernel.org Suggested-by: Andrea Arcangeli Fixes: 5f9794dc94f5 ("RDMA/ucontext: Add a core API for mmaping driver IO memory") Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_main.c | 52 +++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index ea0bc6885517..32cc8fe7902f 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -160,6 +160,7 @@ struct ib_uverbs_file { struct mutex umap_lock; struct list_head umaps; + struct page *disassociate_page; struct idr idr; /* spinlock protects write access to idr */ diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 70b7d80431a9..db20b6e0f253 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -208,6 +208,9 @@ void ib_uverbs_release_file(struct kref *ref) kref_put(&file->async_file->ref, ib_uverbs_release_async_event_file); put_device(&file->device->dev); + + if (file->disassociate_page) + __free_pages(file->disassociate_page, 0); kfree(file); } @@ -877,9 +880,50 @@ static void rdma_umap_close(struct vm_area_struct *vma) kfree(priv); } +/* + * Once the zap_vma_ptes has been called touches to the VMA will come here and + * we return a dummy writable zero page for all the pfns. + */ +static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) +{ + struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data; + struct rdma_umap_priv *priv = vmf->vma->vm_private_data; + vm_fault_t ret = 0; + + if (!priv) + return VM_FAULT_SIGBUS; + + /* Read only pages can just use the system zero page. */ + if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { + vmf->page = ZERO_PAGE(vmf->vm_start); + get_page(vmf->page); + return 0; + } + + mutex_lock(&ufile->umap_lock); + if (!ufile->disassociate_page) + ufile->disassociate_page = + alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0); + + if (ufile->disassociate_page) { + /* + * This VMA is forced to always be shared so this doesn't have + * to worry about COW. + */ + vmf->page = ufile->disassociate_page; + get_page(vmf->page); + } else { + ret = VM_FAULT_SIGBUS; + } + mutex_unlock(&ufile->umap_lock); + + return ret; +} + static const struct vm_operations_struct rdma_umap_ops = { .open = rdma_umap_open, .close = rdma_umap_close, + .fault = rdma_umap_fault, }; static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, @@ -889,6 +933,9 @@ static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, struct ib_uverbs_file *ufile = ucontext->ufile; struct rdma_umap_priv *priv; + if (!(vma->vm_flags & VM_SHARED)) + return ERR_PTR(-EINVAL); + if (vma->vm_end - vma->vm_start != size) return ERR_PTR(-EINVAL); @@ -992,7 +1039,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) * at a time to get the lock ordering right. Typically there * will only be one mm, so no big deal. */ - down_write(&mm->mmap_sem); + down_read(&mm->mmap_sem); mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -1004,10 +1051,9 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); } mutex_unlock(&ufile->umap_lock); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); mmput(mm); } } -- cgit v1.2.3 From ddcdc368b1033e19fd3a5f750752e10e28a87826 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:29 +0300 Subject: RDMA/mlx5: Use get_zeroed_page() for clock_info get_zeroed_page() returns a virtual address for the page which is better than allocating a struct page and doing a permanent kmap on it. Cc: stable@vger.kernel.org Signed-off-by: Jason Gunthorpe Reviewed-by: Haggai Eran Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 5 ++-- .../net/ethernet/mellanox/mlx5/core/lib/clock.c | 30 ++++++++-------------- include/linux/mlx5/driver.h | 1 - 3 files changed, 14 insertions(+), 22 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d3dd290ae1b1..da81402992bc 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2070,11 +2070,12 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, return -EPERM; vma->vm_flags &= ~VM_MAYWRITE; - if (!dev->mdev->clock_info_page) + if (!dev->mdev->clock_info) return -EOPNOTSUPP; return rdma_user_mmap_page(&context->ibucontext, vma, - dev->mdev->clock_info_page, PAGE_SIZE); + virt_to_page(dev->mdev->clock_info), + PAGE_SIZE); } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index ca0ee9916e9e..0059b290e095 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -535,23 +535,16 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) do_div(ns, NSEC_PER_SEC / HZ); clock->overflow_period = ns; - mdev->clock_info_page = alloc_page(GFP_KERNEL); - if (mdev->clock_info_page) { - mdev->clock_info = kmap(mdev->clock_info_page); - if (!mdev->clock_info) { - __free_page(mdev->clock_info_page); - mlx5_core_warn(mdev, "failed to map clock page\n"); - } else { - mdev->clock_info->sign = 0; - mdev->clock_info->nsec = clock->tc.nsec; - mdev->clock_info->cycles = clock->tc.cycle_last; - mdev->clock_info->mask = clock->cycles.mask; - mdev->clock_info->mult = clock->nominal_c_mult; - mdev->clock_info->shift = clock->cycles.shift; - mdev->clock_info->frac = clock->tc.frac; - mdev->clock_info->overflow_period = - clock->overflow_period; - } + mdev->clock_info = + (struct mlx5_ib_clock_info *)get_zeroed_page(GFP_KERNEL); + if (mdev->clock_info) { + mdev->clock_info->nsec = clock->tc.nsec; + mdev->clock_info->cycles = clock->tc.cycle_last; + mdev->clock_info->mask = clock->cycles.mask; + mdev->clock_info->mult = clock->nominal_c_mult; + mdev->clock_info->shift = clock->cycles.shift; + mdev->clock_info->frac = clock->tc.frac; + mdev->clock_info->overflow_period = clock->overflow_period; } INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); @@ -599,8 +592,7 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) cancel_delayed_work_sync(&clock->overflow_work); if (mdev->clock_info) { - kunmap(mdev->clock_info_page); - __free_page(mdev->clock_info_page); + free_page((unsigned long)mdev->clock_info); mdev->clock_info = NULL; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0d0729648844..9ffc53acaec1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,7 +681,6 @@ struct mlx5_core_dev { #endif struct mlx5_clock clock; struct mlx5_ib_clock_info *clock_info; - struct page *clock_info_page; struct mlx5_fw_tracer *tracer; }; -- cgit v1.2.3 From 4eb6ab13b99148b5bf9bfdae7977fe139b4452f8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Apr 2019 14:07:30 +0300 Subject: RDMA: Remove rdma_user_mmap_page Upon further research drivers that want this should simply call the core function vm_insert_page(). The VMA holds a reference on the page and it will be automatically freed when the last reference drops. No need for disassociate to sequence the cleanup. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_main.c | 62 +++++++---------------------------- drivers/infiniband/hw/mlx5/main.c | 12 +++---- include/rdma/ib_verbs.h | 9 ----- 3 files changed, 17 insertions(+), 66 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index db20b6e0f253..7e767b94074a 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -926,43 +926,32 @@ static const struct vm_operations_struct rdma_umap_ops = { .fault = rdma_umap_fault, }; -static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, - unsigned long size) +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot) { struct ib_uverbs_file *ufile = ucontext->ufile; struct rdma_umap_priv *priv; if (!(vma->vm_flags & VM_SHARED)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (vma->vm_end - vma->vm_start != size) - return ERR_PTR(-EINVAL); + return -EINVAL; /* Driver is using this wrong, must be called by ib_uverbs_mmap */ if (WARN_ON(!vma->vm_file || vma->vm_file->private_data != ufile)) - return ERR_PTR(-EINVAL); + return -EINVAL; lockdep_assert_held(&ufile->device->disassociate_srcu); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) - return ERR_PTR(-ENOMEM); - return priv; -} - -/* - * Map IO memory into a process. This is to be called by drivers as part of - * their mmap() functions if they wish to send something like PCI-E BAR memory - * to userspace. - */ -int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, - unsigned long pfn, unsigned long size, pgprot_t prot) -{ - struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); - - if (IS_ERR(priv)) - return PTR_ERR(priv); + return -ENOMEM; vma->vm_page_prot = prot; if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { @@ -975,35 +964,6 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, } EXPORT_SYMBOL(rdma_user_mmap_io); -/* - * The page case is here for a slightly different reason, the driver expects - * to be able to free the page it is sharing to user space when it destroys - * its ucontext, which means we need to zap the user space references. - * - * We could handle this differently by providing an API to allocate a shared - * page and then only freeing the shared page when the last ufile is - * destroyed. - */ -int rdma_user_mmap_page(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, struct page *page, - unsigned long size) -{ - struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); - - if (IS_ERR(priv)) - return PTR_ERR(priv); - - if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, - vma->vm_page_prot)) { - kfree(priv); - return -EAGAIN; - } - - rdma_umap_priv_init(priv, vma); - return 0; -} -EXPORT_SYMBOL(rdma_user_mmap_page); - void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) { struct rdma_umap_priv *priv, *next_priv; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index da81402992bc..239d70833afc 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2060,22 +2060,22 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { - if (vma->vm_end - vma->vm_start != PAGE_SIZE) + if ((vma->vm_end - vma->vm_start != PAGE_SIZE) || + !(vma->vm_flags & VM_SHARED)) return -EINVAL; if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1) return -EOPNOTSUPP; - if (vma->vm_flags & VM_WRITE) + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); if (!dev->mdev->clock_info) return -EOPNOTSUPP; - return rdma_user_mmap_page(&context->ibucontext, vma, - virt_to_page(dev->mdev->clock_info), - PAGE_SIZE); + return vm_insert_page(vma, vma->vm_start, + virt_to_page(dev->mdev->clock_info)); } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9b9e17bcc201..7ca908d5c0c3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2705,9 +2705,6 @@ void ib_set_device_ops(struct ib_device *device, #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, unsigned long pfn, unsigned long size, pgprot_t prot); -int rdma_user_mmap_page(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, struct page *page, - unsigned long size); #else static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, @@ -2716,12 +2713,6 @@ static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext, { return -EINVAL; } -static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, struct page *page, - unsigned long size) -{ - return -EINVAL; -} #endif static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) -- cgit v1.2.3