From 968e78dd96443e2cc963c493070574778805e76a Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:11 +0200 Subject: IB/mlx5: Enhance UMR support to allow partial page table update The current UMR interface doesn't allow partial updates to a memory region's page tables. This patch changes the interface to allow that. It also changes the way the UMR operation validates the memory region's state. When set, IB_SEND_UMR_FAIL_IF_FREE will cause the UMR operation to fail if the MKEY is in the free state. When it is unchecked the operation will check that it isn't in the free state. Signed-off-by: Haggai Eran Signed-off-by: Shachar Raindel Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ea4f1c46f761..fa07bfda0e15 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -180,6 +180,15 @@ enum { MLX5_MKEY_MASK_FREE = 1ull << 29, }; +enum { + MLX5_UMR_TRANSLATION_OFFSET_EN = (1 << 4), + + MLX5_UMR_CHECK_NOT_FREE = (1 << 5), + MLX5_UMR_CHECK_FREE = (2 << 5), + + MLX5_UMR_INLINE = (1 << 7), +}; + enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, @@ -776,6 +785,10 @@ struct mlx5_query_eq_mbox_out { struct mlx5_eq_context ctx; }; +enum { + MLX5_MKEY_STATUS_FREE = 1 << 6, +}; + struct mlx5_mkey_seg { /* This is a two bit field occupying bits 31-30. * bit 31 is always 0, -- cgit v1.2.3 From c1395a2a8c01e8a919e47d64eb3d23d00e824b8b Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:14 +0200 Subject: IB/mlx5: Add function to read WQE from user-space Add a helper function mlx5_ib_read_user_wqe to read information from user-space owned work queues. The function will be used in a later patch by the page-fault handling code in mlx5_ib. Signed-off-by: Haggai Eran [ Add stub for ib_umem_copy_from() for CONFIG_INFINIBAND_USER_MEM=n - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + drivers/infiniband/hw/mlx5/qp.c | 71 ++++++++++++++++++++++++++++++++++++ include/linux/mlx5/qp.h | 3 ++ include/rdma/ib_umem.h | 5 ++- 4 files changed, 80 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 53d19e6e69a4..14a0311eaa1c 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -503,6 +503,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length); struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, int vector, struct ib_ucontext *context, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 36e2cfe1c2fe..9783c3342dbf 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -101,6 +101,77 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); } +/** + * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. + * + * @qp: QP to copy from. + * @send: copy from the send queue when non-zero, use the receive queue + * otherwise. + * @wqe_index: index to start copying from. For send work queues, the + * wqe_index is in units of MLX5_SEND_WQE_BB. + * For receive work queue, it is the number of work queue + * element in the queue. + * @buffer: destination buffer. + * @length: maximum number of bytes to copy. + * + * Copies at least a single WQE, but may copy more data. + * + * Return: the number of bytes copied, or an error code. + */ +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length) +{ + struct ib_device *ibdev = qp->ibqp.device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; + size_t offset; + size_t wq_end; + struct ib_umem *umem = qp->umem; + u32 first_copy_length; + int wqe_length; + int ret; + + if (wq->wqe_cnt == 0) { + mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", + qp->ibqp.qp_type); + return -EINVAL; + } + + offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); + wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); + + if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (offset > umem->length || + (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) + return -EINVAL; + + first_copy_length = min_t(u32, offset + length, wq_end) - offset; + ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); + if (ret) + return ret; + + if (send) { + struct mlx5_wqe_ctrl_seg *ctrl = buffer; + int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + + wqe_length = ds * MLX5_WQE_DS_UNITS; + } else { + wqe_length = 1 << wq->wqe_shift; + } + + if (wqe_length <= first_copy_length) + return first_copy_length; + + ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, + wqe_length - first_copy_length); + if (ret) + return ret; + + return wqe_length; +} + static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3fa075daeb1d..67f4b9660b06 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -189,6 +189,9 @@ struct mlx5_wqe_ctrl_seg { __be32 imm; }; +#define MLX5_WQE_CTRL_DS_MASK 0x3f +#define MLX5_WQE_DS_UNITS 16 + struct mlx5_wqe_xrc_seg { __be32 xrc_srqn; u8 rsvd[12]; diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 45bb04bc88cd..a51f4091489a 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -98,7 +98,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, } static inline void ib_umem_release(struct ib_umem *umem) { } static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } - +static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) { + return -EINVAL; +} #endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ -- cgit v1.2.3 From 6cb7ff3dcfe6aad6a36a0fd0e928b5bea4fabdd5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 15 Dec 2014 18:17:17 -0800 Subject: mlx5_core: Re-add MLX5_DEV_CAP_FLAG_ON_DMND_PG flag In commit 0c7aac854f52 ("net/mlx5_core: Remove unused dev cap enum fields"), the flag MLX5_DEV_CAP_FLAG_ON_DMND_PG was removed. Unfortunately the on-demand paging changes actually use it, so re-add the missing flag. Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index fa07bfda0e15..096abe543d2c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -234,6 +234,7 @@ enum { MLX5_DEV_CAP_FLAG_APM = 1LL << 17, MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, + MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, -- cgit v1.2.3 From e420f0c0f3d1022789fcb59b2a0c4b979ce311ba Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:19 +0200 Subject: mlx5_core: Add support for page faults events and low level handling * Add a handler function pointer in the mlx5_core_qp struct for page fault events. Handle page fault events by calling the handler function, if not NULL. * Add on-demand paging capability query command. * Export command for resuming QPs after page faults. * Add various constants related to paging support. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 13 ++- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 40 +++++++++ drivers/net/ethernet/mellanox/mlx5/core/qp.c | 119 +++++++++++++++++++++++++++ include/linux/mlx5/device.h | 54 +++++++++++- include/linux/mlx5/driver.h | 12 +++ include/linux/mlx5/qp.h | 55 +++++++++++++ 6 files changed, 291 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ab684463780b..da82991239a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -157,6 +157,8 @@ static const char *eqe_type_str(u8 type) return "MLX5_EVENT_TYPE_CMD"; case MLX5_EVENT_TYPE_PAGE_REQUEST: return "MLX5_EVENT_TYPE_PAGE_REQUEST"; + case MLX5_EVENT_TYPE_PAGE_FAULT: + return "MLX5_EVENT_TYPE_PAGE_FAULT"; default: return "Unrecognized event"; } @@ -279,6 +281,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) } break; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + case MLX5_EVENT_TYPE_PAGE_FAULT: + mlx5_eq_pagefault(dev, eqe); + break; +#endif default: mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", @@ -446,8 +453,12 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev) int mlx5_start_eqs(struct mlx5_core_dev *dev) { struct mlx5_eq_table *table = &dev->priv.eq_table; + u32 async_event_mask = MLX5_ASYNC_EVENT_MASK; int err; + if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT); + err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); @@ -459,7 +470,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) mlx5_cmd_use_events(dev); err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, - MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK, + MLX5_NUM_ASYNC_EQE, async_event_mask, "mlx5_async_eq", &dev->priv.uuari.uars[0]); if (err) { mlx5_core_warn(dev, "failed to create async EQ %d\n", err); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 087c4c797deb..06f9036acd83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -69,6 +69,46 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps) return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); } +int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps) +{ + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; + int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *out; + int err; + + if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) + return -ENOTSUPP; + + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto out; + + err = mlx5_cmd_status_to_err_v2(out); + if (err) { + mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err); + goto out; + } + + memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct), + sizeof(*caps)); + + mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n", + be32_to_cpu(caps->per_transport_caps.rc_odp_caps), + be32_to_cpu(caps->per_transport_caps.uc_odp_caps), + be32_to_cpu(caps->per_transport_caps.ud_odp_caps)); + +out: + kfree(out); + return err; +} +EXPORT_SYMBOL(mlx5_query_odp_caps); + int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) { struct mlx5_cmd_init_hca_mbox_in in; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 5261a2b0da43..575d853dbe05 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -88,6 +88,95 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) mlx5_core_put_rsc(common); } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) +{ + struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault; + int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK; + struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn); + struct mlx5_core_qp *qp = + container_of(common, struct mlx5_core_qp, common); + struct mlx5_pagefault pfault; + + if (!qp) { + mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n", + qpn); + return; + } + + pfault.event_subtype = eqe->sub_type; + pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) & + (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA); + pfault.bytes_committed = be32_to_cpu( + pf_eqe->bytes_committed); + + mlx5_core_dbg(dev, + "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n", + eqe->sub_type, pfault.flags); + + switch (eqe->sub_type) { + case MLX5_PFAULT_SUBTYPE_RDMA: + /* RDMA based event */ + pfault.rdma.r_key = + be32_to_cpu(pf_eqe->rdma.r_key); + pfault.rdma.packet_size = + be16_to_cpu(pf_eqe->rdma.packet_length); + pfault.rdma.rdma_op_len = + be32_to_cpu(pf_eqe->rdma.rdma_op_len); + pfault.rdma.rdma_va = + be64_to_cpu(pf_eqe->rdma.rdma_va); + mlx5_core_dbg(dev, + "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n", + qpn, pfault.rdma.r_key); + mlx5_core_dbg(dev, + "PAGE_FAULT: rdma_op_len: 0x%08x,\n", + pfault.rdma.rdma_op_len); + mlx5_core_dbg(dev, + "PAGE_FAULT: rdma_va: 0x%016llx,\n", + pfault.rdma.rdma_va); + mlx5_core_dbg(dev, + "PAGE_FAULT: bytes_committed: 0x%06x\n", + pfault.bytes_committed); + break; + + case MLX5_PFAULT_SUBTYPE_WQE: + /* WQE based event */ + pfault.wqe.wqe_index = + be16_to_cpu(pf_eqe->wqe.wqe_index); + pfault.wqe.packet_size = + be16_to_cpu(pf_eqe->wqe.packet_length); + mlx5_core_dbg(dev, + "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n", + qpn, pfault.wqe.wqe_index); + mlx5_core_dbg(dev, + "PAGE_FAULT: bytes_committed: 0x%06x\n", + pfault.bytes_committed); + break; + + default: + mlx5_core_warn(dev, + "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n", + eqe->sub_type, qpn); + /* Unsupported page faults should still be resolved by the + * page fault handler + */ + } + + if (qp->pfault_handler) { + qp->pfault_handler(qp, &pfault); + } else { + mlx5_core_err(dev, + "ODP event for QP %08x, without a fault handler in QP\n", + qpn); + /* Page fault will remain unresolved. QP will hang until it is + * destroyed + */ + } + + mlx5_core_put_rsc(common); +} +#endif + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, @@ -322,3 +411,33 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) return err; } EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, + u8 flags, int error) +{ + struct mlx5_page_fault_resume_mbox_in in; + struct mlx5_page_fault_resume_mbox_out out; + int err; + + memset(&in, 0, sizeof(in)); + memset(&out, 0, sizeof(out)); + in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME); + in.hdr.opmod = 0; + flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR | + MLX5_PAGE_FAULT_RESUME_WRITE | + MLX5_PAGE_FAULT_RESUME_RDMA); + flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0); + in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) | + (flags << MLX5_QPN_BITS)); + err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); + if (err) + return err; + + if (out.hdr.status) + err = mlx5_cmd_status_to_err(&out.hdr); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); +#endif diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 096abe543d2c..70c28239e339 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -119,6 +119,15 @@ enum { MLX5_MAX_LOG_PKEY_TABLE = 5, }; +enum { + MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31 +}; + +enum { + MLX5_PFAULT_SUBTYPE_WQE = 0, + MLX5_PFAULT_SUBTYPE_RDMA = 1, +}; + enum { MLX5_PERM_LOCAL_READ = 1 << 2, MLX5_PERM_LOCAL_WRITE = 1 << 3, @@ -215,6 +224,8 @@ enum mlx5_event { MLX5_EVENT_TYPE_CMD = 0x0a, MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, + + MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, }; enum { @@ -300,6 +311,8 @@ enum { enum { HCA_CAP_OPMOD_GET_MAX = 0, HCA_CAP_OPMOD_GET_CUR = 1, + HCA_CAP_OPMOD_GET_ODP_MAX = 4, + HCA_CAP_OPMOD_GET_ODP_CUR = 5 }; struct mlx5_inbox_hdr { @@ -329,6 +342,23 @@ struct mlx5_cmd_query_adapter_mbox_out { u8 vsd_psid[16]; }; +enum mlx5_odp_transport_cap_bits { + MLX5_ODP_SUPPORT_SEND = 1 << 31, + MLX5_ODP_SUPPORT_RECV = 1 << 30, + MLX5_ODP_SUPPORT_WRITE = 1 << 29, + MLX5_ODP_SUPPORT_READ = 1 << 28, +}; + +struct mlx5_odp_caps { + char reserved[0x10]; + struct { + __be32 rc_odp_caps; + __be32 uc_odp_caps; + __be32 ud_odp_caps; + } per_transport_caps; + char reserved2[0xe4]; +}; + struct mlx5_cmd_init_hca_mbox_in { struct mlx5_inbox_hdr hdr; u8 rsvd0[2]; @@ -449,6 +479,27 @@ struct mlx5_eqe_page_req { __be32 rsvd1[5]; }; +struct mlx5_eqe_page_fault { + __be32 bytes_committed; + union { + struct { + u16 reserved1; + __be16 wqe_index; + u16 reserved2; + __be16 packet_length; + u8 reserved3[12]; + } __packed wqe; + struct { + __be32 r_key; + u16 reserved1; + __be16 packet_length; + __be32 rdma_op_len; + __be64 rdma_va; + } __packed rdma; + } __packed; + __be32 flags_qpn; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -460,6 +511,7 @@ union ev_data { struct mlx5_eqe_congestion cong; struct mlx5_eqe_stall_vl stall_vl; struct mlx5_eqe_page_req req_pages; + struct mlx5_eqe_page_fault page_fault; } __packed; struct mlx5_eqe { @@ -826,7 +878,7 @@ struct mlx5_query_special_ctxs_mbox_out { struct mlx5_create_mkey_mbox_in { struct mlx5_inbox_hdr hdr; __be32 input_mkey_index; - u8 rsvd0[4]; + __be32 flags; struct mlx5_mkey_seg seg; u8 rsvd1[16]; __be32 xlat_oct_act_size; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b1bf41556b32..7088dcd19214 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -113,6 +113,13 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, }; +enum mlx5_page_fault_resume_flags { + MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0, + MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1, + MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2, + MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7, +}; + enum dbg_rsc_type { MLX5_DBG_RSC_QP, MLX5_DBG_RSC_EQ, @@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe); +#endif void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); @@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); +int mlx5_query_odp_caps(struct mlx5_core_dev *dev, + struct mlx5_odp_caps *odp_caps); static inline u32 mlx5_mkey_to_idx(u32 mkey) { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 67f4b9660b06..6b1d6f60c7e6 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -50,6 +50,9 @@ #define MLX5_BSF_APPTAG_ESCAPE 0x1 #define MLX5_BSF_APPREF_ESCAPE 0x2 +#define MLX5_QPN_BITS 24 +#define MLX5_QPN_MASK ((1 << MLX5_QPN_BITS) - 1) + enum mlx5_qp_optpar { MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, MLX5_QP_OPTPAR_RRE = 1 << 1, @@ -363,9 +366,46 @@ struct mlx5_stride_block_ctrl_seg { __be16 num_entries; }; +enum mlx5_pagefault_flags { + MLX5_PFAULT_REQUESTOR = 1 << 0, + MLX5_PFAULT_WRITE = 1 << 1, + MLX5_PFAULT_RDMA = 1 << 2, +}; + +/* Contains the details of a pagefault. */ +struct mlx5_pagefault { + u32 bytes_committed; + u8 event_subtype; + enum mlx5_pagefault_flags flags; + union { + /* Initiator or send message responder pagefault details. */ + struct { + /* Received packet size, only valid for responders. */ + u32 packet_size; + /* + * WQE index. Refers to either the send queue or + * receive queue, according to event_subtype. + */ + u16 wqe_index; + } wqe; + /* RDMA responder pagefault details */ + struct { + u32 r_key; + /* + * Received packet size, minimal size page fault + * resolution required for forward progress. + */ + u32 packet_size; + u32 rdma_op_len; + u64 rdma_va; + } rdma; + }; +}; + struct mlx5_core_qp { struct mlx5_core_rsc_common common; /* must be first */ void (*event) (struct mlx5_core_qp *, int); + void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *); int qpn; struct mlx5_rsc_debug *dbg; int pid; @@ -533,6 +573,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u return radix_tree_lookup(&dev->priv.mr_table.tree, key); } +struct mlx5_page_fault_resume_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 flags_qpn; + u8 reserved[4]; +}; + +struct mlx5_page_fault_resume_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, @@ -552,6 +603,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev); void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, + u8 context, int error); +#endif static inline const char *mlx5_qp_type_str(int type) { -- cgit v1.2.3 From cc149f751b75211df8c41fcd60bd0006e6143ed6 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:21 +0200 Subject: IB/mlx5: Changes in memory region creation to support on-demand paging This patch wraps together several changes needed for on-demand paging support in the mlx5_ib_populate_pas function, and when registering memory regions. * Instead of accepting a UMR bit telling the function to enable all access flags, the function now accepts the access flags themselves. * For on-demand paging memory regions, fill the memory tables from the correct list, and enable/disable the access flags per-page according to whether the page is present. * A new bit is set to enable writing of access flags when using the firmware create_mkey command. * Disable contig pages when on-demand paging is enabled. In addition the patch changes the UMR code to use PTR_ALIGN instead of our own macro. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/mem.c | 58 ++++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 12 +++++++- drivers/infiniband/hw/mlx5/mr.c | 33 +++++++++++--------- include/linux/mlx5/device.h | 3 ++ 4 files changed, 88 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index dae07eae9507..5f7b30147180 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -32,6 +32,7 @@ #include #include +#include #include "mlx5_ib.h" /* @umem: umem object to scan @@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int entry; unsigned long page_shift = ilog2(umem->page_size); + /* With ODP we must always match OS page size. */ + if (umem->odp_data) { + *count = ib_umem_page_count(umem); + *shift = PAGE_SHIFT; + *ncont = *count; + if (order) + *order = ilog2(roundup_pow_of_two(*count)); + + return; + } + addr = addr >> page_shift; tmp = (unsigned long)addr; m = find_first_bit(&tmp, sizeof(tmp)); @@ -108,8 +120,32 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, *count = i; } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) +{ + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; + + if (umem_dma & ODP_READ_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_READ; + if (umem_dma & ODP_WRITE_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_WRITE; + + return mtt_entry; +} +#endif + +/* + * Populate the given array with bus addresses from the umem. + * + * dev - mlx5_ib device + * umem - umem to use to fill the pages + * page_shift - determines the page size used in the resulting array + * pas - bus addresses array to fill + * access_flags - access flags to set on all present pages. + use enum mlx5_ib_mtt_access_flags for this. + */ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int page_shift, __be64 *pas, int umr) + int page_shift, __be64 *pas, int access_flags) { unsigned long umem_page_shift = ilog2(umem->page_size); int shift = page_shift - umem_page_shift; @@ -120,6 +156,23 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int len; struct scatterlist *sg; int entry; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + const bool odp = umem->odp_data != NULL; + + if (odp) { + int num_pages = ib_umem_num_pages(umem); + + WARN_ON(shift != 0); + WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); + + for (i = 0; i < num_pages; ++i) { + dma_addr_t pa = umem->odp_data->dma_list[i]; + + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + } + return; + } +#endif i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { @@ -128,8 +181,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, for (k = 0; k < len; k++) { if (!(i & mask)) { cur = base + (k << umem_page_shift); - if (umr) - cur |= 3; + cur |= access_flags; pas[i >> shift] = cpu_to_be64(cur); mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index cc50fce8cca7..83c1690e9dd0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -268,6 +268,13 @@ struct mlx5_ib_xrcd { u32 xrcdn; }; +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_core_mr mmr; @@ -552,7 +559,7 @@ void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int *ncont, int *order); void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int page_shift, __be64 *pas, int umr); + int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); @@ -588,4 +595,7 @@ static inline u8 convert_access(int acc) MLX5_PERM_LOCAL_READ; } +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 2de4f4448f8a..49fc3ca735a4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -48,13 +48,6 @@ enum { MLX5_UMR_ALIGN = 2048 }; -static __be64 *mr_align(__be64 *ptr, int align) -{ - unsigned long mask = align - 1; - - return (__be64 *)(((unsigned long)ptr + mask) & ~mask); -} - static int order2idx(struct mlx5_ib_dev *dev, int order) { struct mlx5_mr_cache *cache = &dev->cache; @@ -669,7 +662,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size) static int use_umr(int order) { - return order <= 17; + return order <= MLX5_MAX_UMR_SHIFT; } static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, @@ -747,8 +740,9 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct ib_send_wr wr, *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; - int size = sizeof(u64) * npages; + int size; __be64 *mr_pas; + __be64 *pas; dma_addr_t dma; int err = 0; int i; @@ -768,17 +762,22 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. */ + size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); if (!mr_pas) { err = -ENOMEM; goto free_mr; } - mlx5_ib_populate_pas(dev, umem, page_shift, - mr_align(mr_pas, MLX5_UMR_ALIGN), 1); + pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); - dma = dma_map_single(ddev, mr_align(mr_pas, MLX5_UMR_ALIGN), size, - DMA_TO_DEVICE); + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); if (dma_mapping_error(ddev, dma)) { err = -ENOMEM; goto free_pas; @@ -833,6 +832,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, struct mlx5_ib_mr *mr; int inlen; int err; + bool pg_cap = !!(dev->mdev->caps.gen.flags & + MLX5_DEV_CAP_FLAG_ON_DMND_PG); mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) @@ -844,8 +845,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, err = -ENOMEM; goto err_1; } - mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0); + mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + * in the page list submitted with the command. */ + in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; in->seg.flags = convert_access(access_flags) | MLX5_ACCESS_MODE_MTT; in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 70c28239e339..64512a7354cb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -198,6 +198,9 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; +#define MLX5_UMR_MTT_ALIGNMENT 0x40 +#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) + enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, -- cgit v1.2.3 From 832a6b06ab5e13c228fc27e333ad360aa03ace6f Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:22 +0200 Subject: IB/mlx5: Add mlx5_ib_update_mtt to update page tables after creation The new function allows updating the page tables of a memory region after it was created. This can be used to handle page faults and page invalidations. Since mlx5_ib_update_mtt will need to work from within page invalidation, so it must not block on memory allocation. It employs an atomic memory allocation mechanism that is used as a fallback when kmalloc(GFP_ATOMIC) fails. In order to reuse code from mlx5_ib_populate_pas, the patch splits this function and add the needed parameters. Signed-off-by: Haggai Eran Signed-off-by: Shachar Raindel Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/mem.c | 19 +++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 5 ++ drivers/infiniband/hw/mlx5/mr.c | 132 ++++++++++++++++++++++++++++++++++- include/linux/mlx5/device.h | 1 + 4 files changed, 149 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 5f7b30147180..b56e4c5593ee 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -140,12 +140,16 @@ static u64 umem_dma_to_mtt(dma_addr_t umem_dma) * dev - mlx5_ib device * umem - umem to use to fill the pages * page_shift - determines the page size used in the resulting array + * offset - offset into the umem to start from, + * only implemented for ODP umems + * num_pages - total number of pages to fill * pas - bus addresses array to fill * access_flags - access flags to set on all present pages. use enum mlx5_ib_mtt_access_flags for this. */ -void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int page_shift, __be64 *pas, int access_flags) +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags) { unsigned long umem_page_shift = ilog2(umem->page_size); int shift = page_shift - umem_page_shift; @@ -160,13 +164,11 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, const bool odp = umem->odp_data != NULL; if (odp) { - int num_pages = ib_umem_num_pages(umem); - WARN_ON(shift != 0); WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); for (i = 0; i < num_pages; ++i) { - dma_addr_t pa = umem->odp_data->dma_list[i]; + dma_addr_t pa = umem->odp_data->dma_list[offset + i]; pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); } @@ -194,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, } } +void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, __be64 *pas, int access_flags) +{ + return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, + ib_umem_num_pages(umem), pas, + access_flags); +} int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) { u64 page_size; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 83c1690e9dd0..6856e27bfb6a 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -527,6 +527,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, + int npages, int zap); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); int mlx5_ib_destroy_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, @@ -558,6 +560,9 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int *ncont, int *order); +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags); void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 49fc3ca735a4..38b06267798e 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -44,9 +44,13 @@ enum { MAX_PENDING_REG_MR = 8, }; -enum { - MLX5_UMR_ALIGN = 2048 -}; +#define MLX5_UMR_ALIGN 2048 +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static __be64 mlx5_ib_update_mtt_emergency_buffer[ + MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] + __aligned(MLX5_UMR_ALIGN); +static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); +#endif static int order2idx(struct mlx5_ib_dev *dev, int order) { @@ -822,6 +826,128 @@ free_mr: return mr; } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, + int zap) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_umem *umem = mr->umem; + int size; + __be64 *pas; + dma_addr_t dma; + struct ib_send_wr wr, *bad; + struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg; + struct ib_sge sg; + int err = 0; + const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); + const int page_index_mask = page_index_alignment - 1; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t pages_iter = 0; + int use_emergency_buf = 0; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly */ + if (start_page_index & page_index_mask) { + npages += start_page_index & page_index_mask; + start_page_index &= ~page_index_mask; + } + + pages_to_map = ALIGN(npages, page_index_alignment); + + if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) + return -EINVAL; + + size = sizeof(u64) * pages_to_map; + size = min_t(int, PAGE_SIZE, size); + /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim + * code, when we are called from an invalidation. The pas buffer must + * be 2k-aligned for Connect-IB. */ + pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); + if (!pas) { + mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); + pas = mlx5_ib_update_mtt_emergency_buffer; + size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; + use_emergency_buf = 1; + mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + memset(pas, 0, size); + } + pages_iter = size / sizeof(u64); + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); + err = -ENOMEM; + goto free_pas; + } + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, start_page_index += pages_iter) { + dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); + + npages = min_t(size_t, + pages_iter, + ib_umem_num_pages(umem) - start_page_index); + + if (!zap) { + __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, + start_page_index, npages, pas, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages brought from the + * umem. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + } + + dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + + memset(&wr, 0, sizeof(wr)); + wr.wr_id = (u64)(unsigned long)&umr_context; + + sg.addr = dma; + sg.length = ALIGN(npages * sizeof(u64), + MLX5_UMR_MTT_ALIGNMENT); + sg.lkey = dev->umrc.mr->lkey; + + wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | + MLX5_IB_SEND_UMR_UPDATE_MTT; + wr.sg_list = &sg; + wr.num_sge = 1; + wr.opcode = MLX5_IB_WR_UMR; + umrwr->npages = sg.length / sizeof(u64); + umrwr->page_shift = PAGE_SHIFT; + umrwr->mkey = mr->mmr.key; + umrwr->target.offset = start_page_index; + + mlx5_ib_init_umr_context(&umr_context); + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr, &bad); + if (err) { + mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_err(dev, "UMR completion failed, code %d\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + } + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + if (!use_emergency_buf) + free_page((unsigned long)pas); + else + mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + + return err; +} +#endif + static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, u64 length, struct ib_umem *umem, int npages, int page_shift, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 64512a7354cb..4e5bd813bb9a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -200,6 +200,7 @@ enum { #define MLX5_UMR_MTT_ALIGNMENT 0x40 #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) +#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, -- cgit v1.2.3 From 6aec21f6a8322fa8d43df3ea7f051dfd8967f1b9 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:23 +0200 Subject: IB/mlx5: Page faults handling infrastructure * Refactor MR registration and cleanup, and fix reg_pages accounting. * Create a work queue to handle page fault events in a kthread context. * Register a fault handler to get events from the core for each QP. The registered fault handler is empty in this patch, and only a later patch implements it. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/main.c | 31 +++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 67 +++++++++++++++- drivers/infiniband/hw/mlx5/mr.c | 45 +++++++---- drivers/infiniband/hw/mlx5/odp.c | 145 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/qp.c | 26 ++++++- include/linux/mlx5/driver.h | 2 +- 6 files changed, 294 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e6d775f2446d..a801baa79c8e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -864,7 +864,7 @@ static ssize_t show_reg_pages(struct device *device, struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); + return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, @@ -1389,16 +1389,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) goto err_eqs; mutex_init(&dev->cap_mask_mutex); - spin_lock_init(&dev->mr_lock); err = create_dev_resources(&dev->devr); if (err) goto err_eqs; - err = ib_register_device(&dev->ib_dev, NULL); + err = mlx5_ib_odp_init_one(dev); if (err) goto err_rsrc; + err = ib_register_device(&dev->ib_dev, NULL); + if (err) + goto err_odp; + err = create_umr_res(dev); if (err) goto err_dev; @@ -1420,6 +1423,9 @@ err_umrc: err_dev: ib_unregister_device(&dev->ib_dev); +err_odp: + mlx5_ib_odp_remove_one(dev); + err_rsrc: destroy_dev_resources(&dev->devr); @@ -1435,8 +1441,10 @@ err_dealloc: static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; + ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); + mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); free_comp_eqs(dev); ib_dealloc_device(&dev->ib_dev); @@ -1450,15 +1458,30 @@ static struct mlx5_interface mlx5_ib_interface = { static int __init mlx5_ib_init(void) { + int err; + if (deprecated_prof_sel != 2) pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); - return mlx5_register_interface(&mlx5_ib_interface); + err = mlx5_ib_odp_init(); + if (err) + return err; + + err = mlx5_register_interface(&mlx5_ib_interface); + if (err) + goto clean_odp; + + return err; + +clean_odp: + mlx5_ib_odp_cleanup(); + return err; } static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); + mlx5_ib_odp_cleanup(); } module_init(mlx5_ib_init); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6856e27bfb6a..c6ceec3e3d6a 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -149,6 +149,29 @@ enum { MLX5_QP_EMPTY }; +/* + * Connect-IB can trigger up to four concurrent pagefaults + * per-QP. + */ +enum mlx5_ib_pagefault_context { + MLX5_IB_PAGEFAULT_RESPONDER_READ, + MLX5_IB_PAGEFAULT_REQUESTOR_READ, + MLX5_IB_PAGEFAULT_RESPONDER_WRITE, + MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, + MLX5_IB_PAGEFAULT_CONTEXTS +}; + +static inline enum mlx5_ib_pagefault_context + mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) +{ + return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); +} + +struct mlx5_ib_pfault { + struct work_struct work; + struct mlx5_pagefault mpfault; +}; + struct mlx5_ib_qp { struct ib_qp ibqp; struct mlx5_core_qp mqp; @@ -194,6 +217,21 @@ struct mlx5_ib_qp { /* Store signature errors */ bool signature_en; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * A flag that is true for QP's that are in a state that doesn't + * allow page faults, and shouldn't schedule any more faults. + */ + int disable_page_faults; + /* + * The disable_page_faults_lock protects a QP's disable_page_faults + * field, allowing for a thread to atomically check whether the QP + * allows page faults, and if so schedule a page fault. + */ + spinlock_t disable_page_faults_lock; + struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; +#endif }; struct mlx5_ib_cq_buf { @@ -392,13 +430,17 @@ struct mlx5_ib_dev { struct umr_common umrc; /* sync used page count stats */ - spinlock_t mr_lock; struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; struct timer_list delay_timer; int fill_delay; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_odp_caps odp_caps; + /* + * Sleepable RCU that prevents destruction of MRs while they are still + * being used by a page fault handler. + */ + struct srcu_struct mr_srcu; #endif }; @@ -575,12 +617,33 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +extern struct workqueue_struct *mlx5_ib_page_fault_wq; + int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); -#else +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault); +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); +int __init mlx5_ib_odp_init(void); +void mlx5_ib_odp_cleanup(void); +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) { return 0; } + +static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} +static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } +static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} +static inline int mlx5_ib_odp_init(void) { return 0; } +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} +static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} + #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void init_query_mad(struct ib_smp *mad) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 38b06267798e..922ac85b7198 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -52,6 +52,8 @@ static __be64 mlx5_ib_update_mtt_emergency_buffer[ static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); #endif +static int clean_mr(struct mlx5_ib_mr *mr); + static int order2idx(struct mlx5_ib_dev *dev, int order) { struct mlx5_mr_cache *cache = &dev->cache; @@ -1049,6 +1051,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "cache empty for order %d", order); mr = NULL; } + } else if (access_flags & IB_ACCESS_ON_DEMAND) { + err = -EINVAL; + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); + goto error; } if (!mr) @@ -1064,9 +1070,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->umem = umem; mr->npages = npages; - spin_lock(&dev->mr_lock); - dev->mdev->priv.reg_pages += npages; - spin_unlock(&dev->mr_lock); + atomic_add(npages, &dev->mdev->priv.reg_pages); mr->ibmr.lkey = mr->mmr.key; mr->ibmr.rkey = mr->mmr.key; @@ -1110,12 +1114,9 @@ error: return err; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +static int clean_mr(struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); - struct ib_umem *umem = mr->umem; - int npages = mr->npages; + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int umred = mr->umred; int err; @@ -1135,16 +1136,32 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) free_cached_mr(dev, mr); } + if (!umred) + kfree(mr); + + return 0; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int npages = mr->npages; + struct ib_umem *umem = mr->umem; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem) + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&dev->mr_srcu); +#endif + + clean_mr(mr); + if (umem) { ib_umem_release(umem); - spin_lock(&dev->mr_lock); - dev->mdev->priv.reg_pages -= npages; - spin_unlock(&dev->mr_lock); + atomic_sub(npages, &dev->mdev->priv.reg_pages); } - if (!umred) - kfree(mr); - return 0; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 66c39ee16aff..63bbdba396f1 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -32,6 +32,8 @@ #include "mlx5_ib.h" +struct workqueue_struct *mlx5_ib_page_fault_wq; + #define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ @@ -58,3 +60,146 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) out: return err; } + +static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, + u32 key) +{ + u32 base_key = mlx5_base_mkey(key); + struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); + + if (!mmr || mmr->key != key) + return NULL; + + return container_of(mmr, struct mlx5_ib_mr, mmr); +} + +static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + int error) { + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, + pfault->mpfault.flags, + error); + if (ret) + pr_err("Failed to resolve the page fault on QP 0x%x\n", + qp->mqp.qpn); +} + +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + u8 event_subtype = pfault->mpfault.event_subtype; + + switch (event_subtype) { + default: + pr_warn("Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(qp, pfault, 1); + break; + } +} + +static void mlx5_ib_qp_pfault_action(struct work_struct *work) +{ + struct mlx5_ib_pfault *pfault = container_of(work, + struct mlx5_ib_pfault, + work); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(&pfault->mpfault); + struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, + pagefaults[context]); + mlx5_ib_mr_pfault_handler(qp, pfault); +} + +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 1; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); + + /* + * Note that at this point, we are guarenteed that no more + * work queue elements will be posted to the work queue with + * the QP we are closing. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +} + +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->disable_page_faults_lock, flags); + qp->disable_page_faults = 0; + spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); +} + +static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, + struct mlx5_pagefault *pfault) +{ + /* + * Note that we will only get one fault event per QP per context + * (responder/initiator, read/write), until we resolve the page fault + * with the mlx5_ib_page_fault_resume command. Since this function is + * called from within the work element, there is no risk of missing + * events. + */ + struct mlx5_ib_qp *mibqp = to_mibqp(qp); + enum mlx5_ib_pagefault_context context = + mlx5_ib_get_pagefault_context(pfault); + struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; + + qp_pfault->mpfault = *pfault; + + /* No need to stop interrupts here since we are in an interrupt */ + spin_lock(&mibqp->disable_page_faults_lock); + if (!mibqp->disable_page_faults) + queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); + spin_unlock(&mibqp->disable_page_faults_lock); +} + +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) +{ + int i; + + qp->disable_page_faults = 1; + spin_lock_init(&qp->disable_page_faults_lock); + + qp->mqp.pfault_handler = mlx5_ib_pfault_handler; + + for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) + INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); +} + +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) +{ + int ret; + + ret = init_srcu_struct(&ibdev->mr_srcu); + if (ret) + return ret; + + return 0; +} + +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) +{ + cleanup_srcu_struct(&ibdev->mr_srcu); +} + +int __init mlx5_ib_odp_init(void) +{ + mlx5_ib_page_fault_wq = + create_singlethread_workqueue("mlx5_ib_page_faults"); + if (!mlx5_ib_page_fault_wq) + return -ENOMEM; + + return 0; +} + +void mlx5_ib_odp_cleanup(void) +{ + destroy_workqueue(mlx5_ib_page_fault_wq); +} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 9783c3342dbf..be0cd358b080 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -876,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, int inlen = sizeof(*in); int err; + mlx5_ib_odp_create_qp(qp); + gen = &dev->mdev->caps.gen; mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); @@ -1160,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return; - if (qp->state != IB_QPS_RESET) + if (qp->state != IB_QPS_RESET) { + mlx5_ib_qp_disable_pagefaults(qp); if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", qp->mqp.qpn); + } get_cqs(qp, &send_cq, &recv_cq); @@ -1712,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (mlx5_st < 0) goto out; + /* If moving to a reset or error state, we must disable page faults on + * this QP and flush all current page faults. Otherwise a stale page + * fault may attempt to work on this QP after it is reset and moved + * again to RTS, and may cause the driver and the device to get out of + * sync. */ + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + mlx5_ib_qp_disable_pagefaults(qp); + optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; in->optparam = cpu_to_be32(optpar); @@ -1721,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (err) goto out; + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + mlx5_ib_qp_enable_pagefaults(qp); + qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) @@ -3026,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr int mlx5_state; int err = 0; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * Wait for any outstanding page faults, in case the user frees memory + * based upon this query's result. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +#endif + mutex_lock(&qp->mutex); outb = kzalloc(sizeof(*outb), GFP_KERNEL); if (!outb) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7088dcd19214..166d9315fe4b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -474,7 +474,7 @@ struct mlx5_priv { struct workqueue_struct *pg_wq; struct rb_root page_root; int fw_pages; - int reg_pages; + atomic_t reg_pages; struct list_head free_list; struct mlx5_core_health health; -- cgit v1.2.3 From 7bdf65d411c1715d695be0d9a555d7f48d0a7220 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:24 +0200 Subject: IB/mlx5: Handle page faults This patch implement a page fault handler (leaving the pages pinned as of time being). The page fault handler handles initiator and responder page faults for UD/RC transports, for send/receive operations, as well as RDMA read/write initiator support. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/odp.c | 408 +++++++++++++++++++++++++++++++++++++++ include/linux/mlx5/qp.h | 7 + 2 files changed, 415 insertions(+) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 63bbdba396f1..bd1dbe5ebc15 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -30,6 +30,9 @@ * SOFTWARE. */ +#include +#include + #include "mlx5_ib.h" struct workqueue_struct *mlx5_ib_page_fault_wq; @@ -85,12 +88,417 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, qp->mqp.qpn); } +/* + * Handle a single data segment in a page-fault WQE. + * + * Returns number of pages retrieved on success. The caller will continue to + * the next data segment. + * Can return the following error codes: + * -EAGAIN to designate a temporary error. The caller will abort handling the + * page fault and resolve it. + * -EFAULT when there's an error mapping the requested pages. The caller will + * abort the page fault handling and possibly move the QP to an error state. + * On other errors the QP should also be closed with an error. + */ +static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, + u32 key, u64 io_virt, size_t bcnt, + u32 *bytes_mapped) +{ + struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); + int srcu_key; + unsigned int current_seq; + u64 start_idx; + int npages = 0, ret = 0; + struct mlx5_ib_mr *mr; + u64 access_mask = ODP_READ_ALLOWED_BIT; + + srcu_key = srcu_read_lock(&mib_dev->mr_srcu); + mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); + /* + * If we didn't find the MR, it means the MR was closed while we were + * handling the ODP event. In this case we return -EFAULT so that the + * QP will be closed. + */ + if (!mr || !mr->ibmr.pd) { + pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", + key); + ret = -EFAULT; + goto srcu_unlock; + } + if (!mr->umem->odp_data) { + pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += + (bcnt - pfault->mpfault.bytes_committed); + goto srcu_unlock; + } + if (mr->ibmr.pd != qp->ibqp.pd) { + pr_err("Page-fault with different PDs for QP and MR.\n"); + ret = -EFAULT; + goto srcu_unlock; + } + + current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); + + /* + * Avoid branches - this code will perform correctly + * in all iterations (in iteration 2 and above, + * bytes_committed == 0). + */ + io_virt += pfault->mpfault.bytes_committed; + bcnt -= pfault->mpfault.bytes_committed; + + start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; + + if (mr->umem->writable) + access_mask |= ODP_WRITE_ALLOWED_BIT; + npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, + access_mask, current_seq); + if (npages < 0) { + ret = npages; + goto srcu_unlock; + } + + if (npages > 0) { + mutex_lock(&mr->umem->odp_data->umem_mutex); + /* + * No need to check whether the MTTs really belong to + * this MR, since ib_umem_odp_map_dma_pages already + * checks this. + */ + ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + mutex_unlock(&mr->umem->odp_data->umem_mutex); + if (ret < 0) { + pr_err("Failed to update mkey page tables\n"); + goto srcu_unlock; + } + + if (bytes_mapped) { + u32 new_mappings = npages * PAGE_SIZE - + (io_virt - round_down(io_virt, PAGE_SIZE)); + *bytes_mapped += min_t(u32, new_mappings, bcnt); + } + } + +srcu_unlock: + srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); + pfault->mpfault.bytes_committed = 0; + return ret ? ret : npages; +} + +/** + * Parse a series of data segments for page fault handling. + * + * @qp the QP on which the fault occurred. + * @pfault contains page fault information. + * @wqe points at the first data segment in the WQE. + * @wqe_end points after the end of the WQE. + * @bytes_mapped receives the number of bytes that the function was able to + * map. This allows the caller to decide intelligently whether + * enough memory was mapped to resolve the page fault + * successfully (e.g. enough for the next MTU, or the entire + * WQE). + * @total_wqe_bytes receives the total data size of this WQE in bytes (minus + * the committed bytes). + * + * Returns the number of pages loaded if positive, zero for an empty WQE, or a + * negative error code. + */ +static int pagefault_data_segments(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault, void *wqe, + void *wqe_end, u32 *bytes_mapped, + u32 *total_wqe_bytes, int receive_queue) +{ + int ret = 0, npages = 0; + u64 io_virt; + u32 key; + u32 byte_count; + size_t bcnt; + int inline_segment; + + /* Skip SRQ next-WQE segment. */ + if (receive_queue && qp->ibqp.srq) + wqe += sizeof(struct mlx5_wqe_srq_next_seg); + + if (bytes_mapped) + *bytes_mapped = 0; + if (total_wqe_bytes) + *total_wqe_bytes = 0; + + while (wqe < wqe_end) { + struct mlx5_wqe_data_seg *dseg = wqe; + + io_virt = be64_to_cpu(dseg->addr); + key = be32_to_cpu(dseg->lkey); + byte_count = be32_to_cpu(dseg->byte_count); + inline_segment = !!(byte_count & MLX5_INLINE_SEG); + bcnt = byte_count & ~MLX5_INLINE_SEG; + + if (inline_segment) { + bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; + wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, + 16); + } else { + wqe += sizeof(*dseg); + } + + /* receive WQE end of sg list. */ + if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && + io_virt == 0) + break; + + if (!inline_segment && total_wqe_bytes) { + *total_wqe_bytes += bcnt - min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + } + + /* A zero length data segment designates a length of 2GB. */ + if (bcnt == 0) + bcnt = 1U << 31; + + if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { + pfault->mpfault.bytes_committed -= + min_t(size_t, bcnt, + pfault->mpfault.bytes_committed); + continue; + } + + ret = pagefault_single_data_segment(qp, pfault, key, io_virt, + bcnt, bytes_mapped); + if (ret < 0) + break; + npages += ret; + } + + return ret < 0 ? ret : npages; +} + +/* + * Parse initiator WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_initiator_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_wqe_ctrl_seg *ctrl = *wqe; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + unsigned ds, opcode; +#if defined(DEBUG) + u32 ctrl_wqe_index, ctrl_qpn; +#endif + + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + if (ds * MLX5_WQE_DS_UNITS > wqe_length) { + mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", + ds, wqe_length); + return -EFAULT; + } + + if (ds == 0) { + mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", + wqe_index, qp->mqp.qpn); + return -EFAULT; + } + +#if defined(DEBUG) + ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_WQE_INDEX_MASK) >> + MLX5_WQE_CTRL_WQE_INDEX_SHIFT; + if (wqe_index != ctrl_wqe_index) { + mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_wqe_index); + return -EFAULT; + } + + ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> + MLX5_WQE_CTRL_QPN_SHIFT; + if (qp->mqp.qpn != ctrl_qpn) { + mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", + wqe_index, qp->mqp.qpn, + ctrl_qpn); + return -EFAULT; + } +#endif /* DEBUG */ + + *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; + *wqe += sizeof(*ctrl); + + opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_OPCODE_MASK; + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + case MLX5_OPCODE_SEND_INVAL: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + break; + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_WRITE)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_RDMA_READ: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_READ)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + case IB_QPT_UD: + switch (opcode) { + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_IMM: + if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & + IB_ODP_SUPPORT_SEND)) + goto invalid_transport_or_opcode; + *wqe += sizeof(struct mlx5_wqe_datagram_seg); + break; + default: + goto invalid_transport_or_opcode; + } + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", + qp->ibqp.qp_type, opcode); + return -EFAULT; + } + + return 0; +} + +/* + * Parse responder WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_responder_pfault_handler( + struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, + void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + struct mlx5_ib_wq *wq = &qp->rq; + int wqe_size = 1 << wq->wqe_shift; + + if (qp->ibqp.srq) { + mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); + return -EFAULT; + } + + if (qp->wq_sig) { + mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); + return -EFAULT; + } + + if (wqe_size > wqe_length) { + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); + return -EFAULT; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & + IB_ODP_SUPPORT_RECV)) + goto invalid_transport_or_opcode; + break; + default: +invalid_transport_or_opcode: + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", + qp->ibqp.qp_type); + return -EFAULT; + } + + *wqe_end = *wqe + wqe_size; + + return 0; +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); + int ret; + void *wqe, *wqe_end; + u32 bytes_mapped, total_wqe_bytes; + char *buffer = NULL; + int resume_with_error = 0; + u16 wqe_index = pfault->mpfault.wqe.wqe_index; + int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; + + buffer = (char *)__get_free_page(GFP_KERNEL); + if (!buffer) { + mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, + PAGE_SIZE); + if (ret < 0) { + mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", + -ret, wqe_index, qp->mqp.qpn); + resume_with_error = 1; + goto resolve_page_fault; + } + + wqe = buffer; + if (requestor) + ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + else + ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, + &wqe_end, ret); + if (ret < 0) { + resume_with_error = 1; + goto resolve_page_fault; + } + + if (wqe >= wqe_end) { + mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); + resume_with_error = 1; + goto resolve_page_fault; + } + + ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, + &total_wqe_bytes, !requestor); + if (ret == -EAGAIN) { + goto resolve_page_fault; + } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { + mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", + -ret); + resume_with_error = 1; + goto resolve_page_fault; + } + +resolve_page_fault: + mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", + qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); + + free_page((unsigned long)buffer); +} + void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault) { u8 event_subtype = pfault->mpfault.event_subtype; switch (event_subtype) { + case MLX5_PFAULT_SUBTYPE_WQE: + mlx5_ib_mr_wqe_pfault_handler(qp, pfault); + break; default: pr_warn("Invalid page fault event subtype: 0x%x\n", event_subtype); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 6b1d6f60c7e6..61f7a342d1bf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -193,7 +193,12 @@ struct mlx5_wqe_ctrl_seg { }; #define MLX5_WQE_CTRL_DS_MASK 0x3f +#define MLX5_WQE_CTRL_QPN_MASK 0xffffff00 +#define MLX5_WQE_CTRL_QPN_SHIFT 8 #define MLX5_WQE_DS_UNITS 16 +#define MLX5_WQE_CTRL_OPCODE_MASK 0xff +#define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 +#define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 struct mlx5_wqe_xrc_seg { __be32 xrc_srqn; @@ -298,6 +303,8 @@ struct mlx5_wqe_signature_seg { u8 rsvd1[11]; }; +#define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff + struct mlx5_wqe_inline_seg { __be32 byte_count; }; -- cgit v1.2.3