diff options
Diffstat (limited to 'drivers/infiniband')
73 files changed, 2605 insertions, 2159 deletions
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 68721ff10255..308155937713 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -479,13 +479,20 @@ static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; - if (sa->sa_family == AF_INET) - return memcmp((char *)&((struct sockaddr_in *)sa)->sin_addr, - (char *)&((struct sockaddr_in *)sb)->sin_addr, + if (sa->sa_family == AF_INET && + __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { + return memcmp(&((struct sockaddr_in *)sa)->sin_addr, + &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); + } + + if (sa->sa_family == AF_INET6 && + __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { + return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, + &((struct sockaddr_in6 *)sb)->sin6_addr); + } - return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, - &((struct sockaddr_in6 *)sb)->sin6_addr); + return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) @@ -2819,8 +2826,8 @@ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) } EXPORT_SYMBOL(rdma_set_min_rnr_timer); -static void route_set_path_rec_inbound(struct cma_work *work, - struct sa_path_rec *path_rec) +static int route_set_path_rec_inbound(struct cma_work *work, + struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; @@ -2828,14 +2835,15 @@ static void route_set_path_rec_inbound(struct cma_work *work, route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) - return; + return -ENOMEM; } *route->path_rec_inbound = *path_rec; + return 0; } -static void route_set_path_rec_outbound(struct cma_work *work, - struct sa_path_rec *path_rec) +static int route_set_path_rec_outbound(struct cma_work *work, + struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; @@ -2843,14 +2851,15 @@ static void route_set_path_rec_outbound(struct cma_work *work, route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) - return; + return -ENOMEM; } *route->path_rec_outbound = *path_rec; + return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, - int num_prs, void *context) + unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; @@ -2865,13 +2874,15 @@ static void cma_query_handler(int status, struct sa_path_rec *path_rec, if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) - route_set_path_rec_inbound(work, &path_rec[i]); + status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) - route_set_path_rec_outbound(work, &path_rec[i]); - } - if (!route->path_rec) { - status = -EINVAL; - goto fail; + status = route_set_path_rec_outbound(work, + &path_rec[i]); + else + status = -EINVAL; + + if (status) + goto fail; } route->num_pri_alt_paths = 1; @@ -3541,121 +3552,6 @@ err: return ret; } -static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr) -{ - struct sockaddr_storage zero_sock = {}; - - if (src_addr && src_addr->sa_family) - return rdma_bind_addr(id, src_addr); - - /* - * When the src_addr is not specified, automatically supply an any addr - */ - zero_sock.ss_family = dst_addr->sa_family; - if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { - struct sockaddr_in6 *src_addr6 = - (struct sockaddr_in6 *)&zero_sock; - struct sockaddr_in6 *dst_addr6 = - (struct sockaddr_in6 *)dst_addr; - - src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; - if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - id->route.addr.dev_addr.bound_dev_if = - dst_addr6->sin6_scope_id; - } else if (dst_addr->sa_family == AF_IB) { - ((struct sockaddr_ib *)&zero_sock)->sib_pkey = - ((struct sockaddr_ib *)dst_addr)->sib_pkey; - } - return rdma_bind_addr(id, (struct sockaddr *)&zero_sock); -} - -/* - * If required, resolve the source address for bind and leave the id_priv in - * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior - * calls made by ULP, a previously bound ID will not be re-bound and src_addr is - * ignored. - */ -static int resolve_prepare_src(struct rdma_id_private *id_priv, - struct sockaddr *src_addr, - const struct sockaddr *dst_addr) -{ - int ret; - - memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { - /* For a well behaved ULP state will be RDMA_CM_IDLE */ - ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); - if (ret) - goto err_dst; - if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, - RDMA_CM_ADDR_QUERY))) { - ret = -EINVAL; - goto err_dst; - } - } - - if (cma_family(id_priv) != dst_addr->sa_family) { - ret = -EINVAL; - goto err_state; - } - return 0; - -err_state: - cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); -err_dst: - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return ret; -} - -int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, unsigned long timeout_ms) -{ - struct rdma_id_private *id_priv = - container_of(id, struct rdma_id_private, id); - int ret; - - ret = resolve_prepare_src(id_priv, src_addr, dst_addr); - if (ret) - return ret; - - if (cma_any_addr(dst_addr)) { - ret = cma_resolve_loopback(id_priv); - } else { - if (dst_addr->sa_family == AF_IB) { - ret = cma_resolve_ib_addr(id_priv); - } else { - /* - * The FSM can return back to RDMA_CM_ADDR_BOUND after - * rdma_resolve_ip() is called, eg through the error - * path in addr_handler(). If this happens the existing - * request must be canceled before issuing a new one. - * Since canceling a request is a bit slow and this - * oddball path is rare, keep track once a request has - * been issued. The track turns out to be a permanent - * state since this is the only cancel as it is - * immediately before rdma_resolve_ip(). - */ - if (id_priv->used_resolve_ip) - rdma_addr_cancel(&id->route.addr.dev_addr); - else - id_priv->used_resolve_ip = 1; - ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, - &id->route.addr.dev_addr, - timeout_ms, addr_handler, - false, id_priv); - } - } - if (ret) - goto err; - - return 0; -err: - cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - return ret; -} -EXPORT_SYMBOL(rdma_resolve_addr); - int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; @@ -4058,27 +3954,26 @@ err: } EXPORT_SYMBOL(rdma_listen); -int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, + struct sockaddr *addr, const struct sockaddr *daddr) { - struct rdma_id_private *id_priv; + struct sockaddr *id_daddr; int ret; - struct sockaddr *daddr; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; - id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; - ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = cma_translate_addr(addr, &id->route.addr.dev_addr); + ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; @@ -4098,8 +3993,10 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) } #endif } - daddr = cma_dst_addr(id_priv); - daddr->sa_family = addr->sa_family; + id_daddr = cma_dst_addr(id_priv); + if (daddr != id_daddr) + memcpy(id_daddr, daddr, rdma_addr_size(addr)); + id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) @@ -4115,6 +4012,127 @@ err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + struct sockaddr_storage zero_sock = {}; + + if (src_addr && src_addr->sa_family) + return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); + + /* + * When the src_addr is not specified, automatically supply an any addr + */ + zero_sock.ss_family = dst_addr->sa_family; + if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&zero_sock; + struct sockaddr_in6 *dst_addr6 = + (struct sockaddr_in6 *)dst_addr; + + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = + dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *)&zero_sock)->sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; + } + return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); +} + +/* + * If required, resolve the source address for bind and leave the id_priv in + * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior + * calls made by ULP, a previously bound ID will not be re-bound and src_addr is + * ignored. + */ +static int resolve_prepare_src(struct rdma_id_private *id_priv, + struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + int ret; + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); + if (ret) + return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_ADDR_QUERY))) + return -EINVAL; + + } + + if (cma_family(id_priv) != dst_addr->sa_family) { + ret = -EINVAL; + goto err_state; + } + return 0; + +err_state: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + ret = resolve_prepare_src(id_priv, src_addr, dst_addr); + if (ret) + return ret; + + if (cma_any_addr(dst_addr)) { + ret = cma_resolve_loopback(id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); + } + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_addr); + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); +} EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 0de83d9a4985..59179cfc20ef 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -106,7 +106,7 @@ struct ib_sa_device { struct ib_sa_query { void (*callback)(struct ib_sa_query *sa_query, int status, - int num_prs, struct ib_sa_mad *mad); + struct ib_sa_mad *mad); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; @@ -118,12 +118,6 @@ struct ib_sa_query { u32 seq; /* Local svc request sequence number */ unsigned long timeout; /* Local svc timeout */ u8 path_use; /* How will the pathrecord be used */ - - /* A separate buffer to save pathrecords of a response, as in cases - * like IB/netlink, mulptiple pathrecords are supported, so that - * mad->data is not large enough to hold them - */ - void *resp_pr_data; }; #define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 @@ -132,7 +126,7 @@ struct ib_sa_query { struct ib_sa_path_query { void (*callback)(int status, struct sa_path_rec *rec, - int num_paths, void *context); + unsigned int num_paths, void *context); void *context; struct ib_sa_query sa_query; struct sa_path_rec *conv_pr; @@ -690,6 +684,8 @@ static const struct ib_field guidinfo_rec_table[] = { .size_bits = 512 }, }; +#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 + static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) { query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; @@ -874,30 +870,21 @@ static void send_handler(struct ib_mad_agent *agent, static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, const struct nlmsghdr *nlh) { - struct ib_path_rec_data *srec, *drec; + struct sa_path_rec recs[RDMA_PRIMARY_PATH_MAX_REC_NUM]; struct ib_sa_path_query *path_query; + struct ib_path_rec_data *rec_data; struct ib_mad_send_wc mad_send_wc; const struct nlattr *head, *curr; struct ib_sa_mad *mad = NULL; - int len, rem, num_prs = 0; + int len, rem, status = -EIO; + unsigned int num_prs = 0; u32 mask = 0; - int status = -EIO; if (!query->callback) goto out; path_query = container_of(query, struct ib_sa_path_query, sa_query); mad = query->mad_buf->mad; - if (!path_query->conv_pr && - (be16_to_cpu(mad->mad_hdr.attr_id) == IB_SA_ATTR_PATH_REC)) { - /* Need a larger buffer for possible multiple PRs */ - query->resp_pr_data = kvcalloc(RDMA_PRIMARY_PATH_MAX_REC_NUM, - sizeof(*drec), GFP_KERNEL); - if (!query->resp_pr_data) { - query->callback(query, -ENOMEM, 0, NULL); - return; - } - } head = (const struct nlattr *) nlmsg_data(nlh); len = nlmsg_len(nlh); @@ -917,36 +904,41 @@ static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, break; } - drec = (struct ib_path_rec_data *)query->resp_pr_data; nla_for_each_attr(curr, head, len, rem) { if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD) continue; - srec = nla_data(curr); - if ((srec->flags & mask) != mask) + rec_data = nla_data(curr); + if ((rec_data->flags & mask) != mask) continue; - status = 0; - if (!drec) { - memcpy(mad->data, srec->path_rec, - sizeof(srec->path_rec)); - num_prs = 1; - break; + if ((query->flags & IB_SA_QUERY_OPA) || + path_query->conv_pr) { + mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; + memcpy(mad->data, rec_data->path_rec, + sizeof(rec_data->path_rec)); + query->callback(query, 0, mad); + goto out; } - memcpy(drec, srec, sizeof(*drec)); - drec++; + status = 0; + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + rec_data->path_rec, &recs[num_prs]); + recs[num_prs].flags = rec_data->flags; + recs[num_prs].rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(&recs[num_prs]); + num_prs++; if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM) break; } - if (!status) + if (!status) { mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; - - query->callback(query, status, num_prs, mad); - kvfree(query->resp_pr_data); - query->resp_pr_data = NULL; + path_query->callback(status, recs, num_prs, + path_query->context); + } else + query->callback(query, status, mad); out: mad_send_wc.send_buf = query->mad_buf; @@ -1451,11 +1443,26 @@ static int opa_pr_query_possible(struct ib_sa_client *client, return PR_IB_SUPPORTED; } -static void ib_sa_pr_callback_single(struct ib_sa_path_query *query, - int status, struct ib_sa_mad *mad) +static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, + int status, struct ib_sa_mad *mad) { + struct ib_sa_path_query *query = + container_of(sa_query, struct ib_sa_path_query, sa_query); struct sa_path_rec rec = {}; + if (!mad) { + query->callback(status, NULL, 0, query->context); + return; + } + + if (sa_query->flags & IB_SA_QUERY_OPA) { + ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_OPA; + query->callback(status, &rec, 1, query->context); + return; + } + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); rec.rec_type = SA_PATH_REC_TYPE_IB; @@ -1472,71 +1479,6 @@ static void ib_sa_pr_callback_single(struct ib_sa_path_query *query, } } -/** - * ib_sa_pr_callback_multiple() - Parse path records then do callback. - * - * In a multiple-PR case the PRs are saved in "query->resp_pr_data" - * (instead of"mad->data") and with "ib_path_rec_data" structure format, - * so that rec->flags can be set to indicate the type of PR. - * This is valid only in IB fabric. - */ -static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, - int status, int num_prs, - struct ib_path_rec_data *rec_data) -{ - struct sa_path_rec *rec; - int i; - - rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); - if (!rec) { - query->callback(-ENOMEM, NULL, 0, query->context); - return; - } - - for (i = 0; i < num_prs; i++) { - ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), - rec_data[i].path_rec, rec + i); - rec[i].rec_type = SA_PATH_REC_TYPE_IB; - sa_path_set_dmac_zero(rec + i); - rec[i].flags = rec_data[i].flags; - } - - query->callback(status, rec, num_prs, query->context); - kvfree(rec); -} - -static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, - int status, int num_prs, - struct ib_sa_mad *mad) -{ - struct ib_sa_path_query *query = - container_of(sa_query, struct ib_sa_path_query, sa_query); - struct sa_path_rec rec; - - if (!mad || !num_prs) { - query->callback(status, NULL, 0, query->context); - return; - } - - if (sa_query->flags & IB_SA_QUERY_OPA) { - if (num_prs != 1) { - query->callback(-EINVAL, NULL, 0, query->context); - return; - } - - ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), - mad->data, &rec); - rec.rec_type = SA_PATH_REC_TYPE_OPA; - query->callback(status, &rec, num_prs, query->context); - } else { - if (!sa_query->resp_pr_data) - ib_sa_pr_callback_single(query, status, mad); - else - ib_sa_pr_callback_multiple(query, status, num_prs, - sa_query->resp_pr_data); - } -} - static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) { struct ib_sa_path_query *query = @@ -1578,7 +1520,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, - int num_paths, void *context), + unsigned int num_paths, void *context), void *context, struct ib_sa_query **sa_query) { @@ -1677,8 +1619,7 @@ err1: EXPORT_SYMBOL(ib_sa_path_rec_get); static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, - int status, int num_prs, - struct ib_sa_mad *mad) + int status, struct ib_sa_mad *mad) { struct ib_sa_mcmember_query *query = container_of(sa_query, struct ib_sa_mcmember_query, sa_query); @@ -1769,8 +1710,7 @@ err1: /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, - int status, int num_paths, - struct ib_sa_mad *mad) + int status, struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); @@ -1879,8 +1819,7 @@ static void ib_classportinfo_cb(void *context) } static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, - int status, int num_prs, - struct ib_sa_mad *mad) + int status, struct ib_sa_mad *mad) { unsigned long flags; struct ib_sa_classport_info_query *query = @@ -2055,13 +1994,13 @@ static void send_handler(struct ib_mad_agent *agent, /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: - query->callback(query, -ETIMEDOUT, 0, NULL); + query->callback(query, -ETIMEDOUT, NULL); break; case IB_WC_WR_FLUSH_ERR: - query->callback(query, -EINTR, 0, NULL); + query->callback(query, -EINTR, NULL); break; default: - query->callback(query, -EIO, 0, NULL); + query->callback(query, -EIO, NULL); break; } @@ -2089,10 +2028,10 @@ static void recv_handler(struct ib_mad_agent *mad_agent, if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, mad_recv_wc->recv_buf.mad->mad_hdr.status ? - -EINVAL : 0, 1, + -EINVAL : 0, (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); else - query->callback(query, -EIO, 0, NULL); + query->callback(query, -EIO, NULL); } ib_free_recv_mad(mad_recv_wc); diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 43b26bc12288..39357dc2d229 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -26,8 +26,8 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) if (umem_dmabuf->sgt) goto wait_fence; - sgt = dma_buf_map_attachment_unlocked(umem_dmabuf->attach, - DMA_BIDIRECTIONAL); + sgt = dma_buf_map_attachment(umem_dmabuf->attach, + DMA_BIDIRECTIONAL); if (IS_ERR(sgt)) return PTR_ERR(sgt); @@ -103,8 +103,8 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) umem_dmabuf->last_sg_trim = 0; } - dma_buf_unmap_attachment_unlocked(umem_dmabuf->attach, umem_dmabuf->sgt, - DMA_BIDIRECTIONAL); + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, + DMA_BIDIRECTIONAL); umem_dmabuf->sgt = NULL; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 26b021f43ba4..11b1c1603aeb 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2957,15 +2957,18 @@ EXPORT_SYMBOL(__rdma_block_iter_start); bool __rdma_block_iter_next(struct ib_block_iter *biter) { unsigned int block_offset; + unsigned int sg_delta; if (!biter->__sg_nents || !biter->__sg) return false; biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); - biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset; + sg_delta = BIT_ULL(biter->__pg_bit) - block_offset; - if (biter->__sg_advance >= sg_dma_len(biter->__sg)) { + if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) { + biter->__sg_advance += sg_delta; + } else { biter->__sg_advance = 0; biter->__sg = sg_next(biter->__sg); biter->__sg_nents--; diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 785c37cae3c0..5a2baf49ecaa 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -89,13 +89,6 @@ struct bnxt_re_ring_attr { u8 mode; }; -struct bnxt_re_work { - struct work_struct work; - unsigned long event; - struct bnxt_re_dev *rdev; - struct net_device *vlan_dev; -}; - struct bnxt_re_sqp_entries { struct bnxt_qplib_sge sge; u64 wrid; @@ -132,10 +125,10 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED 17 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; + struct notifier_block nb; unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - struct bnxt_msix_entry msix_entries[BNXT_RE_MAX_MSIX]; int num_msix; int id; @@ -194,5 +187,4 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) return &rdev->ibdev.dev; return NULL; } - #endif diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 8c0c80a8d338..c5867e78f231 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -48,6 +48,7 @@ #include <net/ipv6.h> #include <net/addrconf.h> #include <linux/if_ether.h> +#include <linux/auxiliary_bus.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> @@ -74,14 +75,14 @@ MODULE_DESCRIPTION(BNXT_RE_DESC " Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* globals */ -static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list); -/* Mutex to protect the list of bnxt_re devices added */ -static DEFINE_MUTEX(bnxt_re_dev_lock); -static struct workqueue_struct *bnxt_re_wq; -static void bnxt_re_remove_device(struct bnxt_re_dev *rdev); -static void bnxt_re_dealloc_driver(struct ib_device *ib_dev); +static DEFINE_MUTEX(bnxt_re_mutex); + static void bnxt_re_stop_irq(void *handle); static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev); +static int bnxt_re_netdev_event(struct notifier_block *notifier, + unsigned long event, void *ptr); +static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); +static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) { @@ -111,16 +112,14 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - struct bnxt *bp; en_dev = rdev->en_dev; - bp = netdev_priv(en_dev->net); chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); if (!chip_ctx) return -ENOMEM; - chip_ctx->chip_num = bp->chip_num; - chip_ctx->hw_stats_size = bp->hw_ring_stats_size; + chip_ctx->chip_num = en_dev->chip_num; + chip_ctx->hw_stats_size = en_dev->hw_ring_stats_size; rdev->chip_ctx = chip_ctx; /* rest members to follow eventually */ @@ -128,7 +127,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->qplib_res.cctx = rdev->chip_ctx; rdev->rcfw.res = &rdev->qplib_res; rdev->qplib_res.dattr = &rdev->dev_attr; - rdev->qplib_res.is_vf = BNXT_VF(bp); + rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); bnxt_re_set_drv_mode(rdev, wqe_mode); if (bnxt_qplib_determine_atomics(en_dev->pdev)) @@ -141,10 +140,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev) { - struct bnxt *bp; - - bp = netdev_priv(rdev->en_dev->net); - if (BNXT_VF(bp)) + if (BNXT_EN_VF(rdev->en_dev)) rdev->is_virtfn = 1; } @@ -225,56 +221,12 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) bnxt_re_limit_vf_res(&rdev->qplib_ctx, num_vfs); } -/* for handling bnxt_en callbacks later */ -static void bnxt_re_stop(void *p) -{ - struct bnxt_re_dev *rdev = p; - struct bnxt *bp; - - if (!rdev) - return; - ASSERT_RTNL(); - - /* L2 driver invokes this callback during device error/crash or device - * reset. Current RoCE driver doesn't recover the device in case of - * error. Handle the error by dispatching fatal events to all qps - * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as - * L2 driver want to modify the MSIx table. - */ - bp = netdev_priv(rdev->netdev); - - ibdev_info(&rdev->ibdev, "Handle device stop call from L2 driver"); - /* Check the current device state from L2 structure and move the - * device to detached state if FW_FATAL_COND is set. - * This prevents more commands to HW during clean-up, - * in case the device is already in error. - */ - if (test_bit(BNXT_STATE_FW_FATAL_COND, &bp->state)) - set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); - - bnxt_re_dev_stop(rdev); - bnxt_re_stop_irq(rdev); - /* Move the device states to detached and avoid sending any more - * commands to HW - */ - set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); - set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); -} - -static void bnxt_re_start(void *p) -{ -} - -static void bnxt_re_sriov_config(void *p, int num_vfs) +static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) { - struct bnxt_re_dev *rdev = p; - - if (!rdev) - return; if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) return; - rdev->num_vfs = num_vfs; + rdev->num_vfs = pci_sriov_get_totalvfs(rdev->en_dev->pdev); if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) { bnxt_re_set_resource_limits(rdev); bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, @@ -282,16 +234,14 @@ static void bnxt_re_sriov_config(void *p, int num_vfs) } } -static void bnxt_re_shutdown(void *p) +static void bnxt_re_shutdown(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = p; + struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); if (!rdev) return; - ASSERT_RTNL(); - /* Release the MSIx vectors before queuing unregister */ - bnxt_re_stop_irq(rdev); - ib_unregister_device_queued(&rdev->ibdev); + ib_unregister_device(&rdev->ibdev); + bnxt_re_dev_uninit(rdev); } static void bnxt_re_stop_irq(void *handle) @@ -312,7 +262,7 @@ static void bnxt_re_stop_irq(void *handle) static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) { struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_msix_entry *msix_ent = rdev->msix_entries; + struct bnxt_msix_entry *msix_ent = rdev->en_dev->msix_entries; struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; struct bnxt_qplib_nq *nq; int indx, rc; @@ -331,7 +281,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) * in device sctructure. */ for (indx = 0; indx < rdev->num_msix; indx++) - rdev->msix_entries[indx].vector = ent[indx].vector; + rdev->en_dev->msix_entries[indx].vector = ent[indx].vector; bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, false); @@ -346,93 +296,22 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) } static struct bnxt_ulp_ops bnxt_re_ulp_ops = { - .ulp_async_notifier = NULL, - .ulp_stop = bnxt_re_stop, - .ulp_start = bnxt_re_start, - .ulp_sriov_config = bnxt_re_sriov_config, - .ulp_shutdown = bnxt_re_shutdown, .ulp_irq_stop = bnxt_re_stop_irq, .ulp_irq_restart = bnxt_re_start_irq }; /* RoCE -> Net driver */ -/* Driver registration routines used to let the networking driver (bnxt_en) - * to know that the RoCE driver is now installed - */ -static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev) -{ - struct bnxt_en_dev *en_dev; - int rc; - - if (!rdev) - return -EINVAL; - - en_dev = rdev->en_dev; - - rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev, - BNXT_ROCE_ULP); - return rc; -} - static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; int rc = 0; - if (!rdev) - return -EINVAL; - en_dev = rdev->en_dev; - rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP, - &bnxt_re_ulp_ops, rdev); - rdev->qplib_res.pdev = rdev->en_dev->pdev; - return rc; -} - -static int bnxt_re_free_msix(struct bnxt_re_dev *rdev) -{ - struct bnxt_en_dev *en_dev; - int rc; - - if (!rdev) - return -EINVAL; - - en_dev = rdev->en_dev; - - - rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP); - - return rc; -} - -static int bnxt_re_request_msix(struct bnxt_re_dev *rdev) -{ - int rc = 0, num_msix_want = BNXT_RE_MAX_MSIX, num_msix_got; - struct bnxt_en_dev *en_dev; - - if (!rdev) - return -EINVAL; - - en_dev = rdev->en_dev; - - num_msix_want = min_t(u32, BNXT_RE_MAX_MSIX, num_online_cpus()); - - num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP, - rdev->msix_entries, - num_msix_want); - if (num_msix_got < BNXT_RE_MIN_MSIX) { - rc = -EINVAL; - goto done; - } - if (num_msix_got != num_msix_want) { - ibdev_warn(&rdev->ibdev, - "Requested %d MSI-X vectors, got %d\n", - num_msix_want, num_msix_got); - } - rdev->num_msix = num_msix_got; -done: + rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev); + if (!rc) + rdev->qplib_res.pdev = rdev->en_dev->pdev; return rc; } @@ -458,12 +337,17 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, int type) { - struct bnxt_en_dev *en_dev = rdev->en_dev; + struct bnxt_en_dev *en_dev; struct hwrm_ring_free_input req = {0}; struct hwrm_ring_free_output resp; struct bnxt_fw_msg fw_msg; int rc = -EINVAL; + if (!rdev) + return rc; + + en_dev = rdev->en_dev; + if (!en_dev) return rc; @@ -477,7 +361,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, req.ring_id = cpu_to_le16(fw_ring_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); - rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg); + rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x", req.ring_id, rc); @@ -514,7 +398,7 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, req.int_mode = ring_attr->mode; bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); - rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg); + rc = bnxt_send_msg(en_dev, &fw_msg); if (!rc) *fw_ring_id = le16_to_cpu(resp.ring_id); @@ -542,7 +426,7 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); - rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg); + rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x", rc); @@ -575,7 +459,7 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE; bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); - rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg); + rc = bnxt_send_msg(en_dev, &fw_msg); if (!rc) *fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id); @@ -584,21 +468,6 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, /* Device */ -static bool is_bnxt_re_dev(struct net_device *netdev) -{ - struct ethtool_drvinfo drvinfo; - - if (netdev->ethtool_ops && netdev->ethtool_ops->get_drvinfo) { - memset(&drvinfo, 0, sizeof(drvinfo)); - netdev->ethtool_ops->get_drvinfo(netdev, &drvinfo); - - if (strcmp(drvinfo.driver, "bnxt_en")) - return false; - return true; - } - return false; -} - static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev) { struct ib_device *ibdev = @@ -609,31 +478,6 @@ static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev) return container_of(ibdev, struct bnxt_re_dev, ibdev); } -static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) -{ - struct bnxt_en_dev *en_dev; - struct pci_dev *pdev; - - en_dev = bnxt_ulp_probe(netdev); - if (IS_ERR(en_dev)) - return en_dev; - - pdev = en_dev->pdev; - if (!pdev) - return ERR_PTR(-EINVAL); - - if (!(en_dev->flags & BNXT_EN_FLAG_ROCE_CAP)) { - dev_info(&pdev->dev, - "%s: probe error: RoCE is not supported on this device", - ROCE_DRV_MODULE_NAME); - return ERR_PTR(-ENODEV); - } - - dev_hold(netdev); - - return en_dev; -} - static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { @@ -679,7 +523,6 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .create_qp = bnxt_re_create_qp, .create_srq = bnxt_re_create_srq, .create_user_ah = bnxt_re_create_ah, - .dealloc_driver = bnxt_re_dealloc_driver, .dealloc_pd = bnxt_re_dealloc_pd, .dealloc_ucontext = bnxt_re_dealloc_ucontext, .del_gid = bnxt_re_del_gid, @@ -744,18 +587,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev); } -static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev) -{ - dev_put(rdev->netdev); - rdev->netdev = NULL; - mutex_lock(&bnxt_re_dev_lock); - list_del_rcu(&rdev->list); - mutex_unlock(&bnxt_re_dev_lock); - - synchronize_rcu(); -} - -static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev, +static struct bnxt_re_dev *bnxt_re_dev_add(struct bnxt_aux_priv *aux_priv, struct bnxt_en_dev *en_dev) { struct bnxt_re_dev *rdev; @@ -768,8 +600,8 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev, return NULL; } /* Default values */ - rdev->netdev = netdev; - dev_hold(rdev->netdev); + rdev->nb.notifier_call = NULL; + rdev->netdev = en_dev->net; rdev->en_dev = en_dev; rdev->id = rdev->en_dev->pdev->devfn; INIT_LIST_HEAD(&rdev->qp_list); @@ -784,9 +616,6 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev, rdev->cosq[0] = 0xFFFF; rdev->cosq[1] = 0xFFFF; - mutex_lock(&bnxt_re_dev_lock); - list_add_tail_rcu(&rdev->list, &bnxt_re_dev_list); - mutex_unlock(&bnxt_re_dev_lock); return rdev; } @@ -930,7 +759,7 @@ static u32 bnxt_re_get_nqdb_offset(struct bnxt_re_dev *rdev, u16 indx) return bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ? (rdev->is_virtfn ? BNXT_RE_GEN_P5_VF_NQ_DB : BNXT_RE_GEN_P5_PF_NQ_DB) : - rdev->msix_entries[indx].db_offset; + rdev->en_dev->msix_entries[indx].db_offset; } static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) @@ -955,7 +784,7 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) for (i = 1; i < rdev->num_msix ; i++) { db_offt = bnxt_re_get_nqdb_offset(rdev, i); rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], - i - 1, rdev->msix_entries[i].vector, + i - 1, rdev->en_dev->msix_entries[i].vector, db_offt, &bnxt_re_cqn_handler, &bnxt_re_srqn_handler); if (rc) { @@ -1042,7 +871,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_NQE_MAX_CNT - 1; - rattr.lrid = rdev->msix_entries[i + 1].ring_idx; + rattr.lrid = rdev->en_dev->msix_entries[i + 1].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &nq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, @@ -1095,7 +924,6 @@ static int bnxt_re_query_hwrm_pri2cos(struct bnxt_re_dev *rdev, u8 dir, u64 *cid_map) { struct hwrm_queue_pri2cos_qcfg_input req = {0}; - struct bnxt *bp = netdev_priv(rdev->netdev); struct hwrm_queue_pri2cos_qcfg_output resp; struct bnxt_en_dev *en_dev = rdev->en_dev; struct bnxt_fw_msg fw_msg; @@ -1112,11 +940,11 @@ static int bnxt_re_query_hwrm_pri2cos(struct bnxt_re_dev *rdev, u8 dir, flags |= (dir & 0x01); flags |= HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN; req.flags = cpu_to_le32(flags); - req.port_id = bp->pf.port_id; + req.port_id = en_dev->pf_port_id; bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); - rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg); + rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) return rc; @@ -1299,7 +1127,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) req.hwrm_intf_upd = HWRM_VERSION_UPDATE; bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); - rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg); + rc = bnxt_send_msg(en_dev, &fw_msg); if (rc) { ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x", rc); @@ -1323,7 +1151,7 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) pr_err("Failed to register with IB: %#x\n", rc); return rc; } - dev_info(rdev_to_dev(rdev), "Device registered successfully"); + dev_info(rdev_to_dev(rdev), "Device registered with IB successfully"); ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); @@ -1362,20 +1190,12 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) bnxt_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type); bnxt_qplib_free_rcfw_channel(&rdev->rcfw); } - if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) { - rc = bnxt_re_free_msix(rdev); - if (rc) - ibdev_warn(&rdev->ibdev, - "Failed to free MSI-X vectors: %#x", rc); - } + if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) + rdev->num_msix = 0; bnxt_re_destroy_chip_ctx(rdev); - if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) { - rc = bnxt_re_unregister_netdev(rdev); - if (rc) - ibdev_warn(&rdev->ibdev, - "Failed to unregister with netdev: %#x", rc); - } + if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnxt_unregister_dev(rdev->en_dev); } /* worker thread for polling periodic events. Now used for QoS programming*/ @@ -1416,13 +1236,15 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); - rc = bnxt_re_request_msix(rdev); - if (rc) { + if (!rdev->en_dev->ulp_tbl->msix_requested) { ibdev_err(&rdev->ibdev, "Failed to get MSI-X vectors: %#x\n", rc); rc = -EINVAL; goto fail; } + ibdev_dbg(&rdev->ibdev, "Got %d MSI-X vectors\n", + rdev->en_dev->ulp_tbl->msix_requested); + rdev->num_msix = rdev->en_dev->ulp_tbl->msix_requested; set_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags); bnxt_re_query_hwrm_intf_version(rdev); @@ -1446,14 +1268,14 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) rattr.type = type; rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX; rattr.depth = BNXT_QPLIB_CREQE_MAX_CNT - 1; - rattr.lrid = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; + rattr.lrid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; rc = bnxt_re_net_ring_alloc(rdev, &rattr, &creq->ring_id); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc); goto free_rcfw; } db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX); - vid = rdev->msix_entries[BNXT_RE_AEQ_IDX].vector; + vid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].vector; rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw, vid, db_offt, rdev->is_virtfn, &bnxt_re_aeq_handler); @@ -1521,6 +1343,11 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); + /* + * Use the total VF count since the actual VF count may not be + * available at this point. + */ + bnxt_re_vf_res_config(rdev); } return 0; @@ -1541,135 +1368,43 @@ fail: return rc; } -static void bnxt_re_dev_unreg(struct bnxt_re_dev *rdev) -{ - struct net_device *netdev = rdev->netdev; - - bnxt_re_dev_remove(rdev); - - if (netdev) - dev_put(netdev); -} - -static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev) +static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) { + struct bnxt_aux_priv *aux_priv = + container_of(adev, struct bnxt_aux_priv, aux_dev); struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; int rc = 0; - if (!is_bnxt_re_dev(netdev)) - return -ENODEV; + /* en_dev should never be NULL as long as adev and aux_dev are valid. */ + en_dev = aux_priv->edev; - en_dev = bnxt_re_dev_probe(netdev); - if (IS_ERR(en_dev)) { - if (en_dev != ERR_PTR(-ENODEV)) - ibdev_err(&(*rdev)->ibdev, "%s: Failed to probe\n", - ROCE_DRV_MODULE_NAME); - rc = PTR_ERR(en_dev); - goto exit; - } - *rdev = bnxt_re_dev_add(netdev, en_dev); - if (!*rdev) { + rdev = bnxt_re_dev_add(aux_priv, en_dev); + if (!rdev || !rdev_to_dev(rdev)) { rc = -ENOMEM; - dev_put(netdev); goto exit; } -exit: - return rc; -} -static void bnxt_re_remove_device(struct bnxt_re_dev *rdev) -{ - bnxt_re_dev_uninit(rdev); - pci_dev_put(rdev->en_dev->pdev); - bnxt_re_dev_unreg(rdev); -} - -static int bnxt_re_add_device(struct bnxt_re_dev **rdev, - struct net_device *netdev, u8 wqe_mode) -{ - int rc; - - rc = bnxt_re_dev_reg(rdev, netdev); - if (rc == -ENODEV) - return rc; - if (rc) { - pr_err("Failed to register with the device %s: %#x\n", - netdev->name, rc); - return rc; - } + rc = bnxt_re_dev_init(rdev, wqe_mode); + if (rc) + goto re_dev_dealloc; - pci_dev_get((*rdev)->en_dev->pdev); - rc = bnxt_re_dev_init(*rdev, wqe_mode); + rc = bnxt_re_ib_init(rdev); if (rc) { - pci_dev_put((*rdev)->en_dev->pdev); - bnxt_re_dev_unreg(*rdev); + pr_err("Failed to register with IB: %s", + aux_priv->aux_dev.name); + goto re_dev_uninit; } + auxiliary_set_drvdata(adev, rdev); - return rc; -} - -static void bnxt_re_dealloc_driver(struct ib_device *ib_dev) -{ - struct bnxt_re_dev *rdev = - container_of(ib_dev, struct bnxt_re_dev, ibdev); - - dev_info(rdev_to_dev(rdev), "Unregistering Device"); - - rtnl_lock(); - bnxt_re_remove_device(rdev); - rtnl_unlock(); -} - -/* Handle all deferred netevents tasks */ -static void bnxt_re_task(struct work_struct *work) -{ - struct bnxt_re_work *re_work; - struct bnxt_re_dev *rdev; - int rc = 0; - - re_work = container_of(work, struct bnxt_re_work, work); - rdev = re_work->rdev; - - if (re_work->event == NETDEV_REGISTER) { - rc = bnxt_re_ib_init(rdev); - if (rc) { - ibdev_err(&rdev->ibdev, - "Failed to register with IB: %#x", rc); - rtnl_lock(); - bnxt_re_remove_device(rdev); - rtnl_unlock(); - goto exit; - } - goto exit; - } - - if (!ib_device_try_get(&rdev->ibdev)) - goto exit; + return 0; - switch (re_work->event) { - case NETDEV_UP: - bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, - IB_EVENT_PORT_ACTIVE); - break; - case NETDEV_DOWN: - bnxt_re_dev_stop(rdev); - break; - case NETDEV_CHANGE: - if (!netif_carrier_ok(rdev->netdev)) - bnxt_re_dev_stop(rdev); - else if (netif_carrier_ok(rdev->netdev)) - bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, - IB_EVENT_PORT_ACTIVE); - ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, - &rdev->active_width); - break; - default: - break; - } - ib_device_put(&rdev->ibdev); +re_dev_uninit: + bnxt_re_dev_uninit(rdev); +re_dev_dealloc: + ib_dealloc_device(&rdev->ibdev); exit: - put_device(&rdev->ibdev.dev); - kfree(re_work); + return rc; } /* @@ -1690,109 +1425,189 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, unsigned long event, void *ptr) { struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr); - struct bnxt_re_work *re_work; struct bnxt_re_dev *rdev; - int rc = 0; - bool sch_work = false; - bool release = true; real_dev = rdma_vlan_dev_real_dev(netdev); if (!real_dev) real_dev = netdev; - rdev = bnxt_re_from_netdev(real_dev); - if (!rdev && event != NETDEV_REGISTER) - return NOTIFY_OK; - if (real_dev != netdev) goto exit; - switch (event) { - case NETDEV_REGISTER: - if (rdev) - break; - rc = bnxt_re_add_device(&rdev, real_dev, - BNXT_QPLIB_WQE_MODE_STATIC); - if (!rc) - sch_work = true; - release = false; - break; + rdev = bnxt_re_from_netdev(real_dev); + if (!rdev) + return NOTIFY_DONE; - case NETDEV_UNREGISTER: - ib_unregister_device_queued(&rdev->ibdev); - break; + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, + netif_carrier_ok(real_dev) ? + IB_EVENT_PORT_ACTIVE : + IB_EVENT_PORT_ERR); + break; default: - sch_work = true; break; } - if (sch_work) { - /* Allocate for the deferred task */ - re_work = kzalloc(sizeof(*re_work), GFP_KERNEL); - if (re_work) { - get_device(&rdev->ibdev.dev); - re_work->rdev = rdev; - re_work->event = event; - re_work->vlan_dev = (real_dev == netdev ? - NULL : netdev); - INIT_WORK(&re_work->work, bnxt_re_task); - queue_work(bnxt_re_wq, &re_work->work); - } - } - + ib_device_put(&rdev->ibdev); exit: - if (rdev && release) - ib_device_put(&rdev->ibdev); return NOTIFY_DONE; } -static struct notifier_block bnxt_re_netdev_notifier = { - .notifier_call = bnxt_re_netdev_event -}; +#define BNXT_ADEV_NAME "bnxt_en" -static int __init bnxt_re_mod_init(void) +static void bnxt_re_remove(struct auxiliary_device *adev) { - int rc = 0; + struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - pr_info("%s: %s", ROCE_DRV_MODULE_NAME, version); + if (!rdev) + return; - bnxt_re_wq = create_singlethread_workqueue("bnxt_re"); - if (!bnxt_re_wq) - return -ENOMEM; + mutex_lock(&bnxt_re_mutex); + if (rdev->nb.notifier_call) { + unregister_netdevice_notifier(&rdev->nb); + rdev->nb.notifier_call = NULL; + } else { + /* If notifier is null, we should have already done a + * clean up before coming here. + */ + goto skip_remove; + } + + ib_unregister_device(&rdev->ibdev); + ib_dealloc_device(&rdev->ibdev); + bnxt_re_dev_uninit(rdev); +skip_remove: + mutex_unlock(&bnxt_re_mutex); +} + +static int bnxt_re_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct bnxt_re_dev *rdev; + int rc; - INIT_LIST_HEAD(&bnxt_re_dev_list); + mutex_lock(&bnxt_re_mutex); + rc = bnxt_re_add_device(adev, BNXT_QPLIB_WQE_MODE_STATIC); + if (rc) { + mutex_unlock(&bnxt_re_mutex); + return rc; + } + + rdev = auxiliary_get_drvdata(adev); - rc = register_netdevice_notifier(&bnxt_re_netdev_notifier); + rdev->nb.notifier_call = bnxt_re_netdev_event; + rc = register_netdevice_notifier(&rdev->nb); if (rc) { + rdev->nb.notifier_call = NULL; pr_err("%s: Cannot register to netdevice_notifier", ROCE_DRV_MODULE_NAME); - goto err_netdev; + goto err; } + + mutex_unlock(&bnxt_re_mutex); return 0; -err_netdev: - destroy_workqueue(bnxt_re_wq); +err: + mutex_unlock(&bnxt_re_mutex); + bnxt_re_remove(adev); return rc; } -static void __exit bnxt_re_mod_exit(void) +static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) { - struct bnxt_re_dev *rdev; + struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - unregister_netdevice_notifier(&bnxt_re_netdev_notifier); - if (bnxt_re_wq) - destroy_workqueue(bnxt_re_wq); - list_for_each_entry(rdev, &bnxt_re_dev_list, list) { - /* VF device removal should be called before the removal - * of PF device. Queue VFs unregister first, so that VFs - * shall be removed before the PF during the call of - * ib_unregister_driver. - */ - if (rdev->is_virtfn) - ib_unregister_device(&rdev->ibdev); + if (!rdev) + return 0; + + mutex_lock(&bnxt_re_mutex); + /* L2 driver may invoke this callback during device error/crash or device + * reset. Current RoCE driver doesn't recover the device in case of + * error. Handle the error by dispatching fatal events to all qps + * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as + * L2 driver want to modify the MSIx table. + */ + + ibdev_info(&rdev->ibdev, "Handle device suspend call"); + /* Check the current device state from bnxt_en_dev and move the + * device to detached state if FW_FATAL_COND is set. + * This prevents more commands to HW during clean-up, + * in case the device is already in error. + */ + if (test_bit(BNXT_STATE_FW_FATAL_COND, &rdev->en_dev->en_state)) + set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + + bnxt_re_dev_stop(rdev); + bnxt_re_stop_irq(rdev); + /* Move the device states to detached and avoid sending any more + * commands to HW + */ + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + mutex_unlock(&bnxt_re_mutex); + + return 0; +} + +static int bnxt_re_resume(struct auxiliary_device *adev) +{ + struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + + if (!rdev) + return 0; + + mutex_lock(&bnxt_re_mutex); + /* L2 driver may invoke this callback during device recovery, resume. + * reset. Current RoCE driver doesn't recover the device in case of + * error. Handle the error by dispatching fatal events to all qps + * ie. by calling bnxt_re_dev_stop and release the MSIx vectors as + * L2 driver want to modify the MSIx table. + */ + + ibdev_info(&rdev->ibdev, "Handle device resume call"); + mutex_unlock(&bnxt_re_mutex); + + return 0; +} + +static const struct auxiliary_device_id bnxt_re_id_table[] = { + { .name = BNXT_ADEV_NAME ".rdma", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, bnxt_re_id_table); + +static struct auxiliary_driver bnxt_re_driver = { + .name = "rdma", + .probe = bnxt_re_probe, + .remove = bnxt_re_remove, + .shutdown = bnxt_re_shutdown, + .suspend = bnxt_re_suspend, + .resume = bnxt_re_resume, + .id_table = bnxt_re_id_table, +}; + +static int __init bnxt_re_mod_init(void) +{ + int rc = 0; + + pr_info("%s: %s", ROCE_DRV_MODULE_NAME, version); + rc = auxiliary_driver_register(&bnxt_re_driver); + if (rc) { + pr_err("%s: Failed to register auxiliary driver\n", + ROCE_DRV_MODULE_NAME); + return rc; } - ib_unregister_driver(RDMA_DRIVER_BNXT_RE); + return 0; +} + +static void __exit bnxt_re_mod_exit(void) +{ + auxiliary_driver_unregister(&bnxt_re_driver); } module_init(bnxt_re_mod_init); diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 499a425a3379..ced615b5ea09 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2676,6 +2676,9 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) u16 tcp_opt = ntohs(req->tcp_opt); ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; + pr_debug("ep %p tid %u\n", ep, ep->hwtid); ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); @@ -4144,6 +4147,10 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) if (neigh->dev->flags & IFF_LOOPBACK) { pdev = ip_dev_find(&init_net, iph->daddr); + if (!pdev) { + pr_err("%s - failed to find device!\n", __func__); + goto free_dst; + } e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh, pdev, 0); pi = (struct port_info *)netdev_priv(pdev); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index c7e8d7b3baa1..7e2835dcbc1c 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -767,7 +767,7 @@ static int __c4iw_poll_cq_one(struct c4iw_cq *chp, struct c4iw_qp *qhp, goto out; wc->wr_id = cookie; - wc->qp = qhp ? &qhp->ibqp : NULL; + wc->qp = &qhp->ibqp; wc->vendor_err = CQE_STATUS(&cqe); wc->wc_flags = 0; diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index ff645b955a08..fd22c85d35f4 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -238,7 +238,7 @@ int c4iw_fill_res_cm_id_entry(struct sk_buff *msg, if (rdma_nl_put_driver_u64_hex(msg, "history", epcp->history)) goto err_cancel_table; - if (epcp->state == LISTEN) { + if (listen_ep) { if (rdma_nl_put_driver_u32(msg, "stid", listen_ep->stid)) goto err_cancel_table; if (rdma_nl_put_driver_u32(msg, "backlog", listen_ep->backlog)) diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index a2f5e29ef226..1f79537fc8d1 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -122,9 +122,7 @@ struct fw_ri_dsgl { __be16 nsge; __be32 len0; __be64 addr0; -#ifndef C99_NOT_SUPPORTED struct fw_ri_dsge_pair sge[]; -#endif }; struct fw_ri_sge { @@ -138,9 +136,7 @@ struct fw_ri_isgl { __u8 r1; __be16 nsge; __be32 r2; -#ifndef C99_NOT_SUPPORTED struct fw_ri_sge sge[]; -#endif }; struct fw_ri_immd { @@ -148,9 +144,7 @@ struct fw_ri_immd { __u8 r1; __be16 r2; __be32 immdlen; -#ifndef C99_NOT_SUPPORTED __u8 data[]; -#endif }; struct fw_ri_tpte { @@ -320,9 +314,7 @@ struct fw_ri_res_wr { __be32 op_nres; __be32 len16_pkd; __u64 cookie; -#ifndef C99_NOT_SUPPORTED struct fw_ri_res res[]; -#endif }; #define FW_RI_RES_WR_NRES_S 0 @@ -562,12 +554,10 @@ struct fw_ri_rdma_write_wr { __be32 plen; __be32 stag_sink; __be64 to_sink; -#ifndef C99_NOT_SUPPORTED union { - struct fw_ri_immd immd_src[0]; - struct fw_ri_isgl isgl_src[0]; + DECLARE_FLEX_ARRAY(struct fw_ri_immd, immd_src); + DECLARE_FLEX_ARRAY(struct fw_ri_isgl, isgl_src); } u; -#endif }; struct fw_ri_send_wr { @@ -581,12 +571,10 @@ struct fw_ri_send_wr { __be32 plen; __be32 r3; __be64 r4; -#ifndef C99_NOT_SUPPORTED union { - struct fw_ri_immd immd_src[0]; - struct fw_ri_isgl isgl_src[0]; + DECLARE_FLEX_ARRAY(struct fw_ri_immd, immd_src); + DECLARE_FLEX_ARRAY(struct fw_ri_isgl, isgl_src); } u; -#endif }; #define FW_RI_SEND_WR_SENDOP_S 0 @@ -618,12 +606,10 @@ struct fw_ri_rdma_write_cmpl_wr { struct fw_ri_isgl isgl_src; } u_cmpl; __be64 r3; -#ifndef C99_NOT_SUPPORTED union fw_ri_write { - struct fw_ri_immd immd_src[0]; - struct fw_ri_isgl isgl_src[0]; + DECLARE_FLEX_ARRAY(struct fw_ri_immd, immd_src); + DECLARE_FLEX_ARRAY(struct fw_ri_isgl, isgl_src); } u; -#endif }; struct fw_ri_rdma_read_wr { diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index 74f6348f240a..771059a8eb7d 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -11,6 +11,7 @@ /* Copyright (c) 2017, Open Grid Computing, Inc. */ #include <linux/workqueue.h> +#include <trace/events/sock.h> #include "erdma.h" #include "erdma_cm.h" @@ -925,6 +926,8 @@ static void erdma_cm_llp_data_ready(struct sock *sk) { struct erdma_cep *cep; + trace_sk_data_ready(sk); + read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index ab371fec610c..4c38d99c73f1 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -397,7 +397,7 @@ struct erdma_write_sqe { __le32 rsvd; - struct erdma_sge sgl[0]; + struct erdma_sge sgl[]; }; struct erdma_send_sqe { @@ -408,7 +408,7 @@ struct erdma_send_sqe { }; __le32 length; - struct erdma_sge sgl[0]; + struct erdma_sge sgl[]; }; struct erdma_readreq_sqe { diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 5dab1e87975b..9c30d78730aa 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1110,12 +1110,14 @@ int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) prot = pgprot_device(vma->vm_page_prot); break; default: - return -EINVAL; + err = -EINVAL; + goto put_entry; } err = rdma_user_mmap_io(ctx, vma, PFN_DOWN(entry->address), PAGE_SIZE, prot, rdma_entry); +put_entry: rdma_user_mmap_entry_put(rdma_entry); return err; } diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index ebe970f76232..90b672feed83 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1056,7 +1056,7 @@ static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); static void handle_temp_err(struct hfi1_devdata *dd); static void dc_shutdown(struct hfi1_devdata *dd); static void dc_start(struct hfi1_devdata *dd); -static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, unsigned int *np); static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd); static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms); @@ -13362,7 +13362,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int ret; unsigned ngroups; int rmt_count; - int user_rmt_reduced; u32 n_usr_ctxts; u32 send_contexts = chip_send_contexts(dd); u32 rcv_contexts = chip_rcv_contexts(dd); @@ -13421,28 +13420,34 @@ static int set_up_context_variables(struct hfi1_devdata *dd) (num_kernel_contexts + n_usr_ctxts), &node_affinity.real_cpu_mask); /* - * The RMT entries are currently allocated as shown below: - * 1. QOS (0 to 128 entries); - * 2. FECN (num_kernel_context - 1 + num_user_contexts + - * num_netdev_contexts); - * 3. netdev (num_netdev_contexts). - * It should be noted that FECN oversubscribe num_netdev_contexts - * entries of RMT because both netdev and PSM could allocate any receive - * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts, - * and PSM FECN must reserve an RMT entry for each possible PSM receive - * context. + * RMT entries are allocated as follows: + * 1. QOS (0 to 128 entries) + * 2. FECN (num_kernel_context - 1 [a] + num_user_contexts + + * num_netdev_contexts [b]) + * 3. netdev (NUM_NETDEV_MAP_ENTRIES) + * + * Notes: + * [a] Kernel contexts (except control) are included in FECN if kernel + * TID_RDMA is active. + * [b] Netdev and user contexts are randomly allocated from the same + * context pool, so FECN must cover all contexts in the pool. */ - rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_netdev_contexts * 2); - if (HFI1_CAP_IS_KSET(TID_RDMA)) - rmt_count += num_kernel_contexts - 1; - if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) { - user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count; - dd_dev_err(dd, - "RMT size is reducing the number of user receive contexts from %u to %d\n", - n_usr_ctxts, - user_rmt_reduced); - /* recalculate */ - n_usr_ctxts = user_rmt_reduced; + rmt_count = qos_rmt_entries(num_kernel_contexts - 1, NULL, NULL) + + (HFI1_CAP_IS_KSET(TID_RDMA) ? num_kernel_contexts - 1 + : 0) + + n_usr_ctxts + + num_netdev_contexts + + NUM_NETDEV_MAP_ENTRIES; + if (rmt_count > NUM_MAP_ENTRIES) { + int over = rmt_count - NUM_MAP_ENTRIES; + /* try to squish user contexts, minimum of 1 */ + if (over >= n_usr_ctxts) { + dd_dev_err(dd, "RMT overflow: reduce the requested number of contexts\n"); + return -EINVAL; + } + dd_dev_err(dd, "RMT overflow: reducing # user contexts from %u to %u\n", + n_usr_ctxts, n_usr_ctxts - over); + n_usr_ctxts -= over; } /* the first N are kernel contexts, the rest are user/netdev contexts */ @@ -14299,15 +14304,15 @@ static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index) } /* return the number of RSM map table entries that will be used for QOS */ -static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, unsigned int *np) { int i; unsigned int m, n; - u8 max_by_vl = 0; + uint max_by_vl = 0; /* is QOS active at all? */ - if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || + if (n_krcv_queues < MIN_KERNEL_KCTXTS || num_vls == 1 || krcvqsset <= 1) goto no_qos; @@ -14365,7 +14370,7 @@ static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) if (!rmt) goto bail; - rmt_entries = qos_rmt_entries(dd, &m, &n); + rmt_entries = qos_rmt_entries(dd->n_krcv_queues - 1, &m, &n); if (rmt_entries == 0) goto bail; qpns_per_vl = 1 << m; diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h index c6291bbf723c..41f7fe5d1839 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/exp_rcv.h @@ -133,12 +133,13 @@ static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) return grp; } -static inline u32 rcventry2tidinfo(u32 rcventry) +static inline u32 create_tid(u32 rcventry, u32 npages) { u32 pair = rcventry & ~0x1; return EXP_TID_SET(IDX, pair >> 1) | - EXP_TID_SET(CTRL, 1 << (rcventry - pair)); + EXP_TID_SET(CTRL, 1 << (rcventry - pair)) | + EXP_TID_SET(LEN, npages); } /** diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f5f9269fdc16..b1d6ca7e9708 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -306,6 +306,17 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) return reqs; } +static inline void mmap_cdbg(u16 ctxt, u8 subctxt, u8 type, u8 mapio, u8 vmf, + u64 memaddr, void *memvirt, dma_addr_t memdma, + ssize_t memlen, struct vm_area_struct *vma) +{ + hfi1_cdbg(PROC, + "%u:%u type:%u io/vf/dma:%d/%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx", + ctxt, subctxt, type, mapio, vmf, !!memdma, + memaddr ?: (u64)memvirt, memlen, + vma->vm_end - vma->vm_start, vma->vm_flags); +} + static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) { struct hfi1_filedata *fd = fp->private_data; @@ -315,6 +326,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) u64 token = vma->vm_pgoff << PAGE_SHIFT, memaddr = 0; void *memvirt = NULL; + dma_addr_t memdma = 0; u8 subctxt, mapio = 0, vmf = 0, type; ssize_t memlen = 0; int ret = 0; @@ -334,6 +346,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } + /* + * vm_pgoff is used as a buffer selector cookie. Always mmap from + * the beginning. + */ + vma->vm_pgoff = 0; flags = vma->vm_flags; switch (type) { @@ -355,7 +372,8 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); mapio = 1; break; - case PIO_CRED: + case PIO_CRED: { + u64 cr_page_offset; if (flags & VM_WRITE) { ret = -EPERM; goto done; @@ -365,10 +383,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) * second or third page allocated for credit returns (if number * of enabled contexts > 64 and 128 respectively). */ - memvirt = dd->cr_base[uctxt->numa_id].va; - memaddr = virt_to_phys(memvirt) + - (((u64)uctxt->sc->hw_free - - (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK); + cr_page_offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) & + PAGE_MASK; + memvirt = dd->cr_base[uctxt->numa_id].va + cr_page_offset; + memdma = dd->cr_base[uctxt->numa_id].dma + cr_page_offset; memlen = PAGE_SIZE; flags &= ~VM_MAYWRITE; flags |= VM_DONTCOPY | VM_DONTEXPAND; @@ -378,14 +397,16 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) * memory been flagged as non-cached? */ /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */ - mapio = 1; break; + } case RCV_HDRQ: memlen = rcvhdrq_size(uctxt); memvirt = uctxt->rcvhdrq; + memdma = uctxt->rcvhdrq_dma; break; case RCV_EGRBUF: { - unsigned long addr; + unsigned long vm_start_save; + unsigned long vm_end_save; int i; /* * The RcvEgr buffer need to be handled differently @@ -403,25 +424,35 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EPERM; goto done; } - vma->vm_flags &= ~VM_MAYWRITE; - addr = vma->vm_start; + vm_flags_clear(vma, VM_MAYWRITE); + /* + * Mmap multiple separate allocations into a single vma. From + * here, dma_mmap_coherent() calls dma_direct_mmap(), which + * requires the mmap to exactly fill the vma starting at + * vma_start. Adjust the vma start and end for each eager + * buffer segment mapped. Restore the originals when done. + */ + vm_start_save = vma->vm_start; + vm_end_save = vma->vm_end; + vma->vm_end = vma->vm_start; for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { memlen = uctxt->egrbufs.buffers[i].len; memvirt = uctxt->egrbufs.buffers[i].addr; - ret = remap_pfn_range( - vma, addr, - /* - * virt_to_pfn() does the same, but - * it's not available on x86_64 - * when CONFIG_MMU is enabled. - */ - PFN_DOWN(__pa(memvirt)), - memlen, - vma->vm_page_prot); - if (ret < 0) + memdma = uctxt->egrbufs.buffers[i].dma; + vma->vm_end += memlen; + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, + memvirt, memdma, memlen, vma); + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); + if (ret < 0) { + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; goto done; - addr += memlen; + } + vma->vm_start += memlen; } + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; ret = 0; goto done; } @@ -481,6 +512,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) } memlen = PAGE_SIZE; memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt); + memdma = uctxt->rcvhdrqtailaddr_dma; flags &= ~VM_MAYWRITE; break; case SUBCTXT_UREGS: @@ -528,15 +560,16 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } - vma->vm_flags = flags; - hfi1_cdbg(PROC, - "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", - ctxt, subctxt, type, mapio, vmf, memaddr, memlen, - vma->vm_end - vma->vm_start, vma->vm_flags); + vm_flags_reset(vma, flags); + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, memvirt, memdma, + memlen, vma); if (vmf) { vma->vm_pgoff = PFN_DOWN(memaddr); vma->vm_ops = &vm_ops; ret = 0; + } else if (memdma) { + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); } else if (mapio) { ret = io_remap_pfn_range(vma, vma->vm_start, PFN_DOWN(memaddr), @@ -1318,12 +1351,15 @@ static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg, addr = arg + offsetof(struct hfi1_tid_info, tidcnt); if (copy_to_user((void __user *)addr, &tinfo.tidcnt, sizeof(tinfo.tidcnt))) - return -EFAULT; + ret = -EFAULT; addr = arg + offsetof(struct hfi1_tid_info, length); - if (copy_to_user((void __user *)addr, &tinfo.length, + if (!ret && copy_to_user((void __user *)addr, &tinfo.length, sizeof(tinfo.length))) ret = -EFAULT; + + if (ret) + hfi1_user_exp_rcv_invalid(fd, &tinfo); } return ret; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 24c0f0d257fc..62b6c5020039 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -464,7 +464,7 @@ bail: * * This wrapper is the free function that matches hfi1_create_ctxtdata(). * When a context is done being used (kernel or user), this function is called - * for the "final" put to match the kref init from hf1i_create_ctxtdata(). + * for the "final" put to match the kref init from hfi1_create_ctxtdata(). * Other users of the context do a get/put sequence to make sure that the * structure isn't removed while in use. */ diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index a95b654f5254..8ed20392e9f0 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -3160,8 +3160,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) { int rval = 0; - tx->num_desc++; - if ((unlikely(tx->num_desc == tx->desc_limit))) { + if ((unlikely(tx->num_desc + 1 == tx->desc_limit))) { rval = _extend_sdma_tx_descs(dd, tx); if (rval) { __sdma_txclean(dd, tx); @@ -3174,6 +3173,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) SDMA_MAP_NONE, dd->sdma_pad_phys, sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); + tx->num_desc++; _sdma_close_tx(dd, tx); return rval; } diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index d8170fcbfbdd..b023fc461bd5 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -631,14 +631,13 @@ static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx) static inline void _sdma_close_tx(struct hfi1_devdata *dd, struct sdma_txreq *tx) { - tx->descp[tx->num_desc].qw[0] |= - SDMA_DESC0_LAST_DESC_FLAG; - tx->descp[tx->num_desc].qw[1] |= - dd->default_desc1; + u16 last_desc = tx->num_desc - 1; + + tx->descp[last_desc].qw[0] |= SDMA_DESC0_LAST_DESC_FLAG; + tx->descp[last_desc].qw[1] |= dd->default_desc1; if (tx->flags & SDMA_TXREQ_F_URGENT) - tx->descp[tx->num_desc].qw[1] |= - (SDMA_DESC1_HEAD_TO_HOST_FLAG | - SDMA_DESC1_INT_REQ_FLAG); + tx->descp[last_desc].qw[1] |= (SDMA_DESC1_HEAD_TO_HOST_FLAG | + SDMA_DESC1_INT_REQ_FLAG); } static inline int _sdma_txadd_daddr( @@ -655,6 +654,7 @@ static inline int _sdma_txadd_daddr( type, addr, len); WARN_ON(len > tx->tlen); + tx->num_desc++; tx->tlen -= len; /* special cases for last */ if (!tx->tlen) { @@ -666,7 +666,6 @@ static inline int _sdma_txadd_daddr( _sdma_close_tx(dd, tx); } } - tx->num_desc++; return rval; } diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 186d30291260..96058baf36ed 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -23,18 +23,24 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, const struct mmu_notifier_range *range, unsigned long cur_seq); +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, - struct tid_group *grp, - unsigned int start, u16 count, + struct tid_group *grp, u16 count, u32 *tidlist, unsigned int *tididx, unsigned int *pmapped); -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp); +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); +static void __clear_tid_node(struct hfi1_filedata *fd, + struct tid_rb_node *node); static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); static const struct mmu_interval_notifier_ops tid_mn_ops = { .invalidate = tid_rb_invalidate, }; +static const struct mmu_interval_notifier_ops tid_cover_ops = { + .invalidate = tid_cover_invalidate, +}; /* * Initialize context and file private data needed for Expected @@ -153,16 +159,11 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd, static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) { int pinned; - unsigned int npages; + unsigned int npages = tidbuf->npages; unsigned long vaddr = tidbuf->vaddr; struct page **pages = NULL; struct hfi1_devdata *dd = fd->uctxt->dd; - /* Get the number of pages the user buffer spans */ - npages = num_user_pages(vaddr, tidbuf->length); - if (!npages) - return -EINVAL; - if (npages > fd->uctxt->expected_count) { dd_dev_err(dd, "Expected buffer too big\n"); return -EINVAL; @@ -189,7 +190,6 @@ static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) return pinned; } tidbuf->pages = pages; - tidbuf->npages = npages; fd->tid_n_pinned += pinned; return pinned; } @@ -249,57 +249,70 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, int ret = 0, need_group = 0, pinned; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; - unsigned int ngroups, pageidx = 0, pageset_count, + unsigned int ngroups, pageset_count, tididx = 0, mapped, mapped_pages = 0; u32 *tidlist = NULL; struct tid_user_buf *tidbuf; + unsigned long mmu_seq = 0; if (!PAGE_ALIGNED(tinfo->vaddr)) return -EINVAL; + if (tinfo->length == 0) + return -EINVAL; tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); if (!tidbuf) return -ENOMEM; + mutex_init(&tidbuf->cover_mutex); tidbuf->vaddr = tinfo->vaddr; tidbuf->length = tinfo->length; + tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length); tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), GFP_KERNEL); if (!tidbuf->psets) { - kfree(tidbuf); - return -ENOMEM; + ret = -ENOMEM; + goto fail_release_mem; + } + + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &tidbuf->notifier, current->mm, + tidbuf->vaddr, tidbuf->npages * PAGE_SIZE, + &tid_cover_ops); + if (ret) + goto fail_release_mem; + mmu_seq = mmu_interval_read_begin(&tidbuf->notifier); } pinned = pin_rcv_pages(fd, tidbuf); if (pinned <= 0) { - kfree(tidbuf->psets); - kfree(tidbuf); - return pinned; + ret = (pinned < 0) ? pinned : -ENOSPC; + goto fail_unpin; } /* Find sets of physically contiguous pages */ tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); - /* - * We don't need to access this under a lock since tid_used is per - * process and the same process cannot be in hfi1_user_exp_rcv_clear() - * and hfi1_user_exp_rcv_setup() at the same time. - */ + /* Reserve the number of expected tids to be used. */ spin_lock(&fd->tid_lock); if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) pageset_count = fd->tid_limit - fd->tid_used; else pageset_count = tidbuf->n_psets; + fd->tid_used += pageset_count; spin_unlock(&fd->tid_lock); - if (!pageset_count) - goto bail; + if (!pageset_count) { + ret = -ENOSPC; + goto fail_unreserve; + } ngroups = pageset_count / dd->rcv_entries.group_size; tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); if (!tidlist) { ret = -ENOMEM; - goto nomem; + goto fail_unreserve; } tididx = 0; @@ -318,7 +331,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_pop(&uctxt->tid_group_list); ret = program_rcvarray(fd, tidbuf, grp, - pageidx, dd->rcv_entries.group_size, + dd->rcv_entries.group_size, tidlist, &tididx, &mapped); /* * If there was a failure to program the RcvArray @@ -334,11 +347,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_add_tail(grp, &uctxt->tid_full_list); ngroups--; - pageidx += ret; mapped_pages += mapped; } - while (pageidx < pageset_count) { + while (tididx < pageset_count) { struct tid_group *grp, *ptr; /* * If we don't have any partially used tid groups, check @@ -360,11 +372,11 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, */ list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, list) { - unsigned use = min_t(unsigned, pageset_count - pageidx, + unsigned use = min_t(unsigned, pageset_count - tididx, grp->size - grp->used); ret = program_rcvarray(fd, tidbuf, grp, - pageidx, use, tidlist, + use, tidlist, &tididx, &mapped); if (ret < 0) { hfi1_cdbg(TID, @@ -376,11 +388,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_move(grp, &uctxt->tid_used_list, &uctxt->tid_full_list); - pageidx += ret; mapped_pages += mapped; need_group = 0; /* Check if we are done so we break out early */ - if (pageidx >= pageset_count) + if (tididx >= pageset_count) break; } else if (WARN_ON(ret == 0)) { /* @@ -395,43 +406,78 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, } unlock: mutex_unlock(&uctxt->exp_mutex); -nomem: hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, mapped_pages, ret); - if (tididx) { - spin_lock(&fd->tid_lock); - fd->tid_used += tididx; - spin_unlock(&fd->tid_lock); - tinfo->tidcnt = tididx; - tinfo->length = mapped_pages * PAGE_SIZE; - - if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), - tidlist, sizeof(tidlist[0]) * tididx)) { - /* - * On failure to copy to the user level, we need to undo - * everything done so far so we don't leak resources. - */ - tinfo->tidlist = (unsigned long)&tidlist; - hfi1_user_exp_rcv_clear(fd, tinfo); - tinfo->tidlist = 0; - ret = -EFAULT; - goto bail; + + /* fail if nothing was programmed, set error if none provided */ + if (tididx == 0) { + if (ret >= 0) + ret = -ENOSPC; + goto fail_unreserve; + } + + /* adjust reserved tid_used to actual count */ + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count - tididx; + spin_unlock(&fd->tid_lock); + + /* unpin all pages not covered by a TID */ + unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, + false); + + if (fd->use_mn) { + /* check for an invalidate during setup */ + bool fail = false; + + mutex_lock(&tidbuf->cover_mutex); + fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq); + mutex_unlock(&tidbuf->cover_mutex); + + if (fail) { + ret = -EBUSY; + goto fail_unprogram; } } - /* - * If not everything was mapped (due to insufficient RcvArray entries, - * for example), unpin all unmapped pages so we can pin them nex time. - */ - if (mapped_pages != pinned) - unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, - (pinned - mapped_pages), false); -bail: + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), + tidlist, sizeof(tidlist[0]) * tididx)) { + ret = -EFAULT; + goto fail_unprogram; + } + + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + kfree(tidbuf->pages); kfree(tidbuf->psets); + kfree(tidbuf); kfree(tidlist); + return 0; + +fail_unprogram: + /* unprogram, unmap, and unpin all allocated TIDs */ + tinfo->tidlist = (unsigned long)tidlist; + hfi1_user_exp_rcv_clear(fd, tinfo); + tinfo->tidlist = 0; + pinned = 0; /* nothing left to unpin */ + pageset_count = 0; /* nothing left reserved */ +fail_unreserve: + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count; + spin_unlock(&fd->tid_lock); +fail_unpin: + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + if (pinned > 0) + unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); +fail_release_mem: kfree(tidbuf->pages); + kfree(tidbuf->psets); kfree(tidbuf); - return ret > 0 ? 0 : ret; + kfree(tidlist); + return ret; } int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, @@ -452,7 +498,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, mutex_lock(&uctxt->exp_mutex); for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { - ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); + ret = unprogram_rcvarray(fd, tidinfo[tididx]); if (ret) { hfi1_cdbg(TID, "Failed to unprogram rcv array %d", ret); @@ -589,7 +635,6 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) * struct tid_pageset holding information on physically contiguous * chunks from the user buffer), and other fields. * @grp: RcvArray group - * @start: starting index into sets array * @count: number of struct tid_pageset's to program * @tidlist: the array of u32 elements when the information about the * programmed RcvArray entries is to be encoded. @@ -609,14 +654,14 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) * number of RcvArray entries programmed. */ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, - struct tid_group *grp, - unsigned int start, u16 count, + struct tid_group *grp, u16 count, u32 *tidlist, unsigned int *tididx, unsigned int *pmapped) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; u16 idx; + unsigned int start = *tididx; u32 tidinfo = 0, rcventry, useidx = 0; int mapped = 0; @@ -661,8 +706,7 @@ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, return ret; mapped += npages; - tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | - EXP_TID_SET(LEN, npages); + tidinfo = create_tid(rcventry - uctxt->expected_base, npages); tidlist[(*tididx)++] = tidinfo; grp->used++; grp->map |= 1 << useidx++; @@ -706,6 +750,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, } node->fdata = fd; + mutex_init(&node->invalidate_mutex); node->phys = page_to_phys(pages[0]); node->npages = npages; node->rcventry = rcventry; @@ -721,11 +766,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, &tid_mn_ops); if (ret) goto out_unmap; - /* - * FIXME: This is in the wrong order, the notifier should be - * established before the pages are pinned by pin_rcv_pages. - */ - mmu_interval_read_begin(&node->notifier); } fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; @@ -745,33 +785,29 @@ out_unmap: return -EFAULT; } -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp) +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; struct tid_rb_node *node; - u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tidctrl = EXP_TID_GET(tidinfo, CTRL); u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; - if (tididx >= uctxt->expected_count) { - dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", - tididx, uctxt->ctxt); - return -EINVAL; - } - - if (tidctrl == 0x3) + if (tidctrl == 0x3 || tidctrl == 0x0) return -EINVAL; rcventry = tididx + (tidctrl - 1); + if (rcventry >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + rcventry, uctxt->ctxt); + return -EINVAL; + } + node = fd->entry_to_rb[rcventry]; if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; - if (grp) - *grp = node->grp; - if (fd->use_mn) mmu_interval_notifier_remove(&node->notifier); cacheless_tid_rb_remove(fd, node); @@ -779,23 +815,34 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, return 0; } -static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; + mutex_lock(&node->invalidate_mutex); + if (node->freed) + goto done; + node->freed = true; + trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, node->npages, node->notifier.interval_tree.start, node->phys, node->dma_addr); - /* - * Make sure device has seen the write before we unpin the - * pages. - */ + /* Make sure device has seen the write before pages are unpinned */ hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); +done: + mutex_unlock(&node->invalidate_mutex); +} + +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + __clear_tid_node(fd, node); node->grp->used--; node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); @@ -854,17 +901,22 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, if (node->freed) return true; + /* take action only if unmapping */ + if (range->event != MMU_NOTIFY_UNMAP) + return true; + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->notifier.interval_tree.start, node->rcventry, node->npages, node->dma_addr); - node->freed = true; + + /* clear the hardware rcvarray entry */ + __clear_tid_node(fdata, node); spin_lock(&fdata->invalid_lock); if (fdata->invalid_tid_idx < uctxt->expected_count) { fdata->invalid_tids[fdata->invalid_tid_idx] = - rcventry2tidinfo(node->rcventry - uctxt->expected_base); - fdata->invalid_tids[fdata->invalid_tid_idx] |= - EXP_TID_SET(LEN, node->npages); + create_tid(node->rcventry - uctxt->expected_base, + node->npages); if (!fdata->invalid_tid_idx) { unsigned long *ev; @@ -887,6 +939,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, return true; } +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct tid_user_buf *tidbuf = + container_of(mni, struct tid_user_buf, notifier); + + /* take action only if unmapping */ + if (range->event == MMU_NOTIFY_UNMAP) { + mutex_lock(&tidbuf->cover_mutex); + mmu_interval_set_seq(mni, cur_seq); + mutex_unlock(&tidbuf->cover_mutex); + } + + return true; +} + static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, struct tid_rb_node *tnode) { diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index 8c53e416bf84..f8ee997d0050 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -16,6 +16,8 @@ struct tid_pageset { }; struct tid_user_buf { + struct mmu_interval_notifier notifier; + struct mutex cover_mutex; unsigned long vaddr; unsigned long length; unsigned int npages; @@ -27,6 +29,7 @@ struct tid_user_buf { struct tid_rb_node { struct mmu_interval_notifier notifier; struct hfi1_filedata *fdata; + struct mutex invalidate_mutex; /* covers hw removal */ unsigned long phys; struct tid_group *grp; u32 rcventry; diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 7bce963e2ae6..36aaedc65145 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -29,33 +29,52 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, u32 nlocked, u32 npages) { - unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit, - size = (cache_size * (1UL << 20)); /* convert to bytes */ - unsigned int usr_ctxts = - dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt; - bool can_lock = capable(CAP_IPC_LOCK); + unsigned long ulimit_pages; + unsigned long cache_limit_pages; + unsigned int usr_ctxts; /* - * Calculate per-cache size. The calculation below uses only a quarter - * of the available per-context limit. This leaves space for other - * pinning. Should we worry about shared ctxts? + * Perform RLIMIT_MEMLOCK based checks unless CAP_IPC_LOCK is present. */ - cache_limit = (ulimit / usr_ctxts) / 4; - - /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */ - if (ulimit != (-1UL) && size > cache_limit) - size = cache_limit; - - /* Convert to number of pages */ - size = DIV_ROUND_UP(size, PAGE_SIZE); - - pinned = atomic64_read(&mm->pinned_vm); + if (!capable(CAP_IPC_LOCK)) { + ulimit_pages = + DIV_ROUND_DOWN_ULL(rlimit(RLIMIT_MEMLOCK), PAGE_SIZE); + + /* + * Pinning these pages would exceed this process's locked memory + * limit. + */ + if (atomic64_read(&mm->pinned_vm) + npages > ulimit_pages) + return false; + + /* + * Only allow 1/4 of the user's RLIMIT_MEMLOCK to be used for HFI + * caches. This fraction is then equally distributed among all + * existing user contexts. Note that if RLIMIT_MEMLOCK is + * 'unlimited' (-1), the value of this limit will be > 2^42 pages + * (2^64 / 2^12 / 2^8 / 2^2). + * + * The effectiveness of this check may be reduced if I/O occurs on + * some user contexts before all user contexts are created. This + * check assumes that this process is the only one using this + * context (e.g., the corresponding fd was not passed to another + * process for concurrent access) as there is no per-context, + * per-process tracking of pinned pages. It also assumes that each + * user context has only one cache to limit. + */ + usr_ctxts = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt; + if (nlocked + npages > (ulimit_pages / usr_ctxts / 4)) + return false; + } - /* First, check the absolute limit against all pinned pages. */ - if (pinned + npages >= ulimit && !can_lock) + /* + * Pinning these pages would exceed the size limit for this cache. + */ + cache_limit_pages = cache_size * (1024 * 1024) / PAGE_SIZE; + if (nlocked + npages > cache_limit_pages) return false; - return ((nlocked + npages) <= size) || can_lock; + return true; } int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index e6e17984553c..7f6d7fc7951d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1598,13 +1598,11 @@ static const char * const driver_cntr_names[] = { "DRIVER_EgrHdrFull" }; -static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */ static struct rdma_stat_desc *dev_cntr_descs; static struct rdma_stat_desc *port_cntr_descs; int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names); static int num_dev_cntrs; static int num_port_cntrs; -static int cntr_names_initialized; /* * Convert a list of names separated by '\n' into an array of NULL terminated @@ -1615,8 +1613,8 @@ static int init_cntr_names(const char *names_in, const size_t names_len, int num_extra_names, int *num_cntrs, struct rdma_stat_desc **cntr_descs) { - struct rdma_stat_desc *q; - char *names_out, *p; + struct rdma_stat_desc *names_out; + char *p; int i, n; n = 0; @@ -1624,65 +1622,45 @@ static int init_cntr_names(const char *names_in, const size_t names_len, if (names_in[i] == '\n') n++; - names_out = - kzalloc((n + num_extra_names) * sizeof(*q) + names_len, - GFP_KERNEL); + names_out = kzalloc((n + num_extra_names) * sizeof(*names_out) + + names_len, + GFP_KERNEL); if (!names_out) { *num_cntrs = 0; *cntr_descs = NULL; return -ENOMEM; } - p = names_out + (n + num_extra_names) * sizeof(*q); + p = (char *)&names_out[n + num_extra_names]; memcpy(p, names_in, names_len); - q = (struct rdma_stat_desc *)names_out; for (i = 0; i < n; i++) { - q[i].name = p; + names_out[i].name = p; p = strchr(p, '\n'); *p++ = '\0'; } *num_cntrs = n; - *cntr_descs = (struct rdma_stat_desc *)names_out; + *cntr_descs = names_out; return 0; } -static int init_counters(struct ib_device *ibdev) -{ - struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - int i, err = 0; - - mutex_lock(&cntr_names_lock); - if (cntr_names_initialized) - goto out_unlock; - - err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, num_driver_cntrs, - &num_dev_cntrs, &dev_cntr_descs); - if (err) - goto out_unlock; - - for (i = 0; i < num_driver_cntrs; i++) - dev_cntr_descs[num_dev_cntrs + i].name = driver_cntr_names[i]; - - err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, 0, - &num_port_cntrs, &port_cntr_descs); - if (err) { - kfree(dev_cntr_descs); - dev_cntr_descs = NULL; - goto out_unlock; - } - cntr_names_initialized = 1; - -out_unlock: - mutex_unlock(&cntr_names_lock); - return err; -} - static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev) { - if (init_counters(ibdev)) - return NULL; + if (!dev_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int i, err; + + err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, + num_driver_cntrs, + &num_dev_cntrs, &dev_cntr_descs); + if (err) + return NULL; + + for (i = 0; i < num_driver_cntrs; i++) + dev_cntr_descs[num_dev_cntrs + i].name = + driver_cntr_names[i]; + } return rdma_alloc_hw_stats_struct(dev_cntr_descs, num_dev_cntrs + num_driver_cntrs, RDMA_HW_STATS_DEFAULT_LIFESPAN); @@ -1691,8 +1669,16 @@ static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev) static struct rdma_hw_stats *hfi_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { - if (init_counters(ibdev)) - return NULL; + if (!port_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int err; + + err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, + 0, + &num_port_cntrs, &port_cntr_descs); + if (err) + return NULL; + } return rdma_alloc_hw_stats_struct(port_cntr_descs, num_port_cntrs, RDMA_HW_STATS_DEFAULT_LIFESPAN); } @@ -1917,13 +1903,10 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *dd) del_timer_sync(&dev->mem_timer); verbs_txreq_exit(dev); - mutex_lock(&cntr_names_lock); kfree(dev_cntr_descs); kfree(port_cntr_descs); dev_cntr_descs = NULL; port_cntr_descs = NULL; - cntr_names_initialized = 0; - mutex_unlock(&cntr_names_lock); } void hfi1_cnp_rcv(struct hfi1_packet *packet) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index f701cc86896b..84239b907de2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -144,6 +144,7 @@ enum { HNS_ROCE_CAP_FLAG_DIRECT_WQE = BIT(12), HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), + HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), }; #define HNS_ROCE_DB_TYPE_COUNT 2 @@ -567,21 +568,6 @@ struct hns_roce_mbox_msg { struct hns_roce_dev; -struct hns_roce_rinl_sge { - void *addr; - u32 len; -}; - -struct hns_roce_rinl_wqe { - struct hns_roce_rinl_sge *sg_list; - u32 sge_cnt; -}; - -struct hns_roce_rinl_buf { - struct hns_roce_rinl_wqe *wqe_list; - u32 wqe_cnt; -}; - enum { HNS_ROCE_FLUSH_FLAG = 0, }; @@ -632,7 +618,6 @@ struct hns_roce_qp { /* 0: flush needed, 1: unneeded */ unsigned long flush_flag; struct hns_roce_work flush_work; - struct hns_roce_rinl_buf rq_inl_buf; struct list_head node; /* all qps are on a list */ struct list_head rq_node; /* all recv qps are on a list */ struct list_head sq_node; /* all send qps are on a list */ @@ -887,7 +872,7 @@ struct hns_roce_hw { u32 step_idx); int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, - enum ib_qp_state new_state); + enum ib_qp_state new_state, struct ib_udata *udata); int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); void (*dereg_mr)(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b2421883993b..dbf97fe5948f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -821,22 +821,10 @@ static void fill_recv_sge_to_wqe(const struct ib_recv_wr *wr, void *wqe, static void fill_rq_wqe(struct hns_roce_qp *hr_qp, const struct ib_recv_wr *wr, u32 wqe_idx, u32 max_sge) { - struct hns_roce_rinl_sge *sge_list; void *wqe = NULL; - u32 i; wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx); fill_recv_sge_to_wqe(wr, wqe, max_sge, hr_qp->rq.rsv_sge); - - /* rq support inline data */ - if (hr_qp->rq_inl_buf.wqe_cnt) { - sge_list = hr_qp->rq_inl_buf.wqe_list[wqe_idx].sg_list; - hr_qp->rq_inl_buf.wqe_list[wqe_idx].sge_cnt = (u32)wr->num_sge; - for (i = 0; i < wr->num_sge; i++) { - sge_list[i].addr = (void *)(u64)wr->sg_list[i].addr; - sge_list[i].len = wr->sg_list[i].length; - } - } } static int hns_roce_v2_post_recv(struct ib_qp *ibqp, @@ -2849,7 +2837,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->port_num = 1; attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, - IB_QPS_INIT); + IB_QPS_INIT, NULL); if (ret) { ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", ret); @@ -2871,7 +2859,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num); ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, - IB_QPS_RTR); + IB_QPS_RTR, NULL); hr_dev->loop_idc = loopback; if (ret) { ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", @@ -2886,7 +2874,7 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_RTR, - IB_QPS_RTS); + IB_QPS_RTS, NULL); if (ret) ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", ret); @@ -3730,39 +3718,6 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, return 0; } -static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe, - struct hns_roce_qp *qp, - struct ib_wc *wc) -{ - struct hns_roce_rinl_sge *sge_list; - u32 wr_num, wr_cnt, sge_num; - u32 sge_cnt, data_len, size; - void *wqe_buf; - - wr_num = hr_reg_read(cqe, CQE_WQE_IDX); - wr_cnt = wr_num & (qp->rq.wqe_cnt - 1); - - sge_list = qp->rq_inl_buf.wqe_list[wr_cnt].sg_list; - sge_num = qp->rq_inl_buf.wqe_list[wr_cnt].sge_cnt; - wqe_buf = hns_roce_get_recv_wqe(qp, wr_cnt); - data_len = wc->byte_len; - - for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) { - size = min(sge_list[sge_cnt].len, data_len); - memcpy((void *)sge_list[sge_cnt].addr, wqe_buf, size); - - data_len -= size; - wqe_buf += size; - } - - if (unlikely(data_len)) { - wc->status = IB_WC_LOC_LEN_ERR; - return -EAGAIN; - } - - return 0; -} - static int sw_comp(struct hns_roce_qp *hr_qp, struct hns_roce_wq *wq, int num_entries, struct ib_wc *wc) { @@ -3974,22 +3929,10 @@ static void fill_send_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe) wc->opcode = ib_opcode; } -static inline bool is_rq_inl_enabled(struct ib_wc *wc, u32 hr_opcode, - struct hns_roce_v2_cqe *cqe) -{ - return wc->qp->qp_type != IB_QPT_UD && wc->qp->qp_type != IB_QPT_GSI && - (hr_opcode == HNS_ROCE_V2_OPCODE_SEND || - hr_opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_IMM || - hr_opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) && - hr_reg_read(cqe, CQE_RQ_INLINE); -} - static int fill_recv_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe) { - struct hns_roce_qp *qp = to_hr_qp(wc->qp); u32 hr_opcode; int ib_opcode; - int ret; wc->byte_len = le32_to_cpu(cqe->byte_cnt); @@ -4014,12 +3957,6 @@ static int fill_recv_wc(struct ib_wc *wc, struct hns_roce_v2_cqe *cqe) else wc->opcode = ib_opcode; - if (is_rq_inl_enabled(wc, hr_opcode, cqe)) { - ret = hns_roce_handle_recv_inl_wqe(cqe, qp, wc); - if (unlikely(ret)) - return ret; - } - wc->sl = hr_reg_read(cqe, CQE_SL); wc->src_qp = hr_reg_read(cqe, CQE_RMT_QPN); wc->slid = 0; @@ -4445,10 +4382,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, hr_reg_write(context, QPC_RQ_DB_RECORD_ADDR_H, upper_32_bits(hr_qp->rdb.dma)); - if (ibqp->qp_type != IB_QPT_UD && ibqp->qp_type != IB_QPT_GSI) - hr_reg_write_bool(context, QPC_RQIE, - hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE); - hr_reg_write(context, QPC_RX_CQN, get_cqn(ibqp->recv_cq)); if (ibqp->srq) { @@ -4639,8 +4572,11 @@ static inline enum ib_mtu get_mtu(struct ib_qp *ibqp, static int modify_qp_init_to_rtr(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *qpc_mask, + struct ib_udata *udata) { + struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, ibucontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; @@ -4760,6 +4696,26 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, hr_reg_write(context, QPC_LP_SGEN_INI, 3); hr_reg_clear(qpc_mask, QPC_LP_SGEN_INI); + if (udata && ibqp->qp_type == IB_QPT_RC && + (uctx->config & HNS_ROCE_RQ_INLINE_FLAGS)) { + hr_reg_write_bool(context, QPC_RQIE, + hr_dev->caps.flags & + HNS_ROCE_CAP_FLAG_RQ_INLINE); + hr_reg_clear(qpc_mask, QPC_RQIE); + } + + if (udata && + (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_XRC_TGT) && + (uctx->config & HNS_ROCE_CQE_INLINE_FLAGS)) { + hr_reg_write_bool(context, QPC_CQEIE, + hr_dev->caps.flags & + HNS_ROCE_CAP_FLAG_CQE_INLINE); + hr_reg_clear(qpc_mask, QPC_CQEIE); + + hr_reg_write(context, QPC_CQEIS, 0); + hr_reg_clear(qpc_mask, QPC_CQEIS); + } + return 0; } @@ -5107,7 +5063,8 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, enum ib_qp_state cur_state, enum ib_qp_state new_state, struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) + struct hns_roce_v2_qp_context *qpc_mask, + struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); int ret = 0; @@ -5124,7 +5081,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, modify_qp_init_to_init(ibqp, attr, context, qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context, - qpc_mask); + qpc_mask, udata); } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context, qpc_mask); @@ -5329,7 +5286,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp, static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, - enum ib_qp_state new_state) + enum ib_qp_state new_state, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); @@ -5352,7 +5309,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, memset(qpc_mask, 0xff, hr_dev->caps.qpc_sz); ret = hns_roce_v2_set_abs_fields(ibqp, attr, attr_mask, cur_state, - new_state, context, qpc_mask); + new_state, context, qpc_mask, udata); if (ret) goto out; @@ -5555,7 +5512,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, if (modify_qp_is_ok(hr_qp)) { /* Modify qp to reset before destroying qp */ ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, - hr_qp->state, IB_QPS_RESET); + hr_qp->state, IB_QPS_RESET, udata); if (ret) ibdev_err(ibdev, "failed to modify QP to RST, ret = %d.\n", diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index b1b3e1e0b84e..af9d00225cdf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -531,7 +531,8 @@ struct hns_roce_v2_qp_context { #define QPC_RQ_RTY_TX_ERR QPC_FIELD_LOC(607, 607) #define QPC_RX_CQN QPC_FIELD_LOC(631, 608) #define QPC_XRC_QP_TYPE QPC_FIELD_LOC(632, 632) -#define QPC_RSV3 QPC_FIELD_LOC(634, 633) +#define QPC_CQEIE QPC_FIELD_LOC(633, 633) +#define QPC_CQEIS QPC_FIELD_LOC(634, 634) #define QPC_MIN_RNR_TIME QPC_FIELD_LOC(639, 635) #define QPC_RQ_PRODUCER_IDX QPC_FIELD_LOC(655, 640) #define QPC_RQ_CONSUMER_IDX QPC_FIELD_LOC(671, 656) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 8ba68ac12388..485e110ca433 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -379,6 +379,18 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.max_inline_data = hr_dev->caps.max_sq_inline; } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + context->config |= ucmd.config & HNS_ROCE_RQ_INLINE_FLAGS; + if (context->config & HNS_ROCE_RQ_INLINE_FLAGS) + resp.config |= HNS_ROCE_RSP_RQ_INLINE_FLAGS; + } + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQE_INLINE) { + context->config |= ucmd.config & HNS_ROCE_CQE_INLINE_FLAGS; + if (context->config & HNS_ROCE_CQE_INLINE_FLAGS) + resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; @@ -443,14 +455,15 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma) prot = pgprot_device(vma->vm_page_prot); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } ret = rdma_user_mmap_io(uctx, vma, pfn, rdma_entry->npages * PAGE_SIZE, prot, rdma_entry); +out: rdma_user_mmap_entry_put(rdma_entry); - return ret; } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 0ae335fb205c..d855a917f4cf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -433,7 +433,6 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, if (!has_rq) { hr_qp->rq.wqe_cnt = 0; hr_qp->rq.max_gs = 0; - hr_qp->rq_inl_buf.wqe_cnt = 0; cap->max_recv_wr = 0; cap->max_recv_sge = 0; @@ -463,12 +462,6 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, hr_qp->rq.max_gs); hr_qp->rq.wqe_cnt = cnt; - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE && - hr_qp->ibqp.qp_type != IB_QPT_UD && - hr_qp->ibqp.qp_type != IB_QPT_GSI) - hr_qp->rq_inl_buf.wqe_cnt = cnt; - else - hr_qp->rq_inl_buf.wqe_cnt = 0; cap->max_recv_wr = cnt; cap->max_recv_sge = hr_qp->rq.max_gs - hr_qp->rq.rsv_sge; @@ -732,49 +725,6 @@ static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr) return 1; } -static int alloc_rq_inline_buf(struct hns_roce_qp *hr_qp, - struct ib_qp_init_attr *init_attr) -{ - u32 max_recv_sge = init_attr->cap.max_recv_sge; - u32 wqe_cnt = hr_qp->rq_inl_buf.wqe_cnt; - struct hns_roce_rinl_wqe *wqe_list; - int i; - - /* allocate recv inline buf */ - wqe_list = kcalloc(wqe_cnt, sizeof(struct hns_roce_rinl_wqe), - GFP_KERNEL); - if (!wqe_list) - goto err; - - /* Allocate a continuous buffer for all inline sge we need */ - wqe_list[0].sg_list = kcalloc(wqe_cnt, (max_recv_sge * - sizeof(struct hns_roce_rinl_sge)), - GFP_KERNEL); - if (!wqe_list[0].sg_list) - goto err_wqe_list; - - /* Assign buffers of sg_list to each inline wqe */ - for (i = 1; i < wqe_cnt; i++) - wqe_list[i].sg_list = &wqe_list[0].sg_list[i * max_recv_sge]; - - hr_qp->rq_inl_buf.wqe_list = wqe_list; - - return 0; - -err_wqe_list: - kfree(wqe_list); - -err: - return -ENOMEM; -} - -static void free_rq_inline_buf(struct hns_roce_qp *hr_qp) -{ - if (hr_qp->rq_inl_buf.wqe_list) - kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); - kfree(hr_qp->rq_inl_buf.wqe_list); -} - static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, unsigned long addr) @@ -783,18 +733,6 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_buf_attr buf_attr = {}; int ret; - if (!udata && hr_qp->rq_inl_buf.wqe_cnt) { - ret = alloc_rq_inline_buf(hr_qp, init_attr); - if (ret) { - ibdev_err(ibdev, - "failed to alloc inline buf, ret = %d.\n", - ret); - return ret; - } - } else { - hr_qp->rq_inl_buf.wqe_list = NULL; - } - ret = set_wqe_buf_attr(hr_dev, hr_qp, &buf_attr); if (ret) { ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); @@ -814,7 +752,6 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return 0; err_inline: - free_rq_inline_buf(hr_qp); return ret; } @@ -822,7 +759,6 @@ err_inline: static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); - free_rq_inline_buf(hr_qp); } static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev, @@ -1410,7 +1346,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state, - new_state); + new_state, udata); out: mutex_unlock(&hr_qp->mutex); diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 7b086fe63a24..195aa9ea18b6 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1722,6 +1722,9 @@ static int irdma_add_mqh_4(struct irdma_device *iwdev, continue; idev = in_dev_get(ip_dev); + if (!idev) + continue; + in_dev_for_each_ifa_rtnl(ifa, idev) { ibdev_dbg(&iwdev->ibdev, "CM: Allocating child CM Listener forIP=%pI4, vlan_id=%d, MAC=%pM\n", diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index ab246447520b..2e1e2bad0401 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -483,6 +483,8 @@ static int irdma_save_msix_info(struct irdma_pci_f *rf) iw_qvlist->num_vectors = rf->msix_count; if (rf->msix_count <= num_online_cpus()) rf->msix_shared = true; + else if (rf->msix_count > num_online_cpus() + 1) + rf->msix_count = num_online_cpus() + 1; pmsix = rf->msix_entries; for (i = 0, ceq_idx = 0; i < rf->msix_count; i++, iw_qvinfo++) { diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index f6973ea55eda..1b2e3e800c9a 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2745,6 +2745,162 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, return ret; } +static int irdma_reg_user_mr_type_mem(struct irdma_mr *iwmr, int access) +{ + struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device); + struct irdma_pbl *iwpbl = &iwmr->iwpbl; + bool use_pbles; + u32 stag; + int err; + + use_pbles = iwmr->page_cnt != 1; + + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); + if (err) + return err; + + if (use_pbles) { + err = irdma_check_mr_contiguous(&iwpbl->pble_alloc, + iwmr->page_size); + if (err) { + irdma_free_pble(iwdev->rf->pble_rsrc, &iwpbl->pble_alloc); + iwpbl->pbl_allocated = false; + } + } + + stag = irdma_create_stag(iwdev); + if (!stag) { + err = -ENOMEM; + goto free_pble; + } + + iwmr->stag = stag; + iwmr->ibmr.rkey = stag; + iwmr->ibmr.lkey = stag; + err = irdma_hwreg_mr(iwdev, iwmr, access); + if (err) + goto err_hwreg; + + return 0; + +err_hwreg: + irdma_free_stag(iwdev, stag); + +free_pble: + if (iwpbl->pble_alloc.level != PBLE_LEVEL_0 && iwpbl->pbl_allocated) + irdma_free_pble(iwdev->rf->pble_rsrc, &iwpbl->pble_alloc); + + return err; +} + +static struct irdma_mr *irdma_alloc_iwmr(struct ib_umem *region, + struct ib_pd *pd, u64 virt, + enum irdma_memreg_type reg_type) +{ + struct irdma_device *iwdev = to_iwdev(pd->device); + struct irdma_pbl *iwpbl = NULL; + struct irdma_mr *iwmr = NULL; + unsigned long pgsz_bitmap; + + iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); + if (!iwmr) + return ERR_PTR(-ENOMEM); + + iwpbl = &iwmr->iwpbl; + iwpbl->iwmr = iwmr; + iwmr->region = region; + iwmr->ibmr.pd = pd; + iwmr->ibmr.device = pd->device; + iwmr->ibmr.iova = virt; + iwmr->type = reg_type; + + pgsz_bitmap = (reg_type == IRDMA_MEMREG_TYPE_MEM) ? + iwdev->rf->sc_dev.hw_attrs.page_size_cap : PAGE_SIZE; + + iwmr->page_size = ib_umem_find_best_pgsz(region, pgsz_bitmap, virt); + if (unlikely(!iwmr->page_size)) { + kfree(iwmr); + return ERR_PTR(-EOPNOTSUPP); + } + + iwmr->len = region->length; + iwpbl->user_base = virt; + iwmr->page_cnt = ib_umem_num_dma_blocks(region, iwmr->page_size); + + return iwmr; +} + +static void irdma_free_iwmr(struct irdma_mr *iwmr) +{ + kfree(iwmr); +} + +static int irdma_reg_user_mr_type_qp(struct irdma_mem_reg_req req, + struct ib_udata *udata, + struct irdma_mr *iwmr) +{ + struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device); + struct irdma_pbl *iwpbl = &iwmr->iwpbl; + struct irdma_ucontext *ucontext = NULL; + unsigned long flags; + bool use_pbles; + u32 total; + int err; + + total = req.sq_pages + req.rq_pages + 1; + if (total > iwmr->page_cnt) + return -EINVAL; + + total = req.sq_pages + req.rq_pages; + use_pbles = total > 2; + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); + if (err) + return err; + + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); + list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); + iwpbl->on_list = true; + spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); + + return 0; +} + +static int irdma_reg_user_mr_type_cq(struct irdma_mem_reg_req req, + struct ib_udata *udata, + struct irdma_mr *iwmr) +{ + struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device); + struct irdma_pbl *iwpbl = &iwmr->iwpbl; + struct irdma_ucontext *ucontext = NULL; + u8 shadow_pgcnt = 1; + unsigned long flags; + bool use_pbles; + u32 total; + int err; + + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_CQ_RESIZE) + shadow_pgcnt = 0; + total = req.cq_pages + shadow_pgcnt; + if (total > iwmr->page_cnt) + return -EINVAL; + + use_pbles = req.cq_pages > 1; + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); + if (err) + return err; + + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); + list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); + iwpbl->on_list = true; + spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); + + return 0; +} + /** * irdma_reg_user_mr - Register a user memory region * @pd: ptr of pd @@ -2760,18 +2916,10 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, { #define IRDMA_MEM_REG_MIN_REQ_LEN offsetofend(struct irdma_mem_reg_req, sq_pages) struct irdma_device *iwdev = to_iwdev(pd->device); - struct irdma_ucontext *ucontext; - struct irdma_pble_alloc *palloc; - struct irdma_pbl *iwpbl; - struct irdma_mr *iwmr; - struct ib_umem *region; - struct irdma_mem_reg_req req; - u32 total, stag = 0; - u8 shadow_pgcnt = 1; - bool use_pbles = false; - unsigned long flags; - int err = -EINVAL; - int ret; + struct irdma_mem_reg_req req = {}; + struct ib_umem *region = NULL; + struct irdma_mr *iwmr = NULL; + int err; if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) return ERR_PTR(-EINVAL); @@ -2792,122 +2940,80 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, return ERR_PTR(-EFAULT); } - iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); - if (!iwmr) { + iwmr = irdma_alloc_iwmr(region, pd, virt, req.reg_type); + if (IS_ERR(iwmr)) { ib_umem_release(region); - return ERR_PTR(-ENOMEM); - } - - iwpbl = &iwmr->iwpbl; - iwpbl->iwmr = iwmr; - iwmr->region = region; - iwmr->ibmr.pd = pd; - iwmr->ibmr.device = pd->device; - iwmr->ibmr.iova = virt; - iwmr->page_size = PAGE_SIZE; - - if (req.reg_type == IRDMA_MEMREG_TYPE_MEM) { - iwmr->page_size = ib_umem_find_best_pgsz(region, - iwdev->rf->sc_dev.hw_attrs.page_size_cap, - virt); - if (unlikely(!iwmr->page_size)) { - kfree(iwmr); - ib_umem_release(region); - return ERR_PTR(-EOPNOTSUPP); - } + return (struct ib_mr *)iwmr; } - iwmr->len = region->length; - iwpbl->user_base = virt; - palloc = &iwpbl->pble_alloc; - iwmr->type = req.reg_type; - iwmr->page_cnt = ib_umem_num_dma_blocks(region, iwmr->page_size); switch (req.reg_type) { case IRDMA_MEMREG_TYPE_QP: - total = req.sq_pages + req.rq_pages + shadow_pgcnt; - if (total > iwmr->page_cnt) { - err = -EINVAL; - goto error; - } - total = req.sq_pages + req.rq_pages; - use_pbles = (total > 2); - err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); + err = irdma_reg_user_mr_type_qp(req, udata, iwmr); if (err) goto error; - ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, - ibucontext); - spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); - list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); - iwpbl->on_list = true; - spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); break; case IRDMA_MEMREG_TYPE_CQ: - if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_CQ_RESIZE) - shadow_pgcnt = 0; - total = req.cq_pages + shadow_pgcnt; - if (total > iwmr->page_cnt) { - err = -EINVAL; - goto error; - } - - use_pbles = (req.cq_pages > 1); - err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); + err = irdma_reg_user_mr_type_cq(req, udata, iwmr); if (err) goto error; - - ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, - ibucontext); - spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); - list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); - iwpbl->on_list = true; - spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); break; case IRDMA_MEMREG_TYPE_MEM: - use_pbles = (iwmr->page_cnt != 1); - - err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); + err = irdma_reg_user_mr_type_mem(iwmr, access); if (err) goto error; - if (use_pbles) { - ret = irdma_check_mr_contiguous(palloc, - iwmr->page_size); - if (ret) { - irdma_free_pble(iwdev->rf->pble_rsrc, palloc); - iwpbl->pbl_allocated = false; - } - } - - stag = irdma_create_stag(iwdev); - if (!stag) { - err = -ENOMEM; - goto error; - } - - iwmr->stag = stag; - iwmr->ibmr.rkey = stag; - iwmr->ibmr.lkey = stag; - err = irdma_hwreg_mr(iwdev, iwmr, access); - if (err) { - irdma_free_stag(iwdev, stag); - goto error; - } - break; default: + err = -EINVAL; goto error; } - iwmr->type = req.reg_type; - return &iwmr->ibmr; - error: - if (palloc->level != PBLE_LEVEL_0 && iwpbl->pbl_allocated) - irdma_free_pble(iwdev->rf->pble_rsrc, palloc); ib_umem_release(region); - kfree(iwmr); + irdma_free_iwmr(iwmr); + + return ERR_PTR(err); +} + +static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, + u64 len, u64 virt, + int fd, int access, + struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(pd->device); + struct ib_umem_dmabuf *umem_dmabuf; + struct irdma_mr *iwmr; + int err; + + if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) + return ERR_PTR(-EINVAL); + + umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, access); + if (IS_ERR(umem_dmabuf)) { + err = PTR_ERR(umem_dmabuf); + ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%d]\n", err); + return ERR_PTR(err); + } + + iwmr = irdma_alloc_iwmr(&umem_dmabuf->umem, pd, virt, IRDMA_MEMREG_TYPE_MEM); + if (IS_ERR(iwmr)) { + err = PTR_ERR(iwmr); + goto err_release; + } + + err = irdma_reg_user_mr_type_mem(iwmr, access); + if (err) + goto err_iwmr; + + return &iwmr->ibmr; + +err_iwmr: + irdma_free_iwmr(iwmr); + +err_release: + ib_umem_release(&umem_dmabuf->umem); return ERR_PTR(err); } @@ -4418,6 +4524,7 @@ static const struct ib_device_ops irdma_dev_ops = { .query_port = irdma_query_port, .query_qp = irdma_query_qp, .reg_user_mr = irdma_reg_user_mr, + .reg_user_mr_dmabuf = irdma_reg_user_mr_dmabuf, .req_notify_cq = irdma_req_notify_cq, .resize_cq = irdma_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, irdma_pd, ibpd), diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 8b3bc302d6f3..7be4c3adb4e2 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -249,7 +249,8 @@ static int mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, struct gdma_create_dma_region_req *create_req, - size_t num_pages, mana_handle_t *gdma_region) + size_t num_pages, mana_handle_t *gdma_region, + u32 expected_status) { struct gdma_create_dma_region_resp create_resp = {}; unsigned int create_req_msg_size; @@ -261,7 +262,7 @@ mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, err = mana_gd_send_request(gc, create_req_msg_size, create_req, sizeof(create_resp), &create_resp); - if (err || create_resp.hdr.status) { + if (err || create_resp.hdr.status != expected_status) { ibdev_dbg(&dev->ib_dev, "Failed to create DMA region: %d, 0x%x\n", err, create_resp.hdr.status); @@ -372,14 +373,21 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, page_addr_list = create_req->page_addr_list; rdma_umem_for_each_dma_block(umem, &biter, page_sz) { + u32 expected_status = 0; + page_addr_list[tail++] = rdma_block_iter_dma_address(&biter); if (tail < num_pages_to_handle) continue; + if (num_pages_processed + num_pages_to_handle < + num_pages_total) + expected_status = GDMA_STATUS_MORE_ENTRIES; + if (!num_pages_processed) { /* First create message */ err = mana_ib_gd_first_dma_region(dev, gc, create_req, - tail, gdma_region); + tail, gdma_region, + expected_status); if (err) goto out; @@ -392,14 +400,8 @@ int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, page_addr_list = add_req->page_addr_list; } else { /* Subsequent create messages */ - u32 expected_s = 0; - - if (num_pages_processed + num_pages_to_handle < - num_pages_total) - expected_s = GDMA_STATUS_MORE_ENTRIES; - err = mana_ib_gd_add_dma_region(dev, gc, add_req, tail, - expected_s); + expected_status); if (err) break; } diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index ea15ec77e321..54b61930a7fd 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -289,7 +289,7 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, /* IB ports start with 1, MANA Ethernet ports start with 0 */ port = ucmd.port; - if (ucmd.port > mc->num_ports) + if (port < 1 || port > mc->num_ports) return -EINVAL; if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) { diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index dceebcd885bb..b18e9f2adc82 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -3303,6 +3303,10 @@ static int __init mlx4_ib_init(void) if (!wq) return -ENOMEM; + err = mlx4_ib_qp_event_init(); + if (err) + goto clean_qp_event; + err = mlx4_ib_cm_init(); if (err) goto clean_wq; @@ -3324,6 +3328,9 @@ clean_cm: mlx4_ib_cm_destroy(); clean_wq: + mlx4_ib_qp_event_cleanup(); + +clean_qp_event: destroy_workqueue(wq); return err; } @@ -3333,6 +3340,7 @@ static void __exit mlx4_ib_cleanup(void) mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); mlx4_ib_cm_destroy(); + mlx4_ib_qp_event_cleanup(); destroy_workqueue(wq); } diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 6a3b0f121045..17fee1e73a45 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -940,4 +940,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, int mlx4_ib_cm_init(void); void mlx4_ib_cm_destroy(void); +int mlx4_ib_qp_event_init(void); +void mlx4_ib_qp_event_cleanup(void); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index b17d6ebc5b70..884825b2e5f7 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -102,6 +102,14 @@ enum mlx4_ib_source_type { MLX4_IB_RWQ_SRC = 1, }; +struct mlx4_ib_qp_event_work { + struct work_struct work; + struct mlx4_qp *qp; + enum mlx4_event type; +}; + +static struct workqueue_struct *mlx4_ib_qp_event_wq; + static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { if (!mlx4_is_master(dev->dev)) @@ -200,50 +208,77 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) } } +static void mlx4_ib_handle_qp_event(struct work_struct *_work) +{ + struct mlx4_ib_qp_event_work *qpe_work = + container_of(_work, struct mlx4_ib_qp_event_work, work); + struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp; + struct ib_event event = {}; + + event.device = ibqp->device; + event.element.qp = ibqp; + + switch (qpe_work->type) { + case MLX4_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX4_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX4_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("Unexpected event type %d on QP %06x\n", + qpe_work->type, qpe_work->qp->qpn); + goto out; + } + + ibqp->event_handler(&event, ibqp->qp_context); + +out: + mlx4_put_qp(qpe_work->qp); + kfree(qpe_work); +} + static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) { - struct ib_event event; struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct mlx4_ib_qp_event_work *qpe_work; if (type == MLX4_EVENT_TYPE_PATH_MIG) to_mibqp(qp)->port = to_mibqp(qp)->alt_port; - if (ibqp->event_handler) { - event.device = ibqp->device; - event.element.qp = ibqp; - switch (type) { - case MLX4_EVENT_TYPE_PATH_MIG: - event.event = IB_EVENT_PATH_MIG; - break; - case MLX4_EVENT_TYPE_COMM_EST: - event.event = IB_EVENT_COMM_EST; - break; - case MLX4_EVENT_TYPE_SQ_DRAINED: - event.event = IB_EVENT_SQ_DRAINED; - break; - case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: - event.event = IB_EVENT_QP_LAST_WQE_REACHED; - break; - case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: - event.event = IB_EVENT_QP_FATAL; - break; - case MLX4_EVENT_TYPE_PATH_MIG_FAILED: - event.event = IB_EVENT_PATH_MIG_ERR; - break; - case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: - event.event = IB_EVENT_QP_REQ_ERR; - break; - case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: - event.event = IB_EVENT_QP_ACCESS_ERR; - break; - default: - pr_warn("Unexpected event type %d " - "on QP %06x\n", type, qp->qpn); - return; - } + if (!ibqp->event_handler) + goto out_no_handler; - ibqp->event_handler(&event, ibqp->qp_context); - } + qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC); + if (!qpe_work) + goto out_no_handler; + + qpe_work->qp = qp; + qpe_work->type = type; + INIT_WORK(&qpe_work->work, mlx4_ib_handle_qp_event); + queue_work(mlx4_ib_qp_event_wq, &qpe_work->work); + return; + +out_no_handler: + mlx4_put_qp(qp); } static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type) @@ -4468,3 +4503,17 @@ void mlx4_ib_drain_rq(struct ib_qp *qp) handle_drain_completion(cq, &rdrain, dev); } + +int mlx4_ib_qp_event_init(void) +{ + mlx4_ib_qp_event_wq = alloc_ordered_workqueue("mlx4_ib_qp_event_wq", 0); + if (!mlx4_ib_qp_event_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_qp_event_cleanup(void) +{ + destroy_workqueue(mlx4_ib_qp_event_wq); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index ff3742b0460a..1d0c8d5e745b 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -5,34 +5,41 @@ #include "cmd.h" -int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey) +int mlx5r_cmd_query_special_mkeys(struct mlx5_ib_dev *dev) { u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + bool is_terminate, is_dump, is_null; int err; - MLX5_SET(query_special_contexts_in, in, opcode, - MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); - if (!err) - *mkey = MLX5_GET(query_special_contexts_out, out, - dump_fill_mkey); - return err; -} + is_terminate = MLX5_CAP_GEN(dev->mdev, terminate_scatter_list_mkey); + is_dump = MLX5_CAP_GEN(dev->mdev, dump_fill_mkey); + is_null = MLX5_CAP_GEN(dev->mdev, null_mkey); -int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) -{ - u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; - u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; - int err; + dev->mkeys.terminate_scatter_list_mkey = MLX5_TERMINATE_SCATTER_LIST_LKEY; + if (!is_terminate && !is_dump && !is_null) + return 0; MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); - if (!err) - *null_mkey = MLX5_GET(query_special_contexts_out, out, - null_mkey); - return err; + err = mlx5_cmd_exec_inout(dev->mdev, query_special_contexts, in, out); + if (err) + return err; + + if (is_dump) + dev->mkeys.dump_fill_mkey = MLX5_GET(query_special_contexts_out, + out, dump_fill_mkey); + + if (is_null) + dev->mkeys.null_mkey = cpu_to_be32( + MLX5_GET(query_special_contexts_out, out, null_mkey)); + + if (is_terminate) + dev->mkeys.terminate_scatter_list_mkey = + cpu_to_be32(MLX5_GET(query_special_contexts_out, out, + terminate_scatter_list_mkey)); + + return 0; } int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index ee46638db5de..93a971a40d11 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -37,8 +37,7 @@ #include <linux/kernel.h> #include <linux/mlx5/driver.h> -int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey); -int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); +int mlx5r_cmd_query_special_mkeys(struct mlx5_ib_dev *dev); int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, void *out); int mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index 290ea8ac3838..f87531318feb 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -38,6 +38,7 @@ enum mlx5_ib_cong_node_type { MLX5_IB_RROCE_ECN_RP = 1, MLX5_IB_RROCE_ECN_NP = 2, + MLX5_IB_RROCE_GENERAL = 3, }; static const char * const mlx5_ib_dbg_cc_name[] = { @@ -61,6 +62,8 @@ static const char * const mlx5_ib_dbg_cc_name[] = { "np_cnp_dscp", "np_cnp_prio_mode", "np_cnp_prio", + "rtt_resp_dscp_valid", + "rtt_resp_dscp", }; #define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR BIT(1) @@ -84,14 +87,18 @@ static const char * const mlx5_ib_dbg_cc_name[] = { #define MLX5_IB_NP_CNP_DSCP_ATTR BIT(3) #define MLX5_IB_NP_CNP_PRIO_MODE_ATTR BIT(4) +#define MLX5_IB_GENERAL_RTT_RESP_DSCP_ATTR BIT(0) + static enum mlx5_ib_cong_node_type mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset) { - if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE && - param_offset <= MLX5_IB_DBG_CC_RP_GD) + if (param_offset <= MLX5_IB_DBG_CC_RP_GD) return MLX5_IB_RROCE_ECN_RP; - else + + if (param_offset <= MLX5_IB_DBG_CC_NP_CNP_PRIO) return MLX5_IB_RROCE_ECN_NP; + + return MLX5_IB_RROCE_GENERAL; } static u32 mlx5_get_cc_param_val(void *field, int offset) @@ -157,6 +164,12 @@ static u32 mlx5_get_cc_param_val(void *field, int offset) case MLX5_IB_DBG_CC_NP_CNP_PRIO: return MLX5_GET(cong_control_r_roce_ecn_np, field, cnp_802p_prio); + case MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP_VALID: + return MLX5_GET(cong_control_r_roce_general, field, + rtt_resp_dscp_valid); + case MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP: + return MLX5_GET(cong_control_r_roce_general, field, + rtt_resp_dscp); default: return 0; } @@ -264,6 +277,15 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0); MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var); break; + case MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP_VALID: + *attr_mask |= MLX5_IB_GENERAL_RTT_RESP_DSCP_ATTR; + MLX5_SET(cong_control_r_roce_general, field, rtt_resp_dscp_valid, var); + break; + case MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP: + *attr_mask |= MLX5_IB_GENERAL_RTT_RESP_DSCP_ATTR; + MLX5_SET(cong_control_r_roce_general, field, rtt_resp_dscp_valid, 1); + MLX5_SET(cong_control_r_roce_general, field, rtt_resp_dscp, var); + break; } } diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 945758f39523..3e1272695d99 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -278,7 +278,6 @@ static int do_get_hw_stats(struct ib_device *ibdev, const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1); struct mlx5_core_dev *mdev; int ret, num_counters; - u32 mdev_port_num; if (!stats) return -EINVAL; @@ -299,8 +298,9 @@ static int do_get_hw_stats(struct ib_device *ibdev, } if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { - mdev = mlx5_ib_get_native_port_mdev(dev, port_num, - &mdev_port_num); + if (!port_num) + port_num = 1; + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL); if (!mdev) { /* If port is not affiliated yet, its in down state * which doesn't have any counters yet, so it would be diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 52821485371a..ddcfc116b19a 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -37,6 +37,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) const struct mlx5_ib_profile *profile; struct mlx5_core_dev *peer_dev; struct mlx5_ib_dev *ibdev; + int second_uplink = false; u32 peer_num_ports; int vport_index; int ret; @@ -47,17 +48,24 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) peer_dev = mlx5_lag_get_peer_mdev(dev); peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev); if (mlx5_lag_is_master(dev)) { - /* Only 1 ib port is the representor for both uplinks */ - num_ports += peer_num_ports - 1; + if (mlx5_lag_is_mpesw(dev)) + num_ports += peer_num_ports; + else + num_ports += peer_num_ports - 1; + } else { - if (rep->vport == MLX5_VPORT_UPLINK) - return 0; + if (rep->vport == MLX5_VPORT_UPLINK) { + if (!mlx5_lag_is_mpesw(dev)) + return 0; + second_uplink = true; + } + vport_index += peer_num_ports; dev = peer_dev; } } - if (rep->vport == MLX5_VPORT_UPLINK) + if (rep->vport == MLX5_VPORT_UPLINK && !second_uplink) profile = &raw_eth_profile; else return mlx5_ib_set_vport_rep(dev, rep, vport_index); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c669ef6e47e7..5b988db66b8f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1756,13 +1756,9 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_ucontext *context = to_mucontext(uctx); struct mlx5_bfreg_info *bfregi = &context->bfregi; - int err; if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { - err = mlx5_cmd_dump_fill_mkey(dev->mdev, - &resp->dump_fill_mkey); - if (err) - return err; + resp->dump_fill_mkey = dev->mkeys.dump_fill_mkey; resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY; } @@ -2087,7 +2083,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (!dev->mdev->clock_info) return -EOPNOTSUPP; @@ -2311,7 +2307,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) @@ -3012,26 +3008,63 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) } } -static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num) +static void mlx5_netdev_notifier_register(struct mlx5_roce *roce, + struct net_device *netdev) { int err; - dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event; - err = register_netdevice_notifier(&dev->port[port_num].roce.nb); - if (err) { - dev->port[port_num].roce.nb.notifier_call = NULL; - return err; - } + if (roce->tracking_netdev) + return; + roce->tracking_netdev = netdev; + roce->nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier_dev_net(netdev, &roce->nb, &roce->nn); + WARN_ON(err); +} - return 0; +static void mlx5_netdev_notifier_unregister(struct mlx5_roce *roce) +{ + if (!roce->tracking_netdev) + return; + unregister_netdevice_notifier_dev_net(roce->tracking_netdev, &roce->nb, + &roce->nn); + roce->tracking_netdev = NULL; } -static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num) +static int mlx5e_mdev_notifier_event(struct notifier_block *nb, + unsigned long event, void *data) { - if (dev->port[port_num].roce.nb.notifier_call) { - unregister_netdevice_notifier(&dev->port[port_num].roce.nb); - dev->port[port_num].roce.nb.notifier_call = NULL; + struct mlx5_roce *roce = container_of(nb, struct mlx5_roce, mdev_nb); + struct net_device *netdev = data; + + switch (event) { + case MLX5_DRIVER_EVENT_UPLINK_NETDEV: + if (netdev) + mlx5_netdev_notifier_register(roce, netdev); + else + mlx5_netdev_notifier_unregister(roce); + break; + default: + return NOTIFY_DONE; } + + return NOTIFY_OK; +} + +static void mlx5_mdev_netdev_track(struct mlx5_ib_dev *dev, u32 port_num) +{ + struct mlx5_roce *roce = &dev->port[port_num].roce; + + roce->mdev_nb.notifier_call = mlx5e_mdev_notifier_event; + mlx5_blocking_notifier_register(dev->mdev, &roce->mdev_nb); + mlx5_core_uplink_netdev_event_replay(dev->mdev); +} + +static void mlx5_mdev_netdev_untrack(struct mlx5_ib_dev *dev, u32 port_num) +{ + struct mlx5_roce *roce = &dev->port[port_num].roce; + + mlx5_blocking_notifier_unregister(dev->mdev, &roce->mdev_nb); + mlx5_netdev_notifier_unregister(roce); } static int mlx5_enable_eth(struct mlx5_ib_dev *dev) @@ -3138,7 +3171,7 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, if (mpi->mdev_events.notifier_call) mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events); mpi->mdev_events.notifier_call = NULL; - mlx5_remove_netdev_notifier(ibdev, port_num); + mlx5_mdev_netdev_untrack(ibdev, port_num); spin_lock(&port->mp.mpi_lock); comps = mpi->mdev_refcnt; @@ -3196,12 +3229,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, if (err) goto unbind; - err = mlx5_add_netdev_notifier(ibdev, port_num); - if (err) { - mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n", - port_num + 1); - goto unbind; - } + mlx5_mdev_netdev_track(ibdev, port_num); mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port; mlx5_notifier_register(mpi->mdev, &mpi->mdev_events); @@ -3634,6 +3662,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->port[i].roce.last_port_state = IB_PORT_DOWN; } + err = mlx5r_cmd_query_special_mkeys(dev); + if (err) + return err; + err = mlx5_ib_init_multiport_master(dev); if (err) return err; @@ -3909,9 +3941,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev) port_num = mlx5_core_native_port_num(dev->mdev) - 1; /* Register only for native ports */ - err = mlx5_add_netdev_notifier(dev, port_num); - if (err) - return err; + mlx5_mdev_netdev_track(dev, port_num); err = mlx5_enable_eth(dev); if (err) @@ -3920,7 +3950,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev) return 0; cleanup: - mlx5_remove_netdev_notifier(dev, port_num); + mlx5_mdev_netdev_untrack(dev, port_num); return err; } @@ -3938,7 +3968,7 @@ static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev) mlx5_disable_eth(dev); port_num = mlx5_core_native_port_num(dev->mdev) - 1; - mlx5_remove_netdev_notifier(dev, port_num); + mlx5_mdev_netdev_untrack(dev, port_num); } } @@ -4000,12 +4030,7 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { - int err; - - err = mlx5_mkey_cache_cleanup(dev); - if (err) - mlx5_ib_warn(dev, "mr cache cleanup failed\n"); - + mlx5_mkey_cache_cleanup(dev); mlx5r_umr_resource_cleanup(dev); } @@ -4403,6 +4428,10 @@ static int __init mlx5_ib_init(void) return -ENOMEM; } + ret = mlx5_ib_qp_event_init(); + if (ret) + goto qp_event_err; + mlx5_ib_odp_init(); ret = mlx5r_rep_init(); if (ret) @@ -4420,6 +4449,8 @@ drv_err: mp_err: mlx5r_rep_cleanup(); rep_err: + mlx5_ib_qp_event_cleanup(); +qp_event_err: destroy_workqueue(mlx5_ib_event_wq); free_page((unsigned long)xlt_emergency_page); return ret; @@ -4431,6 +4462,7 @@ static void __exit mlx5_ib_cleanup(void) auxiliary_driver_unregister(&mlx5r_mp_driver); mlx5r_rep_cleanup(); + mlx5_ib_qp_event_cleanup(); destroy_workqueue(mlx5_ib_event_wq); free_page((unsigned long)xlt_emergency_page); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8b91babdd4c0..efa4dc6e7dee 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -617,12 +617,21 @@ enum mlx5_mkey_type { MLX5_MKEY_INDIRECT_DEVX, }; +struct mlx5r_cache_rb_key { + u8 ats:1; + unsigned int access_mode; + unsigned int access_flags; + unsigned int ndescs; +}; + struct mlx5_ib_mkey { u32 key; enum mlx5_mkey_type type; unsigned int ndescs; struct wait_queue_head wait; refcount_t usecount; + /* User Mkey must hold either a rb_key or a cache_ent. */ + struct mlx5r_cache_rb_key rb_key; struct mlx5_cache_ent *cache_ent; }; @@ -737,11 +746,11 @@ struct mlx5_cache_ent { unsigned long reserved; char name[4]; - u32 order; - u32 access_mode; - u32 page; - unsigned int ndescs; + struct rb_node node; + struct mlx5r_cache_rb_key rb_key; + + u8 is_tmp:1; u8 disabled:1; u8 fill_to_high_water:1; @@ -771,9 +780,11 @@ struct mlx5r_async_create_mkey { struct mlx5_mkey_cache { struct workqueue_struct *wq; - struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES]; - struct dentry *root; + struct rb_root rb_root; + struct mutex rb_lock; + struct dentry *fs_root; unsigned long last_add; + struct delayed_work remove_ent_dwork; }; struct mlx5_ib_port_resources { @@ -832,6 +843,9 @@ struct mlx5_roce { rwlock_t netdev_lock; struct net_device *netdev; struct notifier_block nb; + struct netdev_net_notifier nn; + struct notifier_block mdev_nb; + struct net_device *tracking_netdev; atomic_t tx_port_affinity; enum ib_port_state last_port_state; struct mlx5_ib_dev *dev; @@ -874,6 +888,8 @@ enum mlx5_ib_dbg_cc_types { MLX5_IB_DBG_CC_NP_CNP_DSCP, MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE, MLX5_IB_DBG_CC_NP_CNP_PRIO, + MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP_VALID, + MLX5_IB_DBG_CC_GENERAL_RTT_RESP_DSCP, MLX5_IB_DBG_CC_MAX, }; @@ -1051,6 +1067,13 @@ struct mlx5_port_caps { u8 ext_port_cap; }; + +struct mlx5_special_mkeys { + u32 dump_fill_mkey; + __be32 null_mkey; + __be32 terminate_scatter_list_mkey; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -1081,7 +1104,6 @@ struct mlx5_ib_dev { struct xarray odp_mkeys; - u32 null_mkey; struct mlx5_ib_flow_db *flow_db; /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; @@ -1110,6 +1132,7 @@ struct mlx5_ib_dev { struct mlx5_port_caps port_caps[MLX5_MAX_PORTS]; u16 pkey_table_len; u8 lag_ports; + struct mlx5_special_mkeys mkeys; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1316,11 +1339,15 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); -int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); +void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); +struct mlx5_cache_ent * +mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, + struct mlx5r_cache_rb_key rb_key, + bool persistent_entry); struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, - struct mlx5_cache_ent *ent, - int access_flags); + int access_flags, int access_mode, + int ndescs); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); @@ -1344,7 +1371,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq); void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent); +int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev); void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags); @@ -1363,7 +1390,10 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} -static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {} +static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) +{ + return 0; +} static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) {} diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 053fe946e45a..67356f515261 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -140,19 +140,16 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); } - -static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, - void *to_store) +static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings, + void *to_store) { XA_STATE(xas, &ent->mkeys, 0); void *curr; - xa_lock_irq(&ent->mkeys); if (limit_pendings && - (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) { - xa_unlock_irq(&ent->mkeys); + (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) return -EAGAIN; - } + while (1) { /* * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version @@ -191,6 +188,7 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, break; xa_lock_irq(&ent->mkeys); } + xa_lock_irq(&ent->mkeys); if (xas_error(&xas)) return xas_error(&xas); if (WARN_ON(curr)) @@ -198,6 +196,17 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, return 0; } +static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, + void *to_store) +{ + int ret; + + xa_lock_irq(&ent->mkeys); + ret = push_mkey_locked(ent, limit_pendings, to_store); + xa_unlock_irq(&ent->mkeys); + return ret; +} + static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) { void *old; @@ -292,12 +301,14 @@ static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, umr_en, 1); - MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); - MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, + (ent->rb_key.access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, translations_octword_size, - get_mkc_octo_size(ent->access_mode, ent->ndescs)); - MLX5_SET(mkc, mkc, log_page_size, ent->page); + get_mkc_octo_size(ent->rb_key.access_mode, + ent->rb_key.ndescs)); + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); } /* Asynchronously schedule new MRs to be populated in the cache. */ @@ -515,18 +526,22 @@ static const struct file_operations limit_fops = { static bool someone_adding(struct mlx5_mkey_cache *cache) { - unsigned int i; - - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - struct mlx5_cache_ent *ent = &cache->ent[i]; - bool ret; + struct mlx5_cache_ent *ent; + struct rb_node *node; + bool ret; + mutex_lock(&cache->rb_lock); + for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); xa_lock_irq(&ent->mkeys); ret = ent->stored < ent->limit; xa_unlock_irq(&ent->mkeys); - if (ret) + if (ret) { + mutex_unlock(&cache->rb_lock); return true; + } } + mutex_unlock(&cache->rb_lock); return false; } @@ -539,7 +554,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) { lockdep_assert_held(&ent->mkeys.xa_lock); - if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) + if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) return; if (ent->stored < ent->limit) { ent->fill_to_high_water = true; @@ -590,8 +605,8 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) if (err != -EAGAIN) { mlx5_ib_warn( dev, - "command failed order %d, err %d\n", - ent->order, err); + "add keys command failed, err %d\n", + err); queue_delayed_work(cache->wq, &ent->dwork, msecs_to_jiffies(1000)); } @@ -637,16 +652,99 @@ static void delayed_cache_work_func(struct work_struct *work) __cache_work_func(ent); } -struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, - struct mlx5_cache_ent *ent, - int access_flags) +static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, + struct mlx5r_cache_rb_key key2) +{ + int res; + + res = key1.ats - key2.ats; + if (res) + return res; + + res = key1.access_mode - key2.access_mode; + if (res) + return res; + + res = key1.access_flags - key2.access_flags; + if (res) + return res; + + /* + * keep ndescs the last in the compare table since the find function + * searches for an exact match on all properties and only closest + * match in size. + */ + return key1.ndescs - key2.ndescs; +} + +static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, + struct mlx5_cache_ent *ent) +{ + struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; + struct mlx5_cache_ent *cur; + int cmp; + + /* Figure out where to put new node */ + while (*new) { + cur = rb_entry(*new, struct mlx5_cache_ent, node); + parent = *new; + cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); + if (cmp > 0) + new = &((*new)->rb_left); + if (cmp < 0) + new = &((*new)->rb_right); + if (cmp == 0) { + mutex_unlock(&cache->rb_lock); + return -EEXIST; + } + } + + /* Add new node and rebalance tree. */ + rb_link_node(&ent->node, parent, new); + rb_insert_color(&ent->node, &cache->rb_root); + + return 0; +} + +static struct mlx5_cache_ent * +mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, + struct mlx5r_cache_rb_key rb_key) +{ + struct rb_node *node = dev->cache.rb_root.rb_node; + struct mlx5_cache_ent *cur, *smallest = NULL; + int cmp; + + /* + * Find the smallest ent with order >= requested_order. + */ + while (node) { + cur = rb_entry(node, struct mlx5_cache_ent, node); + cmp = cache_ent_key_cmp(cur->rb_key, rb_key); + if (cmp > 0) { + smallest = cur; + node = node->rb_left; + } + if (cmp < 0) + node = node->rb_right; + if (cmp == 0) + return cur; + } + + return (smallest && + smallest->rb_key.access_mode == rb_key.access_mode && + smallest->rb_key.access_flags == rb_key.access_flags && + smallest->rb_key.ats == rb_key.ats) ? + smallest : + NULL; +} + +static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + struct mlx5_cache_ent *ent, + int access_flags) { struct mlx5_ib_mr *mr; int err; - if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) - return ERR_PTR(-EOPNOTSUPP); - mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -677,10 +775,48 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, return mr; } -static void clean_keys(struct mlx5_ib_dev *dev, int c) +static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, + int access_flags) +{ + int ret = 0; + + if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && + MLX5_CAP_GEN(dev->mdev, atomic) && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) + ret |= IB_ACCESS_REMOTE_ATOMIC; + + if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && + MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + ret |= IB_ACCESS_RELAXED_ORDERING; + + if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && + MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + ret |= IB_ACCESS_RELAXED_ORDERING; + + return ret; +} + +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + int access_flags, int access_mode, + int ndescs) +{ + struct mlx5r_cache_rb_key rb_key = { + .ndescs = ndescs, + .access_mode = access_mode, + .access_flags = get_unchangeable_access_flags(dev, access_flags) + }; + struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); + + if (!ent) + return ERR_PTR(-EOPNOTSUPP); + + return _mlx5_mr_cache_alloc(dev, ent, access_flags); +} + +static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) { - struct mlx5_mkey_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[c]; u32 mkey; cancel_delayed_work(&ent->dwork); @@ -699,31 +835,39 @@ static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) if (!mlx5_debugfs_root || dev->is_rep) return; - debugfs_remove_recursive(dev->cache.root); - dev->cache.root = NULL; + debugfs_remove_recursive(dev->cache.fs_root); + dev->cache.fs_root = NULL; } -static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) +static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, + struct mlx5_cache_ent *ent) { - struct mlx5_mkey_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent; + int order = order_base_2(ent->rb_key.ndescs); struct dentry *dir; - int i; if (!mlx5_debugfs_root || dev->is_rep) return; - cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); + if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) + order = MLX5_IMR_KSM_CACHE_ENTRY + 2; - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - ent = &cache->ent[i]; - sprintf(ent->name, "%d", ent->order); - dir = debugfs_create_dir(ent->name, cache->root); - debugfs_create_file("size", 0600, dir, ent, &size_fops); - debugfs_create_file("limit", 0600, dir, ent, &limit_fops); - debugfs_create_ulong("cur", 0400, dir, &ent->stored); - debugfs_create_u32("miss", 0600, dir, &ent->miss); - } + sprintf(ent->name, "%d", order); + dir = debugfs_create_dir(ent->name, dev->cache.fs_root); + debugfs_create_file("size", 0600, dir, ent, &size_fops); + debugfs_create_file("limit", 0600, dir, ent, &limit_fops); + debugfs_create_ulong("cur", 0400, dir, &ent->stored); + debugfs_create_u32("miss", 0600, dir, &ent->miss); +} + +static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) +{ + struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); + struct mlx5_mkey_cache *cache = &dev->cache; + + if (!mlx5_debugfs_root || dev->is_rep) + return; + + cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); } static void delay_time_func(struct timer_list *t) @@ -733,13 +877,100 @@ static void delay_time_func(struct timer_list *t) WRITE_ONCE(dev->fill_delay, 0); } +struct mlx5_cache_ent * +mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, + struct mlx5r_cache_rb_key rb_key, + bool persistent_entry) +{ + struct mlx5_cache_ent *ent; + int order; + int ret; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return ERR_PTR(-ENOMEM); + + xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); + ent->rb_key = rb_key; + ent->dev = dev; + ent->is_tmp = !persistent_entry; + + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + + ret = mlx5_cache_ent_insert(&dev->cache, ent); + if (ret) { + kfree(ent); + return ERR_PTR(ret); + } + + if (persistent_entry) { + if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) + order = MLX5_IMR_KSM_CACHE_ENTRY; + else + order = order_base_2(rb_key.ndescs) - 2; + + if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && + !dev->is_rep && mlx5_core_is_pf(dev->mdev) && + mlx5r_umr_can_load_pas(dev, 0)) + ent->limit = dev->mdev->profile.mr_cache[order].limit; + else + ent->limit = 0; + + mlx5_mkey_cache_debugfs_add_ent(dev, ent); + } else { + mod_delayed_work(ent->dev->cache.wq, + &ent->dev->cache.remove_ent_dwork, + msecs_to_jiffies(30 * 1000)); + } + + return ent; +} + +static void remove_ent_work_func(struct work_struct *work) +{ + struct mlx5_mkey_cache *cache; + struct mlx5_cache_ent *ent; + struct rb_node *cur; + + cache = container_of(work, struct mlx5_mkey_cache, + remove_ent_dwork.work); + mutex_lock(&cache->rb_lock); + cur = rb_last(&cache->rb_root); + while (cur) { + ent = rb_entry(cur, struct mlx5_cache_ent, node); + cur = rb_prev(cur); + mutex_unlock(&cache->rb_lock); + + xa_lock_irq(&ent->mkeys); + if (!ent->is_tmp) { + xa_unlock_irq(&ent->mkeys); + mutex_lock(&cache->rb_lock); + continue; + } + xa_unlock_irq(&ent->mkeys); + + clean_keys(ent->dev, ent); + mutex_lock(&cache->rb_lock); + } + mutex_unlock(&cache->rb_lock); +} + int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; + struct rb_root *root = &dev->cache.rb_root; + struct mlx5r_cache_rb_key rb_key = { + .access_mode = MLX5_MKC_ACCESS_MODE_MTT, + }; struct mlx5_cache_ent *ent; + struct rb_node *node; + int ret; int i; mutex_init(&dev->slow_path_mutex); + mutex_init(&dev->cache.rb_lock); + dev->cache.rb_root = RB_ROOT; + INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -748,52 +979,51 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); timer_setup(&dev->delay_timer, delay_time_func, 0); - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - ent = &cache->ent[i]; - xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); - ent->order = i + 2; - ent->dev = dev; - ent->limit = 0; - - INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); - - if (i > MKEY_CACHE_LAST_STD_ENTRY) { - mlx5_odp_init_mkey_cache_entry(ent); - continue; + mlx5_mkey_cache_debugfs_init(dev); + mutex_lock(&cache->rb_lock); + for (i = 0; i <= mkey_cache_max_order(dev); i++) { + rb_key.ndescs = 1 << (i + 2); + ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); + if (IS_ERR(ent)) { + ret = PTR_ERR(ent); + goto err; } + } - if (ent->order > mkey_cache_max_order(dev)) - continue; + ret = mlx5_odp_init_mkey_cache(dev); + if (ret) + goto err; - ent->page = PAGE_SHIFT; - ent->ndescs = 1 << ent->order; - ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; - if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && - !dev->is_rep && mlx5_core_is_pf(dev->mdev) && - mlx5r_umr_can_load_pas(dev, 0)) - ent->limit = dev->mdev->profile.mr_cache[i].limit; - else - ent->limit = 0; + mutex_unlock(&cache->rb_lock); + for (node = rb_first(root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); xa_lock_irq(&ent->mkeys); queue_adjust_cache_locked(ent); xa_unlock_irq(&ent->mkeys); } - mlx5_mkey_cache_debugfs_init(dev); - return 0; + +err: + mutex_unlock(&cache->rb_lock); + mlx5_mkey_cache_debugfs_cleanup(dev); + mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); + return ret; } -int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) +void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) { - unsigned int i; + struct rb_root *root = &dev->cache.rb_root; + struct mlx5_cache_ent *ent; + struct rb_node *node; if (!dev->cache.wq) - return 0; - - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { - struct mlx5_cache_ent *ent = &dev->cache.ent[i]; + return; + cancel_delayed_work_sync(&dev->cache.remove_ent_dwork); + mutex_lock(&dev->cache.rb_lock); + for (node = rb_first(root); node; node = rb_next(node)) { + ent = rb_entry(node, struct mlx5_cache_ent, node); xa_lock_irq(&ent->mkeys); ent->disabled = true; xa_unlock_irq(&ent->mkeys); @@ -803,13 +1033,18 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) mlx5_mkey_cache_debugfs_cleanup(dev); mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); - for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) - clean_keys(dev, i); + node = rb_first(root); + while (node) { + ent = rb_entry(node, struct mlx5_cache_ent, node); + node = rb_next(node); + clean_keys(dev, ent); + rb_erase(&ent->node, root); + kfree(ent); + } + mutex_unlock(&dev->cache.rb_lock); destroy_workqueue(dev->cache.wq); del_timer_sync(&dev->delay_timer); - - return 0; } struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) @@ -873,23 +1108,10 @@ static int get_octo_len(u64 addr, u64 len, int page_shift) static int mkey_cache_max_order(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) - return MKEY_CACHE_LAST_STD_ENTRY + 2; + return MKEY_CACHE_LAST_STD_ENTRY; return MLX5_MAX_UMR_SHIFT; } -static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, - unsigned int order) -{ - struct mlx5_mkey_cache *cache = &dev->cache; - - if (order < cache->ent[0].order) - return &cache->ent[0]; - order = order - cache->ent[0].order; - if (order > MKEY_CACHE_LAST_STD_ENTRY) - return NULL; - return &cache->ent[order]; -} - static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 length, int access_flags, u64 iova) { @@ -916,6 +1138,9 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags) { + struct mlx5r_cache_rb_key rb_key = { + .access_mode = MLX5_MKC_ACCESS_MODE_MTT, + }; struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; @@ -928,22 +1153,26 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 0, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); - ent = mkey_cache_ent_from_order( - dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); + + rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); + rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); + rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); + ent = mkey_cache_ent_from_rb_key(dev, rb_key); /* - * Matches access in alloc_cache_mr(). If the MR can't come from the - * cache then synchronously create an uncached one. + * If the MR can't come from the cache then synchronously create an uncached + * one. */ - if (!ent || ent->limit == 0 || - !mlx5r_umr_can_reconfig(dev, 0, access_flags) || - mlx5_umem_needs_ats(dev, umem, access_flags)) { + if (!ent) { mutex_lock(&dev->slow_path_mutex); mr = reg_create(pd, umem, iova, access_flags, page_size, false); mutex_unlock(&dev->slow_path_mutex); + if (IS_ERR(mr)) + return mr; + mr->mmkey.rb_key = rb_key; return mr; } - mr = mlx5_mr_cache_alloc(dev, ent, access_flags); + mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); if (IS_ERR(mr)) return mr; @@ -1030,6 +1259,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, goto err_2; } mr->mmkey.type = MLX5_MKEY_MR; + mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); mr->umem = umem; set_mr_fields(dev, mr, umem->length, access_flags, iova); kvfree(in); @@ -1372,7 +1602,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); if (WARN_ON(!*page_size)) return false; - return (1ULL << mr->mmkey.cache_ent->order) >= + return (mr->mmkey.cache_ent->rb_key.ndescs) >= ib_umem_num_dma_blocks(new_umem, *page_size); } @@ -1567,6 +1797,49 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) } } +static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *mr) +{ + struct mlx5_mkey_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int ret; + + if (mr->mmkey.cache_ent) { + xa_lock_irq(&mr->mmkey.cache_ent->mkeys); + mr->mmkey.cache_ent->in_use--; + goto end; + } + + mutex_lock(&cache->rb_lock); + ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); + if (ent) { + if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { + if (ent->disabled) { + mutex_unlock(&cache->rb_lock); + return -EOPNOTSUPP; + } + mr->mmkey.cache_ent = ent; + xa_lock_irq(&mr->mmkey.cache_ent->mkeys); + mutex_unlock(&cache->rb_lock); + goto end; + } + } + + ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); + mutex_unlock(&cache->rb_lock); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + mr->mmkey.cache_ent = ent; + xa_lock_irq(&mr->mmkey.cache_ent->mkeys); + +end: + ret = push_mkey_locked(mr->mmkey.cache_ent, false, + xa_mk_value(mr->mmkey.key)); + xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); + return ret; +} + int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct mlx5_ib_mr *mr = to_mmr(ibmr); @@ -1612,16 +1885,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } /* Stop DMA */ - if (mr->mmkey.cache_ent) { - xa_lock_irq(&mr->mmkey.cache_ent->mkeys); - mr->mmkey.cache_ent->in_use--; - xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); - + if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length)) if (mlx5r_umr_revoke_mr(mr) || - push_mkey(mr->mmkey.cache_ent, false, - xa_mk_value(mr->mmkey.key))) + cache_ent_find_and_store(dev, mr)) mr->mmkey.cache_ent = NULL; - } + if (!mr->mmkey.cache_ent) { rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); if (rc) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index e6e021af6aa9..4a04cbc5b78a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -104,7 +104,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, if (flags & MLX5_IB_UPD_XLT_ZAP) { for (; pklm != end; pklm++, idx++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); + pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; pklm->va = 0; } return; @@ -137,7 +137,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, pklm->key = cpu_to_be32(mtt->ibmr.lkey); pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); + pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; pklm->va = 0; } } @@ -417,8 +417,9 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, if (IS_ERR(odp)) return ERR_CAST(odp); - mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY], - imr->access_flags); + mr = mlx5_mr_cache_alloc(dev, imr->access_flags, + MLX5_MKC_ACCESS_MODE_MTT, + MLX5_IMR_MTT_ENTRIES); if (IS_ERR(mr)) { ib_umem_odp_release(odp); return mr; @@ -492,9 +493,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); - imr = mlx5_mr_cache_alloc(dev, - &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY], - access_flags); + imr = mlx5_mr_cache_alloc(dev, access_flags, MLX5_MKC_ACCESS_MODE_KSM, + mlx5_imr_ksm_entries); if (IS_ERR(imr)) { ib_umem_odp_release(umem_odp); return imr; @@ -986,7 +986,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, { int ret = 0, npages = 0; u64 io_virt; - u32 key; + __be32 key; u32 byte_count; size_t bcnt; int inline_segment; @@ -1000,7 +1000,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, struct mlx5_wqe_data_seg *dseg = wqe; io_virt = be64_to_cpu(dseg->addr); - key = be32_to_cpu(dseg->lkey); + key = dseg->lkey; byte_count = be32_to_cpu(dseg->byte_count); inline_segment = !!(byte_count & MLX5_INLINE_SEG); bcnt = byte_count & ~MLX5_INLINE_SEG; @@ -1014,7 +1014,8 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, } /* receive WQE end of sg list. */ - if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && + if (receive_queue && bcnt == 0 && + key == dev->mkeys.terminate_scatter_list_mkey && io_virt == 0) break; @@ -1034,7 +1035,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, continue; } - ret = pagefault_single_data_segment(dev, NULL, key, + ret = pagefault_single_data_segment(dev, NULL, be32_to_cpu(key), io_virt, bcnt, &pfault->bytes_committed, bytes_mapped); @@ -1587,26 +1588,22 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) return err; } -void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) +int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev) { - if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) - return; + struct mlx5r_cache_rb_key rb_key = { + .access_mode = MLX5_MKC_ACCESS_MODE_KSM, + .ndescs = mlx5_imr_ksm_entries, + }; + struct mlx5_cache_ent *ent; - switch (ent->order - 2) { - case MLX5_IMR_MTT_CACHE_ENTRY: - ent->page = PAGE_SHIFT; - ent->ndescs = MLX5_IMR_MTT_ENTRIES; - ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; - ent->limit = 0; - break; + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return 0; - case MLX5_IMR_KSM_CACHE_ENTRY: - ent->page = MLX5_KSM_PAGE_SHIFT; - ent->ndescs = mlx5_imr_ksm_entries; - ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; - ent->limit = 0; - break; - } + ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + return 0; } static const struct ib_device_ops mlx5_ib_dev_odp_ops = { @@ -1615,25 +1612,15 @@ static const struct ib_device_ops mlx5_ib_dev_odp_ops = { int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { - int ret = 0; - internal_fill_odp_caps(dev); if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) - return ret; + return 0; ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); - if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { - ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); - if (ret) { - mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); - return ret; - } - } - mutex_init(&dev->odp_eq_mutex); - return ret; + return 0; } void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 40d9410ec303..7cc3b973dec7 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -71,6 +71,14 @@ struct mlx5_modify_raw_qp_param { u32 port; }; +struct mlx5_ib_qp_event_work { + struct work_struct work; + struct mlx5_core_qp *qp; + int type; +}; + +static struct workqueue_struct *mlx5_ib_qp_event_wq; + static void get_cqs(enum ib_qp_type qp_type, struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq); @@ -302,51 +310,120 @@ int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc); } +static void mlx5_ib_qp_err_syndrome(struct ib_qp *ibqp) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + void *pas_ext_union, *err_syn; + u32 *outb; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, qpc_extension) || + !MLX5_CAP_GEN(dev->mdev, qp_error_syndrome)) + return; + + outb = kzalloc(outlen, GFP_KERNEL); + if (!outb) + return; + + err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen, + true); + if (err) + goto out; + + pas_ext_union = + MLX5_ADDR_OF(query_qp_out, outb, qp_pas_or_qpc_ext_and_pas); + err_syn = MLX5_ADDR_OF(qpc_extension_and_pas_list_in, pas_ext_union, + qpc_data_extension.error_syndrome); + + pr_err("%s/%d: QP %d error: %s (0x%x 0x%x 0x%x)\n", + ibqp->device->name, ibqp->port, ibqp->qp_num, + ib_wc_status_msg( + MLX5_GET(cqe_error_syndrome, err_syn, syndrome)), + MLX5_GET(cqe_error_syndrome, err_syn, vendor_error_syndrome), + MLX5_GET(cqe_error_syndrome, err_syn, hw_syndrome_type), + MLX5_GET(cqe_error_syndrome, err_syn, hw_error_syndrome)); +out: + kfree(outb); +} + +static void mlx5_ib_handle_qp_event(struct work_struct *_work) +{ + struct mlx5_ib_qp_event_work *qpe_work = + container_of(_work, struct mlx5_ib_qp_event_work, work); + struct ib_qp *ibqp = &to_mibqp(qpe_work->qp)->ibqp; + struct ib_event event = {}; + + event.device = ibqp->device; + event.element.qp = ibqp; + switch (qpe_work->type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", + qpe_work->type, qpe_work->qp->qpn); + goto out; + } + + if ((event.event == IB_EVENT_QP_FATAL) || + (event.event == IB_EVENT_QP_ACCESS_ERR)) + mlx5_ib_qp_err_syndrome(ibqp); + + ibqp->event_handler(&event, ibqp->qp_context); + +out: + mlx5_core_res_put(&qpe_work->qp->common); + kfree(qpe_work); +} + static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) { struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; - struct ib_event event; + struct mlx5_ib_qp_event_work *qpe_work; if (type == MLX5_EVENT_TYPE_PATH_MIG) { /* This event is only valid for trans_qps */ to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; } - if (ibqp->event_handler) { - event.device = ibqp->device; - event.element.qp = ibqp; - switch (type) { - case MLX5_EVENT_TYPE_PATH_MIG: - event.event = IB_EVENT_PATH_MIG; - break; - case MLX5_EVENT_TYPE_COMM_EST: - event.event = IB_EVENT_COMM_EST; - break; - case MLX5_EVENT_TYPE_SQ_DRAINED: - event.event = IB_EVENT_SQ_DRAINED; - break; - case MLX5_EVENT_TYPE_SRQ_LAST_WQE: - event.event = IB_EVENT_QP_LAST_WQE_REACHED; - break; - case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: - event.event = IB_EVENT_QP_FATAL; - break; - case MLX5_EVENT_TYPE_PATH_MIG_FAILED: - event.event = IB_EVENT_PATH_MIG_ERR; - break; - case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: - event.event = IB_EVENT_QP_REQ_ERR; - break; - case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: - event.event = IB_EVENT_QP_ACCESS_ERR; - break; - default: - pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); - return; - } + if (!ibqp->event_handler) + goto out_no_handler; - ibqp->event_handler(&event, ibqp->qp_context); - } + qpe_work = kzalloc(sizeof(*qpe_work), GFP_ATOMIC); + if (!qpe_work) + goto out_no_handler; + + qpe_work->qp = qp; + qpe_work->type = type; + INIT_WORK(&qpe_work->work, mlx5_ib_handle_qp_event); + queue_work(mlx5_ib_qp_event_wq, &qpe_work->work); + return; + +out_no_handler: + mlx5_core_res_put(&qp->common); } static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, @@ -4502,6 +4579,40 @@ static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev, return false; } +static int validate_rd_atomic(struct mlx5_ib_dev *dev, struct ib_qp_attr *attr, + int attr_mask, enum ib_qp_type qp_type) +{ + int log_max_ra_res; + int log_max_ra_req; + + if (qp_type == MLX5_IB_QPT_DCI) { + log_max_ra_res = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_res_dc); + log_max_ra_req = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_req_dc); + } else { + log_max_ra_res = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_res_qp); + log_max_ra_req = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_req_qp); + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > log_max_ra_res) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); + return false; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > log_max_ra_req) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); + return false; + } + return true; +} + int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -4589,21 +4700,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && - attr->max_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { - mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", - attr->max_rd_atomic); - goto out; - } - - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && - attr->max_dest_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { - mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", - attr->max_dest_rd_atomic); + if (!validate_rd_atomic(dev, attr, attr_mask, qp_type)) goto out; - } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; @@ -4806,7 +4904,8 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!outb) return -ENOMEM; - err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen); + err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen, + false); if (err) goto out; @@ -5699,3 +5798,17 @@ out: mutex_unlock(&mqp->mutex); return err; } + +int mlx5_ib_qp_event_init(void) +{ + mlx5_ib_qp_event_wq = alloc_ordered_workqueue("mlx5_ib_qp_event_wq", 0); + if (!mlx5_ib_qp_event_wq) + return -ENOMEM; + + return 0; +} + +void mlx5_ib_qp_event_cleanup(void) +{ + destroy_workqueue(mlx5_ib_qp_event_wq); +} diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index 5d4e140db99c..77f9b4a54816 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -20,7 +20,7 @@ int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct); int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, - u32 *out, int outlen); + u32 *out, int outlen, bool qpc_ext); int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, u32 *out, int outlen); @@ -44,4 +44,6 @@ void mlx5_core_res_put(struct mlx5_core_rsc_common *res); int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn); int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); +int mlx5_ib_qp_event_init(void); +void mlx5_ib_qp_event_cleanup(void); #endif /* _MLX5_IB_QP_H */ diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index 542e4c63a8de..bae0334d6e7f 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -135,7 +135,8 @@ static int rsc_event_notifier(struct notifier_block *nb, case MLX5_RES_SQ: qp = (struct mlx5_core_qp *)common; qp->event(qp, event_type); - break; + /* Need to put resource in event handler */ + return NOTIFY_OK; case MLX5_RES_DCT: dct = (struct mlx5_core_dct *)common; if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) @@ -504,12 +505,14 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev) } int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, - u32 *out, int outlen) + u32 *out, int outlen, bool qpc_ext) { u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {}; MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); MLX5_SET(query_qp_in, in, qpn, qp->qpn); + MLX5_SET(query_qp_in, in, qpc_ext, qpc_ext); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, outlen); } diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 09b365a98bbf..a056ea835da5 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -447,7 +447,7 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, if (i < srq->msrq.max_avail_gather) { scat[i].byte_count = 0; - scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].lkey = dev->mkeys.terminate_scatter_list_mkey; scat[i].addr = 0; } } diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 029e9536ec28..55f4e048d947 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -636,9 +636,7 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); cur_mtt = mtt; - rdma_for_each_block(mr->umem->sgt_append.sgt.sgl, &biter, - mr->umem->sgt_append.sgt.nents, - BIT(mr->page_shift)) { + rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { if (cur_mtt == (void *)mtt + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c index 855f3f4fefad..df1d1b0a3ef7 100644 --- a/drivers/infiniband/hw/mlx5/wr.c +++ b/drivers/infiniband/hw/mlx5/wr.c @@ -1252,7 +1252,7 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (i < qp->rq.max_gs) { scat[i].byte_count = 0; - scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].lkey = dev->mkeys.terminate_scatter_list_mkey; scat[i].addr = 0; } diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 3937144b2ae5..80fe92a21f96 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -733,7 +733,7 @@ static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd, } /* don't allow them to later change with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; @@ -769,7 +769,7 @@ static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd, phys = dd->physaddr + ureg; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, vma->vm_end - vma->vm_start, @@ -810,8 +810,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, * don't allow them to later change to readable with mprotect (for when * not initially mapped readable, as is normally the case) */ - vma->vm_flags &= ~VM_MAYREAD; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND, VM_MAYREAD); /* We used PAT if wc_cookie == 0 */ if (!dd->wc_cookie) @@ -852,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, goto bail; } /* don't allow them to later change to writable with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); start = vma->vm_start; @@ -944,7 +943,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, * Don't allow permission to later change to writable * with mprotect. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } else goto bail; len = vma->vm_end - vma->vm_start; @@ -955,7 +954,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &qib_file_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); ret = 1; bail: diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6e8c4fbb8083..6289238cc5af 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -672,7 +672,7 @@ int usnic_ib_mmap(struct ib_ucontext *context, usnic_dbg("\n"); us_ibdev = to_usdev(context->device); - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vfid = vma->vm_pgoff; usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index c301b3be9f30..2a5cac2658ec 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -277,7 +277,7 @@ iter_chunk: usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x", va_start, &pa_start, size, flags); err = iommu_map(pd->domain, va_start, pa_start, - size, flags); + size, flags, GFP_ATOMIC); if (err) { usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", va_start, &pa_start, size, err); @@ -294,7 +294,7 @@ iter_chunk: usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n", va_start, &pa_start, size, flags); err = iommu_map(pd->domain, va_start, pa_start, - size, flags); + size, flags, GFP_ATOMIC); if (err) { usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n", va_start, &pa_start, size, err); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 19176583dbde..9f54aa90a35a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -408,7 +408,7 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) } /* Map UAR to kernel space, VM_LOCKED? */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, start, context->uar.pfn, size, vma->vm_page_prot)) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index ab334900fcc3..2415f3704f57 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -57,6 +57,44 @@ #define rxe_dbg_mw(mw, fmt, ...) ibdev_dbg((mw)->ibmw.device, \ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) +/* responder states */ +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_INVALIDATE_RKEY, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_RESET, + RESPST_DONE, + RESPST_EXIT, +}; + void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 948ce4902b10..1bb0cb479eb1 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -64,12 +64,16 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr); int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr); int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr); -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length); -int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, - enum rxe_mr_copy_dir dir); +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length); +int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir); int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum rxe_mr_copy_dir dir); -void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length); +int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset); +int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type); int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 072eac4b65d2..b10aa1580a64 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -26,22 +26,22 @@ u8 rxe_get_next_key(u32 last_key) int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) { - - switch (mr->ibmr.type) { case IB_MR_TYPE_DMA: return 0; case IB_MR_TYPE_USER: case IB_MR_TYPE_MEM_REG: - if (iova < mr->ibmr.iova || length > mr->ibmr.length || - iova > mr->ibmr.iova + mr->ibmr.length - length) - return -EFAULT; + if (iova < mr->ibmr.iova || + iova + length > mr->ibmr.iova + mr->ibmr.length) { + rxe_dbg_mr(mr, "iova/length out of range"); + return -EINVAL; + } return 0; default: - rxe_dbg_mr(mr, "type (%d) not supported\n", mr->ibmr.type); - return -EFAULT; + rxe_dbg_mr(mr, "mr type not supported\n"); + return -EINVAL; } } @@ -62,57 +62,31 @@ static void rxe_mr_init(int access, struct rxe_mr *mr) mr->lkey = mr->ibmr.lkey = lkey; mr->rkey = mr->ibmr.rkey = rkey; + mr->access = access; + mr->ibmr.page_size = PAGE_SIZE; + mr->page_mask = PAGE_MASK; + mr->page_shift = PAGE_SHIFT; mr->state = RXE_MR_STATE_INVALID; } -static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) -{ - int i; - int num_map; - struct rxe_map **map = mr->map; - - num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; - - mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); - if (!mr->map) - goto err1; - - for (i = 0; i < num_map; i++) { - mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); - if (!mr->map[i]) - goto err2; - } - - BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP)); - - mr->map_shift = ilog2(RXE_BUF_PER_MAP); - mr->map_mask = RXE_BUF_PER_MAP - 1; - - mr->num_buf = num_buf; - mr->num_map = num_map; - mr->max_buf = num_map * RXE_BUF_PER_MAP; - - return 0; - -err2: - for (i--; i >= 0; i--) - kfree(mr->map[i]); - - kfree(mr->map); - mr->map = NULL; -err1: - return -ENOMEM; -} - void rxe_mr_init_dma(int access, struct rxe_mr *mr) { rxe_mr_init(access, mr); - mr->access = access; mr->state = RXE_MR_STATE_VALID; mr->ibmr.type = IB_MR_TYPE_DMA; } +static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) +{ + return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); +} + +static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) +{ + return iova & (mr_page_size(mr) - 1); +} + static bool is_pmem_page(struct page *pg) { unsigned long paddr = page_to_phys(pg); @@ -122,86 +96,98 @@ static bool is_pmem_page(struct page *pg) IORES_DESC_PERSISTENT_MEMORY); } +static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) +{ + XA_STATE(xas, &mr->page_list, 0); + struct sg_page_iter sg_iter; + struct page *page; + bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + + __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); + if (!__sg_page_iter_next(&sg_iter)) + return 0; + + do { + xas_lock(&xas); + while (true) { + page = sg_page_iter_page(&sg_iter); + + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page can't be persistent\n"); + xas_set_err(&xas, -EINVAL); + break; + } + + xas_store(&xas, page); + if (xas_error(&xas)) + break; + xas_next(&xas); + if (!__sg_page_iter_next(&sg_iter)) + break; + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + return xas_error(&xas); +} + int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr) { - struct rxe_map **map; - struct rxe_phys_buf *buf = NULL; - struct ib_umem *umem; - struct sg_page_iter sg_iter; - int num_buf; - void *vaddr; + struct ib_umem *umem; int err; + rxe_mr_init(access, mr); + + xa_init(&mr->page_list); + umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", (int)PTR_ERR(umem)); - err = PTR_ERR(umem); - goto err_out; + return PTR_ERR(umem); } - num_buf = ib_umem_num_pages(umem); - - rxe_mr_init(access, mr); - - err = rxe_mr_alloc(mr, num_buf); + err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); if (err) { - rxe_dbg_mr(mr, "Unable to allocate memory for map\n"); - goto err_release_umem; + ib_umem_release(umem); + return err; } - mr->page_shift = PAGE_SHIFT; - mr->page_mask = PAGE_SIZE - 1; - - num_buf = 0; - map = mr->map; - if (length > 0) { - bool persistent_access = access & IB_ACCESS_FLUSH_PERSISTENT; - - buf = map[0]->buf; - for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { - struct page *pg = sg_page_iter_page(&sg_iter); + mr->umem = umem; + mr->ibmr.type = IB_MR_TYPE_USER; + mr->state = RXE_MR_STATE_VALID; - if (persistent_access && !is_pmem_page(pg)) { - rxe_dbg_mr(mr, "Unable to register persistent access to non-pmem device\n"); - err = -EINVAL; - goto err_release_umem; - } + return 0; +} - if (num_buf >= RXE_BUF_PER_MAP) { - map++; - buf = map[0]->buf; - num_buf = 0; - } +static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) +{ + XA_STATE(xas, &mr->page_list, 0); + int i = 0; + int err; - vaddr = page_address(pg); - if (!vaddr) { - rxe_dbg_mr(mr, "Unable to get virtual address\n"); - err = -ENOMEM; - goto err_release_umem; - } - buf->addr = (uintptr_t)vaddr; - buf->size = PAGE_SIZE; - num_buf++; - buf++; + xa_init(&mr->page_list); + do { + xas_lock(&xas); + while (i != num_buf) { + xas_store(&xas, XA_ZERO_ENTRY); + if (xas_error(&xas)) + break; + xas_next(&xas); + i++; } - } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); - mr->umem = umem; - mr->access = access; - mr->offset = ib_umem_offset(umem); - mr->state = RXE_MR_STATE_VALID; - mr->ibmr.type = IB_MR_TYPE_USER; - mr->ibmr.page_size = PAGE_SIZE; + err = xas_error(&xas); + if (err) + return err; - return 0; + mr->num_buf = num_buf; -err_release_umem: - ib_umem_release(umem); -err_out: - return err; + return 0; } int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) @@ -215,7 +201,6 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) if (err) goto err1; - mr->max_buf = max_pages; mr->state = RXE_MR_STATE_FREE; mr->ibmr.type = IB_MR_TYPE_MEM_REG; @@ -225,187 +210,125 @@ err1: return err; } -static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out, - size_t *offset_out) +static int rxe_set_page(struct ib_mr *ibmr, u64 iova) { - size_t offset = iova - mr->ibmr.iova + mr->offset; - int map_index; - int buf_index; - u64 length; - - if (likely(mr->page_shift)) { - *offset_out = offset & mr->page_mask; - offset >>= mr->page_shift; - *n_out = offset & mr->map_mask; - *m_out = offset >> mr->map_shift; - } else { - map_index = 0; - buf_index = 0; + struct rxe_mr *mr = to_rmr(ibmr); + struct page *page = virt_to_page(iova & mr->page_mask); + bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + int err; - length = mr->map[map_index]->buf[buf_index].size; + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page cannot be persistent\n"); + return -EINVAL; + } - while (offset >= length) { - offset -= length; - buf_index++; + if (unlikely(mr->nbuf == mr->num_buf)) + return -ENOMEM; - if (buf_index == RXE_BUF_PER_MAP) { - map_index++; - buf_index = 0; - } - length = mr->map[map_index]->buf[buf_index].size; - } + err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); + if (err) + return err; - *m_out = map_index; - *n_out = buf_index; - *offset_out = offset; - } + mr->nbuf++; + return 0; } -void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) +int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, + int sg_nents, unsigned int *sg_offset) { - size_t offset; - int m, n; - void *addr; - - if (mr->state != RXE_MR_STATE_VALID) { - rxe_dbg_mr(mr, "Not in valid state\n"); - addr = NULL; - goto out; - } - - if (!mr->map) { - addr = (void *)(uintptr_t)iova; - goto out; - } - - if (mr_check_range(mr, iova, length)) { - rxe_dbg_mr(mr, "Range violation\n"); - addr = NULL; - goto out; - } - - lookup_iova(mr, iova, &m, &n, &offset); - - if (offset + length > mr->map[m]->buf[n].size) { - rxe_dbg_mr(mr, "Crosses page boundary\n"); - addr = NULL; - goto out; - } + struct rxe_mr *mr = to_rmr(ibmr); + unsigned int page_size = mr_page_size(mr); - addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset; + mr->nbuf = 0; + mr->page_shift = ilog2(page_size); + mr->page_mask = ~((u64)page_size - 1); + mr->page_offset = mr->ibmr.iova & (page_size - 1); -out: - return addr; + return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); } -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length) +static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) { - size_t offset; - - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) - return -EFAULT; + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); + unsigned long index = rxe_mr_iova_to_index(mr, iova); + unsigned int bytes; + struct page *page; + void *va; - offset = (iova - mr->ibmr.iova + mr->offset) & mr->page_mask; - while (length > 0) { - u8 *va; - int bytes; - - bytes = mr->ibmr.page_size - offset; - if (bytes > length) - bytes = length; - - va = iova_to_vaddr(mr, iova, length); - if (!va) + while (length) { + page = xa_load(&mr->page_list, index); + if (!page) return -EFAULT; - arch_wb_cache_pmem(va, bytes); - + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + va = kmap_local_page(page); + if (dir == RXE_FROM_MR_OBJ) + memcpy(addr, va + page_offset, bytes); + else + memcpy(va + page_offset, addr, bytes); + kunmap_local(va); + + page_offset = 0; + addr += bytes; length -= bytes; - iova += bytes; - offset = 0; + index++; } return 0; } -/* copy data from a range (vaddr, vaddr+length-1) to or from - * a mr object starting at iova. - */ -int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, - enum rxe_mr_copy_dir dir) +static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) { - int err; - int bytes; - u8 *va; - struct rxe_map **map; - struct rxe_phys_buf *buf; - int m; - int i; - size_t offset; + unsigned int page_offset = iova & (PAGE_SIZE - 1); + unsigned int bytes; + struct page *page; + u8 *va; - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) { - u8 *src, *dest; + while (length) { + page = virt_to_page(iova & mr->page_mask); + bytes = min_t(unsigned int, length, + PAGE_SIZE - page_offset); + va = kmap_local_page(page); + + if (dir == RXE_TO_MR_OBJ) + memcpy(va + page_offset, addr, bytes); + else + memcpy(addr, va + page_offset, bytes); + + kunmap_local(va); + page_offset = 0; + iova += bytes; + addr += bytes; + length -= bytes; + } +} - src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova); +int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) +{ + int err; - dest = (dir == RXE_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr; + if (length == 0) + return 0; - memcpy(dest, src, length); + if (WARN_ON(!mr)) + return -EINVAL; + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + rxe_mr_copy_dma(mr, iova, addr, length, dir); return 0; } - WARN_ON_ONCE(!mr->map); - err = mr_check_range(mr, iova, length); - if (err) { - err = -EFAULT; - goto err1; - } - - lookup_iova(mr, iova, &m, &i, &offset); - - map = mr->map + m; - buf = map[0]->buf + i; - - while (length > 0) { - u8 *src, *dest; - - va = (u8 *)(uintptr_t)buf->addr + offset; - src = (dir == RXE_TO_MR_OBJ) ? addr : va; - dest = (dir == RXE_TO_MR_OBJ) ? va : addr; - - bytes = buf->size - offset; - - if (bytes > length) - bytes = length; - - memcpy(dest, src, bytes); - - length -= bytes; - addr += bytes; - - offset = 0; - buf++; - i++; - - if (i == RXE_BUF_PER_MAP) { - i = 0; - map++; - buf = map[0]->buf; - } + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range"); + return err; } - return 0; - -err1: - return err; + return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } /* copy data in or out of a wqe, i.e. sg list @@ -477,7 +400,6 @@ int copy_data( if (bytes > 0) { iova = sge->addr + offset; - err = rxe_mr_copy(mr, iova, addr, bytes, dir); if (err) goto err2; @@ -504,6 +426,165 @@ err1: return err; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +{ + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + err = mr_check_range(mr, iova, length); + if (err) + return err; + + while (length > 0) { + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + if (!page) + return -EFAULT; + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + return 0; +} + +/* Guarantee atomicity of atomic operations at the machine level. */ +static DEFINE_SPINLOCK(atomic_ops_lock); + +int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + unsigned int page_offset; + struct page *page; + u64 value; + u64 *va; + + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + page_offset = iova & (PAGE_SIZE - 1); + page = virt_to_page(iova & PAGE_MASK); + } else { + unsigned long index; + int err; + + err = mr_check_range(mr, iova, sizeof(value)); + if (err) { + rxe_dbg_mr(mr, "iova out of range"); + return RESPST_ERR_RKEY_VIOLATION; + } + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + } + + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "iova not aligned"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + spin_lock_bh(&atomic_ops_lock); + value = *orig_val = va[page_offset >> 3]; + + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == compare) + va[page_offset >> 3] = swap_add; + } else { + value += swap_add; + va[page_offset >> 3] = value; + } + spin_unlock_bh(&atomic_ops_lock); + + kunmap_local(va); + + return 0; +} + +#if defined CONFIG_64BIT +/* only implemented or called for 64 bit architectures */ +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + unsigned int page_offset; + struct page *page; + u64 *va; + + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + page_offset = iova & (PAGE_SIZE - 1); + page = virt_to_page(iova & PAGE_MASK); + } else { + unsigned long index; + int err; + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range"); + return RESPST_ERR_RKEY_VIOLATION; + } + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + } + + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "misaligned address"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + + kunmap_local(va); + + return 0; +} +#else +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +#endif + int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { struct rxe_sge *sge = &dma->sge[dma->cur_sge]; @@ -537,12 +618,6 @@ int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) return 0; } -/* (1) find the mr corresponding to lkey/rkey - * depending on lookup_type - * (2) verify that the (qp) pd matches the mr pd - * (3) verify that the mr can support the requested access - * (4) verify that mr state is valid - */ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type) { @@ -656,22 +731,17 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return -EINVAL; rxe_cleanup(mr); - + kfree_rcu(mr); return 0; } void rxe_mr_cleanup(struct rxe_pool_elem *elem) { struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); - int i; rxe_put(mr_pd(mr)); ib_umem_release(mr->umem); - if (mr->map) { - for (i = 0; i < mr->num_map; i++) - kfree(mr->map[i]); - - kfree(mr->map); - } + if (mr->ibmr.type != IB_MR_TYPE_DMA) + xa_destroy(&mr->page_list); } diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index a754fc902e3d..7b41d79e72b2 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -98,11 +98,11 @@ enum rxe_device_param { RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX, RXE_MIN_MR_INDEX = 0x00000001, - RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE, - RXE_MAX_MR = DEFAULT_MAX_VALUE - RXE_MIN_MR_INDEX, - RXE_MIN_MW_INDEX = 0x00010001, - RXE_MAX_MW_INDEX = 0x00020000, - RXE_MAX_MW = 0x00001000, + RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1, + RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX, + RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1, + RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE, + RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX, RXE_MAX_PKT_PER_ACK = 64, diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index f50620f5a0a1..6215c6de3a84 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -23,16 +23,16 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_ucontext), .elem_offset = offsetof(struct rxe_ucontext, elem), .min_index = 1, - .max_index = UINT_MAX, - .max_elem = UINT_MAX, + .max_index = RXE_MAX_UCONTEXT, + .max_elem = RXE_MAX_UCONTEXT, }, [RXE_TYPE_PD] = { .name = "pd", .size = sizeof(struct rxe_pd), .elem_offset = offsetof(struct rxe_pd, elem), .min_index = 1, - .max_index = UINT_MAX, - .max_elem = UINT_MAX, + .max_index = RXE_MAX_PD, + .max_elem = RXE_MAX_PD, }, [RXE_TYPE_AH] = { .name = "ah", @@ -40,7 +40,7 @@ static const struct rxe_type_info { .elem_offset = offsetof(struct rxe_ah, elem), .min_index = RXE_MIN_AH_INDEX, .max_index = RXE_MAX_AH_INDEX, - .max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1, + .max_elem = RXE_MAX_AH, }, [RXE_TYPE_SRQ] = { .name = "srq", @@ -49,7 +49,7 @@ static const struct rxe_type_info { .cleanup = rxe_srq_cleanup, .min_index = RXE_MIN_SRQ_INDEX, .max_index = RXE_MAX_SRQ_INDEX, - .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1, + .max_elem = RXE_MAX_SRQ, }, [RXE_TYPE_QP] = { .name = "qp", @@ -58,7 +58,7 @@ static const struct rxe_type_info { .cleanup = rxe_qp_cleanup, .min_index = RXE_MIN_QP_INDEX, .max_index = RXE_MAX_QP_INDEX, - .max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1, + .max_elem = RXE_MAX_QP, }, [RXE_TYPE_CQ] = { .name = "cq", @@ -66,8 +66,8 @@ static const struct rxe_type_info { .elem_offset = offsetof(struct rxe_cq, elem), .cleanup = rxe_cq_cleanup, .min_index = 1, - .max_index = UINT_MAX, - .max_elem = UINT_MAX, + .max_index = RXE_MAX_CQ, + .max_elem = RXE_MAX_CQ, }, [RXE_TYPE_MR] = { .name = "mr", @@ -76,7 +76,7 @@ static const struct rxe_type_info { .cleanup = rxe_mr_cleanup, .min_index = RXE_MIN_MR_INDEX, .max_index = RXE_MAX_MR_INDEX, - .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1, + .max_elem = RXE_MAX_MR, }, [RXE_TYPE_MW] = { .name = "mw", @@ -85,7 +85,7 @@ static const struct rxe_type_info { .cleanup = rxe_mw_cleanup, .min_index = RXE_MIN_MW_INDEX, .max_index = RXE_MAX_MW_INDEX, - .max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1, + .max_elem = RXE_MAX_MW, }, }; @@ -116,55 +116,12 @@ void rxe_pool_cleanup(struct rxe_pool *pool) WARN_ON(!xa_empty(&pool->xa)); } -void *rxe_alloc(struct rxe_pool *pool) -{ - struct rxe_pool_elem *elem; - void *obj; - int err; - - if (WARN_ON(!(pool->type == RXE_TYPE_MR))) - return NULL; - - if (atomic_inc_return(&pool->num_elem) > pool->max_elem) - goto err_cnt; - - obj = kzalloc(pool->elem_size, GFP_KERNEL); - if (!obj) - goto err_cnt; - - elem = (struct rxe_pool_elem *)((u8 *)obj + pool->elem_offset); - - elem->pool = pool; - elem->obj = obj; - kref_init(&elem->ref_cnt); - init_completion(&elem->complete); - - /* allocate index in array but leave pointer as NULL so it - * can't be looked up until rxe_finalize() is called - */ - err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, - &pool->next, GFP_KERNEL); - if (err < 0) - goto err_free; - - return obj; - -err_free: - kfree(obj); -err_cnt: - atomic_dec(&pool->num_elem); - return NULL; -} - int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, bool sleepable) { int err; gfp_t gfp_flags; - if (WARN_ON(pool->type == RXE_TYPE_MR)) - return -EINVAL; - if (atomic_inc_return(&pool->num_elem) > pool->max_elem) goto err_cnt; @@ -275,9 +232,6 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) if (pool->cleanup) pool->cleanup(elem); - if (pool->type == RXE_TYPE_MR) - kfree_rcu(elem->obj); - atomic_dec(&pool->num_elem); return err; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 9d83cb32092f..b42e26427a70 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -54,9 +54,6 @@ void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, /* free resources from object pool */ void rxe_pool_cleanup(struct rxe_pool *pool); -/* allocate an object from pool */ -void *rxe_alloc(struct rxe_pool *pool); - /* connect already allocated object to pool */ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, bool sleepable); diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h index ed44042782fa..c711cb98b949 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.h +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -35,19 +35,26 @@ /** * enum queue_type - type of queue * @QUEUE_TYPE_TO_CLIENT: Queue is written by rxe driver and - * read by client. Used by rxe driver only. + * read by client which may be a user space + * application or a kernel ulp. + * Used by rxe internals only. * @QUEUE_TYPE_FROM_CLIENT: Queue is written by client and - * read by rxe driver. Used by rxe driver only. - * @QUEUE_TYPE_TO_DRIVER: Queue is written by client and - * read by rxe driver. Used by kernel client only. - * @QUEUE_TYPE_FROM_DRIVER: Queue is written by rxe driver and - * read by client. Used by kernel client only. + * read by rxe driver. + * Used by rxe internals only. + * @QUEUE_TYPE_FROM_ULP: Queue is written by kernel ulp and + * read by rxe driver. + * Used by kernel verbs APIs only on + * behalf of ulps. + * @QUEUE_TYPE_TO_ULP: Queue is written by rxe driver and + * read by kernel ulp. + * Used by kernel verbs APIs only on + * behalf of ulps. */ enum queue_type { QUEUE_TYPE_TO_CLIENT, QUEUE_TYPE_FROM_CLIENT, - QUEUE_TYPE_TO_DRIVER, - QUEUE_TYPE_FROM_DRIVER, + QUEUE_TYPE_FROM_ULP, + QUEUE_TYPE_TO_ULP, }; struct rxe_queue_buf; @@ -62,9 +69,9 @@ struct rxe_queue { u32 index_mask; enum queue_type type; /* private copy of index for shared queues between - * kernel space and user space. Kernel reads and writes + * driver and clients. Driver reads and writes * this copy and then replicates to rxe_queue_buf - * for read access by user space. + * for read access by clients. */ u32 index; }; @@ -97,19 +104,21 @@ static inline u32 queue_get_producer(const struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: - /* protect user index */ + /* used by rxe, client owns the index */ prod = smp_load_acquire(&q->buf->producer_index); break; case QUEUE_TYPE_TO_CLIENT: + /* used by rxe which owns the index */ prod = q->index; break; - case QUEUE_TYPE_FROM_DRIVER: - /* protect driver index */ - prod = smp_load_acquire(&q->buf->producer_index); - break; - case QUEUE_TYPE_TO_DRIVER: + case QUEUE_TYPE_FROM_ULP: + /* used by ulp which owns the index */ prod = q->buf->producer_index; break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp, rxe owns the index */ + prod = smp_load_acquire(&q->buf->producer_index); + break; } return prod; @@ -122,19 +131,21 @@ static inline u32 queue_get_consumer(const struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: + /* used by rxe which owns the index */ cons = q->index; break; case QUEUE_TYPE_TO_CLIENT: - /* protect user index */ + /* used by rxe, client owns the index */ cons = smp_load_acquire(&q->buf->consumer_index); break; - case QUEUE_TYPE_FROM_DRIVER: - cons = q->buf->consumer_index; - break; - case QUEUE_TYPE_TO_DRIVER: - /* protect driver index */ + case QUEUE_TYPE_FROM_ULP: + /* used by ulp, rxe owns the index */ cons = smp_load_acquire(&q->buf->consumer_index); break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp which owns the index */ + cons = q->buf->consumer_index; + break; } return cons; @@ -172,24 +183,31 @@ static inline void queue_advance_producer(struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: - pr_warn("%s: attempt to advance client index\n", - __func__); + /* used by rxe, client owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance client index\n", + __func__); break; case QUEUE_TYPE_TO_CLIENT: + /* used by rxe which owns the index */ prod = q->index; prod = (prod + 1) & q->index_mask; q->index = prod; - /* protect user index */ + /* release so client can read it safely */ smp_store_release(&q->buf->producer_index, prod); break; - case QUEUE_TYPE_FROM_DRIVER: - pr_warn("%s: attempt to advance driver index\n", - __func__); - break; - case QUEUE_TYPE_TO_DRIVER: + case QUEUE_TYPE_FROM_ULP: + /* used by ulp which owns the index */ prod = q->buf->producer_index; prod = (prod + 1) & q->index_mask; - q->buf->producer_index = prod; + /* release so rxe can read it safely */ + smp_store_release(&q->buf->producer_index, prod); + break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp, rxe owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance driver index\n", + __func__); break; } } @@ -201,24 +219,30 @@ static inline void queue_advance_consumer(struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: - cons = q->index; - cons = (cons + 1) & q->index_mask; + /* used by rxe which owns the index */ + cons = (q->index + 1) & q->index_mask; q->index = cons; - /* protect user index */ + /* release so client can read it safely */ smp_store_release(&q->buf->consumer_index, cons); break; case QUEUE_TYPE_TO_CLIENT: - pr_warn("%s: attempt to advance client index\n", - __func__); + /* used by rxe, client owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance client index\n", + __func__); + break; + case QUEUE_TYPE_FROM_ULP: + /* used by ulp, rxe owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance driver index\n", + __func__); break; - case QUEUE_TYPE_FROM_DRIVER: + case QUEUE_TYPE_TO_ULP: + /* used by ulp which owns the index */ cons = q->buf->consumer_index; cons = (cons + 1) & q->index_mask; - q->buf->consumer_index = cons; - break; - case QUEUE_TYPE_TO_DRIVER: - pr_warn("%s: attempt to advance driver index\n", - __func__); + /* release so rxe can read it safely */ + smp_store_release(&q->buf->consumer_index, cons); break; } } diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c74972244f08..0cc1ba91d48c 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -10,43 +10,6 @@ #include "rxe_loc.h" #include "rxe_queue.h" -enum resp_states { - RESPST_NONE, - RESPST_GET_REQ, - RESPST_CHK_PSN, - RESPST_CHK_OP_SEQ, - RESPST_CHK_OP_VALID, - RESPST_CHK_RESOURCE, - RESPST_CHK_LENGTH, - RESPST_CHK_RKEY, - RESPST_EXECUTE, - RESPST_READ_REPLY, - RESPST_ATOMIC_REPLY, - RESPST_ATOMIC_WRITE_REPLY, - RESPST_PROCESS_FLUSH, - RESPST_COMPLETE, - RESPST_ACKNOWLEDGE, - RESPST_CLEANUP, - RESPST_DUPLICATE_REQUEST, - RESPST_ERR_MALFORMED_WQE, - RESPST_ERR_UNSUPPORTED_OPCODE, - RESPST_ERR_MISALIGNED_ATOMIC, - RESPST_ERR_PSN_OUT_OF_SEQ, - RESPST_ERR_MISSING_OPCODE_FIRST, - RESPST_ERR_MISSING_OPCODE_LAST_C, - RESPST_ERR_MISSING_OPCODE_LAST_D1E, - RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, - RESPST_ERR_RNR, - RESPST_ERR_RKEY_VIOLATION, - RESPST_ERR_INVALIDATE_RKEY, - RESPST_ERR_LENGTH, - RESPST_ERR_CQ_OVERFLOW, - RESPST_ERROR, - RESPST_RESET, - RESPST_DONE, - RESPST_EXIT, -}; - static char *resp_state_name[] = { [RESPST_NONE] = "NONE", [RESPST_GET_REQ] = "GET_REQ", @@ -457,13 +420,23 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, return RESPST_CHK_RKEY; } +/* if the reth length field is zero we can assume nothing + * about the rkey value and should not validate or use it. + * Instead set qp->resp.rkey to 0 which is an invalid rkey + * value since the minimum index part is 1. + */ static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { + unsigned int length = reth_len(pkt); + qp->resp.va = reth_va(pkt); qp->resp.offset = 0; - qp->resp.rkey = reth_rkey(pkt); - qp->resp.resid = reth_len(pkt); - qp->resp.length = reth_len(pkt); + qp->resp.resid = length; + qp->resp.length = length; + if (pkt->mask & RXE_READ_OR_WRITE_MASK && length == 0) + qp->resp.rkey = 0; + else + qp->resp.rkey = reth_rkey(pkt); } static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) @@ -474,6 +447,10 @@ static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) qp->resp.resid = sizeof(u64); } +/* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL + * if an invalid rkey is received or the rdma length is zero. For middle + * or last packets use the stored value of mr. + */ static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { @@ -510,10 +487,12 @@ static enum resp_states check_rkey(struct rxe_qp *qp, return RESPST_EXECUTE; } - /* A zero-byte op is not required to set an addr or rkey. See C9-88 */ + /* A zero-byte read or write op is not required to + * set an addr or rkey. See C9-88 + */ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && - (pkt->mask & RXE_RETH_MASK) && - reth_len(pkt) == 0) { + (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { + qp->resp.mr = NULL; return RESPST_EXECUTE; } @@ -592,6 +571,7 @@ skip_check_range: return RESPST_EXECUTE; err: + qp->resp.mr = NULL; if (mr) rxe_put(mr); if (mw) @@ -725,17 +705,12 @@ static enum resp_states process_flush(struct rxe_qp *qp, return RESPST_ACKNOWLEDGE; } -/* Guarantee atomicity of atomic operations at the machine level. */ -static DEFINE_SPINLOCK(atomic_ops_lock); - static enum resp_states atomic_reply(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) + struct rxe_pkt_info *pkt) { - u64 *vaddr; - enum resp_states ret; struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; - u64 value; + int err; if (!res) { res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK); @@ -743,32 +718,14 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, } if (!res->replay) { - if (mr->state != RXE_MR_STATE_VALID) { - ret = RESPST_ERR_RKEY_VIOLATION; - goto out; - } - - vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, - sizeof(u64)); - - /* check vaddr is 8 bytes aligned. */ - if (!vaddr || (uintptr_t)vaddr & 7) { - ret = RESPST_ERR_MISALIGNED_ATOMIC; - goto out; - } + u64 iova = qp->resp.va + qp->resp.offset; - spin_lock_bh(&atomic_ops_lock); - res->atomic.orig_val = value = *vaddr; - - if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { - if (value == atmeth_comp(pkt)) - value = atmeth_swap_add(pkt); - } else { - value += atmeth_swap_add(pkt); - } - - *vaddr = value; - spin_unlock_bh(&atomic_ops_lock); + err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); + if (err) + return err; qp->resp.msn++; @@ -780,35 +737,35 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, qp->resp.status = IB_WC_SUCCESS; } - ret = RESPST_ACKNOWLEDGE; -out: - return ret; + return RESPST_ACKNOWLEDGE; } -#ifdef CONFIG_64BIT -static enum resp_states do_atomic_write(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) +static enum resp_states atomic_write_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) { - struct rxe_mr *mr = qp->resp.mr; - int payload = payload_size(pkt); - u64 src, *dst; - - if (mr->state != RXE_MR_STATE_VALID) - return RESPST_ERR_RKEY_VIOLATION; + struct resp_res *res = qp->resp.res; + struct rxe_mr *mr; + u64 value; + u64 iova; + int err; - memcpy(&src, payload_addr(pkt), payload); + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); + qp->resp.res = res; + } - dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload); - /* check vaddr is 8 bytes aligned. */ - if (!dst || (uintptr_t)dst & 7) - return RESPST_ERR_MISALIGNED_ATOMIC; + if (res->replay) + return RESPST_ACKNOWLEDGE; - /* Do atomic write after all prior operations have completed */ - smp_store_release(dst, src); + mr = qp->resp.mr; + value = *(u64 *)payload_addr(pkt); + iova = qp->resp.va + qp->resp.offset; - /* decrease resp.resid to zero */ - qp->resp.resid -= sizeof(payload); + err = rxe_mr_do_atomic_write(mr, iova, value); + if (err) + return err; + qp->resp.resid = 0; qp->resp.msn++; /* next expected psn, read handles this separately */ @@ -817,29 +774,8 @@ static enum resp_states do_atomic_write(struct rxe_qp *qp, qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; - return RESPST_ACKNOWLEDGE; -} -#else -static enum resp_states do_atomic_write(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) -{ - return RESPST_ERR_UNSUPPORTED_OPCODE; -} -#endif /* CONFIG_64BIT */ -static enum resp_states atomic_write_reply(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) -{ - struct resp_res *res = qp->resp.res; - - if (!res) { - res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); - qp->resp.res = res; - } - - if (res->replay) - return RESPST_ACKNOWLEDGE; - return do_atomic_write(qp, pkt); + return RESPST_ACKNOWLEDGE; } static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, @@ -966,7 +902,11 @@ static enum resp_states read_reply(struct rxe_qp *qp, } if (res->state == rdatm_res_state_new) { - if (!res->replay) { + if (!res->replay || qp->resp.length == 0) { + /* if length == 0 mr will be NULL (is ok) + * otherwise qp->resp.mr holds a ref on mr + * which we transfer to mr and drop below. + */ mr = qp->resp.mr; qp->resp.mr = NULL; } else { @@ -980,6 +920,10 @@ static enum resp_states read_reply(struct rxe_qp *qp, else opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; } else { + /* re-lookup mr from rkey on all later packets. + * length will be non-zero. This can fail if someone + * modifies or destroys the mr since the first packet. + */ mr = rxe_recheck_mr(qp, res->read.rkey); if (!mr) return RESPST_ERR_RKEY_VIOLATION; @@ -997,18 +941,16 @@ static enum resp_states read_reply(struct rxe_qp *qp, skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) { - if (mr) - rxe_put(mr); - return RESPST_ERR_RNR; + state = RESPST_ERR_RNR; + goto err_out; } err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); - if (mr) - rxe_put(mr); if (err) { kfree_skb(skb); - return RESPST_ERR_RKEY_VIOLATION; + state = RESPST_ERR_RKEY_VIOLATION; + goto err_out; } if (bth_pad(&ack_pkt)) { @@ -1017,9 +959,12 @@ static enum resp_states read_reply(struct rxe_qp *qp, memset(pad, 0, bth_pad(&ack_pkt)); } + /* rxe_xmit_packet always consumes the skb */ err = rxe_xmit_packet(qp, &ack_pkt, skb); - if (err) - return RESPST_ERR_RNR; + if (err) { + state = RESPST_ERR_RNR; + goto err_out; + } res->read.va += payload; res->read.resid -= payload; @@ -1036,6 +981,9 @@ static enum resp_states read_reply(struct rxe_qp *qp, state = RESPST_CLEANUP; } +err_out: + if (mr) + rxe_put(mr); return state; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 025b35bf014e..e14050a69276 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -245,7 +245,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) int num_sge = ibwr->num_sge; int full; - full = queue_full(rq->queue, QUEUE_TYPE_TO_DRIVER); + full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) return -ENOMEM; @@ -256,7 +256,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; - recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_TO_DRIVER); + recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); recv_wqe->wr_id = ibwr->wr_id; memcpy(recv_wqe->dma.sge, ibwr->sg_list, @@ -268,7 +268,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) recv_wqe->dma.cur_sge = 0; recv_wqe->dma.sge_offset = 0; - queue_advance_producer(rq->queue, QUEUE_TYPE_TO_DRIVER); + queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); return 0; } @@ -623,17 +623,17 @@ static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, spin_lock_irqsave(&qp->sq.sq_lock, flags); - full = queue_full(sq->queue, QUEUE_TYPE_TO_DRIVER); + full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { spin_unlock_irqrestore(&qp->sq.sq_lock, flags); return -ENOMEM; } - send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_TO_DRIVER); + send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); init_send_wqe(qp, ibwr, mask, length, send_wqe); - queue_advance_producer(sq->queue, QUEUE_TYPE_TO_DRIVER); + queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); spin_unlock_irqrestore(&qp->sq.sq_lock, flags); @@ -821,12 +821,12 @@ static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_lock_irqsave(&cq->cq_lock, flags); for (i = 0; i < num_entries; i++) { - cqe = queue_head(cq->queue, QUEUE_TYPE_FROM_DRIVER); + cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); if (!cqe) break; memcpy(wc++, &cqe->ibwc, sizeof(*wc)); - queue_advance_consumer(cq->queue, QUEUE_TYPE_FROM_DRIVER); + queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); } spin_unlock_irqrestore(&cq->cq_lock, flags); @@ -838,7 +838,7 @@ static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) struct rxe_cq *cq = to_rcq(ibcq); int count; - count = queue_count(cq->queue, QUEUE_TYPE_FROM_DRIVER); + count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP); return (count > wc_cnt) ? wc_cnt : count; } @@ -854,7 +854,7 @@ static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = flags & IB_CQ_SOLICITED_MASK; - empty = queue_empty(cq->queue, QUEUE_TYPE_FROM_DRIVER); + empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) ret = 1; @@ -869,10 +869,17 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; + int err; - mr = rxe_alloc(&rxe->mr_pool); - if (!mr) - return ERR_PTR(-ENOMEM); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + goto err_out; + } + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) + goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; @@ -880,8 +887,12 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) rxe_mr_init_dma(access, mr); rxe_finalize(mr); - return &mr->ibmr; + +err_free: + kfree(mr); +err_out: + return ERR_PTR(err); } static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, @@ -895,9 +906,15 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; - mr = rxe_alloc(&rxe->mr_pool); - if (!mr) - return ERR_PTR(-ENOMEM); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + goto err_out; + } + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) + goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; @@ -905,14 +922,16 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, err = rxe_mr_init_user(rxe, start, length, iova, access, mr); if (err) - goto err1; + goto err_cleanup; rxe_finalize(mr); - return &mr->ibmr; -err1: +err_cleanup: rxe_cleanup(mr); +err_free: + kfree(mr); +err_out: return ERR_PTR(err); } @@ -927,9 +946,15 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, if (mr_type != IB_MR_TYPE_MEM_REG) return ERR_PTR(-EINVAL); - mr = rxe_alloc(&rxe->mr_pool); - if (!mr) - return ERR_PTR(-ENOMEM); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + goto err_out; + } + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) + goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; @@ -937,53 +962,19 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, err = rxe_mr_init_fast(max_num_sg, mr); if (err) - goto err1; + goto err_cleanup; rxe_finalize(mr); - return &mr->ibmr; -err1: +err_cleanup: rxe_cleanup(mr); +err_free: + kfree(mr); +err_out: return ERR_PTR(err); } -static int rxe_set_page(struct ib_mr *ibmr, u64 addr) -{ - struct rxe_mr *mr = to_rmr(ibmr); - struct rxe_map *map; - struct rxe_phys_buf *buf; - - if (unlikely(mr->nbuf == mr->num_buf)) - return -ENOMEM; - - map = mr->map[mr->nbuf / RXE_BUF_PER_MAP]; - buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP]; - - buf->addr = addr; - buf->size = ibmr->page_size; - mr->nbuf++; - - return 0; -} - -static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, - int sg_nents, unsigned int *sg_offset) -{ - struct rxe_mr *mr = to_rmr(ibmr); - int n; - - mr->nbuf = 0; - - n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page); - - mr->page_shift = ilog2(ibmr->page_size); - mr->page_mask = ibmr->page_size - 1; - mr->offset = ibmr->iova & mr->page_mask; - - return n; -} - static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 19ddfa890480..c269ae2a3224 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -283,17 +283,6 @@ enum rxe_mr_lookup_type { RXE_LOOKUP_REMOTE, }; -#define RXE_BUF_PER_MAP (PAGE_SIZE / sizeof(struct rxe_phys_buf)) - -struct rxe_phys_buf { - u64 addr; - u64 size; -}; - -struct rxe_map { - struct rxe_phys_buf buf[RXE_BUF_PER_MAP]; -}; - static inline int rkey_is_mw(u32 rkey) { u32 index = rkey >> 8; @@ -310,25 +299,24 @@ struct rxe_mr { u32 lkey; u32 rkey; enum rxe_mr_state state; - u32 offset; int access; + atomic_t num_mw; - int page_shift; - int page_mask; - int map_shift; - int map_mask; + unsigned int page_offset; + unsigned int page_shift; + u64 page_mask; u32 num_buf; u32 nbuf; - u32 max_buf; - u32 num_map; - - atomic_t num_mw; - - struct rxe_map **map; + struct xarray page_list; }; +static inline unsigned int mr_page_size(struct rxe_mr *mr) +{ + return mr ? mr->ibmr.page_size : PAGE_SIZE; +} + enum rxe_mw_state { RXE_MW_STATE_INVALID = RXE_MR_STATE_INVALID, RXE_MW_STATE_FREE = RXE_MR_STATE_FREE, diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index f88d2971c2c6..da530c0404da 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -16,6 +16,7 @@ #include <net/tcp.h> #include <linux/inet.h> #include <linux/tcp.h> +#include <trace/events/sock.h> #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> @@ -109,6 +110,8 @@ static void siw_rtr_data_ready(struct sock *sk) struct siw_qp *qp = NULL; read_descriptor_t rd_desc; + trace_sk_data_ready(sk); + read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); @@ -1216,6 +1219,8 @@ static void siw_cm_llp_data_ready(struct sock *sk) { struct siw_cep *cep; + trace_sk_data_ready(sk); + read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index b2b33dd3b4fa..f51ab2ccf151 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -398,7 +398,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { + if (atomic64_add_return(num_pages, &mm_s->pinned_vm) > mlock_limit) { rv = -ENOMEM; goto out_sem_up; } @@ -411,30 +411,27 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) goto out_sem_up; } for (i = 0; num_pages; i++) { - int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); - - umem->page_chunk[i].plist = + int nents = min_t(int, num_pages, PAGES_PER_CHUNK); + struct page **plist = kcalloc(nents, sizeof(struct page *), GFP_KERNEL); - if (!umem->page_chunk[i].plist) { + + if (!plist) { rv = -ENOMEM; goto out_sem_up; } - got = 0; + umem->page_chunk[i].plist = plist; while (nents) { - struct page **plist = &umem->page_chunk[i].plist[got]; - rv = pin_user_pages(first_page_va, nents, foll_flags, plist, NULL); if (rv < 0) goto out_sem_up; umem->num_pages += rv; - atomic64_add(rv, &mm_s->pinned_vm); first_page_va += rv * PAGE_SIZE; + plist += rv; nents -= rv; - got += rv; + num_pages -= rv; } - num_pages -= got; } out_sem_up: mmap_read_unlock(mm_s); @@ -442,6 +439,10 @@ out_sem_up: if (rv > 0) return umem; + /* Adjust accounting for pages not pinned */ + if (num_pages) + atomic64_sub(num_pages, &mm_s->pinned_vm); + siw_umem_release(umem, false); return ERR_PTR(rv); diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c index e6f634971228..81e9bbd9ebda 100644 --- a/drivers/infiniband/sw/siw/siw_qp.c +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -10,6 +10,7 @@ #include <linux/llist.h> #include <asm/barrier.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include "siw.h" #include "siw_verbs.h" @@ -94,6 +95,8 @@ void siw_qp_llp_data_ready(struct sock *sk) { struct siw_qp *qp; + trace_sk_data_ready(sk); + read_lock(&sk->sk_callback_lock); if (unlikely(!sk->sk_user_data || !sk_to_qp(sk))) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ac25fc80fb33..cf8b0822f5c8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -742,7 +742,7 @@ void ipoib_flush_paths(struct net_device *dev) static void path_rec_completion(int status, struct sa_path_rec *pathrec, - int num_prs, void *path_ptr) + unsigned int num_prs, void *path_ptr) { struct ipoib_path *path = path_ptr; struct net_device *dev = path->dev; @@ -2200,6 +2200,14 @@ int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; rn->hca = hca; + + rc = netif_set_real_num_tx_queues(dev, 1); + if (rc) + goto out; + + rc = netif_set_real_num_rx_queues(dev, 1); + if (rc) + goto out; } priv->rn_ops = dev->netdev_ops; diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 620ae5b2d80d..6b7603765383 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -446,7 +446,7 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, * @is_leading: indicate if this is the session leading connection (MCS) * * Return: zero on success, $error if iscsi_conn_bind fails and - * -EINVAL in case end-point doesn't exsits anymore or iser connection + * -EINVAL in case end-point doesn't exists anymore or iser connection * state is not UP (teardown already started). */ static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index c76ba29da1e2..5adba0f754b6 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -312,9 +312,8 @@ void rtrs_srv_destroy_path_files(struct rtrs_srv_path *srv_path) if (srv_path->kobj.state_in_sysfs) { sysfs_remove_group(&srv_path->kobj, &rtrs_srv_path_attr_group); - kobject_del(&srv_path->kobj); kobject_put(&srv_path->kobj); + rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } - rtrs_srv_destroy_once_sysfs_root_folders(srv_path); } diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index b4d6a4a5ae81..df21b30b7735 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -699,7 +699,7 @@ static void srp_free_ch_ib(struct srp_target_port *target, static void srp_path_rec_completion(int status, struct sa_path_rec *pathrec, - int num_paths, void *ch_ptr) + unsigned int num_paths, void *ch_ptr) { struct srp_rdma_ch *ch = ch_ptr; struct srp_target_port *target = ch->target; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 00b0068fda20..5d94db453df3 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -62,9 +62,6 @@ enum { SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, - SRP_TAG_NO_REQ = ~0U, - SRP_TAG_TSK_MGMT = 1U << 31, - SRP_MAX_PAGES_PER_MR = 512, SRP_MAX_ADD_CDB_LEN = 16, @@ -79,6 +76,11 @@ enum { sizeof(struct srp_imm_buf), }; +enum { + SRP_TAG_NO_REQ = ~0U, + SRP_TAG_TSK_MGMT = BIT(31), +}; + enum srp_target_state { SRP_TARGET_SCANNING, SRP_TARGET_LIVE, |