diff options
Diffstat (limited to 'drivers/infiniband/core')
28 files changed, 1804 insertions, 1298 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 24cb71a16a28..ccf2670ef45e 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -17,7 +17,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o -ib_cm-y := cm.o +ib_cm-y := cm.o cm_trace.o iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 3a98439bba83..0abce004a959 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -647,13 +647,12 @@ static void process_one_req(struct work_struct *_work) req->callback = NULL; spin_lock_bh(&lock); + /* + * Although the work will normally have been canceled by the workqueue, + * it can still be requeued as long as it is on the req_list. + */ + cancel_delayed_work(&req->work); if (!list_empty(&req->list)) { - /* - * Although the work will normally have been canceled by the - * workqueue, it can still be requeued as long as it is on the - * req_list. - */ - cancel_delayed_work(&req->work); list_del_init(&req->list); kfree(req); } diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 5a76611e684a..8017c40dd110 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -133,7 +133,11 @@ static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) } static const char * const gid_type_str[] = { + /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for + * user space compatibility reasons. + */ [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; @@ -1220,7 +1224,7 @@ EXPORT_SYMBOL(ib_get_cached_port_state); const struct ib_gid_attr * rdma_get_gid_attr(struct ib_device *device, u8 port_num, int index) { - const struct ib_gid_attr *attr = ERR_PTR(-EINVAL); + const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); struct ib_gid_table *table; unsigned long flags; @@ -1244,6 +1248,67 @@ done: EXPORT_SYMBOL(rdma_get_gid_attr); /** + * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. + * @device: The device to query. + * @entries: Entries where GID entries are returned. + * @max_entries: Maximum number of entries that can be returned. + * Entries array must be allocated to hold max_entries number of entries. + * @num_entries: Updated to the number of entries that were successfully read. + * + * Returns number of entries on success or appropriate error code. + */ +ssize_t rdma_query_gid_table(struct ib_device *device, + struct ib_uverbs_gid_entry *entries, + size_t max_entries) +{ + const struct ib_gid_attr *gid_attr; + ssize_t num_entries = 0, ret; + struct ib_gid_table *table; + unsigned int port_num, i; + struct net_device *ndev; + unsigned long flags; + + rdma_for_each_port(device, port_num) { + if (!rdma_ib_or_roce(device, port_num)) + continue; + + table = rdma_gid_table(device, port_num); + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + if (!is_gid_entry_valid(table->data_vec[i])) + continue; + if (num_entries >= max_entries) { + ret = -EINVAL; + goto err; + } + + gid_attr = &table->data_vec[i]->attr; + + memcpy(&entries->gid, &gid_attr->gid, + sizeof(gid_attr->gid)); + entries->gid_index = gid_attr->index; + entries->port_num = gid_attr->port_num; + entries->gid_type = gid_attr->gid_type; + ndev = rcu_dereference_protected( + gid_attr->ndev, + lockdep_is_held(&table->rwlock)); + if (ndev) + entries->netdev_ifindex = ndev->ifindex; + + num_entries++; + entries++; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return num_entries; +err: + read_unlock_irqrestore(&table->rwlock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_query_gid_table); + +/** * rdma_put_gid_attr - Release reference to the GID attribute * @attr: Pointer to the GID attribute whose reference * needs to be released. @@ -1299,7 +1364,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); struct ib_device *device = entry->attr.device; - struct net_device *ndev = ERR_PTR(-ENODEV); + struct net_device *ndev = ERR_PTR(-EINVAL); u8 port_num = entry->attr.port_num; struct ib_gid_table *table; unsigned long flags; @@ -1311,8 +1376,7 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) valid = is_gid_entry_valid(table->data_vec[attr->index]); if (valid) { ndev = rcu_dereference(attr->ndev); - if (!ndev || - (ndev && ((READ_ONCE(ndev->flags) & IFF_UP) == 0))) + if (!ndev) ndev = ERR_PTR(-ENODEV); } read_unlock_irqrestore(&table->rwlock, flags); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index fbc28f1a8b92..5740d1ba3568 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -27,6 +27,7 @@ #include <rdma/ib_cm.h> #include "cm_msgs.h" #include "core_priv.h" +#include "cm_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); @@ -201,7 +202,6 @@ static struct attribute *cm_counter_default_attrs[] = { struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; - struct kobject port_obj; u8 port_num; struct list_head cm_priv_prim_list; struct list_head cm_priv_altr_list; @@ -1563,6 +1563,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); + trace_icm_send_req(&cm_id_priv->id); spin_lock_irqsave(&cm_id_priv->lock, flags); ret = ib_post_send_mad(cm_id_priv->msg, NULL); if (ret) { @@ -1610,6 +1611,9 @@ static int cm_issue_rej(struct cm_port *port, IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); } + trace_icm_issue_rej( + IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg), + IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); @@ -1961,6 +1965,7 @@ static void cm_dup_req_handler(struct cm_work *work, } spin_unlock_irq(&cm_id_priv->lock); + trace_icm_send_dup_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; @@ -2124,8 +2129,7 @@ static int cm_req_handler(struct cm_work *work) listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { - pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id)); + trace_icm_no_listener_err(&cm_id_priv->id); cm_id_priv->id.state = IB_CM_IDLE; ret = -EINVAL; goto destroy; @@ -2274,8 +2278,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { - pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); + trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); ret = -EINVAL; goto out; } @@ -2289,6 +2292,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; + trace_icm_send_rep(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -2348,8 +2352,7 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { - pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_icm_send_cm_rtu_err(cm_id); ret = -EINVAL; goto error; } @@ -2361,6 +2364,7 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, private_data, private_data_len); + trace_icm_send_rtu(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -2442,6 +2446,7 @@ static void cm_dup_rep_handler(struct cm_work *work) goto unlock; spin_unlock_irq(&cm_id_priv->lock); + trace_icm_send_dup_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; @@ -2465,7 +2470,7 @@ static int cm_rep_handler(struct cm_work *work) cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); if (!cm_id_priv) { cm_dup_rep_handler(work); - pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, + trace_icm_remote_no_priv_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); return -EINVAL; } @@ -2479,11 +2484,10 @@ static int cm_rep_handler(struct cm_work *work) break; default: ret = -EINVAL; - pr_debug( - "%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", - __func__, cm_id_priv->id.state, + trace_icm_rep_unknown_err( IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), - IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), + cm_id_priv->id.state); spin_unlock_irq(&cm_id_priv->lock); goto error; } @@ -2500,7 +2504,7 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("%s: Failed to insert remote id %d\n", __func__, + trace_icm_insert_failed_err( IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); goto error; } @@ -2517,9 +2521,8 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; - pr_debug( - "%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", - __func__, IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + trace_icm_staleconn_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); if (cur_cm_id_priv) { @@ -2646,9 +2649,7 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, return -EINVAL; if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_dreq_skipped(&cm_id_priv->id); return -EINVAL; } @@ -2667,6 +2668,7 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; + trace_icm_send_dreq(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_enter_timewait(cm_id_priv); @@ -2722,10 +2724,7 @@ static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, return -EINVAL; if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { - pr_debug( - "%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_send_drep_err(&cm_id_priv->id); kfree(private_data); return -EINVAL; } @@ -2740,6 +2739,7 @@ static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, private_data, private_data_len); + trace_icm_send_drep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_free_msg(msg); @@ -2789,6 +2789,9 @@ static int cm_issue_drep(struct cm_port *port, IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); + trace_icm_issue_drep( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); @@ -2810,9 +2813,8 @@ static int cm_dreq_handler(struct cm_work *work) atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); - pr_debug( - "%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", - __func__, IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + trace_icm_no_priv_err( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); return -EINVAL; } @@ -2858,9 +2860,7 @@ static int cm_dreq_handler(struct cm_work *work) counter[CM_DREQ_COUNTER]); goto unlock; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_dreq_unknown_err(&cm_id_priv->id); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2945,12 +2945,11 @@ static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, state); break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_send_unknown_rej_err(&cm_id_priv->id); return -EINVAL; } + trace_icm_send_rej(&cm_id_priv->id, reason); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_free_msg(msg); @@ -3060,9 +3059,7 @@ static int cm_rej_handler(struct cm_work *work) } fallthrough; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_rej_unknown_err(&cm_id_priv->id); spin_unlock_irq(&cm_id_priv->lock); goto out; } @@ -3118,9 +3115,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, } fallthrough; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_send_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; goto error1; } @@ -3133,6 +3128,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, msg_response, service_timeout, private_data, private_data_len); + trace_icm_send_mra(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) goto error2; @@ -3229,9 +3225,7 @@ static int cm_mra_handler(struct cm_work *work) counter[CM_MRA_COUNTER]); fallthrough; default: - pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_mra_unknown_err(&cm_id_priv->id); goto out; } @@ -3505,10 +3499,12 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state == IB_CM_IDLE) + if (cm_id->state == IB_CM_IDLE) { + trace_icm_send_sidr_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); - else + } else { ret = -EINVAL; + } if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -3670,6 +3666,7 @@ static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, param); + trace_icm_send_sidr_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_free_msg(msg); @@ -3767,8 +3764,7 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg, if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) goto discard; - pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n", - state, ib_wc_status_msg(wc_status)); + trace_icm_mad_send_err(state, wc_status); switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: @@ -3891,7 +3887,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: - pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); + trace_icm_handler_err(work->cm_event.event); ret = -EINVAL; break; } @@ -3927,8 +3923,7 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_icm_establish_err(cm_id); ret = -EINVAL; break; } @@ -4125,9 +4120,7 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_init_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4175,9 +4168,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_rtr_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4237,9 +4228,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_rts_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4295,20 +4284,6 @@ static struct kobj_type cm_counter_obj_type = { .default_attrs = cm_counter_default_attrs }; -static char *cm_devnode(struct device *dev, umode_t *mode) -{ - if (mode) - *mode = 0666; - return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); -} - -struct class cm_class = { - .owner = THIS_MODULE, - .name = "infiniband_cm", - .devnode = cm_devnode, -}; -EXPORT_SYMBOL(cm_class); - static int cm_create_port_fs(struct cm_port *port) { int i, ret; @@ -4511,12 +4486,6 @@ static int __init ib_cm_init(void) get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); - ret = class_register(&cm_class); - if (ret) { - ret = -ENOMEM; - goto error1; - } - cm.wq = alloc_workqueue("ib_cm", 0, 1); if (!cm.wq) { ret = -ENOMEM; @@ -4531,8 +4500,6 @@ static int __init ib_cm_init(void) error3: destroy_workqueue(cm.wq); error2: - class_unregister(&cm_class); -error1: return ret; } @@ -4553,7 +4520,6 @@ static void __exit ib_cm_cleanup(void) kfree(timewait_info); } - class_unregister(&cm_class); WARN_ON(!xa_empty(&cm.local_id_table)); } diff --git a/drivers/infiniband/core/cm_trace.c b/drivers/infiniband/core/cm_trace.c new file mode 100644 index 000000000000..8f3482f66338 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for the IB Connection Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#include <rdma/rdma_cm.h> +#include "cma_priv.h" + +#define CREATE_TRACE_POINTS + +#include "cm_trace.h" diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h new file mode 100644 index 000000000000..e9d282679ef1 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.h @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trace point definitions for the RDMA Connect Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020 Oracle and/or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ib_cma + +#if !defined(_TRACE_IB_CMA_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_IB_CMA_H + +#include <linux/tracepoint.h> +#include <rdma/ib_cm.h> +#include <trace/events/rdma.h> + +/* + * enum ib_cm_state, from include/rdma/ib_cm.h + */ +#define IB_CM_STATE_LIST \ + ib_cm_state(IDLE) \ + ib_cm_state(LISTEN) \ + ib_cm_state(REQ_SENT) \ + ib_cm_state(REQ_RCVD) \ + ib_cm_state(MRA_REQ_SENT) \ + ib_cm_state(MRA_REQ_RCVD) \ + ib_cm_state(REP_SENT) \ + ib_cm_state(REP_RCVD) \ + ib_cm_state(MRA_REP_SENT) \ + ib_cm_state(MRA_REP_RCVD) \ + ib_cm_state(ESTABLISHED) \ + ib_cm_state(DREQ_SENT) \ + ib_cm_state(DREQ_RCVD) \ + ib_cm_state(TIMEWAIT) \ + ib_cm_state(SIDR_REQ_SENT) \ + ib_cm_state_end(SIDR_REQ_RCVD) + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_STATE_LIST + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) { IB_CM_##x, #x }, +#define ib_cm_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_state(x) \ + __print_symbolic(x, IB_CM_STATE_LIST) + +/* + * enum ib_cm_lap_state, from include/rdma/ib_cm.h + */ +#define IB_CM_LAP_STATE_LIST \ + ib_cm_lap_state(LAP_UNINIT) \ + ib_cm_lap_state(LAP_IDLE) \ + ib_cm_lap_state(LAP_SENT) \ + ib_cm_lap_state(LAP_RCVD) \ + ib_cm_lap_state(MRA_LAP_SENT) \ + ib_cm_lap_state_end(MRA_LAP_RCVD) + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_lap_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_LAP_STATE_LIST + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) { IB_CM_##x, #x }, +#define ib_cm_lap_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_lap_state(x) \ + __print_symbolic(x, IB_CM_LAP_STATE_LIST) + +/* + * enum ib_cm_rej_reason, from include/rdma/ib_cm.h + */ +#define IB_CM_REJ_REASON_LIST \ + ib_cm_rej_reason(REJ_NO_QP) \ + ib_cm_rej_reason(REJ_NO_EEC) \ + ib_cm_rej_reason(REJ_NO_RESOURCES) \ + ib_cm_rej_reason(REJ_TIMEOUT) \ + ib_cm_rej_reason(REJ_UNSUPPORTED) \ + ib_cm_rej_reason(REJ_INVALID_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_COMM_INSTANCE) \ + ib_cm_rej_reason(REJ_INVALID_SERVICE_ID) \ + ib_cm_rej_reason(REJ_INVALID_TRANSPORT_TYPE) \ + ib_cm_rej_reason(REJ_STALE_CONN) \ + ib_cm_rej_reason(REJ_RDC_NOT_EXIST) \ + ib_cm_rej_reason(REJ_INVALID_GID) \ + ib_cm_rej_reason(REJ_INVALID_LID) \ + ib_cm_rej_reason(REJ_INVALID_SL) \ + ib_cm_rej_reason(REJ_INVALID_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_PACKET_RATE) \ + ib_cm_rej_reason(REJ_INVALID_ALT_GID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_LID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_SL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_ALT_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_ALT_PACKET_RATE) \ + ib_cm_rej_reason(REJ_PORT_CM_REDIRECT) \ + ib_cm_rej_reason(REJ_PORT_REDIRECT) \ + ib_cm_rej_reason(REJ_INVALID_MTU) \ + ib_cm_rej_reason(REJ_INSUFFICIENT_RESP_RESOURCES) \ + ib_cm_rej_reason(REJ_CONSUMER_DEFINED) \ + ib_cm_rej_reason(REJ_INVALID_RNR_RETRY) \ + ib_cm_rej_reason(REJ_DUPLICATE_LOCAL_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_CLASS_VERSION) \ + ib_cm_rej_reason(REJ_INVALID_FLOW_LABEL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_FLOW_LABEL) \ + ib_cm_rej_reason_end(REJ_VENDOR_OPTION_NOT_SUPPORTED) + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_rej_reason_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_REJ_REASON_LIST + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) { IB_CM_##x, #x }, +#define ib_cm_rej_reason_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_rej_reason(x) \ + __print_symbolic(x, IB_CM_REJ_REASON_LIST) + +DECLARE_EVENT_CLASS(icm_id_class, + TP_PROTO( + const struct ib_cm_id *cm_id + ), + + TP_ARGS(cm_id), + + TP_STRUCT__entry( + __field(const void *, cm_id) /* for eBPF scripts */ + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + __field(unsigned long, lap_state) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->lap_state = cm_id->lap_state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s lap_state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_lap_state(__entry->lap_state) + ) +); + +#define DEFINE_CM_SEND_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_send_##name, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_SEND_EVENT(req); +DEFINE_CM_SEND_EVENT(rep); +DEFINE_CM_SEND_EVENT(dup_req); +DEFINE_CM_SEND_EVENT(dup_rep); +DEFINE_CM_SEND_EVENT(rtu); +DEFINE_CM_SEND_EVENT(mra); +DEFINE_CM_SEND_EVENT(sidr_req); +DEFINE_CM_SEND_EVENT(sidr_rep); +DEFINE_CM_SEND_EVENT(dreq); +DEFINE_CM_SEND_EVENT(drep); + +TRACE_EVENT(icm_send_rej, + TP_PROTO( + const struct ib_cm_id *cm_id, + enum ib_cm_rej_reason reason + ), + + TP_ARGS(cm_id, reason), + + TP_STRUCT__entry( + __field(const void *, cm_id) + __field(u32, local_id) + __field(u32, remote_id) + __field(unsigned long, state) + __field(unsigned long, reason) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->reason = reason; + ), + + TP_printk("local_id=%u remote_id=%u state=%s reason=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_rej_reason(__entry->reason) + ) +); + +#define DEFINE_CM_ERR_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_##name##_err, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_ERR_EVENT(send_cm_rtu); +DEFINE_CM_ERR_EVENT(establish); +DEFINE_CM_ERR_EVENT(no_listener); +DEFINE_CM_ERR_EVENT(send_drep); +DEFINE_CM_ERR_EVENT(dreq_unknown); +DEFINE_CM_ERR_EVENT(send_unknown_rej); +DEFINE_CM_ERR_EVENT(rej_unknown); +DEFINE_CM_ERR_EVENT(send_mra_unknown); +DEFINE_CM_ERR_EVENT(mra_unknown); +DEFINE_CM_ERR_EVENT(qp_init); +DEFINE_CM_ERR_EVENT(qp_rtr); +DEFINE_CM_ERR_EVENT(qp_rts); + +DEFINE_EVENT(icm_id_class, \ + icm_dreq_skipped, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id) \ +); + +DECLARE_EVENT_CLASS(icm_local_class, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id + ), + + TP_ARGS(local_id, remote_id), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + ), + + TP_printk("local_id=%u remote_id=%u", + __entry->local_id, __entry->remote_id + ) +); + +#define DEFINE_CM_LOCAL_EVENT(name) \ + DEFINE_EVENT(icm_local_class, \ + icm_##name, \ + TP_PROTO( \ + unsigned int local_id, \ + unsigned int remote_id \ + ), \ + TP_ARGS(local_id, remote_id)) + +DEFINE_CM_LOCAL_EVENT(issue_rej); +DEFINE_CM_LOCAL_EVENT(issue_drep); +DEFINE_CM_LOCAL_EVENT(staleconn_err); +DEFINE_CM_LOCAL_EVENT(no_priv_err); + +DECLARE_EVENT_CLASS(icm_remote_class, + TP_PROTO( + u32 remote_id + ), + + TP_ARGS(remote_id), + + TP_STRUCT__entry( + __field(u32, remote_id) + ), + + TP_fast_assign( + __entry->remote_id = remote_id; + ), + + TP_printk("remote_id=%u", + __entry->remote_id + ) +); + +#define DEFINE_CM_REMOTE_EVENT(name) \ + DEFINE_EVENT(icm_remote_class, \ + icm_##name, \ + TP_PROTO( \ + u32 remote_id \ + ), \ + TP_ARGS(remote_id)) + +DEFINE_CM_REMOTE_EVENT(remote_no_priv_err); +DEFINE_CM_REMOTE_EVENT(insert_failed_err); + +TRACE_EVENT(icm_send_rep_err, + TP_PROTO( + __be32 local_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = be32_to_cpu(local_id); + __entry->state = state; + ), + + TP_printk("local_id=%u state=%s", + __entry->local_id, show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_rep_unknown_err, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, remote_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + __entry->state = state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_handler_err, + TP_PROTO( + enum ib_cm_event_type event + ), + + TP_ARGS(event), + + TP_STRUCT__entry( + __field(unsigned long, event) + ), + + TP_fast_assign( + __entry->event = event; + ), + + TP_printk("unhandled event=%s", + rdma_show_ib_cm_event(__entry->event) + ) +); + +TRACE_EVENT(icm_mad_send_err, + TP_PROTO( + enum ib_cm_state state, + enum ib_wc_status wc_status + ), + + TP_ARGS(state, wc_status), + + TP_STRUCT__entry( + __field(unsigned long, state) + __field(unsigned long, wc_status) + ), + + TP_fast_assign( + __entry->state = state; + __entry->wc_status = wc_status; + ), + + TP_printk("state=%s completion status=%s", + show_ib_cm_state(__entry->state), + rdma_show_wc_status(__entry->wc_status) + ) +); + +#endif /* _TRACE_IB_CMA_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../drivers/infiniband/core +#define TRACE_INCLUDE_FILE cm_trace + +#include <trace/define_trace.h> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5888311b2119..7c2ab1f2fbea 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -68,6 +68,9 @@ static const char * const cma_events[] = { [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; +static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, + union ib_gid *mgid); + const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; @@ -301,6 +304,10 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; + if (default_gid_type == IB_GID_TYPE_IB && + rdma_protocol_roce_eth_encap(cma_dev->device, port)) + default_gid_type = IB_GID_TYPE_ROCE; + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) @@ -345,13 +352,10 @@ struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) struct cma_multicast { struct rdma_id_private *id_priv; - union { - struct ib_sa_multicast *ib; - } multicast; + struct ib_sa_multicast *sa_mc; struct list_head list; void *context; struct sockaddr_storage addr; - struct kref mcref; u8 join_state; }; @@ -363,18 +367,6 @@ struct cma_work { struct rdma_cm_event event; }; -struct cma_ndev_work { - struct work_struct work; - struct rdma_id_private *id; - struct rdma_cm_event event; -}; - -struct iboe_mcast_work { - struct work_struct work; - struct rdma_id_private *id; - struct cma_multicast *mc; -}; - union cma_ip_addr { struct in6_addr ip6; struct { @@ -404,23 +396,21 @@ struct cma_req_info { u16 pkey; }; -static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&id_priv->lock, flags); - ret = (id_priv->state == comp); - spin_unlock_irqrestore(&id_priv->lock, flags); - return ret; -} - static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; + /* + * The FSM uses a funny double locking where state is protected by both + * the handler_mutex and the spinlock. State is not allowed to change + * away from a handler_mutex protected value without also holding + * handler_mutex. + */ + if (comp == RDMA_CM_CONNECT) + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; @@ -467,10 +457,8 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); - if (id_priv->res.kern_name) - rdma_restrack_kadd(&id_priv->res); - else - rdma_restrack_uadd(&id_priv->res); + rdma_restrack_add(&id_priv->res); + trace_cm_id_attach(id_priv, cma_dev->device); } @@ -483,14 +471,6 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, rdma_start_port(cma_dev->device)]; } -static inline void release_mc(struct kref *kref) -{ - struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); - - kfree(mc->multicast.ib); - kfree(mc); -} - static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); @@ -844,10 +824,10 @@ static void cma_id_put(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -struct rdma_cm_id *__rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_ucm_port_space ps, - enum ib_qp_type qp_type, const char *caller) +static struct rdma_id_private * +__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; @@ -855,8 +835,6 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - rdma_restrack_set_task(&id_priv->res, caller); - id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; @@ -876,9 +854,45 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; - return &id_priv->id; + rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); + if (parent) + rdma_restrack_parent_name(&id_priv->res, &parent->res); + + return id_priv; +} + +struct rdma_cm_id * +__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const char *caller) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, caller); + return &ret->id; } -EXPORT_SYMBOL(__rdma_create_id); +EXPORT_SYMBOL(__rdma_create_kernel_id); + +struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, + void *context, + enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, + ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, NULL); + return &ret->id; +} +EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { @@ -1783,19 +1797,30 @@ static void cma_release_port(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static void cma_leave_roce_mc_group(struct rdma_id_private *id_priv, - struct cma_multicast *mc) +static void destroy_mc(struct rdma_id_private *id_priv, + struct cma_multicast *mc) { - struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = NULL; + if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) + ib_sa_free_multicast(mc->sa_mc); - if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (ndev) { - cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, false); - dev_put(ndev); + if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev) { + union ib_gid mgid; + + cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, + &mgid); + cma_igmp_send(ndev, &mgid, false); + dev_put(ndev); + } } - kref_put(&mc->mcref, release_mc); + kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) @@ -1803,16 +1828,10 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { - mc = container_of(id_priv->mc_list.next, - struct cma_multicast, list); + mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, + list); list_del(&mc->list); - if (rdma_cap_ib_mcast(id_priv->cma_dev->device, - id_priv->id.port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else { - cma_leave_roce_mc_group(id_priv, mc); - } + destroy_mc(id_priv, mc); } } @@ -1821,7 +1840,6 @@ static void _destroy_id(struct rdma_id_private *id_priv, { cma_cancel_operation(id_priv, state); - rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) @@ -1847,6 +1865,7 @@ static void _destroy_id(struct rdma_id_private *id_priv, rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); put_net(id_priv->id.route.addr.dev_addr.net); + rdma_restrack_del(&id_priv->res); kfree(id_priv); } @@ -1949,13 +1968,15 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; + enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); + state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_CONNECT) || + state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_DISCONNECT)) + state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { @@ -1965,7 +1986,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: - if (cma_comp(id_priv, RDMA_CM_CONNECT) && + if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); @@ -2043,14 +2064,15 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); - id = __rdma_create_id(listen_id->route.addr.dev_addr.net, - listen_id->event_handler, listen_id->context, - listen_id->ps, ib_event->param.req_rcvd.qp_type, - listen_id_priv->res.kern_name); - if (IS_ERR(id)) + id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, + listen_id->ps, + ib_event->param.req_rcvd.qp_type, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) @@ -2104,13 +2126,13 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); - id = __rdma_create_id(net, listen_id->event_handler, listen_id->context, - listen_id->ps, IB_QPT_UD, - listen_id_priv->res.kern_name); - if (IS_ERR(id)) + id_priv = __rdma_create_id(net, listen_id->event_handler, + listen_id->context, listen_id->ps, IB_QPT_UD, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, @@ -2184,7 +2206,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) { + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } @@ -2226,8 +2248,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, goto net_dev_put; } - if (cma_comp(conn_id, RDMA_CM_CONNECT) && - (conn_id->id.qp_type != IB_QPT_UD)) { + if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && + conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } @@ -2288,7 +2310,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { @@ -2346,7 +2368,6 @@ out: static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { - struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; @@ -2362,20 +2383,18 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, - listen_id->id.event_handler, - listen_id->id.context, - RDMA_PS_TCP, IB_QPT_RC, - listen_id->res.kern_name); - if (IS_ERR(new_cm_id)) { + conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, + listen_id->id.context, RDMA_PS_TCP, + IB_QPT_RC, listen_id); + if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } - conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; @@ -2480,7 +2499,6 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { struct rdma_id_private *dev_id_priv; - struct rdma_cm_id *id; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; @@ -2489,13 +2507,12 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return; - id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, - id_priv->id.qp_type, id_priv->res.kern_name); - if (IS_ERR(id)) + dev_id_priv = + __rdma_create_id(net, cma_listen_handler, id_priv, + id_priv->id.ps, id_priv->id.qp_type, id_priv); + if (IS_ERR(dev_id_priv)) return; - dev_id_priv = container_of(id, struct rdma_id_private, id); - dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); @@ -2508,7 +2525,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; - ret = rdma_listen(id, id_priv->backlog); + ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) dev_warn(&cma_dev->device->dev, "RDMA CMA: cma_listen_on_dev, error %d\n", ret); @@ -2647,32 +2664,14 @@ static void cma_work_handler(struct work_struct *_work) struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); - if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; - - if (cma_cm_event_handler(id_priv, &work->event)) { - cma_id_put(id_priv); - destroy_id_handler_unlock(id_priv); - goto out_free; + if (work->old_state != 0 || work->new_state != 0) { + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out_unlock; } -out_unlock: - mutex_unlock(&id_priv->handler_mutex); - cma_id_put(id_priv); -out_free: - kfree(work); -} - -static void cma_ndev_work_handler(struct work_struct *_work) -{ - struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); - struct rdma_id_private *id_priv = work->id; - - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == RDMA_CM_DESTROYING || - id_priv->state == RDMA_CM_DEVICE_REMOVAL) - goto out_unlock; - if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); @@ -2683,6 +2682,8 @@ out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: + if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } @@ -3240,32 +3241,54 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return rdma_bind_addr(id, src_addr); } -int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, unsigned long timeout_ms) +/* + * If required, resolve the source address for bind and leave the id_priv in + * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior + * calls made by ULP, a previously bound ID will not be re-bound and src_addr is + * ignored. + */ +static int resolve_prepare_src(struct rdma_id_private *id_priv, + struct sockaddr *src_addr, + const struct sockaddr *dst_addr) { - struct rdma_id_private *id_priv; int ret; - id_priv = container_of(id, struct rdma_id_private, id); memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - if (id_priv->state == RDMA_CM_IDLE) { - ret = cma_bind_addr(id, src_addr, dst_addr); - if (ret) { - memset(cma_dst_addr(id_priv), 0, - rdma_addr_size(dst_addr)); - return ret; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); + if (ret) + goto err_dst; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_ADDR_QUERY))) { + ret = -EINVAL; + goto err_dst; } } if (cma_family(id_priv) != dst_addr->sa_family) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return -EINVAL; + ret = -EINVAL; + goto err_state; } + return 0; - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return -EINVAL; - } +err_state: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); +err_dst: + memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); + return ret; +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + ret = resolve_prepare_src(id_priv, src_addr, dst_addr); + if (ret) + return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); @@ -3297,7 +3320,8 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (reuse || id_priv->state == RDMA_CM_IDLE) { + if ((reuse && id_priv->state != RDMA_CM_LISTEN) || + id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { @@ -3491,8 +3515,7 @@ static int cma_check_port(struct rdma_bind_list *bind_list, if (id_priv == cur_id) continue; - if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && - cur_id->reuseaddr) + if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); @@ -3533,18 +3556,6 @@ static int cma_use_port(enum rdma_ucm_port_space ps, return ret; } -static int cma_bind_listen(struct rdma_id_private *id_priv) -{ - struct rdma_bind_list *bind_list = id_priv->bind_list; - int ret = 0; - - mutex_lock(&lock); - if (bind_list->owners.first->next) - ret = cma_check_port(bind_list, id_priv, 0); - mutex_unlock(&lock); - return ret; -} - static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { @@ -3638,22 +3649,31 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, int rdma_listen(struct rdma_cm_id *id, int backlog) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == RDMA_CM_IDLE) { + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ id->route.addr.src_addr.ss_family = AF_INET; ret = rdma_bind_addr(id, cma_src_addr(id_priv)); if (ret) return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN))) + return -EINVAL; } - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) - return -EINVAL; - + /* + * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable + * any more, and has to be unique in the bind list. + */ if (id_priv->reuseaddr) { - ret = cma_bind_listen(id_priv); + mutex_lock(&lock); + ret = cma_check_port(id_priv->bind_list, id_priv, 0); + if (!ret) + id_priv->reuseaddr = 0; + mutex_unlock(&lock); if (ret) goto err; } @@ -3678,6 +3698,10 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) return 0; err: id_priv->backlog = 0; + /* + * All the failure paths that lead here will not allow the req_handler's + * to have run. + */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } @@ -3732,7 +3756,6 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) cma_release_dev(id_priv); err1: @@ -3781,7 +3804,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, int ret; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { @@ -4017,12 +4040,15 @@ out: int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) - return -EINVAL; + mutex_lock(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) { + ret = -EINVAL; + goto err_unlock; + } if (!id->qp) { id_priv->qp_num = conn_param->qp_num; @@ -4039,11 +4065,13 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) else ret = -ENOSYS; if (ret) - goto err; - + goto err_state; + mutex_unlock(&id_priv->handler_mutex); return 0; -err: +err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); +err_unlock: + mutex_unlock(&id_priv->handler_mutex); return ret; } EXPORT_SYMBOL(rdma_connect); @@ -4155,17 +4183,33 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } -int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, - const char *caller) +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + * + * In the case of error, a reject message is sent to the remote side and the + * state of the qp associated with the id is modified to error, such that any + * previously posted receive buffers would be flushed. + * + * This function is for use by kernel ULPs and must be called from under the + * handler callback. + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - - rdma_restrack_set_task(&id_priv->res, caller); + lockdep_assert_held(&id_priv->handler_mutex); - if (!cma_comp(id_priv, RDMA_CM_CONNECT)) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { @@ -4203,10 +4247,10 @@ reject: rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } -EXPORT_SYMBOL(__rdma_accept); +EXPORT_SYMBOL(rdma_accept); -int __rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, - const char *caller, struct rdma_ucm_ece *ece) +int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); @@ -4214,9 +4258,27 @@ int __rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; - return __rdma_accept(id, conn_param, caller); + return rdma_accept(id, conn_param); +} +EXPORT_SYMBOL(rdma_accept_ece); + +void rdma_lock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); } -EXPORT_SYMBOL(__rdma_accept_ece); +EXPORT_SYMBOL(rdma_lock_handler); + +void rdma_unlock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_unlock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { @@ -4299,63 +4361,66 @@ out: } EXPORT_SYMBOL(rdma_disconnect); -static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) +static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, + struct ib_sa_multicast *multicast, + struct rdma_cm_event *event, + struct cma_multicast *mc) { - struct rdma_id_private *id_priv; - struct cma_multicast *mc = multicast->context; - struct rdma_cm_event event = {}; - int ret = 0; - - id_priv = mc->id_priv; - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_ADDR_BOUND && - id_priv->state != RDMA_CM_ADDR_RESOLVED) - goto out; + struct rdma_dev_addr *dev_addr; + enum ib_gid_type gid_type; + struct net_device *ndev; if (!status) status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); else pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); - mutex_lock(&id_priv->qp_mutex); - if (!status && id_priv->id.qp) { - status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, - be16_to_cpu(multicast->rec.mlid)); - if (status) - pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to attach QP. status %d\n", - status); + + event->status = status; + event->param.ud.private_data = mc->context; + if (status) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + return; } - mutex_unlock(&id_priv->qp_mutex); - event.status = status; - event.param.ud.private_data = mc->context; - if (!status) { - struct rdma_dev_addr *dev_addr = - &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = - dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - enum ib_gid_type gid_type = - id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; - - event.event = RDMA_CM_EVENT_MULTICAST_JOIN; - ret = ib_init_ah_from_mcmember(id_priv->id.device, - id_priv->id.port_num, - &multicast->rec, - ndev, gid_type, - &event.param.ud.ah_attr); - if (ret) - event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + dev_addr = &id_priv->id.route.addr.dev_addr; + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + gid_type = + id_priv->cma_dev + ->default_gid_type[id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + + event->event = RDMA_CM_EVENT_MULTICAST_JOIN; + if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, + &multicast->rec, ndev, gid_type, + &event->param.ud.ah_attr)) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + goto out; + } - event.param.ud.qp_num = 0xFFFFFF; - event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - if (ndev) - dev_put(ndev); - } else - event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + event->param.ud.qp_num = 0xFFFFFF; + event->param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - ret = cma_cm_event_handler(id_priv, &event); +out: + if (ndev) + dev_put(ndev); +} +static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) +{ + struct cma_multicast *mc = multicast->context; + struct rdma_id_private *id_priv = mc->id_priv; + struct rdma_cm_event event = {}; + int ret = 0; + + mutex_lock(&id_priv->handler_mutex); + if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || + READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) + goto out; + + cma_make_mc_event(status, id_priv, multicast, &event, mc); + ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { destroy_id_handler_unlock(id_priv); @@ -4445,23 +4510,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; - mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, - id_priv->id.port_num, &rec, - comp_mask, GFP_KERNEL, - cma_ib_mc_handler, mc); - return PTR_ERR_OR_ZERO(mc->multicast.ib); -} - -static void iboe_mcast_work_handler(struct work_struct *work) -{ - struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); - struct cma_multicast *mc = mw->mc; - struct ib_sa_multicast *m = mc->multicast.ib; - - mc->multicast.ib->context = mc; - cma_ib_mc_handler(0, m); - kref_put(&mc->mcref, release_mc); - kfree(mw); + mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, + id_priv->id.port_num, &rec, comp_mask, + GFP_KERNEL, cma_ib_mc_handler, mc); + return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, @@ -4496,52 +4548,47 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { - struct iboe_mcast_work *work; + struct cma_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + struct ib_sa_multicast ib; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); - if (cma_zero_addr((struct sockaddr *)&mc->addr)) + if (cma_zero_addr(addr)) return -EINVAL; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; - mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); - if (!mc->multicast.ib) { - err = -ENOMEM; - goto out1; - } - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); + cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); - mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); + ib.rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) - mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + ib.rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; - goto out2; + goto err_free; } - mc->multicast.ib->rec.rate = iboe_get_rate(ndev); - mc->multicast.ib->rec.hop_limit = 1; - mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + ib.rec.rate = iboe_get_rate(ndev); + ib.rec.hop_limit = 1; + ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { - mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { - err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } @@ -4550,24 +4597,22 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -ENOTSUPP; } dev_put(ndev); - if (err || !mc->multicast.ib->rec.mtu) { + if (err || !ib.rec.mtu) { if (!err) err = -EINVAL; - goto out2; + goto err_free; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, - &mc->multicast.ib->rec.port_gid); + &ib.rec.port_gid); work->id = id_priv; - work->mc = mc; - INIT_WORK(&work->work, iboe_mcast_work_handler); - kref_get(&mc->mcref); + INIT_WORK(&work->work, cma_work_handler); + cma_make_mc_event(0, id_priv, &ib, &work->event, mc); + /* Balances with cma_id_put() in cma_work_handler */ + cma_id_get(id_priv); queue_work(cma_wq, &work->work); - return 0; -out2: - kfree(mc->multicast.ib); -out1: +err_free: kfree(work); return err; } @@ -4575,19 +4620,21 @@ out1: int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; - if (!id->device) + /* Not supported for kernel QPs */ + if (WARN_ON(id->qp)) return -EINVAL; - id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && - !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) + /* ULP is calling this wrong. */ + if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && + READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; - mc = kmalloc(sizeof *mc, GFP_KERNEL); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; @@ -4597,7 +4644,6 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { - kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; @@ -4629,25 +4675,14 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { - list_del(&mc->list); - spin_unlock_irq(&id_priv->lock); - - if (id->qp) - ib_detach_mcast(id->qp, - &mc->multicast.ib->rec.mgid, - be16_to_cpu(mc->multicast.ib->rec.mlid)); - - BUG_ON(id_priv->cma_dev->device != id->device); - - if (rdma_cap_ib_mcast(id->device, id->port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else if (rdma_protocol_roce(id->device, id->port_num)) { - cma_leave_roce_mc_group(id_priv, mc); - } - return; - } + if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) + continue; + list_del(&mc->list); + spin_unlock_irq(&id_priv->lock); + + WARN_ON(id_priv->cma_dev->device != id->device); + destroy_mc(id_priv, mc); + return; } spin_unlock_irq(&id_priv->lock); } @@ -4656,7 +4691,7 @@ EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; - struct cma_ndev_work *work; + struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; @@ -4669,7 +4704,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id if (!work) return -ENOMEM; - INIT_WORK(&work->work, cma_ndev_work_handler); + INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; cma_id_get(id_priv); diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 3c1e2ca564fe..7ec4af2ed87a 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -123,16 +123,17 @@ static ssize_t default_roce_mode_store(struct config_item *item, { struct cma_device *cma_dev; struct cma_dev_port_group *group; - int gid_type = ib_cache_gid_parse_type_str(buf); + int gid_type; ssize_t ret; - if (gid_type < 0) - return -EINVAL; - ret = cma_configfs_params_get(item, &cma_dev, &group); if (ret) return ret; + gid_type = ib_cache_gid_parse_type_str(buf); + if (gid_type < 0) + return -EINVAL; + ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); cma_configfs_params_put(cma_dev); diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index e6e20c36c538..e45264267bcc 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -17,46 +17,6 @@ #include <linux/tracepoint.h> #include <trace/events/rdma.h> -/* - * enum ib_cm_event_type, from include/rdma/ib_cm.h - */ -#define IB_CM_EVENT_LIST \ - ib_cm_event(REQ_ERROR) \ - ib_cm_event(REQ_RECEIVED) \ - ib_cm_event(REP_ERROR) \ - ib_cm_event(REP_RECEIVED) \ - ib_cm_event(RTU_RECEIVED) \ - ib_cm_event(USER_ESTABLISHED) \ - ib_cm_event(DREQ_ERROR) \ - ib_cm_event(DREQ_RECEIVED) \ - ib_cm_event(DREP_RECEIVED) \ - ib_cm_event(TIMEWAIT_EXIT) \ - ib_cm_event(MRA_RECEIVED) \ - ib_cm_event(REJ_RECEIVED) \ - ib_cm_event(LAP_ERROR) \ - ib_cm_event(LAP_RECEIVED) \ - ib_cm_event(APR_RECEIVED) \ - ib_cm_event(SIDR_REQ_ERROR) \ - ib_cm_event(SIDR_REQ_RECEIVED) \ - ib_cm_event_end(SIDR_REP_RECEIVED) - -#undef ib_cm_event -#undef ib_cm_event_end - -#define ib_cm_event(x) TRACE_DEFINE_ENUM(IB_CM_##x); -#define ib_cm_event_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); - -IB_CM_EVENT_LIST - -#undef ib_cm_event -#undef ib_cm_event_end - -#define ib_cm_event(x) { IB_CM_##x, #x }, -#define ib_cm_event_end(x) { IB_CM_##x, #x } - -#define rdma_show_ib_cm_event(x) \ - __print_symbolic(x, IB_CM_EVENT_LIST) - DECLARE_EVENT_CLASS(cma_fsm_class, TP_PROTO( diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a1e6a67b2c4a..e84b0fedaacb 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -44,6 +44,7 @@ #include <rdma/ib_mad.h> #include <rdma/restrack.h> #include "mad_priv.h" +#include "restrack.h" /* Total number of ports combined across all struct ib_devices's */ #define RDMA_MAX_PORTS 8192 @@ -352,6 +353,7 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); + rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); /* * We don't track XRC QPs for now, because they don't have PD * and more importantly they are created internaly by driver, @@ -359,14 +361,9 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, */ is_xrc = qp_type == IB_QPT_XRC_INI || qp_type == IB_QPT_XRC_TGT; if ((qp_type < IB_QPT_MAX && !is_xrc) || qp_type == IB_QPT_DRIVER) { - qp->res.type = RDMA_RESTRACK_QP; - if (uobj) - rdma_restrack_uadd(&qp->res); - else - rdma_restrack_kadd(&qp->res); - } else - qp->res.valid = false; - + rdma_restrack_parent_name(&qp->res, &pd->res); + rdma_restrack_add(&qp->res); + } return qp; } diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 636166880442..e4ff0d3328b6 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -80,8 +80,9 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, counter->device = dev; counter->port = port; - counter->res.type = RDMA_RESTRACK_COUNTER; - counter->stats = dev->ops.counter_alloc_stats(counter); + + rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); + counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) goto err_stats; @@ -107,6 +108,7 @@ err_mode: mutex_unlock(&port_counter->lock); kfree(counter->stats); err_stats: + rdma_restrack_put(&counter->res); kfree(counter); return NULL; } @@ -248,13 +250,8 @@ next: static void rdma_counter_res_add(struct rdma_counter *counter, struct ib_qp *qp) { - if (rdma_is_kernel_res(&qp->res)) { - rdma_restrack_set_task(&counter->res, qp->res.kern_name); - rdma_restrack_kadd(&counter->res); - } else { - rdma_restrack_attach_task(&counter->res, qp->res.task); - rdma_restrack_uadd(&counter->res); - } + rdma_restrack_parent_name(&counter->res, &qp->res); + rdma_restrack_add(&counter->res); } static void counter_release(struct kref *kref) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index a92fc3f90bb5..12ebacf52958 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -197,24 +197,22 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) } /** - * __ib_alloc_cq_user - allocate a completion queue + * __ib_alloc_cq allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. * @caller: module owner name. - * @udata: Valid user data or NULL for kernel object * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ -struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, - enum ib_poll_context poll_ctx, - const char *caller, struct ib_udata *udata) +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, + int comp_vector, enum ib_poll_context poll_ctx, + const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, @@ -237,15 +235,13 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, if (!cq->wc) goto out_free_cq; - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); ret = dev->ops.create_cq(cq, &cq_attr, NULL); if (ret) goto out_free_wc; - rdma_restrack_kadd(&cq->res); - rdma_dim_init(cq); switch (cq->poll_ctx) { @@ -271,21 +267,22 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, goto out_destroy_cq; } + rdma_restrack_add(&cq->res); trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); return cq; out_destroy_cq: rdma_dim_destroy(cq); - rdma_restrack_del(&cq->res); - cq->device->ops.destroy_cq(cq, udata); + cq->device->ops.destroy_cq(cq, NULL); out_free_wc: + rdma_restrack_put(&cq->res); kfree(cq->wc); out_free_cq: kfree(cq); trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); return ERR_PTR(ret); } -EXPORT_SYMBOL(__ib_alloc_cq_user); +EXPORT_SYMBOL(__ib_alloc_cq); /** * __ib_alloc_cq_any - allocate a completion queue @@ -310,18 +307,19 @@ struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, atomic_inc_return(&counter) % min_t(int, dev->num_comp_vectors, num_online_cpus()); - return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx, - caller, NULL); + return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, + caller); } EXPORT_SYMBOL(__ib_alloc_cq_any); /** - * ib_free_cq_user - free a completion queue + * ib_free_cq - free a completion queue * @cq: completion queue to free. - * @udata: User data or NULL for kernel object */ -void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) +void ib_free_cq(struct ib_cq *cq) { + int ret; + if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; if (WARN_ON_ONCE(cq->cqe_used)) @@ -343,12 +341,13 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) rdma_dim_destroy(cq); trace_cq_free(cq); + ret = cq->device->ops.destroy_cq(cq, NULL); + WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); rdma_restrack_del(&cq->res); - cq->device->ops.destroy_cq(cq, udata); kfree(cq->wc); kfree(cq); } -EXPORT_SYMBOL(ib_free_cq_user); +EXPORT_SYMBOL(ib_free_cq); void ib_cq_pool_init(struct ib_device *dev) { diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 23ee65a9185f..a3b1fc84cdca 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1177,58 +1177,23 @@ out: return ret; } -static void setup_dma_device(struct ib_device *device) +static void setup_dma_device(struct ib_device *device, + struct device *dma_device) { - struct device *parent = device->dev.parent; - - WARN_ON_ONCE(device->dma_device); - -#ifdef CONFIG_DMA_OPS - if (device->dev.dma_ops) { - /* - * The caller provided custom DMA operations. Copy the - * DMA-related fields that are used by e.g. dma_alloc_coherent() - * into device->dev. - */ - device->dma_device = &device->dev; - if (!device->dev.dma_mask) { - if (parent) - device->dev.dma_mask = parent->dma_mask; - else - WARN_ON_ONCE(true); - } - if (!device->dev.coherent_dma_mask) { - if (parent) - device->dev.coherent_dma_mask = - parent->coherent_dma_mask; - else - WARN_ON_ONCE(true); - } - } else -#endif /* CONFIG_DMA_OPS */ - { - /* - * The caller did not provide custom DMA operations. Use the - * DMA mapping operations of the parent device. - */ - WARN_ON_ONCE(!parent); - device->dma_device = parent; - } - - if (!device->dev.dma_parms) { - if (parent) { - /* - * The caller did not provide DMA parameters, so - * 'parent' probably represents a PCI device. The PCI - * core sets the maximum segment size to 64 - * KB. Increase this parameter to 2 GB. - */ - device->dev.dma_parms = parent->dma_parms; - dma_set_max_seg_size(device->dma_device, SZ_2G); - } else { - WARN_ON_ONCE(true); - } + /* + * If the caller does not provide a DMA capable device then the IB + * device will be used. In this case the caller should fully setup the + * ibdev for DMA. This usually means using dma_virt_ops. + */ +#ifdef CONFIG_DMA_VIRT_OPS + if (!dma_device) { + device->dev.dma_ops = &dma_virt_ops; + dma_device = &device->dev; } +#endif + WARN_ON(!dma_device); + device->dma_device = dma_device; + WARN_ON(!device->dma_device->dma_parms); } /* @@ -1241,7 +1206,6 @@ static int setup_device(struct ib_device *device) struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; - setup_dma_device(device); ib_device_check_mandatory(device); ret = setup_port_data(device); @@ -1354,7 +1318,10 @@ static void prevent_dealloc_device(struct ib_device *ib_dev) * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will - * cause a unique index to be added to the passed device name. + * cause a unique index to be added to the passed device name. + * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB + * device will be used. In this case the caller should fully + * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a @@ -1365,7 +1332,8 @@ static void prevent_dealloc_device(struct ib_device *ib_dev) * asynchronously then the device pointer may become freed as soon as this * function returns. */ -int ib_register_device(struct ib_device *device, const char *name) +int ib_register_device(struct ib_device *device, const char *name, + struct device *dma_device) { int ret; @@ -1373,6 +1341,7 @@ int ib_register_device(struct ib_device *device, const char *name) if (ret) return ret; + setup_dma_device(device, dma_device); ret = setup_device(device); if (ret) return ret; @@ -2697,7 +2666,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); + SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6d3ed7c6e19e..ffe11b03724c 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -130,17 +130,6 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, lockdep_assert_held(&ufile->hw_destroy_rwsem); assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE); - if (reason == RDMA_REMOVE_ABORT_HWOBJ) { - reason = RDMA_REMOVE_ABORT; - ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason, - attrs); - /* - * Drivers are not permitted to ignore RDMA_REMOVE_ABORT, see - * ib_is_destroy_retryable, cleanup_retryable == false here. - */ - WARN_ON(ret); - } - if (reason == RDMA_REMOVE_ABORT) { WARN_ON(!list_empty(&uobj->list)); WARN_ON(!uobj->context); @@ -674,11 +663,22 @@ void rdma_alloc_abort_uobject(struct ib_uobject *uobj, bool hw_obj_valid) { struct ib_uverbs_file *ufile = uobj->ufile; + int ret; + + if (hw_obj_valid) { + ret = uobj->uapi_object->type_class->destroy_hw( + uobj, RDMA_REMOVE_ABORT, attrs); + /* + * If the driver couldn't destroy the object then go ahead and + * commit it. Leaking objects that can't be destroyed is only + * done during FD close after the driver has a few more tries to + * destroy it. + */ + if (WARN_ON(ret)) + return rdma_alloc_commit_uobject(uobj, attrs); + } - uverbs_destroy_uobject(uobj, - hw_obj_valid ? RDMA_REMOVE_ABORT_HWOBJ : - RDMA_REMOVE_ABORT, - attrs); + uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs); /* Matches the down_read in rdma_alloc_begin_uobject */ up_read(&ufile->hw_destroy_rwsem); @@ -889,14 +889,14 @@ void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, if (!ufile->ucontext) goto done; - ufile->ucontext->closing = true; ufile->ucontext->cleanup_retryable = true; while (!list_empty(&ufile->uobjects)) if (__uverbs_cleanup_ufile(ufile, reason)) { /* * No entry was cleaned-up successfully during this - * iteration + * iteration. It is a driver bug to fail destruction. */ + WARN_ON(!list_empty(&ufile->uobjects)); break; } diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 62fbb0ae9cb4..4aeeaaed0f17 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -123,32 +123,6 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) } EXPORT_SYMBOL(rdma_restrack_count); -static void set_kern_name(struct rdma_restrack_entry *res) -{ - struct ib_pd *pd; - - switch (res->type) { - case RDMA_RESTRACK_QP: - pd = container_of(res, struct ib_qp, res)->pd; - if (!pd) { - WARN_ONCE(true, "XRC QPs are not supported\n"); - /* Survive, despite the programmer's error */ - res->kern_name = " "; - } - break; - case RDMA_RESTRACK_MR: - pd = container_of(res, struct ib_mr, res)->pd; - break; - default: - /* Other types set kern_name directly */ - pd = NULL; - break; - } - - if (pd) - res->kern_name = pd->res.kern_name; -} - static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) { switch (res->type) { @@ -173,36 +147,77 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) } } -void rdma_restrack_set_task(struct rdma_restrack_entry *res, - const char *caller) +/** + * rdma_restrack_attach_task() - attach the task onto this resource, + * valid for user space restrack entries. + * @res: resource entry + * @task: the task to attach + */ +static void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task) { - if (caller) { - res->kern_name = caller; + if (WARN_ON_ONCE(!task)) return; - } if (res->task) put_task_struct(res->task); - get_task_struct(current); - res->task = current; + get_task_struct(task); + res->task = task; + res->user = true; } -EXPORT_SYMBOL(rdma_restrack_set_task); /** - * rdma_restrack_attach_task() - attach the task onto this resource + * rdma_restrack_set_name() - set the task for this resource * @res: resource entry - * @task: the task to attach, the current task will be used if it is NULL. + * @caller: kernel name, the current task will be used if the caller is NULL. */ -void rdma_restrack_attach_task(struct rdma_restrack_entry *res, - struct task_struct *task) +void rdma_restrack_set_name(struct rdma_restrack_entry *res, const char *caller) { - if (res->task) - put_task_struct(res->task); - get_task_struct(task); - res->task = task; + if (caller) { + res->kern_name = caller; + return; + } + + rdma_restrack_attach_task(res, current); +} +EXPORT_SYMBOL(rdma_restrack_set_name); + +/** + * rdma_restrack_parent_name() - set the restrack name properties based + * on parent restrack + * @dst: destination resource entry + * @parent: parent resource entry + */ +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent) +{ + if (rdma_is_kernel_res(parent)) + dst->kern_name = parent->kern_name; + else + rdma_restrack_attach_task(dst, parent->task); +} +EXPORT_SYMBOL(rdma_restrack_parent_name); + +/** + * rdma_restrack_new() - Initializes new restrack entry to allow _put() interface + * to release memory in fully automatic way. + * @res - Entry to initialize + * @type - REstrack type + */ +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type) +{ + kref_init(&res->kref); + init_completion(&res->comp); + res->type = type; } +EXPORT_SYMBOL(rdma_restrack_new); -static void rdma_restrack_add(struct rdma_restrack_entry *res) +/** + * rdma_restrack_add() - add object to the reource tracking database + * @res: resource entry + */ +void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); struct rdma_restrack_root *rt; @@ -213,8 +228,6 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) rt = &dev->res[res->type]; - kref_init(&res->kref); - init_completion(&res->comp); if (res->type == RDMA_RESTRACK_QP) { /* Special case to ensure that LQPN points to right QP */ struct ib_qp *qp = container_of(res, struct ib_qp, res); @@ -236,38 +249,7 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) if (!ret) res->valid = true; } - -/** - * rdma_restrack_kadd() - add kernel object to the reource tracking database - * @res: resource entry - */ -void rdma_restrack_kadd(struct rdma_restrack_entry *res) -{ - res->task = NULL; - set_kern_name(res); - res->user = false; - rdma_restrack_add(res); -} -EXPORT_SYMBOL(rdma_restrack_kadd); - -/** - * rdma_restrack_uadd() - add user object to the reource tracking database - * @res: resource entry - */ -void rdma_restrack_uadd(struct rdma_restrack_entry *res) -{ - if ((res->type != RDMA_RESTRACK_CM_ID) && - (res->type != RDMA_RESTRACK_COUNTER)) - res->task = NULL; - - if (!res->task) - rdma_restrack_set_task(res, NULL); - res->kern_name = NULL; - - res->user = true; - rdma_restrack_add(res); -} -EXPORT_SYMBOL(rdma_restrack_uadd); +EXPORT_SYMBOL(rdma_restrack_add); int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) { @@ -305,6 +287,10 @@ static void restrack_release(struct kref *kref) struct rdma_restrack_entry *res; res = container_of(kref, struct rdma_restrack_entry, kref); + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } complete(&res->comp); } @@ -314,14 +300,23 @@ int rdma_restrack_put(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_restrack_put); +/** + * rdma_restrack_del() - delete object from the reource tracking database + * @res: resource entry + */ void rdma_restrack_del(struct rdma_restrack_entry *res) { struct rdma_restrack_entry *old; struct rdma_restrack_root *rt; struct ib_device *dev; - if (!res->valid) - goto out; + if (!res->valid) { + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } + return; + } dev = res_to_dev(res); if (WARN_ON(!dev)) @@ -330,16 +325,12 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) rt = &dev->res[res->type]; old = xa_erase(&rt->xa, res->id); + if (res->type == RDMA_RESTRACK_MR || res->type == RDMA_RESTRACK_QP) + return; WARN_ON(old != res); res->valid = false; rdma_restrack_put(res); wait_for_completion(&res->comp); - -out: - if (res->task) { - put_task_struct(res->task); - res->task = NULL; - } } EXPORT_SYMBOL(rdma_restrack_del); diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h index d084e5f89849..6a04fc41f738 100644 --- a/drivers/infiniband/core/restrack.h +++ b/drivers/infiniband/core/restrack.h @@ -25,6 +25,12 @@ struct rdma_restrack_root { int rdma_restrack_init(struct ib_device *dev); void rdma_restrack_clean(struct ib_device *dev); -void rdma_restrack_attach_task(struct rdma_restrack_entry *res, - struct task_struct *task); +void rdma_restrack_add(struct rdma_restrack_entry *res); +void rdma_restrack_del(struct rdma_restrack_entry *res); +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type); +void rdma_restrack_set_name(struct rdma_restrack_entry *res, + const char *caller); +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent); #endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index c11e50510e49..914cddea525d 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -59,7 +59,7 @@ struct ib_port { struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group *pkey_group; - struct attribute_group *pma_table; + const struct attribute_group *pma_table; struct attribute_group *hw_stats_ag; struct rdma_hw_stats *hw_stats; u8 port_num; @@ -387,7 +387,8 @@ static ssize_t _show_port_gid_attr( gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index); if (IS_ERR(gid_attr)) - return PTR_ERR(gid_attr); + /* -EINVAL is returned for user space compatibility reasons. */ + return -EINVAL; ret = print(gid_attr, buf); rdma_put_gid_attr(gid_attr); @@ -653,17 +654,17 @@ static struct attribute *pma_attrs_noietf[] = { NULL }; -static struct attribute_group pma_group = { +static const struct attribute_group pma_group = { .name = "counters", .attrs = pma_attrs }; -static struct attribute_group pma_group_ext = { +static const struct attribute_group pma_group_ext = { .name = "counters", .attrs = pma_attrs_ext }; -static struct attribute_group pma_group_noietf = { +static const struct attribute_group pma_group_noietf = { .name = "counters", .attrs = pma_attrs_noietf }; @@ -778,8 +779,8 @@ err: * Figure out which counter table to use depending on * the device capabilities. */ -static struct attribute_group *get_counter_table(struct ib_device *dev, - int port_num) +static const struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) { struct ib_class_port_info cpi; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 1d184ea05eba..ffe2563ad345 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -80,7 +80,6 @@ struct ucma_file { struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; - struct workqueue_struct *close_wq; }; struct ucma_context { @@ -88,7 +87,7 @@ struct ucma_context { struct completion comp; refcount_t ref; int events_reported; - int backlog; + atomic_t backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; @@ -96,11 +95,6 @@ struct ucma_context { u64 uid; struct list_head list; - struct list_head mc_list; - /* mark that device is in process of destroying the internal HW - * resources, protected by the ctx_table lock - */ - int closing; /* sync between removal event and id destroy, protected by file mut */ int destroying; struct work_struct close_work; @@ -113,23 +107,22 @@ struct ucma_multicast { u64 uid; u8 join_state; - struct list_head list; struct sockaddr_storage addr; }; struct ucma_event { struct ucma_context *ctx; + struct ucma_context *conn_req_ctx; struct ucma_multicast *mc; struct list_head list; - struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; - struct work_struct close_work; }; static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; +static int __destroy_id(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) @@ -139,7 +132,7 @@ static inline struct ucma_context *_ucma_find_context(int id, ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file || !ctx->cm_id) + else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } @@ -150,12 +143,9 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) { - if (ctx->closing) - ctx = ERR_PTR(-EIO); - else - refcount_inc(&ctx->ref); - } + if (!IS_ERR(ctx)) + if (!refcount_inc_not_zero(&ctx->ref)) + ctx = ERR_PTR(-ENXIO); xa_unlock(&ctx_table); return ctx; } @@ -183,14 +173,6 @@ static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) return ctx; } -static void ucma_close_event_id(struct work_struct *work) -{ - struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); - - rdma_destroy_id(uevent_close->cm_id); - kfree(uevent_close); -} - static void ucma_close_id(struct work_struct *work) { struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); @@ -203,6 +185,14 @@ static void ucma_close_id(struct work_struct *work) wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); + + /* + * At this point ctx->ref is zero so the only place the ctx can be is in + * a uevent or in __destroy_id(). Since the former doesn't touch + * ctx->cm_id and the latter sync cancels this, there is no races with + * this store. + */ + ctx->cm_id = NULL; } static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) @@ -216,39 +206,23 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) INIT_WORK(&ctx->close_work, ucma_close_id); refcount_set(&ctx->ref, 1); init_completion(&ctx->comp); - INIT_LIST_HEAD(&ctx->mc_list); + /* So list_del() will work if we don't do ucma_finish_ctx() */ + INIT_LIST_HEAD(&ctx->list); ctx->file = file; mutex_init(&ctx->mutex); - if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&ctx->list, &file->ctx_list); + if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } return ctx; - -error: - kfree(ctx); - return NULL; } -static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) +static void ucma_finish_ctx(struct ucma_context *ctx) { - struct ucma_multicast *mc; - - mc = kzalloc(sizeof(*mc), GFP_KERNEL); - if (!mc) - return NULL; - - mc->ctx = ctx; - if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&mc->list, &ctx->mc_list); - return mc; - -error: - kfree(mc); - return NULL; + lockdep_assert_held(&ctx->file->mut); + list_add_tail(&ctx->list, &ctx->file->ctx_list); + xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, @@ -280,10 +254,15 @@ static void ucma_copy_ud_event(struct ib_device *device, dst->qkey = src->qkey; } -static void ucma_set_event_context(struct ucma_context *ctx, - struct rdma_cm_event *event, - struct ucma_event *uevent) +static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, + struct rdma_cm_event *event) { + struct ucma_event *uevent; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) + return NULL; + uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: @@ -298,44 +277,56 @@ static void ucma_set_event_context(struct ucma_context *ctx, uevent->resp.id = ctx->id; break; } + uevent->resp.event = event->event; + uevent->resp.status = event->status; + if (ctx->cm_id->qp_type == IB_QPT_UD) + ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, + &event->param.ud); + else + ucma_copy_conn_event(&uevent->resp.param.conn, + &event->param.conn); + + uevent->resp.ece.vendor_id = event->ece.vendor_id; + uevent->resp.ece.attr_mod = event->ece.attr_mod; + return uevent; } -/* Called with file->mut locked for the relevant context. */ -static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) { - struct ucma_context *ctx = cm_id->context; - struct ucma_event *con_req_eve; - int event_found = 0; + struct ucma_context *listen_ctx = cm_id->context; + struct ucma_context *ctx; + struct ucma_event *uevent; - if (ctx->destroying) - return; + if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) + return -ENOMEM; + ctx = ucma_alloc_ctx(listen_ctx->file); + if (!ctx) + goto err_backlog; + ctx->cm_id = cm_id; - /* only if context is pointing to cm_id that it owns it and can be - * queued to be closed, otherwise that cm_id is an inflight one that - * is part of that context event list pending to be detached and - * reattached to its new context as part of ucma_get_event, - * handled separately below. - */ - if (ctx->cm_id == cm_id) { - xa_lock(&ctx_table); - ctx->closing = 1; - xa_unlock(&ctx_table); - queue_work(ctx->file->close_wq, &ctx->close_work); - return; - } + uevent = ucma_create_uevent(listen_ctx, event); + if (!uevent) + goto err_alloc; + uevent->conn_req_ctx = ctx; + uevent->resp.id = ctx->id; - list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { - if (con_req_eve->cm_id == cm_id && - con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - list_del(&con_req_eve->list); - INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); - queue_work(ctx->file->close_wq, &con_req_eve->close_work); - event_found = 1; - break; - } - } - if (!event_found) - pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); + ctx->cm_id->context = ctx; + + mutex_lock(&ctx->file->mut); + ucma_finish_ctx(ctx); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + return 0; + +err_alloc: + xa_erase(&ctx_table, ctx->id); + kfree(ctx); +err_backlog: + atomic_inc(&listen_ctx->backlog); + /* Returning error causes the new ID to be destroyed */ + return -ENOMEM; } static int ucma_event_handler(struct rdma_cm_id *cm_id, @@ -343,66 +334,38 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; - int ret = 0; - - uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); - if (!uevent) - return event->event == RDMA_CM_EVENT_CONNECT_REQUEST; - mutex_lock(&ctx->file->mut); - uevent->cm_id = cm_id; - ucma_set_event_context(ctx, event, uevent); - uevent->resp.event = event->event; - uevent->resp.status = event->status; - if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, - &event->param.ud); - else - ucma_copy_conn_event(&uevent->resp.param.conn, - &event->param.conn); - - uevent->resp.ece.vendor_id = event->ece.vendor_id; - uevent->resp.ece.attr_mod = event->ece.attr_mod; - - if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { - if (!ctx->backlog) { - ret = -ENOMEM; - kfree(uevent); - goto out; - } - ctx->backlog--; - } else if (!ctx->uid || ctx->cm_id != cm_id) { - /* - * We ignore events for new connections until userspace has set - * their context. This can only happen if an error occurs on a - * new connection before the user accepts it. This is okay, - * since the accept will just fail later. However, we do need - * to release the underlying HW resources in case of a device - * removal event. - */ - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + return ucma_connect_event_handler(cm_id, event); - kfree(uevent); - goto out; + /* + * We ignore events for new connections until userspace has set their + * context. This can only happen if an error occurs on a new connection + * before the user accepts it. This is okay, since the accept will just + * fail later. However, we do need to release the underlying HW + * resources in case of a device removal event. + */ + if (ctx->uid) { + uevent = ucma_create_uevent(ctx, event); + if (!uevent) + return 0; + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); } - list_add_tail(&uevent->list, &ctx->file->event_list); - wake_up_interruptible(&ctx->file->poll_wait); - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); -out: - mutex_unlock(&ctx->file->mut); - return ret; + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL && !ctx->destroying) + queue_work(system_unbound_wq, &ctx->close_work); + return 0; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct ucma_context *ctx; struct rdma_ucm_get_event cmd; struct ucma_event *uevent; - int ret = 0; /* * Old 32 bit user space does not send the 4 byte padding in the @@ -429,35 +392,25 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, mutex_lock(&file->mut); } - uevent = list_entry(file->event_list.next, struct ucma_event, list); - - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - ctx = ucma_alloc_ctx(file); - if (!ctx) { - ret = -ENOMEM; - goto done; - } - uevent->ctx->backlog++; - ctx->cm_id = uevent->cm_id; - ctx->cm_id->context = ctx; - uevent->resp.id = ctx->id; - } + uevent = list_first_entry(&file->event_list, struct ucma_event, list); if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { - ret = -EFAULT; - goto done; + mutex_unlock(&file->mut); + return -EFAULT; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; - kfree(uevent); -done: + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + atomic_inc(&uevent->ctx->backlog); mutex_unlock(&file->mut); - return ret; + + kfree(uevent); + return 0; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) @@ -498,58 +451,60 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, if (ret) return ret; - mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); - mutex_unlock(&file->mut); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; - cm_id = __rdma_create_id(current->nsproxy->net_ns, - ucma_event_handler, ctx, cmd.ps, qp_type, NULL); + cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto err1; } + ctx->cm_id = cm_id; resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { - ret = -EFAULT; - goto err2; + xa_erase(&ctx_table, ctx->id); + __destroy_id(ctx); + return -EFAULT; } - ctx->cm_id = cm_id; + mutex_lock(&file->mut); + ucma_finish_ctx(ctx); + mutex_unlock(&file->mut); return 0; -err2: - rdma_destroy_id(cm_id); err1: xa_erase(&ctx_table, ctx->id); - mutex_lock(&file->mut); - list_del(&ctx->list); - mutex_unlock(&file->mut); kfree(ctx); return ret; } static void ucma_cleanup_multicast(struct ucma_context *ctx) { - struct ucma_multicast *mc, *tmp; + struct ucma_multicast *mc; + unsigned long index; - mutex_lock(&ctx->file->mut); - list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { - list_del(&mc->list); - xa_erase(&multicast_table, mc->id); + xa_for_each(&multicast_table, index, mc) { + if (mc->ctx != ctx) + continue; + /* + * At this point mc->ctx->ref is 0 so the mc cannot leave the + * lock on the reader and this is enough serialization + */ + xa_erase(&multicast_table, index); kfree(mc); } - mutex_unlock(&ctx->file->mut); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; + rdma_lock_handler(mc->ctx->cm_id); + mutex_lock(&mc->ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; @@ -557,6 +512,8 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) list_del(&uevent->list); kfree(uevent); } + mutex_unlock(&mc->ctx->file->mut); + rdma_unlock_handler(mc->ctx->cm_id); } /* @@ -564,10 +521,6 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) * this point, no new events will be reported from the hardware. However, we * still need to cleanup the UCMA context for this ID. Specifically, there * might be events that have not yet been consumed by the user space software. - * These might include pending connect requests which we have not completed - * processing. We cannot call rdma_destroy_id while holding the lock of the - * context (file->mut), as it might cause a deadlock. We therefore extract all - * relevant events from the context pending events list while holding the * mutex. After that we release them as needed. */ static int ucma_free_ctx(struct ucma_context *ctx) @@ -576,31 +529,57 @@ static int ucma_free_ctx(struct ucma_context *ctx) struct ucma_event *uevent, *tmp; LIST_HEAD(list); - ucma_cleanup_multicast(ctx); /* Cleanup events not yet reported to the user. */ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { - if (uevent->ctx == ctx) + if (uevent->ctx == ctx || uevent->conn_req_ctx == ctx) list_move_tail(&uevent->list, &list); } list_del(&ctx->list); + events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); + /* + * If this was a listening ID then any connections spawned from it + * that have not been delivered to userspace are cleaned up too. + * Must be done outside any locks. + */ list_for_each_entry_safe(uevent, tmp, &list, list) { list_del(&uevent->list); - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) - rdma_destroy_id(uevent->cm_id); + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && + uevent->conn_req_ctx != ctx) + __destroy_id(uevent->conn_req_ctx); kfree(uevent); } - events_reported = ctx->events_reported; mutex_destroy(&ctx->mutex); kfree(ctx); return events_reported; } +static int __destroy_id(struct ucma_context *ctx) +{ + /* + * If the refcount is already 0 then ucma_close_id() has already + * destroyed the cm_id, otherwise holding the refcount keeps cm_id + * valid. Prevent queue_work() from being called. + */ + if (refcount_inc_not_zero(&ctx->ref)) { + rdma_lock_handler(ctx->cm_id); + ctx->destroying = 1; + rdma_unlock_handler(ctx->cm_id); + ucma_put_ctx(ctx); + } + + cancel_work_sync(&ctx->close_work); + /* At this point it's guaranteed that there is no inflight closing task */ + if (ctx->cm_id) + ucma_close_id(&ctx->close_work); + return ucma_free_ctx(ctx); +} + static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { @@ -624,24 +603,7 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&ctx->file->mut); - ctx->destroying = 1; - mutex_unlock(&ctx->file->mut); - - flush_workqueue(ctx->file->close_wq); - /* At this point it's guaranteed that there is no inflight - * closing task */ - xa_lock(&ctx_table); - if (!ctx->closing) { - xa_unlock(&ctx_table); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - rdma_destroy_id(ctx->cm_id); - } else { - xa_unlock(&ctx_table); - } - - resp.events_reported = ucma_free_ctx(ctx); + resp.events_reported = __destroy_id(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -1124,10 +1086,12 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? - cmd.backlog : max_backlog; + if (cmd.backlog <= 0 || cmd.backlog > max_backlog) + cmd.backlog = max_backlog; + atomic_set(&ctx->backlog, cmd.backlog); + mutex_lock(&ctx->mutex); - ret = rdma_listen(ctx->cm_id, ctx->backlog); + ret = rdma_listen(ctx->cm_id, cmd.backlog); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; @@ -1160,16 +1124,20 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); - mutex_lock(&file->mut); mutex_lock(&ctx->mutex); - ret = __rdma_accept_ece(ctx->cm_id, &conn_param, NULL, &ece); - mutex_unlock(&ctx->mutex); - if (!ret) + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); + if (!ret) { + /* The uid must be set atomically with the handler */ ctx->uid = cmd.uid; - mutex_unlock(&file->mut); + } + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); } else { mutex_lock(&ctx->mutex); - ret = __rdma_accept_ece(ctx->cm_id, NULL, NULL, &ece); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); + rdma_unlock_handler(ctx->cm_id); mutex_unlock(&ctx->mutex); } ucma_put_ctx(ctx); @@ -1482,44 +1450,52 @@ static ssize_t ucma_process_join(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&file->mut); - mc = ucma_alloc_multicast(ctx); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) { ret = -ENOMEM; - goto err1; + goto err_put_ctx; } + + mc->ctx = ctx; mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); + + if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, + GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_mc; + } + mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); mutex_unlock(&ctx->mutex); if (ret) - goto err2; + goto err_xa_erase; resp.id = mc->id; if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; - goto err3; + goto err_leave_multicast; } xa_store(&multicast_table, mc->id, mc, 0); - mutex_unlock(&file->mut); ucma_put_ctx(ctx); return 0; -err3: +err_leave_multicast: + mutex_lock(&ctx->mutex); rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); + mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); -err2: +err_xa_erase: xa_erase(&multicast_table, mc->id); - list_del(&mc->list); +err_free_mc: kfree(mc); -err1: - mutex_unlock(&file->mut); +err_put_ctx: ucma_put_ctx(ctx); return ret; } @@ -1581,7 +1557,7 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); - else if (mc->ctx->file != file) + else if (READ_ONCE(mc->ctx->file) != file) mc = ERR_PTR(-EINVAL); else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); @@ -1598,10 +1574,7 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&mc->ctx->mutex); - mutex_lock(&mc->ctx->file->mut); ucma_cleanup_mc_events(mc); - list_del(&mc->list); - mutex_unlock(&mc->ctx->file->mut); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; @@ -1614,45 +1587,15 @@ out: return ret; } -static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - /* Acquire mutex's based on pointer comparison to prevent deadlock. */ - if (file1 < file2) { - mutex_lock(&file1->mut); - mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); - } else { - mutex_lock(&file2->mut); - mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); - } -} - -static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - if (file1 < file2) { - mutex_unlock(&file2->mut); - mutex_unlock(&file1->mut); - } else { - mutex_unlock(&file1->mut); - mutex_unlock(&file2->mut); - } -} - -static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file) -{ - struct ucma_event *uevent, *tmp; - - list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) - if (uevent->ctx == ctx) - list_move_tail(&uevent->list, &file->event_list); -} - static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; + struct ucma_event *uevent, *tmp; struct ucma_context *ctx; + LIST_HEAD(event_list); struct fd f; struct ucma_file *cur_file; int ret = 0; @@ -1668,40 +1611,53 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, ret = -EINVAL; goto file_put; } + cur_file = f.file->private_data; /* Validate current fd and prevent destruction of id. */ - ctx = ucma_get_ctx(f.file->private_data, cmd.id); + ctx = ucma_get_ctx(cur_file, cmd.id); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto file_put; } - cur_file = ctx->file; - if (cur_file == new_file) { - resp.events_reported = ctx->events_reported; - goto response; - } - + rdma_lock_handler(ctx->cm_id); /* - * Migrate events between fd's, maintaining order, and avoiding new - * events being added before existing events. + * ctx->file can only be changed under the handler & xa_lock. xa_load() + * must be checked again to ensure the ctx hasn't begun destruction + * since the ucma_get_ctx(). */ - ucma_lock_files(cur_file, new_file); xa_lock(&ctx_table); - - list_move_tail(&ctx->list, &new_file->ctx_list); - ucma_move_events(ctx, new_file); + if (_ucma_find_context(cmd.id, cur_file) != ctx) { + xa_unlock(&ctx_table); + ret = -ENOENT; + goto err_unlock; + } ctx->file = new_file; + xa_unlock(&ctx_table); + + mutex_lock(&cur_file->mut); + list_del(&ctx->list); + /* + * At this point lock_handler() prevents addition of new uevents for + * this ctx. + */ + list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &event_list); resp.events_reported = ctx->events_reported; + mutex_unlock(&cur_file->mut); - xa_unlock(&ctx_table); - ucma_unlock_files(cur_file, new_file); + mutex_lock(&new_file->mut); + list_add_tail(&ctx->list, &new_file->ctx_list); + list_splice_tail(&event_list, &new_file->event_list); + mutex_unlock(&new_file->mut); -response: if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; +err_unlock: + rdma_unlock_handler(ctx->cm_id); ucma_put_ctx(ctx); file_put: fdput(f); @@ -1801,13 +1757,6 @@ static int ucma_open(struct inode *inode, struct file *filp) if (!file) return -ENOMEM; - file->close_wq = alloc_ordered_workqueue("ucma_close_id", - WQ_MEM_RECLAIM); - if (!file->close_wq) { - kfree(file); - return -ENOMEM; - } - INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); @@ -1822,37 +1771,22 @@ static int ucma_open(struct inode *inode, struct file *filp) static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; - struct ucma_context *ctx, *tmp; - mutex_lock(&file->mut); - list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { - ctx->destroying = 1; - mutex_unlock(&file->mut); + /* + * All paths that touch ctx_list or ctx_list starting from write() are + * prevented by this being a FD release function. The list_add_tail() in + * ucma_connect_event_handler() can run concurrently, however it only + * adds to the list *after* a listening ID. By only reading the first of + * the list, and relying on __destroy_id() to block + * ucma_connect_event_handler(), no additional locking is needed. + */ + while (!list_empty(&file->ctx_list)) { + struct ucma_context *ctx = list_first_entry( + &file->ctx_list, struct ucma_context, list); xa_erase(&ctx_table, ctx->id); - flush_workqueue(file->close_wq); - /* At that step once ctx was marked as destroying and workqueue - * was flushed we are safe from any inflights handlers that - * might put other closing task. - */ - xa_lock(&ctx_table); - if (!ctx->closing) { - xa_unlock(&ctx_table); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - /* rdma_destroy_id ensures that no event handlers are - * inflight for that id before releasing it. - */ - rdma_destroy_id(ctx->cm_id); - } else { - xa_unlock(&ctx_table); - } - - ucma_free_ctx(ctx); - mutex_lock(&file->mut); + __destroy_id(ctx); } - mutex_unlock(&file->mut); - destroy_workqueue(file->close_wq); kfree(file); return 0; } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 831bff8d52e5..e9fecbdf391b 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -39,6 +39,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/count_zeros.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" @@ -60,73 +61,6 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d sg_free_table(&umem->sg_head); } -/* ib_umem_add_sg_table - Add N contiguous pages to scatter table - * - * sg: current scatterlist entry - * page_list: array of npage struct page pointers - * npages: number of pages in page_list - * max_seg_sz: maximum segment size in bytes - * nents: [out] number of entries in the scatterlist - * - * Return new end of scatterlist - */ -static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, - struct page **page_list, - unsigned long npages, - unsigned int max_seg_sz, - int *nents) -{ - unsigned long first_pfn; - unsigned long i = 0; - bool update_cur_sg = false; - bool first = !sg_page(sg); - - /* Check if new page_list is contiguous with end of previous page_list. - * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. - */ - if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == - page_to_pfn(page_list[0]))) - update_cur_sg = true; - - while (i != npages) { - unsigned long len; - struct page *first_page = page_list[i]; - - first_pfn = page_to_pfn(first_page); - - /* Compute the number of contiguous pages we have starting - * at i - */ - for (len = 0; i != npages && - first_pfn + len == page_to_pfn(page_list[i]) && - len < (max_seg_sz >> PAGE_SHIFT); - len++) - i++; - - /* Squash N contiguous pages from page_list into current sge */ - if (update_cur_sg) { - if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { - sg_set_page(sg, sg_page(sg), - sg->length + (len << PAGE_SHIFT), - 0); - update_cur_sg = false; - continue; - } - update_cur_sg = false; - } - - /* Squash N contiguous pages into next sge or first sge */ - if (!first) - sg = sg_next(sg); - - (*nents)++; - sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); - first = false; - } - - return sg; -} - /** * ib_umem_find_best_pgsz - Find best HW page size to use for this MR * @@ -146,18 +80,28 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, unsigned long virt) { struct scatterlist *sg; - unsigned int best_pg_bit; unsigned long va, pgoff; dma_addr_t mask; int i; + /* rdma_for_each_block() has a bug if the page size is smaller than the + * page size used to build the umem. For now prevent smaller page sizes + * from being returned. + */ + pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT); + /* At minimum, drivers must support PAGE_SIZE or smaller */ if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0)))) return 0; - va = virt; - /* max page size not to exceed MR length */ - mask = roundup_pow_of_two(umem->length); + umem->iova = va = virt; + /* The best result is the smallest page size that results in the minimum + * number of required pages. Compute the largest page size that could + * work based on VA address bits that don't change. + */ + mask = pgsz_bitmap & + GENMASK(BITS_PER_LONG - 1, + bits_per((umem->length - 1 + virt) ^ virt)); /* offset into first SGL */ pgoff = umem->address & ~PAGE_MASK; @@ -175,9 +119,14 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, mask |= va; pgoff = 0; } - best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap); - return BIT_ULL(best_pg_bit); + /* The mask accumulates 1's in each position where the VA and physical + * address differ, thus the length of trailing 0 is the largest page + * size that can pass the VA through to the physical. + */ + if (mask) + pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); + return rounddown_pow_of_two(pgsz_bitmap); } EXPORT_SYMBOL(ib_umem_find_best_pgsz); @@ -201,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, struct mm_struct *mm; unsigned long npages; int ret; - struct scatterlist *sg; + struct scatterlist *sg = NULL; unsigned int gup_flags = FOLL_WRITE; /* @@ -224,6 +173,11 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, umem->ibdev = device; umem->length = size; umem->address = addr; + /* + * Drivers should call ib_umem_find_best_pgsz() to set the iova + * correctly. + */ + umem->iova = addr; umem->writable = ib_access_writable(access); umem->owning_mm = mm = current->mm; mmgrab(mm); @@ -251,15 +205,9 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, cur_base = addr & PAGE_MASK; - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); - if (ret) - goto vma; - if (!umem->writable) gup_flags |= FOLL_FORCE; - sg = umem->sg_head.sgl; - while (npages) { cond_resched(); ret = pin_user_pages_fast(cur_base, @@ -271,15 +219,19 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, goto umem_release; cur_base += ret * PAGE_SIZE; - npages -= ret; - - sg = ib_umem_add_sg_table(sg, page_list, ret, - dma_get_max_seg_size(device->dma_device), - &umem->sg_nents); + npages -= ret; + sg = __sg_alloc_table_from_pages( + &umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT, + dma_get_max_seg_size(device->dma_device), sg, npages, + GFP_KERNEL); + umem->sg_nents = umem->sg_head.nents; + if (IS_ERR(sg)) { + unpin_user_pages_dirty_lock(page_list, ret, 0); + ret = PTR_ERR(sg); + goto umem_release; + } } - sg_mark_end(sg); - if (access & IB_ACCESS_RELAXED_ORDERING) dma_attr |= DMA_ATTR_WEAK_ORDERING; @@ -297,7 +249,6 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, umem_release: __ib_umem_release(device, umem, 0); -vma: atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: free_page((unsigned long) page_list); @@ -329,18 +280,6 @@ void ib_umem_release(struct ib_umem *umem) } EXPORT_SYMBOL(ib_umem_release); -int ib_umem_page_count(struct ib_umem *umem) -{ - int i, n = 0; - struct scatterlist *sg; - - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> PAGE_SHIFT; - - return n; -} -EXPORT_SYMBOL(ib_umem_page_count); - /* * Copy from the given ib_umem's pages to the given buffer. * diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index cc6b4befde7c..323f6cf00682 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -40,6 +40,7 @@ #include <linux/vmalloc.h> #include <linux/hugetlb.h> #include <linux/interval_tree.h> +#include <linux/hmm.h> #include <linux/pagemap.h> #include <rdma/ib_verbs.h> @@ -60,7 +61,7 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, size_t page_size = 1UL << umem_odp->page_shift; unsigned long start; unsigned long end; - size_t pages; + size_t ndmas, npfns; start = ALIGN_DOWN(umem_odp->umem.address, page_size); if (check_add_overflow(umem_odp->umem.address, @@ -71,20 +72,21 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, if (unlikely(end < page_size)) return -EOVERFLOW; - pages = (end - start) >> umem_odp->page_shift; - if (!pages) + ndmas = (end - start) >> umem_odp->page_shift; + if (!ndmas) return -EINVAL; - umem_odp->page_list = kvcalloc( - pages, sizeof(*umem_odp->page_list), GFP_KERNEL); - if (!umem_odp->page_list) + npfns = (end - start) >> PAGE_SHIFT; + umem_odp->pfn_list = kvcalloc( + npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); + if (!umem_odp->pfn_list) return -ENOMEM; umem_odp->dma_list = kvcalloc( - pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); + ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); if (!umem_odp->dma_list) { ret = -ENOMEM; - goto out_page_list; + goto out_pfn_list; } ret = mmu_interval_notifier_insert(&umem_odp->notifier, @@ -98,8 +100,8 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, out_dma_list: kvfree(umem_odp->dma_list); -out_page_list: - kvfree(umem_odp->page_list); +out_pfn_list: + kvfree(umem_odp->pfn_list); return ret; } @@ -276,7 +278,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) mutex_unlock(&umem_odp->umem_mutex); mmu_interval_notifier_remove(&umem_odp->notifier); kvfree(umem_odp->dma_list); - kvfree(umem_odp->page_list); + kvfree(umem_odp->pfn_list); } put_pid(umem_odp->tgid); kfree(umem_odp); @@ -287,87 +289,56 @@ EXPORT_SYMBOL(ib_umem_odp_release); * Map for DMA and insert a single page into the on-demand paging page tables. * * @umem: the umem to insert the page to. - * @page_index: index in the umem to add the page to. + * @dma_index: index in the umem to add the dma to. * @page: the page struct to map and add. * @access_mask: access permissions needed for this page. * @current_seq: sequence number for synchronization with invalidations. * the sequence number is taken from * umem_odp->notifiers_seq. * - * The function returns -EFAULT if the DMA mapping operation fails. It returns - * -EAGAIN if a concurrent invalidation prevents us from updating the page. + * The function returns -EFAULT if the DMA mapping operation fails. * - * The page is released via put_page even if the operation failed. For on-demand - * pinning, the page is released whenever it isn't stored in the umem. */ static int ib_umem_odp_map_dma_single_page( struct ib_umem_odp *umem_odp, - unsigned int page_index, + unsigned int dma_index, struct page *page, - u64 access_mask, - unsigned long current_seq) + u64 access_mask) { struct ib_device *dev = umem_odp->umem.ibdev; - dma_addr_t dma_addr; - int ret = 0; + dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) { - ret = -EAGAIN; - goto out; - } - if (!(umem_odp->dma_list[page_index])) { - dma_addr = - ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, dma_addr)) { - ret = -EFAULT; - goto out; - } - umem_odp->dma_list[page_index] = dma_addr | access_mask; - umem_odp->page_list[page_index] = page; - umem_odp->npages++; - } else if (umem_odp->page_list[page_index] == page) { - umem_odp->dma_list[page_index] |= access_mask; - } else { + if (*dma_addr) { /* - * This is a race here where we could have done: - * - * CPU0 CPU1 - * get_user_pages() - * invalidate() - * page_fault() - * mutex_lock(umem_mutex) - * page from GUP != page in ODP - * - * It should be prevented by the retry test above as reading - * the seq number should be reliable under the - * umem_mutex. Thus something is really not working right if - * things get here. + * If the page is already dma mapped it means it went through + * a non-invalidating trasition, like read-only to writable. + * Resync the flags. */ - WARN(true, - "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem_odp->page_list[page_index], page); - ret = -EAGAIN; + *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; + return 0; } -out: - put_page(page); - return ret; + *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, *dma_addr)) { + *dma_addr = 0; + return -EFAULT; + } + umem_odp->npages++; + *dma_addr |= access_mask; + return 0; } /** - * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. + * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * - * Pins the range of pages passed in the argument, and maps them to - * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem_odp->dma_list. + * Maps the range passed in the argument to DMA addresses. + * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. + * Upon success the ODP MR will be locked to let caller complete its device + * page table update. * * Returns the number of pages mapped in success, negative error code * for failure. - * An -EAGAIN error code is returned when a concurrent mmu notifier prevents - * the function from completing its task. - * An -ENOENT error code indicates that userspace process is being terminated - * and mm was already destroyed. * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be @@ -376,21 +347,19 @@ out: * the return value. * @access_mask: bit mask of the requested access permissions for the given * range. - * @current_seq: the MMU notifiers sequance value for synchronization with - * invalidations. the sequance number is read from - * umem_odp->notifiers_seq before calling this function + * @fault: is faulting required for the given range */ -int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, - u64 bcnt, u64 access_mask, - unsigned long current_seq) +int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, bool fault) + __acquires(&umem_odp->umem_mutex) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = umem_odp->umem.owning_mm; - struct page **local_page_list = NULL; - u64 page_mask, off; - int j, k, ret = 0, start_idx, npages = 0; - unsigned int flags = 0, page_shift; - phys_addr_t p = 0; + int pfn_index, dma_index, ret = 0, start_idx; + unsigned int page_shift, hmm_order, pfn_start_idx; + unsigned long num_pfns, current_seq; + struct hmm_range range = {}; + unsigned long timeout; if (access_mask == 0) return -EINVAL; @@ -399,15 +368,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; - local_page_list = (struct page **)__get_free_page(GFP_KERNEL); - if (!local_page_list) - return -ENOMEM; - page_shift = umem_odp->page_shift; - page_mask = ~(BIT(page_shift) - 1); - off = user_virt & (~page_mask); - user_virt = user_virt & page_mask; - bcnt += off; /* Charge for the first page offset as well. */ /* * owning_process is allowed to be NULL, this means somehow the mm is @@ -420,99 +381,104 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, goto out_put_task; } - if (access_mask & ODP_WRITE_ALLOWED_BIT) - flags |= FOLL_WRITE; + range.notifier = &umem_odp->notifier; + range.start = ALIGN_DOWN(user_virt, 1UL << page_shift); + range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); + pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + num_pfns = (range.end - range.start) >> PAGE_SHIFT; + if (fault) { + range.default_flags = HMM_PFN_REQ_FAULT; - start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; - k = start_idx; + if (access_mask & ODP_WRITE_ALLOWED_BIT) + range.default_flags |= HMM_PFN_REQ_WRITE; + } - while (bcnt > 0) { - const size_t gup_num_pages = min_t(size_t, - ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, - PAGE_SIZE / sizeof(struct page *)); + range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); - mmap_read_lock(owning_mm); - /* - * Note: this might result in redundent page getting. We can - * avoid this by checking dma_list to be 0 before calling - * get_user_pages. However, this make the code much more - * complex (and doesn't gain us much performance in most use - * cases). - */ - npages = get_user_pages_remote(owning_mm, - user_virt, gup_num_pages, - flags, local_page_list, NULL, NULL); - mmap_read_unlock(owning_mm); - - if (npages < 0) { - if (npages != -EAGAIN) - pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); - else - pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); - break; - } +retry: + current_seq = range.notifier_seq = + mmu_interval_read_begin(&umem_odp->notifier); - bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem_odp->umem_mutex); - for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { - if (user_virt & ~page_mask) { - p += PAGE_SIZE; - if (page_to_phys(local_page_list[j]) != p) { - ret = -EFAULT; - break; - } - put_page(local_page_list[j]); - continue; - } + mmap_read_lock(owning_mm); + ret = hmm_range_fault(&range); + mmap_read_unlock(owning_mm); + if (unlikely(ret)) { + if (ret == -EBUSY && !time_after(jiffies, timeout)) + goto retry; + goto out_put_mm; + } - ret = ib_umem_odp_map_dma_single_page( - umem_odp, k, local_page_list[j], - access_mask, current_seq); - if (ret < 0) { - if (ret != -EAGAIN) - pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - else - pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } + start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; + dma_index = start_idx; - p = page_to_phys(local_page_list[j]); - k++; - } + mutex_lock(&umem_odp->umem_mutex); + if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) { mutex_unlock(&umem_odp->umem_mutex); + goto retry; + } - if (ret < 0) { + for (pfn_index = 0; pfn_index < num_pfns; + pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { + + if (fault) { /* - * Release pages, remembering that the first page - * to hit an error was already released by - * ib_umem_odp_map_dma_single_page(). + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. */ - if (npages - (j + 1) > 0) - release_pages(&local_page_list[j+1], - npages - (j + 1)); + WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + } else { + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { + WARN_ON(umem_odp->dma_list[dma_index]); + continue; + } + access_mask = ODP_READ_ALLOWED_BIT; + if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) + access_mask |= ODP_WRITE_ALLOWED_BIT; + } + + hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); + /* If a hugepage was detected and ODP wasn't set for, the umem + * page_shift will be used, the opposite case is an error. + */ + if (hmm_order + PAGE_SHIFT < page_shift) { + ret = -EINVAL; + ibdev_dbg(umem_odp->umem.ibdev, + "%s: un-expected hmm_order %d, page_shift %d\n", + __func__, hmm_order, page_shift); break; } - } - if (ret >= 0) { - if (npages < 0 && k == start_idx) - ret = npages; - else - ret = k - start_idx; + ret = ib_umem_odp_map_dma_single_page( + umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), + access_mask); + if (ret < 0) { + ibdev_dbg(umem_odp->umem.ibdev, + "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); + break; + } } + /* upon sucesss lock should stay on hold for the callee */ + if (!ret) + ret = dma_index - start_idx; + else + mutex_unlock(&umem_odp->umem_mutex); +out_put_mm: mmput(owning_mm); out_put_task: if (owning_process) put_task_struct(owning_process); - free_page((unsigned long)local_page_list); return ret; } -EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); +EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { + dma_addr_t dma_addr; + dma_addr_t dma; int idx; u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; @@ -521,20 +487,16 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); - /* Note that during the run of this function, the - * notifiers_count of the MR is > 0, preventing any racing - * faults from completion. We might be racing with other - * invalidations, so we must make sure we free each page only - * once. */ for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - if (umem_odp->page_list[idx]) { - struct page *page = umem_odp->page_list[idx]; - dma_addr_t dma = umem_odp->dma_list[idx]; - dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; + dma = umem_odp->dma_list[idx]; - WARN_ON(!dma_addr); + /* The access flags guaranteed a valid DMA address in case was NULL */ + if (dma) { + unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); + dma_addr = dma & ODP_DMA_ADDR_MASK; ib_dma_unmap_page(dev, dma_addr, BIT(umem_odp->page_shift), DMA_BIDIRECTIONAL); @@ -551,7 +513,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, */ set_page_dirty(head_page); } - umem_odp->page_list[idx] = NULL; umem_odp->dma_list[idx] = 0; umem_odp->npages--; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2fbc583d5bdd..418d133a8fb0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -218,10 +218,12 @@ int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs) if (!ucontext) return -ENOMEM; - ucontext->res.type = RDMA_RESTRACK_CTX; ucontext->device = ib_dev; ucontext->ufile = ufile; xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC); + + rdma_restrack_new(&ucontext->res, RDMA_RESTRACK_CTX); + rdma_restrack_set_name(&ucontext->res, NULL); attrs->context = ucontext; return 0; } @@ -250,7 +252,7 @@ int ib_init_ucontext(struct uverbs_attr_bundle *attrs) if (ret) goto err_uncharge; - rdma_restrack_uadd(&ucontext->res); + rdma_restrack_add(&ucontext->res); /* * Make sure that ib_uverbs_get_ucontext() sees the pointer update @@ -313,6 +315,7 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) err_uobj: rdma_alloc_abort_uobject(uobj, attrs, false); err_ucontext: + rdma_restrack_put(&attrs->context->res); kfree(attrs->context); attrs->context = NULL; return ret; @@ -439,12 +442,14 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) pd->device = ib_dev; pd->uobject = uobj; atomic_set(&pd->usecnt, 0); - pd->res.type = RDMA_RESTRACK_PD; + + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, NULL); ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata); if (ret) goto err_alloc; - rdma_restrack_uadd(&pd->res); + rdma_restrack_add(&pd->res); uobj->object = pd; uobj_finalize_uobj_create(uobj, attrs); @@ -453,6 +458,7 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) return uverbs_response(attrs, &resp, sizeof(resp)); err_alloc: + rdma_restrack_put(&pd->res); kfree(pd); err: uobj_alloc_abort(uobj, attrs); @@ -742,9 +748,11 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) mr->sig_attrs = NULL; mr->uobject = uobj; atomic_inc(&pd->usecnt); - mr->res.type = RDMA_RESTRACK_MR; mr->iova = cmd.hca_va; - rdma_restrack_uadd(&mr->res); + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); uobj->object = mr; uobj_put_obj_read(pd); @@ -858,7 +866,7 @@ static int ib_uverbs_dereg_mr(struct uverbs_attr_bundle *attrs) static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_alloc_mw cmd; - struct ib_uverbs_alloc_mw_resp resp; + struct ib_uverbs_alloc_mw_resp resp = {}; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; @@ -884,15 +892,21 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) goto err_put; } - mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata); - if (IS_ERR(mw)) { - ret = PTR_ERR(mw); + mw = rdma_zalloc_drv_obj(ib_dev, ib_mw); + if (!mw) { + ret = -ENOMEM; goto err_put; } - mw->device = pd->device; - mw->pd = pd; + mw->device = ib_dev; + mw->pd = pd; mw->uobject = uobj; + mw->type = cmd.mw_type; + + ret = pd->device->ops.alloc_mw(mw, &attrs->driver_udata); + if (ret) + goto err_alloc; + atomic_inc(&pd->usecnt); uobj->object = mw; @@ -903,6 +917,8 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) resp.mw_handle = uobj->id; return uverbs_response(attrs, &resp, sizeof(resp)); +err_alloc: + kfree(mw); err_put: uobj_put_obj_read(pd); err_free: @@ -994,12 +1010,14 @@ static int create_cq(struct uverbs_attr_bundle *attrs, cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); if (ret) goto err_free; - rdma_restrack_uadd(&cq->res); + rdma_restrack_add(&cq->res); obj->uevent.uobject.object = cq; obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); @@ -1013,6 +1031,7 @@ static int create_cq(struct uverbs_attr_bundle *attrs, return uverbs_response(attrs, &resp, sizeof(resp)); err_free: + rdma_restrack_put(&cq->res); kfree(cq); err_file: if (ev_file) @@ -1237,8 +1256,21 @@ static int create_qp(struct uverbs_attr_bundle *attrs, bool has_sq = true; struct ib_device *ib_dev; - if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) - return -EPERM; + switch (cmd->qp_type) { + case IB_QPT_RAW_PACKET: + if (!capable(CAP_NET_RAW)) + return -EPERM; + break; + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + case IB_QPT_DRIVER: + break; + default: + return -EINVAL; + } obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs, &ib_dev); @@ -2985,11 +3017,11 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_ex_create_rwq_ind_table cmd; struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; - struct ib_uobject *uobj; + struct ib_uobject *uobj; int err; struct ib_rwq_ind_table_init_attr init_attr = {}; struct ib_rwq_ind_table *rwq_ind_tbl; - struct ib_wq **wqs = NULL; + struct ib_wq **wqs = NULL; u32 *wqs_handles = NULL; struct ib_wq *wq = NULL; int i, num_read_wqs; @@ -3047,17 +3079,15 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) goto put_wqs; } - init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; - init_attr.ind_tbl = wqs; - - rwq_ind_tbl = ib_dev->ops.create_rwq_ind_table(ib_dev, &init_attr, - &attrs->driver_udata); - - if (IS_ERR(rwq_ind_tbl)) { - err = PTR_ERR(rwq_ind_tbl); + rwq_ind_tbl = rdma_zalloc_drv_obj(ib_dev, ib_rwq_ind_table); + if (!rwq_ind_tbl) { + err = -ENOMEM; goto err_uobj; } + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl->ind_tbl = wqs; rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; rwq_ind_tbl->uobject = uobj; @@ -3065,6 +3095,11 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) rwq_ind_tbl->device = ib_dev; atomic_set(&rwq_ind_tbl->usecnt, 0); + err = ib_dev->ops.create_rwq_ind_table(rwq_ind_tbl, &init_attr, + &attrs->driver_udata); + if (err) + goto err_create; + for (i = 0; i < num_wq_handles; i++) rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject, UVERBS_LOOKUP_READ); @@ -3076,6 +3111,8 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) resp.response_length = uverbs_response_length(attrs, sizeof(resp)); return uverbs_response(attrs, &resp, sizeof(resp)); +err_create: + kfree(rwq_ind_tbl); err_uobj: uobj_alloc_abort(uobj, attrs); put_wqs: @@ -3232,8 +3269,8 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) goto err_free; } - flow_id = qp->device->ops.create_flow( - qp, flow_attr, IB_FLOW_DOMAIN_USER, &attrs->driver_udata); + flow_id = qp->device->ops.create_flow(qp, flow_attr, + &attrs->driver_udata); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index a4ba0b87d6de..4bb7c642f80c 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -108,8 +108,11 @@ int uverbs_dealloc_mw(struct ib_mw *mw) int ret; ret = mw->device->ops.dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + kfree(mw); return ret; } diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 08c39cfb1bd9..0658101fca00 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -81,12 +81,20 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, { struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object; struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; - int ret; + u32 table_size = (1 << rwq_ind_tbl->log_ind_tbl_size); + int ret, i; + + if (atomic_read(&rwq_ind_tbl->usecnt)) + return -EBUSY; - ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); + ret = rwq_ind_tbl->device->ops.destroy_rwq_ind_table(rwq_ind_tbl); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); + + kfree(rwq_ind_tbl); kfree(ind_tbl); return ret; } @@ -122,8 +130,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject, if (ret) return ret; - ib_dealloc_pd_user(pd, &attrs->driver_udata); - return 0; + return ib_dealloc_pd_user(pd, &attrs->driver_udata); } void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue) diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c index c7e7438752bc..b3c6c066b601 100644 --- a/drivers/infiniband/core/uverbs_std_types_counters.c +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -46,7 +46,9 @@ static int uverbs_free_counters(struct ib_uobject *uobject, if (ret) return ret; - counters->device->ops.destroy_counters(counters); + ret = counters->device->ops.destroy_counters(counters); + if (ret) + return ret; kfree(counters); return 0; } diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index b1c7dacc02de..8dabd05988b2 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -33,6 +33,7 @@ #include <rdma/uverbs_std_types.h> #include "rdma_core.h" #include "uverbs.h" +#include "restrack.h" static int uverbs_free_cq(struct ib_uobject *uobject, enum rdma_remove_reason why, @@ -123,7 +124,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); if (ret) @@ -131,7 +134,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( obj->uevent.uobject.object = cq; obj->uevent.uobject.user_handle = user_handle; - rdma_restrack_uadd(&cq->res); + rdma_restrack_add(&cq->res); uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE); ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, @@ -139,6 +142,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( return ret; err_free: + rdma_restrack_put(&cq->res); kfree(cq); err_event_file: if (obj->uevent.event_file) diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index 75df2094a010..f367d523a46b 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -3,11 +3,13 @@ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. */ +#include <linux/overflow.h> #include <rdma/uverbs_std_types.h> #include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_ioctl.h> #include <rdma/opa_addr.h> +#include <rdma/ib_cache.h> /* * This ioctl method allows calling any defined write or write_ex @@ -165,7 +167,8 @@ void copy_port_attr_to_resp(struct ib_port_attr *attr, resp->subnet_timeout = attr->subnet_timeout; resp->init_type_reply = attr->init_type_reply; resp->active_width = attr->active_width; - resp->active_speed = attr->active_speed; + /* This ABI needs to be extended to provide any speed more than IB_SPEED_NDR */ + resp->active_speed = min_t(u16, attr->active_speed, IB_SPEED_NDR); resp->phys_state = attr->phys_state; resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num); } @@ -265,6 +268,172 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)( return ucontext->device->ops.query_ucontext(ucontext, attrs); } +static int copy_gid_entries_to_user(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_gid_entry *entries, + size_t num_entries, size_t user_entry_size) +{ + const struct uverbs_attr *attr; + void __user *user_entries; + size_t copy_len; + int ret; + int i; + + if (user_entry_size == sizeof(*entries)) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + entries, sizeof(*entries) * num_entries); + return ret; + } + + copy_len = min_t(size_t, user_entry_size, sizeof(*entries)); + attr = uverbs_attr_get(attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); + if (IS_ERR(attr)) + return PTR_ERR(attr); + + user_entries = u64_to_user_ptr(attr->ptr_attr.data); + for (i = 0; i < num_entries; i++) { + if (copy_to_user(user_entries, entries, copy_len)) + return -EFAULT; + + if (user_entry_size > sizeof(*entries)) { + if (clear_user(user_entries + sizeof(*entries), + user_entry_size - sizeof(*entries))) + return -EFAULT; + } + + entries++; + user_entries += user_entry_size; + } + + return uverbs_output_written(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_TABLE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry *entries; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + size_t user_entry_size; + ssize_t num_entries; + size_t max_entries; + size_t num_bytes; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&user_entry_size, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE); + if (ret) + return ret; + + max_entries = uverbs_attr_ptr_get_array_size( + attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + user_entry_size); + if (max_entries <= 0) + return -EINVAL; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (check_mul_overflow(max_entries, sizeof(*entries), &num_bytes)) + return -EINVAL; + + entries = uverbs_zalloc(attrs, num_bytes); + if (!entries) + return -ENOMEM; + + num_entries = rdma_query_gid_table(ib_dev, entries, max_entries); + if (num_entries < 0) + return -EINVAL; + + ret = copy_gid_entries_to_user(attrs, entries, num_entries, + user_entry_size); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + &num_entries, sizeof(num_entries)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_ENTRY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry entry = {}; + const struct ib_gid_attr *gid_attr; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + struct net_device *ndev; + u32 gid_index; + u32 port_num; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_PORT); + if (ret) + return ret; + + ret = uverbs_get_const(&gid_index, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX); + if (ret) + return ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!rdma_is_port_valid(ib_dev, port_num)) + return -EINVAL; + + if (!rdma_ib_or_roce(ib_dev, port_num)) + return -EOPNOTSUPP; + + gid_attr = rdma_get_gid_attr(ib_dev, port_num, gid_index); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + + memcpy(&entry.gid, &gid_attr->gid, sizeof(gid_attr->gid)); + entry.gid_index = gid_attr->index; + entry.port_num = gid_attr->port_num; + entry.gid_type = gid_attr->gid_type; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(gid_attr); + if (IS_ERR(ndev)) { + if (PTR_ERR(ndev) != -ENODEV) { + ret = PTR_ERR(ndev); + rcu_read_unlock(); + goto out; + } + } else { + entry.netdev_ifindex = ndev->ifindex; + } + rcu_read_unlock(); + + ret = uverbs_copy_to_struct_or_zero( + attrs, UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, &entry, + sizeof(entry)); +out: + rdma_put_gid_attr(gid_attr); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_GET_CONTEXT, UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, @@ -299,12 +468,38 @@ DECLARE_UVERBS_NAMED_METHOD( reserved), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_TABLE, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, u32, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + UVERBS_ATTR_MIN_SIZE(0), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_ENTRY, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_PORT, u32, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX, u32, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, u32, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, + UVERBS_ATTR_STRUCT(struct ib_uverbs_gid_entry, + netdev_ifindex), + UA_MANDATORY)); + DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), - &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT)); + &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY)); const struct uapi_definition uverbs_def_obj_device[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE), diff --git a/drivers/infiniband/core/uverbs_std_types_wq.c b/drivers/infiniband/core/uverbs_std_types_wq.c index cad842ede077..f2e6a625724a 100644 --- a/drivers/infiniband/core/uverbs_std_types_wq.c +++ b/drivers/infiniband/core/uverbs_std_types_wq.c @@ -16,7 +16,7 @@ static int uverbs_free_wq(struct ib_uobject *uobject, container_of(uobject, struct ib_uwq_object, uevent.uobject); int ret; - ret = ib_destroy_wq(wq, &attrs->driver_udata); + ret = ib_destroy_wq_user(wq, &attrs->driver_udata); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 307886737646..740f8454b6b4 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -272,15 +272,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, atomic_set(&pd->usecnt, 0); pd->flags = flags; - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_set_task(&pd->res, caller); + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, caller); ret = device->ops.alloc_pd(pd, NULL); if (ret) { + rdma_restrack_put(&pd->res); kfree(pd); return ERR_PTR(ret); } - rdma_restrack_kadd(&pd->res); + rdma_restrack_add(&pd->res); if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; @@ -329,7 +330,7 @@ EXPORT_SYMBOL(__ib_alloc_pd); * exist. The caller is responsible to synchronously destroy them and * guarantee no new allocations will happen. */ -void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) +int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) { int ret; @@ -343,9 +344,13 @@ void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) requires the caller to guarantee we can't race here. */ WARN_ON(atomic_read(&pd->usecnt)); + ret = pd->device->ops.dealloc_pd(pd, udata); + if (ret) + return ret; + rdma_restrack_del(&pd->res); - pd->device->ops.dealloc_pd(pd, udata); kfree(pd); + return ret; } EXPORT_SYMBOL(ib_dealloc_pd_user); @@ -728,7 +733,7 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, (struct in6_addr *)dgid); return 0; } else if (net_type == RDMA_NETWORK_IPV6 || - net_type == RDMA_NETWORK_IB) { + net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) { *dgid = hdr->ibgrh.dgid; *sgid = hdr->ibgrh.sgid; return 0; @@ -964,18 +969,22 @@ int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata) { const struct ib_gid_attr *sgid_attr = ah->sgid_attr; struct ib_pd *pd; + int ret; might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE); pd = ah->pd; - ah->device->ops.destroy_ah(ah, flags); + ret = ah->device->ops.destroy_ah(ah, flags); + if (ret) + return ret; + atomic_dec(&pd->usecnt); if (sgid_attr) rdma_put_gid_attr(sgid_attr); kfree(ah); - return 0; + return ret; } EXPORT_SYMBOL(rdma_destroy_ah_user); @@ -1060,10 +1069,14 @@ EXPORT_SYMBOL(ib_query_srq); int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) { + int ret; + if (atomic_read(&srq->usecnt)) return -EBUSY; - srq->device->ops.destroy_srq(srq, udata); + ret = srq->device->ops.destroy_srq(srq, udata); + if (ret) + return ret; atomic_dec(&srq->pd->usecnt); if (srq->srq_type == IB_SRQT_XRC) @@ -1072,7 +1085,7 @@ int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) atomic_dec(&srq->ext.cq->usecnt); kfree(srq); - return 0; + return ret; } EXPORT_SYMBOL(ib_destroy_srq_user); @@ -1781,7 +1794,7 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, } EXPORT_SYMBOL(ib_modify_qp_with_udata); -int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u16 *speed, u8 *width) { int rc; u32 netdev_speed; @@ -1984,16 +1997,18 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); ret = device->ops.create_cq(cq, cq_attr, NULL); if (ret) { + rdma_restrack_put(&cq->res); kfree(cq); return ERR_PTR(ret); } - rdma_restrack_kadd(&cq->res); + rdma_restrack_add(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); @@ -2011,16 +2026,21 @@ EXPORT_SYMBOL(rdma_set_cq_moderation); int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) { + int ret; + if (WARN_ON_ONCE(cq->shared)) return -EOPNOTSUPP; if (atomic_read(&cq->usecnt)) return -EBUSY; + ret = cq->device->ops.destroy_cq(cq, udata); + if (ret) + return ret; + rdma_restrack_del(&cq->res); - cq->device->ops.destroy_cq(cq, udata); kfree(cq); - return 0; + return ret; } EXPORT_SYMBOL(ib_destroy_cq_user); @@ -2059,8 +2079,10 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->pd = pd; mr->dm = NULL; atomic_inc(&pd->usecnt); - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_kadd(&mr->res); + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); return mr; } @@ -2139,11 +2161,12 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_kadd(&mr->res); mr->type = mr_type; mr->sig_attrs = NULL; + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); out: trace_mr_alloc(pd, mr_type, max_num_sg, mr); return mr; @@ -2199,11 +2222,12 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_kadd(&mr->res); mr->type = IB_MR_TYPE_INTEGRITY; mr->sig_attrs = sig_attrs; + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); out: trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr); return mr; @@ -2328,13 +2352,17 @@ EXPORT_SYMBOL(ib_alloc_xrcd_user); */ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata) { + int ret; + if (atomic_read(&xrcd->usecnt)) return -EBUSY; WARN_ON(!xa_empty(&xrcd->tgt_qps)); - xrcd->device->ops.dealloc_xrcd(xrcd, udata); + ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata); + if (ret) + return ret; kfree(xrcd); - return 0; + return ret; } EXPORT_SYMBOL(ib_dealloc_xrcd_user); @@ -2378,25 +2406,28 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd, EXPORT_SYMBOL(ib_create_wq); /** - * ib_destroy_wq - Destroys the specified user WQ. + * ib_destroy_wq_user - Destroys the specified user WQ. * @wq: The WQ to destroy. * @udata: Valid user data */ -int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) +int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata) { struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; + int ret; if (atomic_read(&wq->usecnt)) return -EBUSY; - wq->device->ops.destroy_wq(wq, udata); + ret = wq->device->ops.destroy_wq(wq, udata); + if (ret) + return ret; + atomic_dec(&pd->usecnt); atomic_dec(&cq->usecnt); - - return 0; + return ret; } -EXPORT_SYMBOL(ib_destroy_wq); +EXPORT_SYMBOL(ib_destroy_wq_user); /** * ib_modify_wq - Modifies the specified WQ. @@ -2419,29 +2450,6 @@ int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, } EXPORT_SYMBOL(ib_modify_wq); -/* - * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table. - * @wq_ind_table: The Indirection Table to destroy. -*/ -int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) -{ - int err, i; - u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); - struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; - - if (atomic_read(&rwq_ind_table->usecnt)) - return -EBUSY; - - err = rwq_ind_table->device->ops.destroy_rwq_ind_table(rwq_ind_table); - if (!err) { - for (i = 0; i < table_size; i++) - atomic_dec(&ind_tbl[i]->usecnt); - } - - return err; -} -EXPORT_SYMBOL(ib_destroy_rwq_ind_table); - int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) { |