diff options
Diffstat (limited to 'drivers/infiniband/core')
26 files changed, 1184 insertions, 429 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d49ded7e95f0..f483e0c12444 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -33,6 +33,7 @@ ib_umad-y := user_mad.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ + uverbs_std_types_dmah.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 9979a351577f..81cf3c902e81 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -582,8 +582,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, out_unlock: mutex_unlock(&table->lock); if (ret) - pr_warn("%s: unable to add gid %pI6 error=%d\n", - __func__, gid->raw, ret); + pr_warn_ratelimited("%s: unable to add gid %pI6 error=%d\n", + __func__, gid->raw, ret); return ret; } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 142170473e75..92678e438ff4 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -36,6 +36,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */ #define CM_DIRECT_RETRY_CTX ((void *) 1UL) +#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */ static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_NO_QP] = "no QP", @@ -160,6 +161,7 @@ struct cm_counter_attribute { struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; + struct ib_mad_agent *rep_agent; u32 port_num; atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT]; }; @@ -167,7 +169,7 @@ struct cm_port { struct cm_device { struct kref kref; struct list_head list; - spinlock_t mad_agent_lock; + rwlock_t mad_agent_lock; struct ib_device *ib_device; u8 ack_delay; int going_down; @@ -241,7 +243,6 @@ struct cm_id_private { u8 initiator_depth; u8 retry_count; u8 rnr_retry_count; - u8 service_timeout; u8 target_ack_delay; struct list_head work_list; @@ -274,7 +275,8 @@ static inline void cm_deref_id(struct cm_id_private *cm_id_priv) complete(&cm_id_priv->comp); } -static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) +static struct ib_mad_send_buf * +cm_alloc_msg_agent(struct cm_id_private *cm_id_priv, bool rep_agent) { struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; @@ -285,8 +287,9 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) if (!cm_id_priv->av.port) return ERR_PTR(-EINVAL); - spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); - mad_agent = cm_id_priv->av.port->mad_agent; + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + mad_agent = rep_agent ? cm_id_priv->av.port->rep_agent : + cm_id_priv->av.port->mad_agent; if (!mad_agent) { m = ERR_PTR(-EINVAL); goto out; @@ -311,10 +314,15 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) m->ah = ah; out: - spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return m; } +static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) +{ + return cm_alloc_msg_agent(cm_id_priv, false); +} + static void cm_free_msg(struct ib_mad_send_buf *msg) { if (msg->ah) @@ -323,13 +331,14 @@ static void cm_free_msg(struct ib_mad_send_buf *msg) } static struct ib_mad_send_buf * -cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) +cm_alloc_priv_msg_rep(struct cm_id_private *cm_id_priv, enum ib_cm_state state, + bool rep_agent) { struct ib_mad_send_buf *msg; lockdep_assert_held(&cm_id_priv->lock); - msg = cm_alloc_msg(cm_id_priv); + msg = cm_alloc_msg_agent(cm_id_priv, rep_agent); if (IS_ERR(msg)) return msg; @@ -344,6 +353,12 @@ cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) return msg; } +static struct ib_mad_send_buf * +cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) +{ + return cm_alloc_priv_msg_rep(cm_id_priv, state, false); +} + static void cm_free_priv_msg(struct ib_mad_send_buf *msg) { struct cm_id_private *cm_id_priv = msg->context[0]; @@ -1297,10 +1312,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) if (!cm_id_priv->av.port) return cpu_to_be64(low_tid); - spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); if (cm_id_priv->av.port->mad_agent) hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32; - spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return cpu_to_be64(hi_tid | low_tid); } @@ -1872,7 +1887,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv, static void cm_format_mra(struct cm_mra_msg *mra_msg, struct cm_id_private *cm_id_priv, - enum cm_msg_response msg_mraed, u8 service_timeout, + enum cm_msg_response msg_mraed, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); @@ -1881,7 +1896,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg, be32_to_cpu(cm_id_priv->id.remote_id)); - IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout); + IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING); if (private_data && private_data_len) IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data, @@ -1960,7 +1975,7 @@ static void cm_dup_req_handler(struct cm_work *work, switch (cm_id_priv->id.state) { case IB_CM_MRA_REQ_SENT: cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REQ, cm_id_priv->private_data, cm_id_priv->private_data_len); break; @@ -2295,7 +2310,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, goto out; } - msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REP_SENT); + msg = cm_alloc_priv_msg_rep(cm_id_priv, IB_CM_REP_SENT, true); if (IS_ERR(msg)) { ret = PTR_ERR(msg); goto out; @@ -2454,7 +2469,7 @@ static void cm_dup_rep_handler(struct cm_work *work) cm_id_priv->private_data_len); else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REP, cm_id_priv->private_data, cm_id_priv->private_data_len); else @@ -3094,26 +3109,13 @@ out: return -EINVAL; } -int ib_send_cm_mra(struct ib_cm_id *cm_id, - u8 service_timeout, - const void *private_data, - u8 private_data_len) +int ib_prepare_cm_mra(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; - struct ib_mad_send_buf *msg; enum ib_cm_state cm_state; enum ib_cm_lap_state lap_state; - enum cm_msg_response msg_response; - void *data; unsigned long flags; - int ret; - - if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) - return -EINVAL; - - data = cm_copy_private_data(private_data, private_data_len); - if (IS_ERR(data)) - return PTR_ERR(data); + int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3122,58 +3124,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REQ; break; case IB_CM_REP_RCVD: cm_state = IB_CM_MRA_REP_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REP; break; case IB_CM_ESTABLISHED: if (cm_id->lap_state == IB_CM_LAP_RCVD) { cm_state = cm_id->state; lap_state = IB_CM_MRA_LAP_SENT; - msg_response = CM_MSG_RESPONSE_OTHER; break; } fallthrough; default: - trace_icm_send_mra_unknown_err(&cm_id_priv->id); + trace_icm_prepare_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; goto error_unlock; } - if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { - msg = cm_alloc_msg(cm_id_priv); - if (IS_ERR(msg)) { - ret = PTR_ERR(msg); - goto error_unlock; - } - - cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - msg_response, service_timeout, - private_data, private_data_len); - trace_icm_send_mra(cm_id); - ret = ib_post_send_mad(msg, NULL); - if (ret) - goto error_free_msg; - } - cm_id->state = cm_state; cm_id->lap_state = lap_state; - cm_id_priv->service_timeout = service_timeout; - cm_set_private_data(cm_id_priv, data, private_data_len); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return 0; + cm_set_private_data(cm_id_priv, NULL, 0); -error_free_msg: - cm_free_msg(msg); error_unlock: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); return ret; } -EXPORT_SYMBOL(ib_send_cm_mra); +EXPORT_SYMBOL(ib_prepare_cm_mra); static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { @@ -3377,7 +3354,6 @@ static int cm_lap_handler(struct cm_work *work) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, - cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); @@ -3786,7 +3762,8 @@ static void cm_process_send_error(struct cm_id_private *cm_id_priv, spin_lock_irq(&cm_id_priv->lock); if (msg != cm_id_priv->msg) { spin_unlock_irq(&cm_id_priv->lock); - cm_free_priv_msg(msg); + cm_free_msg(msg); + cm_deref_id(cm_id_priv); return; } cm_free_priv_msg(msg); @@ -4378,7 +4355,7 @@ static int cm_add_one(struct ib_device *ib_device) return -ENOMEM; kref_init(&cm_dev->kref); - spin_lock_init(&cm_dev->mad_agent_lock); + rwlock_init(&cm_dev->mad_agent_lock); cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; @@ -4418,9 +4395,22 @@ static int cm_add_one(struct ib_device *ib_device) goto error2; } + port->rep_agent = ib_register_mad_agent(ib_device, i, + IB_QPT_GSI, + NULL, + 0, + cm_send_handler, + NULL, + port, + 0); + if (IS_ERR(port->rep_agent)) { + ret = PTR_ERR(port->rep_agent); + goto error3; + } + ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) - goto error3; + goto error4; count++; } @@ -4435,6 +4425,8 @@ static int cm_add_one(struct ib_device *ib_device) write_unlock_irqrestore(&cm.device_lock, flags); return 0; +error4: + ib_unregister_mad_agent(port->rep_agent); error3: ib_unregister_mad_agent(port->mad_agent); error2: @@ -4448,6 +4440,7 @@ error1: port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); + ib_unregister_mad_agent(port->rep_agent); ib_unregister_mad_agent(port->mad_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); @@ -4477,12 +4470,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) rdma_for_each_port (ib_device, i) { struct ib_mad_agent *mad_agent; + struct ib_mad_agent *rep_agent; if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; mad_agent = port->mad_agent; + rep_agent = port->rep_agent; ib_modify_port(ib_device, port->port_num, 0, &port_modify); /* * We flush the queue here after the going_down set, this @@ -4494,10 +4489,12 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) * The above ensures no call paths from the work are running, * the remaining paths all take the mad_agent_lock. */ - spin_lock(&cm_dev->mad_agent_lock); + write_lock(&cm_dev->mad_agent_lock); port->mad_agent = NULL; - spin_unlock(&cm_dev->mad_agent_lock); + port->rep_agent = NULL; + write_unlock(&cm_dev->mad_agent_lock); ib_unregister_mad_agent(mad_agent); + ib_unregister_mad_agent(rep_agent); ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); } diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h index 944d9071245d..4a4987da69d4 100644 --- a/drivers/infiniband/core/cm_trace.h +++ b/drivers/infiniband/core/cm_trace.h @@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep); DEFINE_CM_ERR_EVENT(dreq_unknown); DEFINE_CM_ERR_EVENT(send_unknown_rej); DEFINE_CM_ERR_EVENT(rej_unknown); -DEFINE_CM_ERR_EVENT(send_mra_unknown); +DEFINE_CM_ERR_EVENT(prepare_mra_unknown); DEFINE_CM_ERR_EVENT(mra_unknown); DEFINE_CM_ERR_EVENT(qp_init); DEFINE_CM_ERR_EVENT(qp_rtr); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index fedcdb56fb6b..9b471548e7ae 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 -#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP @@ -72,6 +71,8 @@ static const char * const cma_events[] = { static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); +static void cma_netevent_work_handler(struct work_struct *_work); + const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; @@ -144,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) } EXPORT_SYMBOL(rdma_iw_cm_id); -/** - * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. - * @res: rdma resource tracking entry pointer - */ -struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) -{ - struct rdma_id_private *id_priv = - container_of(res, struct rdma_id_private, res); - - return &id_priv->id; -} -EXPORT_SYMBOL(rdma_res_to_id); - static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); @@ -1047,6 +1035,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; + INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler); rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) @@ -2211,8 +2200,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { - trace_cm_send_mra(id_priv); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + trace_cm_prepare_mra(id_priv); + ib_prepare_cm_mra(cm_id); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); @@ -2473,8 +2462,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { - trace_cm_send_mra(cm_id->context); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + trace_cm_prepare_mra(cm_id->context); + ib_prepare_cm_mra(cm_id); } mutex_unlock(&conn_id->handler_mutex); @@ -5241,9 +5230,9 @@ static int cma_netevent_callback(struct notifier_block *self, if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; - INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); - queue_work(cma_wq, ¤t_id->id.net_work); + if (!queue_work(cma_wq, ¤t_id->id.net_work)) + cma_id_put(current_id); } out: spin_unlock_irqrestore(&id_table_lock, flags); diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index dc622f3778be..3456d5f3aa47 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class, DEFINE_CMA_FSM_EVENT(send_rtu); DEFINE_CMA_FSM_EVENT(send_rej); -DEFINE_CMA_FSM_EVENT(send_mra); +DEFINE_CMA_FSM_EVENT(prepare_mra); DEFINE_CMA_FSM_EVENT(send_sidr_req); DEFINE_CMA_FSM_EVENT(send_sidr_rep); DEFINE_CMA_FSM_EVENT(disconnect); diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index e6ec7b7a40af..c3aa6d7fc66b 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -461,7 +461,7 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) return NULL; qp = container_of(res, struct ib_qp, res); - if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev)) goto err; return qp; diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index a70876a0a231..584537c71545 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -317,13 +317,18 @@ EXPORT_SYMBOL(__ib_alloc_cq_any); */ void ib_free_cq(struct ib_cq *cq) { - int ret; + int ret = 0; if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; if (WARN_ON_ONCE(cq->cqe_used)) return; + if (cq->device->ops.pre_destroy_cq) { + ret = cq->device->ops.pre_destroy_cq(cq); + WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail"); + } + switch (cq->poll_ctx) { case IB_POLL_DIRECT: break; @@ -340,7 +345,10 @@ void ib_free_cq(struct ib_cq *cq) rdma_dim_destroy(cq); trace_cq_free(cq); - ret = cq->device->ops.destroy_cq(cq, NULL); + if (cq->device->ops.post_destroy_cq) + cq->device->ops.post_destroy_cq(cq); + else + ret = cq->device->ops.destroy_cq(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); rdma_restrack_del(&cq->res); kfree(cq->wc); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b4e3e4beb7f4..3145cb34a1d2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -145,6 +145,33 @@ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) } EXPORT_SYMBOL(rdma_dev_access_netns); +/** + * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has + * CAP_NET_RAW capability or not. + * + * @dev: Pointer to rdma device whose capability to be checked + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. When rdma subsystem is in legacy shared network, + * namespace mode, the default net namespace is considered. + */ +bool rdma_dev_has_raw_cap(const struct ib_device *dev) +{ + const struct net *net; + + /* Network namespace is the resource whose user namespace + * to be considered. When in shared mode, there is no reliable + * network namespace resource, so consider the default net namespace. + */ + if (ib_devices_shared_netns) + net = &init_net; + else + net = read_pnet(&dev->coredev.rdma_net); + + return ns_capable(net->user_ns, CAP_NET_RAW); +} +EXPORT_SYMBOL(rdma_dev_has_raw_cap); + /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in @@ -557,6 +584,8 @@ static void rdma_init_coredev(struct ib_core_device *coredev, /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate + * @net: network namespace device should be located in, namespace + * must stay valid until ib_register_device() is completed. * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, @@ -564,7 +593,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev, * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ -struct ib_device *_ib_alloc_device(size_t size) +struct ib_device *_ib_alloc_device(size_t size, struct net *net) { struct ib_device *device; unsigned int i; @@ -581,7 +610,15 @@ struct ib_device *_ib_alloc_device(size_t size) return NULL; } - rdma_init_coredev(&device->coredev, device, &init_net); + /* ib_devices_shared_netns can't change while we have active namespaces + * in the system which means either init_net is passed or the user has + * no idea what they are doing. + * + * To avoid breaking backward compatibility, when in shared mode, + * force to init the device in the init_net. + */ + net = ib_devices_shared_netns ? &init_net : net; + rdma_init_coredev(&device->coredev, device, net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); @@ -1352,6 +1389,9 @@ static void ib_device_notify_register(struct ib_device *device) down_read(&devices_rwsem); + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out; @@ -1468,10 +1508,9 @@ int ib_register_device(struct ib_device *device, const char *name, return ret; } dev_set_uevent_suppress(&device->dev, false); - /* Mark for userspace that device is ready */ - kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_notify_register(device); + ib_device_put(device); return 0; @@ -2669,6 +2708,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); + SET_DEVICE_OP(dev_ops, alloc_dmah); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); @@ -2689,6 +2729,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); + SET_DEVICE_OP(dev_ops, create_cq_umem); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); @@ -2696,6 +2737,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); + SET_DEVICE_OP(dev_ops, dealloc_dmah); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); @@ -2761,8 +2803,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); + SET_DEVICE_OP(dev_ops, pre_destroy_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); + SET_DEVICE_OP(dev_ops, post_destroy_cq); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); @@ -2791,6 +2835,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); + SET_OBJ_SIZE(dev_ops, ib_dmah); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index f4486cbd8f45..62410578dec3 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -368,12 +368,9 @@ EXPORT_SYMBOL(iw_cm_disconnect); /* * CM_ID <-- DESTROYING * - * Clean up all resources associated with the connection and release - * the initial reference taken by iw_create_cm_id. - * - * Returns true if and only if the last cm_id_priv reference has been dropped. + * Clean up all resources associated with the connection. */ -static bool destroy_cm_id(struct iw_cm_id *cm_id) +static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; @@ -442,20 +439,22 @@ static bool destroy_cm_id(struct iw_cm_id *cm_id) iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); } - - return iwcm_deref_id(cm_id_priv); } /* - * This function is only called by the application thread and cannot - * be called by the event thread. The function will wait for all - * references to be released on the cm_id and then kfree the cm_id - * object. + * Destroy cm_id. If the cm_id still has other references, wait for all + * references to be released on the cm_id and then release the initial + * reference taken by iw_create_cm_id. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { - if (!destroy_cm_id(cm_id)) + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + destroy_cm_id(cm_id); + if (refcount_read(&cm_id_priv->refcount) > 1) flush_workqueue(iwcm_wq); + iwcm_deref_id(cm_id_priv); } EXPORT_SYMBOL(iw_destroy_cm_id); @@ -1035,8 +1034,10 @@ static void cm_work_handler(struct work_struct *_work) if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { ret = process_event(cm_id_priv, &levent); - if (ret) - WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id)); + if (ret) { + destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); + } } else pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 73f3a0b9a54b..8f26bfb69586 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -210,6 +210,29 @@ int ib_response_mad(const struct ib_mad_hdr *hdr) } EXPORT_SYMBOL(ib_response_mad); +#define SOL_FC_MAX_DEFAULT_FRAC 4 +#define SOL_FC_MAX_SA_FRAC 32 + +static int get_sol_fc_max_outstanding(struct ib_mad_reg_req *mad_reg_req) +{ + if (!mad_reg_req) + /* Send only agent */ + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; + + switch (mad_reg_req->mgmt_class) { + case IB_MGMT_CLASS_CM: + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; + case IB_MGMT_CLASS_SUBN_ADM: + return mad_recvq_size / SOL_FC_MAX_SA_FRAC; + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + return min(mad_recvq_size, IB_MAD_QP_RECV_SIZE) / + SOL_FC_MAX_DEFAULT_FRAC; + default: + return 0; + } +} + /* * ib_register_mad_agent - Register to send/receive MADs * @@ -391,13 +414,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); - INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); + INIT_LIST_HEAD(&mad_agent_priv->backlog_list); INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); refcount_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); + mad_agent_priv->sol_fc_send_count = 0; + mad_agent_priv->sol_fc_wait_count = 0; + mad_agent_priv->sol_fc_max = + recv_handler ? get_sol_fc_max_outstanding(mad_reg_req) : 0; ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type); if (ret2) { @@ -1055,6 +1082,180 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) return ret; } +static void handle_queued_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) { + mad_agent_priv->sol_fc_wait_count--; + list_move_tail(&mad_send_wr->agent_list, + &mad_agent_priv->backlog_list); + } else { + expect_mad_state(mad_send_wr, IB_MAD_STATE_INIT); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->backlog_list); + } +} + +static void handle_send_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->state == IB_MAD_STATE_INIT) { + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + } else { + expect_mad_state2(mad_send_wr, IB_MAD_STATE_WAIT_RESP, + IB_MAD_STATE_QUEUED); + list_move_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + } + + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + mad_agent_priv->sol_fc_send_count++; + } +} + +static void handle_wait_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *temp_mad_send_wr; + struct list_head *list_item; + unsigned long delay; + + expect_mad_state3(mad_send_wr, IB_MAD_STATE_SEND_START, + IB_MAD_STATE_WAIT_RESP, IB_MAD_STATE_CANCELED); + if (mad_send_wr->state == IB_MAD_STATE_SEND_START && + mad_send_wr->is_solicited_fc) { + mad_agent_priv->sol_fc_send_count--; + mad_agent_priv->sol_fc_wait_count++; + } + + list_del_init(&mad_send_wr->agent_list); + delay = mad_send_wr->timeout; + mad_send_wr->timeout += jiffies; + + if (delay) { + list_for_each_prev(list_item, + &mad_agent_priv->wait_list) { + temp_mad_send_wr = list_entry( + list_item, + struct ib_mad_send_wr_private, + agent_list); + if (time_after(mad_send_wr->timeout, + temp_mad_send_wr->timeout)) + break; + } + } else { + list_item = &mad_agent_priv->wait_list; + } + + list_add(&mad_send_wr->agent_list, list_item); +} + +static void handle_early_resp_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + expect_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + mad_agent_priv->sol_fc_send_count -= mad_send_wr->is_solicited_fc; +} + +static void handle_canceled_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + not_expect_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) + mad_agent_priv->sol_fc_send_count--; + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + } +} + +static void handle_done_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) + mad_agent_priv->sol_fc_send_count--; + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + } + + list_del_init(&mad_send_wr->agent_list); +} + +void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state new_state) +{ + struct ib_mad_agent_private *mad_agent_priv = + mad_send_wr->mad_agent_priv; + + switch (new_state) { + case IB_MAD_STATE_INIT: + break; + case IB_MAD_STATE_QUEUED: + handle_queued_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_SEND_START: + handle_send_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_WAIT_RESP: + handle_wait_state(mad_send_wr, mad_agent_priv); + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + return; + break; + case IB_MAD_STATE_EARLY_RESP: + handle_early_resp_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_CANCELED: + handle_canceled_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_DONE: + handle_done_state(mad_send_wr, mad_agent_priv); + break; + } + + mad_send_wr->state = new_state; +} + +static bool is_solicited_fc_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + u8 mgmt_class; + + if (!mad_send_wr->timeout) + return 0; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (mad_send_wr->mad_agent_priv->agent.rmpp_version && + (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) + return 0; + + mgmt_class = + ((struct ib_mad_hdr *)mad_send_wr->send_buf.mad)->mgmt_class; + return mgmt_class == IB_MGMT_CLASS_CM || + mgmt_class == IB_MGMT_CLASS_SUBN_ADM || + mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE; +} + +static bool mad_is_for_backlog(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_agent_private *mad_agent_priv = + mad_send_wr->mad_agent_priv; + + if (!mad_send_wr->is_solicited_fc || !mad_agent_priv->sol_fc_max) + return false; + + if (!list_empty(&mad_agent_priv->backlog_list)) + return true; + + return mad_agent_priv->sol_fc_send_count + + mad_agent_priv->sol_fc_wait_count >= + mad_agent_priv->sol_fc_max; +} + /* * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client @@ -1080,9 +1281,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, if (ret) goto error; - if (!send_buf->mad_agent->send_handler || - (send_buf->timeout_ms && - !send_buf->mad_agent->recv_handler)) { + if (!send_buf->mad_agent->send_handler) { ret = -EINVAL; goto error; } @@ -1118,15 +1317,19 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, mad_send_wr->max_retries = send_buf->retries; mad_send_wr->retries_left = send_buf->retries; send_buf->retries = 0; - /* Reference for work request to QP + response */ - mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); - mad_send_wr->status = IB_WC_SUCCESS; + change_mad_state(mad_send_wr, IB_MAD_STATE_INIT); /* Reference MAD agent until send completes */ refcount_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_add_tail(&mad_send_wr->agent_list, - &mad_agent_priv->send_list); + mad_send_wr->is_solicited_fc = is_solicited_fc_mad(mad_send_wr); + if (mad_is_for_backlog(mad_send_wr)) { + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + return 0; + } + + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { @@ -1138,7 +1341,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, if (ret < 0) { /* Fail send request */ spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&mad_send_wr->agent_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); deref_mad_agent(mad_agent_priv); goto error; @@ -1746,7 +1949,19 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, */ (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) - return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; + } + + list_for_each_entry(wr, &mad_agent_priv->backlog_list, agent_list) { + if ((wr->tid == mad_hdr->tid) && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(mad_hdr->mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; } /* @@ -1765,17 +1980,55 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ - return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; } return NULL; } +static void +process_backlog_mads(struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc mad_send_wc = {}; + unsigned long flags; + int ret; + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + while (!list_empty(&mad_agent_priv->backlog_list) && + (mad_agent_priv->sol_fc_send_count + + mad_agent_priv->sol_fc_wait_count < + mad_agent_priv->sol_fc_max)) { + mad_send_wr = list_entry(mad_agent_priv->backlog_list.next, + struct ib_mad_send_wr_private, + agent_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + ret = ib_send_mad(mad_send_wr); + if (ret) { + spin_lock_irqsave(&mad_agent_priv->lock, flags); + deref_mad_agent(mad_agent_priv); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_send_wc.status = IB_WC_LOC_QP_OP_ERR; + mad_agent_priv->agent.send_handler( + &mad_agent_priv->agent, &mad_send_wc); + } + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + } + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; - if (mad_send_wr->refcount == 1) - list_move_tail(&mad_send_wr->agent_list, - &mad_send_wr->mad_agent_priv->done_list); + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP || + mad_send_wr->state == IB_MAD_STATE_QUEUED) + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + else + change_mad_state(mad_send_wr, IB_MAD_STATE_EARLY_RESP); } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, @@ -1784,6 +2037,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; + bool is_mad_done; int ret; INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); @@ -1832,6 +2086,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } } else { ib_mark_mad_done(mad_send_wr); + is_mad_done = (mad_send_wr->state == IB_MAD_STATE_DONE); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ @@ -1841,10 +2096,13 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_recv_wc); deref_mad_agent(mad_agent_priv); - mad_send_wc.status = IB_WC_SUCCESS; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + if (is_mad_done) { + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, + &mad_send_wc); + } } } else { mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, @@ -2172,30 +2430,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wr_private *temp_mad_send_wr; - struct list_head *list_item; unsigned long delay; mad_agent_priv = mad_send_wr->mad_agent_priv; - list_del(&mad_send_wr->agent_list); - delay = mad_send_wr->timeout; - mad_send_wr->timeout += jiffies; - - if (delay) { - list_for_each_prev(list_item, &mad_agent_priv->wait_list) { - temp_mad_send_wr = list_entry(list_item, - struct ib_mad_send_wr_private, - agent_list); - if (time_after(mad_send_wr->timeout, - temp_mad_send_wr->timeout)) - break; - } - } else { - list_item = &mad_agent_priv->wait_list; - } - - list_add(&mad_send_wr->agent_list, list_item); + change_mad_state(mad_send_wr, IB_MAD_STATE_WAIT_RESP); /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) @@ -2229,32 +2468,28 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, } else ret = IB_RMPP_RESULT_UNHANDLED; - if (mad_send_wc->status != IB_WC_SUCCESS && - mad_send_wr->status == IB_WC_SUCCESS) { - mad_send_wr->status = mad_send_wc->status; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } - - if (--mad_send_wr->refcount > 0) { - if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && - mad_send_wr->status == IB_WC_SUCCESS) { - wait_for_response(mad_send_wr); - } + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + mad_send_wc->status = IB_WC_WR_FLUSH_ERR; + else if (mad_send_wr->state == IB_MAD_STATE_SEND_START && + mad_send_wr->timeout) { + wait_for_response(mad_send_wr); goto done; } /* Remove send from MAD agent and notify client of completion */ - list_del(&mad_send_wr->agent_list); + if (mad_send_wr->state != IB_MAD_STATE_DONE) + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); adjust_timeout(mad_agent_priv); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_send_wr->status != IB_WC_SUCCESS) - mad_send_wc->status = mad_send_wr->status; - if (ret == IB_RMPP_RESULT_INTERNAL) + if (ret == IB_RMPP_RESULT_INTERNAL) { ib_rmpp_send_handler(mad_send_wc); - else + } else { + if (mad_send_wr->is_solicited_fc) + process_backlog_mads(mad_agent_priv); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); + } /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); @@ -2396,40 +2631,53 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, return true; } +static void clear_mad_error_list(struct list_head *list, + enum ib_wc_status wc_status, + struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr, *n; + struct ib_mad_send_wc mad_send_wc; + + mad_send_wc.status = wc_status; + mad_send_wc.vendor_err = 0; + + list_for_each_entry_safe(mad_send_wr, n, list, agent_list) { + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + deref_mad_agent(mad_agent_priv); + } +} + static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) { unsigned long flags; struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; - struct ib_mad_send_wc mad_send_wc; struct list_head cancel_list; INIT_LIST_HEAD(&cancel_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, - &mad_agent_priv->send_list, agent_list) { - if (mad_send_wr->status == IB_WC_SUCCESS) { - mad_send_wr->status = IB_WC_WR_FLUSH_ERR; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } - } + &mad_agent_priv->send_list, agent_list) + change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); - /* Empty wait list to prevent receives from finding a request */ - list_splice_init(&mad_agent_priv->wait_list, &cancel_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - - /* Report all cancelled requests */ - mad_send_wc.status = IB_WC_WR_FLUSH_ERR; - mad_send_wc.vendor_err = 0; + /* Empty wait & backlog list to prevent receives from finding request */ + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &mad_agent_priv->wait_list, agent_list) { + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, &cancel_list); + } list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, - &cancel_list, agent_list) { - mad_send_wc.send_buf = &mad_send_wr->send_buf; - list_del(&mad_send_wr->agent_list); - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); - deref_mad_agent(mad_agent_priv); + &mad_agent_priv->backlog_list, agent_list) { + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, &cancel_list); } + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + /* Report all cancelled requests */ + clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv); } static struct ib_mad_send_wr_private* @@ -2451,6 +2699,13 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv, &mad_send_wr->send_buf == send_buf) return mad_send_wr; } + + list_for_each_entry(mad_send_wr, &mad_agent_priv->backlog_list, + agent_list) { + if (&mad_send_wr->send_buf == send_buf) + return mad_send_wr; + } + return NULL; } @@ -2468,16 +2723,16 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms) struct ib_mad_agent_private, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); - if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { + if (!mad_send_wr || mad_send_wr->state == IB_MAD_STATE_CANCELED) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } - active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); - if (!timeout_ms) { - mad_send_wr->status = IB_WC_WR_FLUSH_ERR; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } + active = ((mad_send_wr->state == IB_MAD_STATE_SEND_START) || + (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) || + (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms)); + if (!timeout_ms) + change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); mad_send_wr->send_buf.timeout_ms = timeout_ms; if (active) @@ -2589,6 +2844,11 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->send_buf.retries++; mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); + if (mad_send_wr->is_solicited_fc && + !list_empty(&mad_send_wr->mad_agent_priv->backlog_list)) { + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); + return 0; + } if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); @@ -2606,26 +2866,25 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) } else ret = ib_send_mad(mad_send_wr); - if (!ret) { - mad_send_wr->refcount++; - list_add_tail(&mad_send_wr->agent_list, - &mad_send_wr->mad_agent_priv->send_list); - } + if (!ret) + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + return ret; } static void timeout_sends(struct work_struct *work) { - struct ib_mad_send_wr_private *mad_send_wr, *n; + struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wc mad_send_wc; - struct list_head local_list; + struct list_head timeout_list; + struct list_head cancel_list; + struct list_head *list_item; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); - mad_send_wc.vendor_err = 0; - INIT_LIST_HEAD(&local_list); + INIT_LIST_HEAD(&timeout_list); + INIT_LIST_HEAD(&cancel_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { @@ -2643,25 +2902,22 @@ static void timeout_sends(struct work_struct *work) break; } - list_del_init(&mad_send_wr->agent_list); - if (mad_send_wr->status == IB_WC_SUCCESS && - !retry_send(mad_send_wr)) + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + list_item = &cancel_list; + else if (retry_send(mad_send_wr)) + list_item = &timeout_list; + else continue; - list_add_tail(&mad_send_wr->agent_list, &local_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, list_item); } - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) { - if (mad_send_wr->status == IB_WC_SUCCESS) - mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; - else - mad_send_wc.status = mad_send_wr->status; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); - deref_mad_agent(mad_agent_priv); - } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + process_backlog_mads(mad_agent_priv); + clear_mad_error_list(&timeout_list, IB_WC_RESP_TIMEOUT_ERR, + mad_agent_priv); + clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv); } /* diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 1b7445a6f671..f444357d33f4 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -95,13 +95,16 @@ struct ib_mad_agent_private { spinlock_t lock; struct list_head send_list; + unsigned int sol_fc_send_count; struct list_head wait_list; - struct list_head done_list; + unsigned int sol_fc_wait_count; struct delayed_work timed_work; unsigned long timeout; struct list_head local_list; struct work_struct local_work; struct list_head rmpp_list; + unsigned int sol_fc_max; + struct list_head backlog_list; refcount_t refcount; union { @@ -118,6 +121,32 @@ struct ib_mad_snoop_private { struct completion comp; }; +enum ib_mad_state { + /* MAD is in the making and is not yet in any list */ + IB_MAD_STATE_INIT, + /* MAD is in backlog list */ + IB_MAD_STATE_QUEUED, + /* + * MAD was sent to the QP and is waiting for completion + * notification in send list. + */ + IB_MAD_STATE_SEND_START, + /* + * MAD send completed successfully, waiting for a response + * in wait list. + */ + IB_MAD_STATE_WAIT_RESP, + /* + * Response came early, before send completion notification, + * in send list. + */ + IB_MAD_STATE_EARLY_RESP, + /* MAD was canceled while in wait or send list */ + IB_MAD_STATE_CANCELED, + /* MAD processing completed, MAD in no list */ + IB_MAD_STATE_DONE +}; + struct ib_mad_send_wr_private { struct ib_mad_list_head mad_list; struct list_head agent_list; @@ -132,8 +161,6 @@ struct ib_mad_send_wr_private { int max_retries; int retries_left; int retry; - int refcount; - enum ib_wc_status status; /* RMPP control */ struct list_head rmpp_list; @@ -143,8 +170,48 @@ struct ib_mad_send_wr_private { int seg_num; int newwin; int pad; + + enum ib_mad_state state; + + /* Solicited MAD flow control */ + bool is_solicited_fc; }; +static inline void expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state); +} + +static inline void expect_mad_state2(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state1, + enum ib_mad_state expected_state2) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state1 && + mad_send_wr->state != expected_state2); +} + +static inline void expect_mad_state3(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state1, + enum ib_mad_state expected_state2, + enum ib_mad_state expected_state3) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state1 && + mad_send_wr->state != expected_state2 && + mad_send_wr->state != expected_state3); +} + +static inline void +not_expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state wrong_state) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state == wrong_state); +} + struct ib_mad_local_private { struct list_head completion_list; struct ib_mad_private *mad_priv; @@ -222,4 +289,7 @@ void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, unsigned long timeout_ms); +void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state new_state); + #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 8af0619a39cd..1c5e0eaf1c94 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, recv_wc->recv_buf.grh, agent->port_num); if (IS_ERR(ah)) - return (void *) ah; + return ERR_CAST(ah); hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, @@ -608,16 +608,20 @@ static void abort_send(struct ib_mad_agent_private *agent, goto out; /* Unmatched send */ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || - (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + (!mad_send_wr->timeout) || + (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ ib_mark_mad_done(mad_send_wr); + if (mad_send_wr->state == IB_MAD_STATE_DONE) { + spin_unlock_irqrestore(&agent->lock, flags); + wc.status = IB_WC_REM_ABORT_ERR; + wc.vendor_err = rmpp_status; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } spin_unlock_irqrestore(&agent->lock, flags); - - wc.status = IB_WC_REM_ABORT_ERR; - wc.vendor_err = rmpp_status; - wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &wc); return; out: spin_unlock_irqrestore(&agent->lock, flags); @@ -684,7 +688,8 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, } if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || - (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + (!mad_send_wr->timeout) || + (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ if (seg_num > mad_send_wr->send_buf.seg_count || @@ -709,21 +714,24 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, struct ib_mad_send_wc wc; ib_mark_mad_done(mad_send_wr); + if (mad_send_wr->state == IB_MAD_STATE_DONE) { + spin_unlock_irqrestore(&agent->lock, flags); + wc.status = IB_WC_SUCCESS; + wc.vendor_err = 0; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } spin_unlock_irqrestore(&agent->lock, flags); - - wc.status = IB_WC_SUCCESS; - wc.vendor_err = 0; - wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &wc); return; } - if (mad_send_wr->refcount == 1) + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) ib_reset_mad_timeout(mad_send_wr, mad_send_wr->send_buf.timeout_ms); spin_unlock_irqrestore(&agent->lock, flags); ack_ds_ack(agent, mad_recv_wc); return; - } else if (mad_send_wr->refcount == 1 && + } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP && mad_send_wr->seg_num < mad_send_wr->newwin && mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { /* Send failure will just result in a timeout/retry */ @@ -731,7 +739,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, if (ret) goto out; - mad_send_wr->refcount++; + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } @@ -890,7 +898,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->newwin = init_newwin(mad_send_wr); /* We need to wait for the final ACK even if there isn't a response */ - mad_send_wr->refcount += (mad_send_wr->timeout == 0); ret = send_next_seg(mad_send_wr); if (!ret) return IB_RMPP_RESULT_CONSUMED; @@ -912,7 +919,7 @@ int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */ if (mad_send_wc->status != IB_WC_SUCCESS || - mad_send_wr->status != IB_WC_SUCCESS) + mad_send_wr->state == IB_MAD_STATE_CANCELED) return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */ if (!mad_send_wr->timeout) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index a872643e8039..2220a2dfab24 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -255,7 +255,7 @@ EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); bool rdma_nl_get_privileged_qkey(void) { - return privileged_qkey || capable(CAP_NET_RAW); + return privileged_qkey; } EXPORT_SYMBOL(rdma_nl_get_privileged_qkey); @@ -1469,10 +1469,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { }; -static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack, - enum rdma_restrack_type res_type, - res_fill_func_t fill_func) +static noinline_for_stack int +res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + enum rdma_restrack_type res_type, + res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; @@ -2263,10 +2264,10 @@ err: return ret; } -static int stat_get_doit_default_counter(struct sk_buff *skb, - struct nlmsghdr *nlh, - struct netlink_ext_ack *extack, - struct nlattr *tb[]) +static noinline_for_stack int +stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr *tb[]) { struct rdma_hw_stats *stats; struct nlattr *table_attr; @@ -2356,8 +2357,9 @@ err: return ret; } -static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack, struct nlattr *tb[]) +static noinline_for_stack int +stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, struct nlattr *tb[]) { static enum rdma_nl_counter_mode mode; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 90c177edf9b0..18918f463361 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -1019,3 +1019,32 @@ void uverbs_finalize_object(struct ib_uobject *uobj, WARN_ON(true); } } + +/** + * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the + * uverbs attributes file has CAP_NET_RAW + * capability or not. + * + * @attrs: Pointer to uverbs attributes + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. + */ +bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_ucontext *ucontext; + bool has_cap = false; + int srcu_key; + + srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu); + ucontext = ib_uverbs_get_ucontext_file(ufile); + if (IS_ERR(ucontext)) + goto out; + has_cap = rdma_dev_has_raw_cap(ucontext->device); + +out: + srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key); + return has_cap; +} +EXPORT_SYMBOL(rdma_uattrs_has_raw_cap); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 33706dad6c0f..a59b087611cb 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; +extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3313410014cd..a7de6f403fca 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -100,6 +100,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct rdma_counter, res)->device; case RDMA_RESTRACK_SRQ: return container_of(res, struct ib_srq, res)->device; + case RDMA_RESTRACK_DMAH: + return container_of(res, struct ib_dmah, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c index 6853c6d078f9..de5cb8bf0a61 100644 --- a/drivers/infiniband/core/ucaps.c +++ b/drivers/infiniband/core/ucaps.c @@ -170,7 +170,7 @@ int ib_create_ucap(enum rdma_user_cap type) ucap->dev.class = &ucaps_class; ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type); ucap->dev.release = ucap_dev_release; - ret = dev_set_name(&ucap->dev, ucap_names[type]); + ret = dev_set_name(&ucap->dev, "%s", ucap_names[type]); if (ret) goto err_device; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e9fa22d31c23..b1c44ec1a3f3 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -41,65 +41,83 @@ #include <linux/hugetlb.h> #include <linux/interval_tree.h> #include <linux/hmm.h> +#include <linux/hmm-dma.h> #include <linux/pagemap.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" -static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, - const struct mmu_interval_notifier_ops *ops) +static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp) { - int ret; + umem_odp->is_implicit_odp = 1; + umem_odp->umem.is_odp = 1; + mutex_init(&umem_odp->umem_mutex); +} + +static int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + const struct mmu_interval_notifier_ops *ops) +{ + struct ib_device *dev = umem_odp->umem.ibdev; + size_t page_size = 1UL << umem_odp->page_shift; + struct hmm_dma_map *map; + unsigned long start; + unsigned long end; + size_t nr_entries; + int ret = 0; umem_odp->umem.is_odp = 1; mutex_init(&umem_odp->umem_mutex); - if (!umem_odp->is_implicit_odp) { - size_t page_size = 1UL << umem_odp->page_shift; - unsigned long start; - unsigned long end; - size_t ndmas, npfns; - - start = ALIGN_DOWN(umem_odp->umem.address, page_size); - if (check_add_overflow(umem_odp->umem.address, - (unsigned long)umem_odp->umem.length, - &end)) - return -EOVERFLOW; - end = ALIGN(end, page_size); - if (unlikely(end < page_size)) - return -EOVERFLOW; - - ndmas = (end - start) >> umem_odp->page_shift; - if (!ndmas) - return -EINVAL; - - npfns = (end - start) >> PAGE_SHIFT; - umem_odp->pfn_list = kvcalloc( - npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); - if (!umem_odp->pfn_list) - return -ENOMEM; - - umem_odp->dma_list = kvcalloc( - ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); - if (!umem_odp->dma_list) { + start = ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, &end)) + return -EOVERFLOW; + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) + return -EOVERFLOW; + /* + * The mmu notifier can be called within reclaim contexts and takes the + * umem_mutex. This is rare to trigger in testing, teach lockdep about + * it. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + fs_reclaim_acquire(GFP_KERNEL); + mutex_lock(&umem_odp->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); + fs_reclaim_release(GFP_KERNEL); + } + + nr_entries = (end - start) >> PAGE_SHIFT; + if (!(nr_entries * PAGE_SIZE / page_size)) + return -EINVAL; + + map = &umem_odp->map; + if (ib_uses_virt_dma(dev)) { + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) ret = -ENOMEM; - goto out_pfn_list; - } + } else + ret = hmm_dma_map_alloc(dev->dma_device, map, + (end - start) >> PAGE_SHIFT, + 1 << umem_odp->page_shift); + if (ret) + return ret; - ret = mmu_interval_notifier_insert(&umem_odp->notifier, - umem_odp->umem.owning_mm, - start, end - start, ops); - if (ret) - goto out_dma_list; - } + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, start, + end - start, ops); + if (ret) + goto out_free_map; return 0; -out_dma_list: - kvfree(umem_odp->dma_list); -out_pfn_list: - kvfree(umem_odp->pfn_list); +out_free_map: + if (ib_uses_virt_dma(dev)) + kfree(map->pfn_list); + else + hmm_dma_map_free(dev->dma_device, map); return ret; } @@ -118,7 +136,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, { struct ib_umem *umem; struct ib_umem_odp *umem_odp; - int ret; if (access & IB_ACCESS_HUGETLB) return ERR_PTR(-EINVAL); @@ -130,16 +147,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, umem->ibdev = device; umem->writable = ib_access_writable(access); umem->owning_mm = current->mm; - umem_odp->is_implicit_odp = 1; umem_odp->page_shift = PAGE_SHIFT; umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - ret = ib_init_umem_odp(umem_odp, NULL); - if (ret) { - put_pid(umem_odp->tgid); - kfree(umem_odp); - return ERR_PTR(ret); - } + ib_init_umem_implicit_odp(umem_odp); return umem_odp; } EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); @@ -260,74 +271,41 @@ err_put_pid: } EXPORT_SYMBOL(ib_umem_odp_get); -void ib_umem_odp_release(struct ib_umem_odp *umem_odp) +static void ib_umem_odp_free(struct ib_umem_odp *umem_odp) { + struct ib_device *dev = umem_odp->umem.ibdev; + /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - if (!umem_odp->is_implicit_odp) { - mutex_lock(&umem_odp->umem_mutex); - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - mutex_unlock(&umem_odp->umem_mutex); - mmu_interval_notifier_remove(&umem_odp->notifier); - kvfree(umem_odp->dma_list); - kvfree(umem_odp->pfn_list); - } - put_pid(umem_odp->tgid); - kfree(umem_odp); + mutex_lock(&umem_odp->umem_mutex); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + mutex_unlock(&umem_odp->umem_mutex); + mmu_interval_notifier_remove(&umem_odp->notifier); + if (ib_uses_virt_dma(dev)) + kfree(umem_odp->map.pfn_list); + else + hmm_dma_map_free(dev->dma_device, &umem_odp->map); } -EXPORT_SYMBOL(ib_umem_odp_release); -/* - * Map for DMA and insert a single page into the on-demand paging page tables. - * - * @umem: the umem to insert the page to. - * @dma_index: index in the umem to add the dma to. - * @page: the page struct to map and add. - * @access_mask: access permissions needed for this page. - * - * The function returns -EFAULT if the DMA mapping operation fails. - * - */ -static int ib_umem_odp_map_dma_single_page( - struct ib_umem_odp *umem_odp, - unsigned int dma_index, - struct page *page, - u64 access_mask) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_device *dev = umem_odp->umem.ibdev; - dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - - if (*dma_addr) { - /* - * If the page is already dma mapped it means it went through - * a non-invalidating trasition, like read-only to writable. - * Resync the flags. - */ - *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; - return 0; - } + if (!umem_odp->is_implicit_odp) + ib_umem_odp_free(umem_odp); - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, *dma_addr)) { - *dma_addr = 0; - return -EFAULT; - } - umem_odp->npages++; - *dma_addr |= access_mask; - return 0; + put_pid(umem_odp->tgid); + kfree(umem_odp); } +EXPORT_SYMBOL(ib_umem_odp_release); /** * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * * Maps the range passed in the argument to DMA addresses. - * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. * Upon success the ODP MR will be locked to let caller complete its device * page table update. * @@ -355,9 +333,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, struct hmm_range range = {}; unsigned long timeout; - if (access_mask == 0) - return -EINVAL; - if (user_virt < ib_umem_start(umem_odp) || user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; @@ -383,11 +358,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, if (fault) { range.default_flags = HMM_PFN_REQ_FAULT; - if (access_mask & ODP_WRITE_ALLOWED_BIT) + if (access_mask & HMM_PFN_WRITE) range.default_flags |= HMM_PFN_REQ_WRITE; } - range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); + range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]); timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: @@ -415,22 +390,17 @@ retry: for (pfn_index = 0; pfn_index < num_pfns; pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { - if (fault) { - /* - * Since we asked for hmm_range_fault() to populate - * pages it shouldn't return an error entry on success. - */ - WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); - WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); - } else { - if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { - WARN_ON(umem_odp->dma_list[dma_index]); - continue; - } - access_mask = ODP_READ_ALLOWED_BIT; - if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) - access_mask |= ODP_WRITE_ALLOWED_BIT; - } + /* + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. + */ + WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) + continue; + + if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED) + continue; hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); /* If a hugepage was detected and ODP wasn't set for, the umem @@ -443,15 +413,6 @@ retry: __func__, hmm_order, page_shift); break; } - - ret = ib_umem_odp_map_dma_single_page( - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), - access_mask); - if (ret < 0) { - ibdev_dbg(umem_odp->umem.ibdev, - "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } } /* upon success lock should stay on hold for the callee */ if (!ret) @@ -471,45 +432,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - dma_addr_t dma_addr; - dma_addr_t dma; - int idx; - u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; + u64 addr; lockdep_assert_held(&umem_odp->umem_mutex); virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { - idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - dma = umem_odp->dma_list[idx]; - - /* The access flags guaranteed a valid DMA address in case was NULL */ - if (dma) { - unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; - struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); - - dma_addr = dma & ODP_DMA_ADDR_MASK; - ib_dma_unmap_page(dev, dma_addr, - BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); - if (dma & ODP_WRITE_ALLOWED_BIT) { - struct page *head_page = compound_head(page); - /* - * set_page_dirty prefers being called with - * the page lock. However, MMU notifiers are - * called sometimes with and sometimes without - * the lock. We rely on the umem_mutex instead - * to prevent other mmu notifiers from - * continuing and allowing the page mapping to - * be removed. - */ - set_page_dirty(head_page); - } - umem_odp->dma_list[idx] = 0; - umem_odp->npages--; + u64 offset = addr - ib_umem_start(umem_odp); + size_t idx = offset >> umem_odp->page_shift; + unsigned long pfn = umem_odp->map.pfn_list[idx]; + + if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx)) + goto clear; + + if (pfn & HMM_PFN_WRITE) { + struct page *page = hmm_pfn_to_page(pfn); + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); } + umem_odp->npages--; +clear: + umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS; } } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3c3bb670c805..ce16404cdfb8 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -193,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs) fd, attrs); if (IS_ERR(uobj)) - return (void *)uobj; + return ERR_CAST(uobj); uverbs_uobject_get(uobj); uobj_put_read(uobj); @@ -741,7 +741,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) } mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, + cmd.access_flags, NULL, &attrs->driver_udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); @@ -1312,9 +1312,9 @@ static int create_qp(struct uverbs_attr_bundle *attrs, switch (cmd->qp_type) { case IB_QPT_RAW_PACKET: - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; - break; + fallthrough; case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -1451,7 +1451,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs, } if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { - if (!capable(CAP_NET_RAW)) { + if (!rdma_uattrs_has_raw_cap(attrs)) { ret = -EPERM; goto err_put; } @@ -1877,7 +1877,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, attr->path_mig_state = cmd->base.path_mig_state; if (cmd->base.attr_mask & IB_QP_QKEY) { if (cmd->base.qkey & IB_QP_SET_QKEY && - !rdma_nl_get_privileged_qkey()) { + !(rdma_nl_get_privileged_qkey() || + rdma_uattrs_has_raw_cap(attrs))) { ret = -EPERM; goto release_qp; } @@ -3225,7 +3226,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) if (cmd.comp_mask) return -EINVAL; - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 432054f0a8a4..37cd37556510 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -64,15 +64,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( struct ib_ucq_object *obj = container_of( uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE), typeof(*obj), uevent.uobject); + struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_device *ib_dev = attrs->context->device; - int ret; - u64 user_handle; + struct ib_umem_dmabuf *umem_dmabuf; struct ib_cq_init_attr attr = {}; - struct ib_cq *cq; - struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_uobject *ev_file_uobj; + struct ib_umem *umem = NULL; + u64 buffer_length; + u64 buffer_offset; + struct ib_cq *cq; + u64 user_handle; + u64 buffer_va; + int buffer_fd; + int ret; - if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq) + if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq) return -EOPNOTSUPP; ret = uverbs_copy_from(&attr.comp_vector, attrs, @@ -112,9 +118,66 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->uevent.event_list); + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) { + + ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + ret = PTR_ERR(umem); + goto err_event_file; + } + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) { + + ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length, + buffer_fd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem_dmabuf)) { + ret = PTR_ERR(umem_dmabuf); + goto err_event_file; + } + umem = &umem_dmabuf->umem; + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) || + !ib_dev->ops.create_cq) { + ret = -EINVAL; + goto err_event_file; + } + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); if (!cq) { ret = -ENOMEM; + ib_umem_release(umem); goto err_event_file; } @@ -128,7 +191,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, attrs); + ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) : + ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; @@ -180,6 +244,17 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_ASYNC_EVENT, UVERBS_ACCESS_READ, UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), UVERBS_ATTR_UHW()); static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c new file mode 100644 index 000000000000..453ce656c6f2 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_dmah.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include <rdma/uverbs_std_types.h> +#include "restrack.h" + +static int uverbs_free_dmah(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_dmah *dmah = uobject->object; + int ret; + + if (atomic_read(&dmah->usecnt)) + return -EBUSY; + + ret = dmah->device->ops.dealloc_dmah(dmah, attrs); + if (ret) + return ret; + + rdma_restrack_del(&dmah->res); + kfree(dmah); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE) + ->obj_attr.uobject; + struct ib_device *ib_dev = attrs->context->device; + struct ib_dmah *dmah; + int ret; + + dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah); + if (!dmah) + return -ENOMEM; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) { + ret = uverbs_copy_from(&dmah->cpu_id, attrs, + UVERBS_ATTR_ALLOC_DMAH_CPU_ID); + if (ret) + goto err; + + if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) { + ret = -EPERM; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) { + dmah->mem_type = uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE); + dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) { + ret = uverbs_copy_from(&dmah->ph, attrs, + UVERBS_ATTR_ALLOC_DMAH_PH); + if (ret) + goto err; + + /* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */ + if (dmah->ph & 0xFC) { + ret = -EINVAL; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS); + } + + dmah->device = ib_dev; + dmah->uobject = uobj; + atomic_set(&dmah->usecnt, 0); + + rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH); + rdma_restrack_set_name(&dmah->res, NULL); + + ret = ib_dev->ops.alloc_dmah(dmah, attrs); + if (ret) { + rdma_restrack_put(&dmah->res); + goto err; + } + + uobj->object = dmah; + rdma_restrack_add(&dmah->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE); + return 0; +err: + kfree(dmah); + return ret; +} + +static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = { + [TPH_MEM_TYPE_VM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [TPH_MEM_TYPE_PM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DMAH_ALLOC, + UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE, + uverbs_dmah_mem_type, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH, + UVERBS_ATTR_TYPE(u8), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_DMAH_FREE, + UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE)); + +const struct uapi_definition uverbs_def_obj_dmah[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah), + UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 7ebc7bd3caae..570b9656801d 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -238,7 +238,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( return ret; mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, - access_flags, + access_flags, NULL, attrs); if (IS_ERR(mr)) return PTR_ERR(mr); @@ -266,6 +266,135 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( return ret; } +static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE); + u32 valid_access_flags = IB_ACCESS_SUPPORTED; + u64 length, iova, fd_offset = 0, addr = 0; + struct ib_device *ib_dev = pd->device; + struct ib_dmah *dmah = NULL; + bool has_fd_offset = false; + bool has_addr = false; + bool has_fd = false; + u32 access_flags; + struct ib_mr *mr; + int fd; + int ret; + + ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA); + if (ret) + return ret; + + ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) { + ret = uverbs_copy_from(&addr, attrs, + UVERBS_ATTR_REG_MR_ADDR); + if (ret) + return ret; + has_addr = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) { + ret = uverbs_copy_from(&fd_offset, attrs, + UVERBS_ATTR_REG_MR_FD_OFFSET); + if (ret) + return ret; + has_fd_offset = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) { + ret = uverbs_get_raw_fd(&fd, attrs, + UVERBS_ATTR_REG_MR_FD); + if (ret) + return ret; + has_fd = true; + } + + if (has_fd) { + if (!ib_dev->ops.reg_user_mr_dmabuf) + return -EOPNOTSUPP; + + /* FD requires offset and can't come with addr */ + if (!has_fd_offset || has_addr) + return -EINVAL; + + if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + + valid_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_RELAXED_ORDERING; + } else { + if (!has_addr || has_fd_offset) + return -EINVAL; + + if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) { + dmah = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_REG_MR_DMA_HANDLE); + if (IS_ERR(dmah)) + return PTR_ERR(dmah); + } + + ret = uverbs_get_flags32(&access_flags, attrs, + UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + valid_access_flags); + if (ret) + return ret; + + ret = ib_check_mr_access(ib_dev, access_flags); + if (ret) + return ret; + + if (has_fd) + mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, + iova, fd, access_flags, + dmah, attrs); + else + mr = pd->device->ops.reg_user_mr(pd, addr, length, iova, + access_flags, dmah, NULL); + + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + if (dmah) { + mr->dmah = dmah; + atomic_inc(&dmah->usecnt); + } + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; + + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY, + &mr->lkey, sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_ADVISE_MR, UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE, @@ -362,6 +491,44 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_REG_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + enum ib_access_flags, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MR_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE, @@ -376,7 +543,8 @@ DECLARE_UVERBS_NAMED_OBJECT( &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR), - &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR)); + &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR), + &UVERBS_METHOD(UVERBS_METHOD_REG_MR)); const struct uapi_definition uverbs_def_obj_mr[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c index 7b4773fa4bc0..be0730e8509e 100644 --- a/drivers/infiniband/core/uverbs_std_types_qp.c +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -133,7 +133,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( device = xrcd->device; break; case IB_UVERBS_QPT_RAW_PACKET: - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; fallthrough; case IB_UVERBS_QPT_RC: diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index a02916a3a79c..e00ea63175bd 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), + UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c5e78bbefbd0..3a5f81402d2f 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -572,7 +572,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, GFP_KERNEL : GFP_ATOMIC); if (IS_ERR(slave)) { rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); - return (void *)slave; + return ERR_CAST(slave); } ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); rdma_lag_put_ah_roce_slave(slave); @@ -2223,7 +2223,7 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, - access_flags, NULL); + access_flags, NULL, NULL); if (IS_ERR(mr)) return mr; @@ -2262,6 +2262,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; + struct ib_dmah *dmah = mr->dmah; struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; @@ -2272,6 +2273,8 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); + if (dmah) + atomic_dec(&dmah->usecnt); kfree(sig_attrs); } |