summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile1
-rw-r--r--drivers/infiniband/core/cache.c4
-rw-r--r--drivers/infiniband/core/cm.c125
-rw-r--r--drivers/infiniband/core/cm_trace.h2
-rw-r--r--drivers/infiniband/core/cma.c29
-rw-r--r--drivers/infiniband/core/cma_trace.h2
-rw-r--r--drivers/infiniband/core/counters.c2
-rw-r--r--drivers/infiniband/core/cq.c12
-rw-r--r--drivers/infiniband/core/device.c53
-rw-r--r--drivers/infiniband/core/iwcm.c29
-rw-r--r--drivers/infiniband/core/mad.c468
-rw-r--r--drivers/infiniband/core/mad_priv.h76
-rw-r--r--drivers/infiniband/core/mad_rmpp.c43
-rw-r--r--drivers/infiniband/core/nldev.c24
-rw-r--r--drivers/infiniband/core/rdma_core.c29
-rw-r--r--drivers/infiniband/core/rdma_core.h1
-rw-r--r--drivers/infiniband/core/restrack.c2
-rw-r--r--drivers/infiniband/core/ucaps.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c280
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c15
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c87
-rw-r--r--drivers/infiniband/core/uverbs_std_types_dmah.c145
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c172
-rw-r--r--drivers/infiniband/core/uverbs_std_types_qp.c2
-rw-r--r--drivers/infiniband/core/uverbs_uapi.c1
-rw-r--r--drivers/infiniband/core/verbs.c7
26 files changed, 1184 insertions, 429 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d49ded7e95f0..f483e0c12444 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -33,6 +33,7 @@ ib_umad-y := user_mad.o
ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
uverbs_std_types_cq.o \
+ uverbs_std_types_dmah.o \
uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
uverbs_std_types_mr.o uverbs_std_types_counters.o \
uverbs_uapi.o uverbs_std_types_device.o \
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 9979a351577f..81cf3c902e81 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -582,8 +582,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
out_unlock:
mutex_unlock(&table->lock);
if (ret)
- pr_warn("%s: unable to add gid %pI6 error=%d\n",
- __func__, gid->raw, ret);
+ pr_warn_ratelimited("%s: unable to add gid %pI6 error=%d\n",
+ __func__, gid->raw, ret);
return ret;
}
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 142170473e75..92678e438ff4 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -36,6 +36,7 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
+#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
static const char * const ibcm_rej_reason_strs[] = {
[IB_CM_REJ_NO_QP] = "no QP",
@@ -160,6 +161,7 @@ struct cm_counter_attribute {
struct cm_port {
struct cm_device *cm_dev;
struct ib_mad_agent *mad_agent;
+ struct ib_mad_agent *rep_agent;
u32 port_num;
atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT];
};
@@ -167,7 +169,7 @@ struct cm_port {
struct cm_device {
struct kref kref;
struct list_head list;
- spinlock_t mad_agent_lock;
+ rwlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
@@ -241,7 +243,6 @@ struct cm_id_private {
u8 initiator_depth;
u8 retry_count;
u8 rnr_retry_count;
- u8 service_timeout;
u8 target_ack_delay;
struct list_head work_list;
@@ -274,7 +275,8 @@ static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
complete(&cm_id_priv->comp);
}
-static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
+static struct ib_mad_send_buf *
+cm_alloc_msg_agent(struct cm_id_private *cm_id_priv, bool rep_agent)
{
struct ib_mad_agent *mad_agent;
struct ib_mad_send_buf *m;
@@ -285,8 +287,9 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
- mad_agent = cm_id_priv->av.port->mad_agent;
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ mad_agent = rep_agent ? cm_id_priv->av.port->rep_agent :
+ cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
goto out;
@@ -311,10 +314,15 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
m->ah = ah;
out:
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
+static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
+{
+ return cm_alloc_msg_agent(cm_id_priv, false);
+}
+
static void cm_free_msg(struct ib_mad_send_buf *msg)
{
if (msg->ah)
@@ -323,13 +331,14 @@ static void cm_free_msg(struct ib_mad_send_buf *msg)
}
static struct ib_mad_send_buf *
-cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
+cm_alloc_priv_msg_rep(struct cm_id_private *cm_id_priv, enum ib_cm_state state,
+ bool rep_agent)
{
struct ib_mad_send_buf *msg;
lockdep_assert_held(&cm_id_priv->lock);
- msg = cm_alloc_msg(cm_id_priv);
+ msg = cm_alloc_msg_agent(cm_id_priv, rep_agent);
if (IS_ERR(msg))
return msg;
@@ -344,6 +353,12 @@ cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
return msg;
}
+static struct ib_mad_send_buf *
+cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state)
+{
+ return cm_alloc_priv_msg_rep(cm_id_priv, state, false);
+}
+
static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
{
struct cm_id_private *cm_id_priv = msg->context[0];
@@ -1297,10 +1312,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
@@ -1872,7 +1887,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
static void cm_format_mra(struct cm_mra_msg *mra_msg,
struct cm_id_private *cm_id_priv,
- enum cm_msg_response msg_mraed, u8 service_timeout,
+ enum cm_msg_response msg_mraed,
const void *private_data, u8 private_data_len)
{
cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
@@ -1881,7 +1896,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
- IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
+ IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING);
if (private_data && private_data_len)
IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
@@ -1960,7 +1975,7 @@ static void cm_dup_req_handler(struct cm_work *work,
switch (cm_id_priv->id.state) {
case IB_CM_MRA_REQ_SENT:
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REQ,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
break;
@@ -2295,7 +2310,7 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
goto out;
}
- msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REP_SENT);
+ msg = cm_alloc_priv_msg_rep(cm_id_priv, IB_CM_REP_SENT, true);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto out;
@@ -2454,7 +2469,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
cm_id_priv->private_data_len);
else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ CM_MSG_RESPONSE_REP,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else
@@ -3094,26 +3109,13 @@ out:
return -EINVAL;
}
-int ib_send_cm_mra(struct ib_cm_id *cm_id,
- u8 service_timeout,
- const void *private_data,
- u8 private_data_len)
+int ib_prepare_cm_mra(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
- struct ib_mad_send_buf *msg;
enum ib_cm_state cm_state;
enum ib_cm_lap_state lap_state;
- enum cm_msg_response msg_response;
- void *data;
unsigned long flags;
- int ret;
-
- if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
- return -EINVAL;
-
- data = cm_copy_private_data(private_data, private_data_len);
- if (IS_ERR(data))
- return PTR_ERR(data);
+ int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3122,58 +3124,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
case IB_CM_REQ_RCVD:
cm_state = IB_CM_MRA_REQ_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
cm_state = IB_CM_MRA_REP_SENT;
lap_state = cm_id->lap_state;
- msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
if (cm_id->lap_state == IB_CM_LAP_RCVD) {
cm_state = cm_id->state;
lap_state = IB_CM_MRA_LAP_SENT;
- msg_response = CM_MSG_RESPONSE_OTHER;
break;
}
fallthrough;
default:
- trace_icm_send_mra_unknown_err(&cm_id_priv->id);
+ trace_icm_prepare_mra_unknown_err(&cm_id_priv->id);
ret = -EINVAL;
goto error_unlock;
}
- if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
- msg = cm_alloc_msg(cm_id_priv);
- if (IS_ERR(msg)) {
- ret = PTR_ERR(msg);
- goto error_unlock;
- }
-
- cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
- msg_response, service_timeout,
- private_data, private_data_len);
- trace_icm_send_mra(cm_id);
- ret = ib_post_send_mad(msg, NULL);
- if (ret)
- goto error_free_msg;
- }
-
cm_id->state = cm_state;
cm_id->lap_state = lap_state;
- cm_id_priv->service_timeout = service_timeout;
- cm_set_private_data(cm_id_priv, data, private_data_len);
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return 0;
+ cm_set_private_data(cm_id_priv, NULL, 0);
-error_free_msg:
- cm_free_msg(msg);
error_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- kfree(data);
return ret;
}
-EXPORT_SYMBOL(ib_send_cm_mra);
+EXPORT_SYMBOL(ib_prepare_cm_mra);
static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
{
@@ -3377,7 +3354,6 @@ static int cm_lap_handler(struct cm_work *work)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_OTHER,
- cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
@@ -3786,7 +3762,8 @@ static void cm_process_send_error(struct cm_id_private *cm_id_priv,
spin_lock_irq(&cm_id_priv->lock);
if (msg != cm_id_priv->msg) {
spin_unlock_irq(&cm_id_priv->lock);
- cm_free_priv_msg(msg);
+ cm_free_msg(msg);
+ cm_deref_id(cm_id_priv);
return;
}
cm_free_priv_msg(msg);
@@ -4378,7 +4355,7 @@ static int cm_add_one(struct ib_device *ib_device)
return -ENOMEM;
kref_init(&cm_dev->kref);
- spin_lock_init(&cm_dev->mad_agent_lock);
+ rwlock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
@@ -4418,9 +4395,22 @@ static int cm_add_one(struct ib_device *ib_device)
goto error2;
}
+ port->rep_agent = ib_register_mad_agent(ib_device, i,
+ IB_QPT_GSI,
+ NULL,
+ 0,
+ cm_send_handler,
+ NULL,
+ port,
+ 0);
+ if (IS_ERR(port->rep_agent)) {
+ ret = PTR_ERR(port->rep_agent);
+ goto error3;
+ }
+
ret = ib_modify_port(ib_device, i, 0, &port_modify);
if (ret)
- goto error3;
+ goto error4;
count++;
}
@@ -4435,6 +4425,8 @@ static int cm_add_one(struct ib_device *ib_device)
write_unlock_irqrestore(&cm.device_lock, flags);
return 0;
+error4:
+ ib_unregister_mad_agent(port->rep_agent);
error3:
ib_unregister_mad_agent(port->mad_agent);
error2:
@@ -4448,6 +4440,7 @@ error1:
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->rep_agent);
ib_unregister_mad_agent(port->mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
@@ -4477,12 +4470,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
rdma_for_each_port (ib_device, i) {
struct ib_mad_agent *mad_agent;
+ struct ib_mad_agent *rep_agent;
if (!rdma_cap_ib_cm(ib_device, i))
continue;
port = cm_dev->port[i-1];
mad_agent = port->mad_agent;
+ rep_agent = port->rep_agent;
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
/*
* We flush the queue here after the going_down set, this
@@ -4494,10 +4489,12 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
- spin_lock(&cm_dev->mad_agent_lock);
+ write_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
- spin_unlock(&cm_dev->mad_agent_lock);
+ port->rep_agent = NULL;
+ write_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
+ ib_unregister_mad_agent(rep_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);
}
diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h
index 944d9071245d..4a4987da69d4 100644
--- a/drivers/infiniband/core/cm_trace.h
+++ b/drivers/infiniband/core/cm_trace.h
@@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep);
DEFINE_CM_ERR_EVENT(dreq_unknown);
DEFINE_CM_ERR_EVENT(send_unknown_rej);
DEFINE_CM_ERR_EVENT(rej_unknown);
-DEFINE_CM_ERR_EVENT(send_mra_unknown);
+DEFINE_CM_ERR_EVENT(prepare_mra_unknown);
DEFINE_CM_ERR_EVENT(mra_unknown);
DEFINE_CM_ERR_EVENT(qp_init);
DEFINE_CM_ERR_EVENT(qp_rtr);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index fedcdb56fb6b..9b471548e7ae 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
-#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 16
#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
@@ -72,6 +71,8 @@ static const char * const cma_events[] = {
static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
enum ib_gid_type gid_type);
+static void cma_netevent_work_handler(struct work_struct *_work);
+
const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
{
size_t index = event;
@@ -144,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
}
EXPORT_SYMBOL(rdma_iw_cm_id);
-/**
- * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
- * @res: rdma resource tracking entry pointer
- */
-struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
-{
- struct rdma_id_private *id_priv =
- container_of(res, struct rdma_id_private, res);
-
- return &id_priv->id;
-}
-EXPORT_SYMBOL(rdma_res_to_id);
-
static int cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device, void *client_data);
@@ -1047,6 +1035,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
id_priv->id.route.addr.dev_addr.net = get_net(net);
id_priv->seq_num &= 0x00ffffff;
+ INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler);
rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID);
if (parent)
@@ -2211,8 +2200,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
case IB_CM_REP_RECEIVED:
if (state == RDMA_CM_CONNECT &&
(id_priv->id.qp_type != IB_QPT_UD)) {
- trace_cm_send_mra(id_priv);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(id_priv);
+ ib_prepare_cm_mra(cm_id);
}
if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
@@ -2473,8 +2462,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
conn_id->id.qp_type != IB_QPT_UD) {
- trace_cm_send_mra(cm_id->context);
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ trace_cm_prepare_mra(cm_id->context);
+ ib_prepare_cm_mra(cm_id);
}
mutex_unlock(&conn_id->handler_mutex);
@@ -5241,9 +5230,9 @@ static int cma_netevent_callback(struct notifier_block *self,
if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
neigh->ha, ETH_ALEN))
continue;
- INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler);
cma_id_get(current_id);
- queue_work(cma_wq, &current_id->id.net_work);
+ if (!queue_work(cma_wq, &current_id->id.net_work))
+ cma_id_put(current_id);
}
out:
spin_unlock_irqrestore(&id_table_lock, flags);
diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h
index dc622f3778be..3456d5f3aa47 100644
--- a/drivers/infiniband/core/cma_trace.h
+++ b/drivers/infiniband/core/cma_trace.h
@@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class,
DEFINE_CMA_FSM_EVENT(send_rtu);
DEFINE_CMA_FSM_EVENT(send_rej);
-DEFINE_CMA_FSM_EVENT(send_mra);
+DEFINE_CMA_FSM_EVENT(prepare_mra);
DEFINE_CMA_FSM_EVENT(send_sidr_req);
DEFINE_CMA_FSM_EVENT(send_sidr_rep);
DEFINE_CMA_FSM_EVENT(disconnect);
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index e6ec7b7a40af..c3aa6d7fc66b 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -461,7 +461,7 @@ static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
return NULL;
qp = container_of(res, struct ib_qp, res);
- if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev))
goto err;
return qp;
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index a70876a0a231..584537c71545 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -317,13 +317,18 @@ EXPORT_SYMBOL(__ib_alloc_cq_any);
*/
void ib_free_cq(struct ib_cq *cq)
{
- int ret;
+ int ret = 0;
if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
return;
if (WARN_ON_ONCE(cq->cqe_used))
return;
+ if (cq->device->ops.pre_destroy_cq) {
+ ret = cq->device->ops.pre_destroy_cq(cq);
+ WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail");
+ }
+
switch (cq->poll_ctx) {
case IB_POLL_DIRECT:
break;
@@ -340,7 +345,10 @@ void ib_free_cq(struct ib_cq *cq)
rdma_dim_destroy(cq);
trace_cq_free(cq);
- ret = cq->device->ops.destroy_cq(cq, NULL);
+ if (cq->device->ops.post_destroy_cq)
+ cq->device->ops.post_destroy_cq(cq);
+ else
+ ret = cq->device->ops.destroy_cq(cq, NULL);
WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
rdma_restrack_del(&cq->res);
kfree(cq->wc);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b4e3e4beb7f4..3145cb34a1d2 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -145,6 +145,33 @@ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
}
EXPORT_SYMBOL(rdma_dev_access_netns);
+/**
+ * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has
+ * CAP_NET_RAW capability or not.
+ *
+ * @dev: Pointer to rdma device whose capability to be checked
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false. When rdma subsystem is in legacy shared network,
+ * namespace mode, the default net namespace is considered.
+ */
+bool rdma_dev_has_raw_cap(const struct ib_device *dev)
+{
+ const struct net *net;
+
+ /* Network namespace is the resource whose user namespace
+ * to be considered. When in shared mode, there is no reliable
+ * network namespace resource, so consider the default net namespace.
+ */
+ if (ib_devices_shared_netns)
+ net = &init_net;
+ else
+ net = read_pnet(&dev->coredev.rdma_net);
+
+ return ns_capable(net->user_ns, CAP_NET_RAW);
+}
+EXPORT_SYMBOL(rdma_dev_has_raw_cap);
+
/*
* xarray has this behavior where it won't iterate over NULL values stored in
* allocated arrays. So we need our own iterator to see all values stored in
@@ -557,6 +584,8 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
/**
* _ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
+ * @net: network namespace device should be located in, namespace
+ * must stay valid until ib_register_device() is completed.
*
* Low-level drivers should use ib_alloc_device() to allocate &struct
* ib_device. @size is the size of the structure to be allocated,
@@ -564,7 +593,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
* ib_dealloc_device() must be used to free structures allocated with
* ib_alloc_device().
*/
-struct ib_device *_ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size, struct net *net)
{
struct ib_device *device;
unsigned int i;
@@ -581,7 +610,15 @@ struct ib_device *_ib_alloc_device(size_t size)
return NULL;
}
- rdma_init_coredev(&device->coredev, device, &init_net);
+ /* ib_devices_shared_netns can't change while we have active namespaces
+ * in the system which means either init_net is passed or the user has
+ * no idea what they are doing.
+ *
+ * To avoid breaking backward compatibility, when in shared mode,
+ * force to init the device in the init_net.
+ */
+ net = ib_devices_shared_netns ? &init_net : net;
+ rdma_init_coredev(&device->coredev, device, net);
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->qp_open_list_lock);
@@ -1352,6 +1389,9 @@ static void ib_device_notify_register(struct ib_device *device)
down_read(&devices_rwsem);
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
if (ret)
goto out;
@@ -1468,10 +1508,9 @@ int ib_register_device(struct ib_device *device, const char *name,
return ret;
}
dev_set_uevent_suppress(&device->dev, false);
- /* Mark for userspace that device is ready */
- kobject_uevent(&device->dev.kobj, KOBJ_ADD);
ib_device_notify_register(device);
+
ib_device_put(device);
return 0;
@@ -2669,6 +2708,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, add_sub_dev);
SET_DEVICE_OP(dev_ops, advise_mr);
SET_DEVICE_OP(dev_ops, alloc_dm);
+ SET_DEVICE_OP(dev_ops, alloc_dmah);
SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
SET_DEVICE_OP(dev_ops, alloc_hw_port_stats);
SET_DEVICE_OP(dev_ops, alloc_mr);
@@ -2689,6 +2729,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, create_ah);
SET_DEVICE_OP(dev_ops, create_counters);
SET_DEVICE_OP(dev_ops, create_cq);
+ SET_DEVICE_OP(dev_ops, create_cq_umem);
SET_DEVICE_OP(dev_ops, create_flow);
SET_DEVICE_OP(dev_ops, create_qp);
SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
@@ -2696,6 +2737,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, create_user_ah);
SET_DEVICE_OP(dev_ops, create_wq);
SET_DEVICE_OP(dev_ops, dealloc_dm);
+ SET_DEVICE_OP(dev_ops, dealloc_dmah);
SET_DEVICE_OP(dev_ops, dealloc_driver);
SET_DEVICE_OP(dev_ops, dealloc_mw);
SET_DEVICE_OP(dev_ops, dealloc_pd);
@@ -2761,8 +2803,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, modify_srq);
SET_DEVICE_OP(dev_ops, modify_wq);
SET_DEVICE_OP(dev_ops, peek_cq);
+ SET_DEVICE_OP(dev_ops, pre_destroy_cq);
SET_DEVICE_OP(dev_ops, poll_cq);
SET_DEVICE_OP(dev_ops, port_groups);
+ SET_DEVICE_OP(dev_ops, post_destroy_cq);
SET_DEVICE_OP(dev_ops, post_recv);
SET_DEVICE_OP(dev_ops, post_send);
SET_DEVICE_OP(dev_ops, post_srq_recv);
@@ -2791,6 +2835,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_OBJ_SIZE(dev_ops, ib_ah);
SET_OBJ_SIZE(dev_ops, ib_counters);
SET_OBJ_SIZE(dev_ops, ib_cq);
+ SET_OBJ_SIZE(dev_ops, ib_dmah);
SET_OBJ_SIZE(dev_ops, ib_mw);
SET_OBJ_SIZE(dev_ops, ib_pd);
SET_OBJ_SIZE(dev_ops, ib_qp);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index f4486cbd8f45..62410578dec3 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -368,12 +368,9 @@ EXPORT_SYMBOL(iw_cm_disconnect);
/*
* CM_ID <-- DESTROYING
*
- * Clean up all resources associated with the connection and release
- * the initial reference taken by iw_create_cm_id.
- *
- * Returns true if and only if the last cm_id_priv reference has been dropped.
+ * Clean up all resources associated with the connection.
*/
-static bool destroy_cm_id(struct iw_cm_id *cm_id)
+static void destroy_cm_id(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
struct ib_qp *qp;
@@ -442,20 +439,22 @@ static bool destroy_cm_id(struct iw_cm_id *cm_id)
iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
}
-
- return iwcm_deref_id(cm_id_priv);
}
/*
- * This function is only called by the application thread and cannot
- * be called by the event thread. The function will wait for all
- * references to be released on the cm_id and then kfree the cm_id
- * object.
+ * Destroy cm_id. If the cm_id still has other references, wait for all
+ * references to be released on the cm_id and then release the initial
+ * reference taken by iw_create_cm_id.
*/
void iw_destroy_cm_id(struct iw_cm_id *cm_id)
{
- if (!destroy_cm_id(cm_id))
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ destroy_cm_id(cm_id);
+ if (refcount_read(&cm_id_priv->refcount) > 1)
flush_workqueue(iwcm_wq);
+ iwcm_deref_id(cm_id_priv);
}
EXPORT_SYMBOL(iw_destroy_cm_id);
@@ -1035,8 +1034,10 @@ static void cm_work_handler(struct work_struct *_work)
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
- if (ret)
- WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id));
+ if (ret) {
+ destroy_cm_id(&cm_id_priv->id);
+ WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
+ }
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 73f3a0b9a54b..8f26bfb69586 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -210,6 +210,29 @@ int ib_response_mad(const struct ib_mad_hdr *hdr)
}
EXPORT_SYMBOL(ib_response_mad);
+#define SOL_FC_MAX_DEFAULT_FRAC 4
+#define SOL_FC_MAX_SA_FRAC 32
+
+static int get_sol_fc_max_outstanding(struct ib_mad_reg_req *mad_reg_req)
+{
+ if (!mad_reg_req)
+ /* Send only agent */
+ return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC;
+
+ switch (mad_reg_req->mgmt_class) {
+ case IB_MGMT_CLASS_CM:
+ return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC;
+ case IB_MGMT_CLASS_SUBN_ADM:
+ return mad_recvq_size / SOL_FC_MAX_SA_FRAC;
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ return min(mad_recvq_size, IB_MAD_QP_RECV_SIZE) /
+ SOL_FC_MAX_DEFAULT_FRAC;
+ default:
+ return 0;
+ }
+}
+
/*
* ib_register_mad_agent - Register to send/receive MADs
*
@@ -391,13 +414,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
spin_lock_init(&mad_agent_priv->lock);
INIT_LIST_HEAD(&mad_agent_priv->send_list);
INIT_LIST_HEAD(&mad_agent_priv->wait_list);
- INIT_LIST_HEAD(&mad_agent_priv->done_list);
INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+ INIT_LIST_HEAD(&mad_agent_priv->backlog_list);
INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
INIT_LIST_HEAD(&mad_agent_priv->local_list);
INIT_WORK(&mad_agent_priv->local_work, local_completions);
refcount_set(&mad_agent_priv->refcount, 1);
init_completion(&mad_agent_priv->comp);
+ mad_agent_priv->sol_fc_send_count = 0;
+ mad_agent_priv->sol_fc_wait_count = 0;
+ mad_agent_priv->sol_fc_max =
+ recv_handler ? get_sol_fc_max_outstanding(mad_reg_req) : 0;
ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type);
if (ret2) {
@@ -1055,6 +1082,180 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
return ret;
}
+static void handle_queued_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) {
+ mad_agent_priv->sol_fc_wait_count--;
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->backlog_list);
+ } else {
+ expect_mad_state(mad_send_wr, IB_MAD_STATE_INIT);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->backlog_list);
+ }
+}
+
+static void handle_send_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->state == IB_MAD_STATE_INIT) {
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ } else {
+ expect_mad_state2(mad_send_wr, IB_MAD_STATE_WAIT_RESP,
+ IB_MAD_STATE_QUEUED);
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ }
+
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ mad_agent_priv->sol_fc_send_count++;
+ }
+}
+
+static void handle_wait_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ expect_mad_state3(mad_send_wr, IB_MAD_STATE_SEND_START,
+ IB_MAD_STATE_WAIT_RESP, IB_MAD_STATE_CANCELED);
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START &&
+ mad_send_wr->is_solicited_fc) {
+ mad_agent_priv->sol_fc_send_count--;
+ mad_agent_priv->sol_fc_wait_count++;
+ }
+
+ list_del_init(&mad_send_wr->agent_list);
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ if (delay) {
+ list_for_each_prev(list_item,
+ &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(
+ list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ } else {
+ list_item = &mad_agent_priv->wait_list;
+ }
+
+ list_add(&mad_send_wr->agent_list, list_item);
+}
+
+static void handle_early_resp_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ expect_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+ mad_agent_priv->sol_fc_send_count -= mad_send_wr->is_solicited_fc;
+}
+
+static void handle_canceled_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ not_expect_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START)
+ mad_agent_priv->sol_fc_send_count--;
+ else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ }
+}
+
+static void handle_done_state(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (mad_send_wr->is_solicited_fc) {
+ if (mad_send_wr->state == IB_MAD_STATE_SEND_START)
+ mad_agent_priv->sol_fc_send_count--;
+ else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
+ mad_agent_priv->sol_fc_wait_count--;
+ }
+
+ list_del_init(&mad_send_wr->agent_list);
+}
+
+void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state new_state)
+{
+ struct ib_mad_agent_private *mad_agent_priv =
+ mad_send_wr->mad_agent_priv;
+
+ switch (new_state) {
+ case IB_MAD_STATE_INIT:
+ break;
+ case IB_MAD_STATE_QUEUED:
+ handle_queued_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_SEND_START:
+ handle_send_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_WAIT_RESP:
+ handle_wait_state(mad_send_wr, mad_agent_priv);
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ return;
+ break;
+ case IB_MAD_STATE_EARLY_RESP:
+ handle_early_resp_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_CANCELED:
+ handle_canceled_state(mad_send_wr, mad_agent_priv);
+ break;
+ case IB_MAD_STATE_DONE:
+ handle_done_state(mad_send_wr, mad_agent_priv);
+ break;
+ }
+
+ mad_send_wr->state = new_state;
+}
+
+static bool is_solicited_fc_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ u8 mgmt_class;
+
+ if (!mad_send_wr->timeout)
+ return 0;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (mad_send_wr->mad_agent_priv->agent.rmpp_version &&
+ (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE))
+ return 0;
+
+ mgmt_class =
+ ((struct ib_mad_hdr *)mad_send_wr->send_buf.mad)->mgmt_class;
+ return mgmt_class == IB_MGMT_CLASS_CM ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_ADM ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE;
+}
+
+static bool mad_is_for_backlog(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *mad_agent_priv =
+ mad_send_wr->mad_agent_priv;
+
+ if (!mad_send_wr->is_solicited_fc || !mad_agent_priv->sol_fc_max)
+ return false;
+
+ if (!list_empty(&mad_agent_priv->backlog_list))
+ return true;
+
+ return mad_agent_priv->sol_fc_send_count +
+ mad_agent_priv->sol_fc_wait_count >=
+ mad_agent_priv->sol_fc_max;
+}
+
/*
* ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
* with the registered client
@@ -1080,9 +1281,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
if (ret)
goto error;
- if (!send_buf->mad_agent->send_handler ||
- (send_buf->timeout_ms &&
- !send_buf->mad_agent->recv_handler)) {
+ if (!send_buf->mad_agent->send_handler) {
ret = -EINVAL;
goto error;
}
@@ -1118,15 +1317,19 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
mad_send_wr->max_retries = send_buf->retries;
mad_send_wr->retries_left = send_buf->retries;
send_buf->retries = 0;
- /* Reference for work request to QP + response */
- mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
- mad_send_wr->status = IB_WC_SUCCESS;
+ change_mad_state(mad_send_wr, IB_MAD_STATE_INIT);
/* Reference MAD agent until send completes */
refcount_inc(&mad_agent_priv->refcount);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_add_tail(&mad_send_wr->agent_list,
- &mad_agent_priv->send_list);
+ mad_send_wr->is_solicited_fc = is_solicited_fc_mad(mad_send_wr);
+ if (mad_is_for_backlog(mad_send_wr)) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return 0;
+ }
+
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
@@ -1138,7 +1341,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
if (ret < 0) {
/* Fail send request */
spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_del(&mad_send_wr->agent_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
deref_mad_agent(mad_agent_priv);
goto error;
@@ -1746,7 +1949,19 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
*/
(is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
- return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
+ }
+
+ list_for_each_entry(wr, &mad_agent_priv->backlog_list, agent_list) {
+ if ((wr->tid == mad_hdr->tid) &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(mad_hdr->mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
}
/*
@@ -1765,17 +1980,55 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
(is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
/* Verify request has not been canceled */
- return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL;
}
return NULL;
}
+static void
+process_backlog_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc = {};
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->backlog_list) &&
+ (mad_agent_priv->sol_fc_send_count +
+ mad_agent_priv->sol_fc_wait_count <
+ mad_agent_priv->sol_fc_max)) {
+ mad_send_wr = list_entry(mad_agent_priv->backlog_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ ret = ib_send_mad(mad_send_wr);
+ if (ret) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ deref_mad_agent(mad_agent_priv);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_send_wc.status = IB_WC_LOC_QP_OP_ERR;
+ mad_agent_priv->agent.send_handler(
+ &mad_agent_priv->agent, &mad_send_wc);
+ }
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
{
mad_send_wr->timeout = 0;
- if (mad_send_wr->refcount == 1)
- list_move_tail(&mad_send_wr->agent_list,
- &mad_send_wr->mad_agent_priv->done_list);
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP ||
+ mad_send_wr->state == IB_MAD_STATE_QUEUED)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ else
+ change_mad_state(mad_send_wr, IB_MAD_STATE_EARLY_RESP);
}
static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
@@ -1784,6 +2037,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
struct ib_mad_send_wr_private *mad_send_wr;
struct ib_mad_send_wc mad_send_wc;
unsigned long flags;
+ bool is_mad_done;
int ret;
INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
@@ -1832,6 +2086,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
} else {
ib_mark_mad_done(mad_send_wr);
+ is_mad_done = (mad_send_wr->state == IB_MAD_STATE_DONE);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
/* Defined behavior is to complete response before request */
@@ -1841,10 +2096,13 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
mad_recv_wc);
deref_mad_agent(mad_agent_priv);
- mad_send_wc.status = IB_WC_SUCCESS;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ if (is_mad_done) {
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr,
+ &mad_send_wc);
+ }
}
} else {
mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
@@ -2172,30 +2430,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
{
struct ib_mad_agent_private *mad_agent_priv;
- struct ib_mad_send_wr_private *temp_mad_send_wr;
- struct list_head *list_item;
unsigned long delay;
mad_agent_priv = mad_send_wr->mad_agent_priv;
- list_del(&mad_send_wr->agent_list);
-
delay = mad_send_wr->timeout;
- mad_send_wr->timeout += jiffies;
-
- if (delay) {
- list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
- temp_mad_send_wr = list_entry(list_item,
- struct ib_mad_send_wr_private,
- agent_list);
- if (time_after(mad_send_wr->timeout,
- temp_mad_send_wr->timeout))
- break;
- }
- } else {
- list_item = &mad_agent_priv->wait_list;
- }
-
- list_add(&mad_send_wr->agent_list, list_item);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_WAIT_RESP);
/* Reschedule a work item if we have a shorter timeout */
if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
@@ -2229,32 +2468,28 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
} else
ret = IB_RMPP_RESULT_UNHANDLED;
- if (mad_send_wc->status != IB_WC_SUCCESS &&
- mad_send_wr->status == IB_WC_SUCCESS) {
- mad_send_wr->status = mad_send_wc->status;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
-
- if (--mad_send_wr->refcount > 0) {
- if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
- mad_send_wr->status == IB_WC_SUCCESS) {
- wait_for_response(mad_send_wr);
- }
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ mad_send_wc->status = IB_WC_WR_FLUSH_ERR;
+ else if (mad_send_wr->state == IB_MAD_STATE_SEND_START &&
+ mad_send_wr->timeout) {
+ wait_for_response(mad_send_wr);
goto done;
}
/* Remove send from MAD agent and notify client of completion */
- list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->state != IB_MAD_STATE_DONE)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
adjust_timeout(mad_agent_priv);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- if (mad_send_wr->status != IB_WC_SUCCESS)
- mad_send_wc->status = mad_send_wr->status;
- if (ret == IB_RMPP_RESULT_INTERNAL)
+ if (ret == IB_RMPP_RESULT_INTERNAL) {
ib_rmpp_send_handler(mad_send_wc);
- else
+ } else {
+ if (mad_send_wr->is_solicited_fc)
+ process_backlog_mads(mad_agent_priv);
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
mad_send_wc);
+ }
/* Release reference on agent taken when sending */
deref_mad_agent(mad_agent_priv);
@@ -2396,40 +2631,53 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
return true;
}
+static void clear_mad_error_list(struct list_head *list,
+ enum ib_wc_status wc_status,
+ struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr, *n;
+ struct ib_mad_send_wc mad_send_wc;
+
+ mad_send_wc.status = wc_status;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, n, list, agent_list) {
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+}
+
static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
{
unsigned long flags;
struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
- struct ib_mad_send_wc mad_send_wc;
struct list_head cancel_list;
INIT_LIST_HEAD(&cancel_list);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
- &mad_agent_priv->send_list, agent_list) {
- if (mad_send_wr->status == IB_WC_SUCCESS) {
- mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
- }
+ &mad_agent_priv->send_list, agent_list)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED);
- /* Empty wait list to prevent receives from finding a request */
- list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-
- /* Report all cancelled requests */
- mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
- mad_send_wc.vendor_err = 0;
+ /* Empty wait & backlog list to prevent receives from finding request */
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->wait_list, agent_list) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, &cancel_list);
+ }
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
- &cancel_list, agent_list) {
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- list_del(&mad_send_wr->agent_list);
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
- deref_mad_agent(mad_agent_priv);
+ &mad_agent_priv->backlog_list, agent_list) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, &cancel_list);
}
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ /* Report all cancelled requests */
+ clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv);
}
static struct ib_mad_send_wr_private*
@@ -2451,6 +2699,13 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
&mad_send_wr->send_buf == send_buf)
return mad_send_wr;
}
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->backlog_list,
+ agent_list) {
+ if (&mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+
return NULL;
}
@@ -2468,16 +2723,16 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
struct ib_mad_agent_private, agent);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
- if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ if (!mad_send_wr || mad_send_wr->state == IB_MAD_STATE_CANCELED) {
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
return -EINVAL;
}
- active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
- if (!timeout_ms) {
- mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
- mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
- }
+ active = ((mad_send_wr->state == IB_MAD_STATE_SEND_START) ||
+ (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) ||
+ (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms));
+ if (!timeout_ms)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED);
mad_send_wr->send_buf.timeout_ms = timeout_ms;
if (active)
@@ -2589,6 +2844,11 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->send_buf.retries++;
mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ if (mad_send_wr->is_solicited_fc &&
+ !list_empty(&mad_send_wr->mad_agent_priv->backlog_list)) {
+ change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED);
+ return 0;
+ }
if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
ret = ib_retry_rmpp(mad_send_wr);
@@ -2606,26 +2866,25 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
} else
ret = ib_send_mad(mad_send_wr);
- if (!ret) {
- mad_send_wr->refcount++;
- list_add_tail(&mad_send_wr->agent_list,
- &mad_send_wr->mad_agent_priv->send_list);
- }
+ if (!ret)
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
+
return ret;
}
static void timeout_sends(struct work_struct *work)
{
- struct ib_mad_send_wr_private *mad_send_wr, *n;
+ struct ib_mad_send_wr_private *mad_send_wr;
struct ib_mad_agent_private *mad_agent_priv;
- struct ib_mad_send_wc mad_send_wc;
- struct list_head local_list;
+ struct list_head timeout_list;
+ struct list_head cancel_list;
+ struct list_head *list_item;
unsigned long flags, delay;
mad_agent_priv = container_of(work, struct ib_mad_agent_private,
timed_work.work);
- mad_send_wc.vendor_err = 0;
- INIT_LIST_HEAD(&local_list);
+ INIT_LIST_HEAD(&timeout_list);
+ INIT_LIST_HEAD(&cancel_list);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
while (!list_empty(&mad_agent_priv->wait_list)) {
@@ -2643,25 +2902,22 @@ static void timeout_sends(struct work_struct *work)
break;
}
- list_del_init(&mad_send_wr->agent_list);
- if (mad_send_wr->status == IB_WC_SUCCESS &&
- !retry_send(mad_send_wr))
+ if (mad_send_wr->state == IB_MAD_STATE_CANCELED)
+ list_item = &cancel_list;
+ else if (retry_send(mad_send_wr))
+ list_item = &timeout_list;
+ else
continue;
- list_add_tail(&mad_send_wr->agent_list, &local_list);
+ change_mad_state(mad_send_wr, IB_MAD_STATE_DONE);
+ list_add_tail(&mad_send_wr->agent_list, list_item);
}
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) {
- if (mad_send_wr->status == IB_WC_SUCCESS)
- mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
- else
- mad_send_wc.status = mad_send_wr->status;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
- deref_mad_agent(mad_agent_priv);
- }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ process_backlog_mads(mad_agent_priv);
+ clear_mad_error_list(&timeout_list, IB_WC_RESP_TIMEOUT_ERR,
+ mad_agent_priv);
+ clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv);
}
/*
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 1b7445a6f671..f444357d33f4 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -95,13 +95,16 @@ struct ib_mad_agent_private {
spinlock_t lock;
struct list_head send_list;
+ unsigned int sol_fc_send_count;
struct list_head wait_list;
- struct list_head done_list;
+ unsigned int sol_fc_wait_count;
struct delayed_work timed_work;
unsigned long timeout;
struct list_head local_list;
struct work_struct local_work;
struct list_head rmpp_list;
+ unsigned int sol_fc_max;
+ struct list_head backlog_list;
refcount_t refcount;
union {
@@ -118,6 +121,32 @@ struct ib_mad_snoop_private {
struct completion comp;
};
+enum ib_mad_state {
+ /* MAD is in the making and is not yet in any list */
+ IB_MAD_STATE_INIT,
+ /* MAD is in backlog list */
+ IB_MAD_STATE_QUEUED,
+ /*
+ * MAD was sent to the QP and is waiting for completion
+ * notification in send list.
+ */
+ IB_MAD_STATE_SEND_START,
+ /*
+ * MAD send completed successfully, waiting for a response
+ * in wait list.
+ */
+ IB_MAD_STATE_WAIT_RESP,
+ /*
+ * Response came early, before send completion notification,
+ * in send list.
+ */
+ IB_MAD_STATE_EARLY_RESP,
+ /* MAD was canceled while in wait or send list */
+ IB_MAD_STATE_CANCELED,
+ /* MAD processing completed, MAD in no list */
+ IB_MAD_STATE_DONE
+};
+
struct ib_mad_send_wr_private {
struct ib_mad_list_head mad_list;
struct list_head agent_list;
@@ -132,8 +161,6 @@ struct ib_mad_send_wr_private {
int max_retries;
int retries_left;
int retry;
- int refcount;
- enum ib_wc_status status;
/* RMPP control */
struct list_head rmpp_list;
@@ -143,8 +170,48 @@ struct ib_mad_send_wr_private {
int seg_num;
int newwin;
int pad;
+
+ enum ib_mad_state state;
+
+ /* Solicited MAD flow control */
+ bool is_solicited_fc;
};
+static inline void expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state);
+}
+
+static inline void expect_mad_state2(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state1,
+ enum ib_mad_state expected_state2)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state1 &&
+ mad_send_wr->state != expected_state2);
+}
+
+static inline void expect_mad_state3(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state expected_state1,
+ enum ib_mad_state expected_state2,
+ enum ib_mad_state expected_state3)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state != expected_state1 &&
+ mad_send_wr->state != expected_state2 &&
+ mad_send_wr->state != expected_state3);
+}
+
+static inline void
+not_expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state wrong_state)
+{
+ if (IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON(mad_send_wr->state == wrong_state);
+}
+
struct ib_mad_local_private {
struct list_head completion_list;
struct ib_mad_private *mad_priv;
@@ -222,4 +289,7 @@ void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
unsigned long timeout_ms);
+void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr,
+ enum ib_mad_state new_state);
+
#endif /* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index 8af0619a39cd..1c5e0eaf1c94 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
recv_wc->recv_buf.grh, agent->port_num);
if (IS_ERR(ah))
- return (void *) ah;
+ return ERR_CAST(ah);
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
@@ -608,16 +608,20 @@ static void abort_send(struct ib_mad_agent_private *agent,
goto out; /* Unmatched send */
if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
- (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ (!mad_send_wr->timeout) ||
+ (mad_send_wr->state == IB_MAD_STATE_CANCELED))
goto out; /* Send is already done */
ib_mark_mad_done(mad_send_wr);
+ if (mad_send_wr->state == IB_MAD_STATE_DONE) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ wc.status = IB_WC_REM_ABORT_ERR;
+ wc.vendor_err = rmpp_status;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
spin_unlock_irqrestore(&agent->lock, flags);
-
- wc.status = IB_WC_REM_ABORT_ERR;
- wc.vendor_err = rmpp_status;
- wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &wc);
return;
out:
spin_unlock_irqrestore(&agent->lock, flags);
@@ -684,7 +688,8 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
}
if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
- (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ (!mad_send_wr->timeout) ||
+ (mad_send_wr->state == IB_MAD_STATE_CANCELED))
goto out; /* Send is already done */
if (seg_num > mad_send_wr->send_buf.seg_count ||
@@ -709,21 +714,24 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
struct ib_mad_send_wc wc;
ib_mark_mad_done(mad_send_wr);
+ if (mad_send_wr->state == IB_MAD_STATE_DONE) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ wc.status = IB_WC_SUCCESS;
+ wc.vendor_err = 0;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
spin_unlock_irqrestore(&agent->lock, flags);
-
- wc.status = IB_WC_SUCCESS;
- wc.vendor_err = 0;
- wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &wc);
return;
}
- if (mad_send_wr->refcount == 1)
+ if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP)
ib_reset_mad_timeout(mad_send_wr,
mad_send_wr->send_buf.timeout_ms);
spin_unlock_irqrestore(&agent->lock, flags);
ack_ds_ack(agent, mad_recv_wc);
return;
- } else if (mad_send_wr->refcount == 1 &&
+ } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP &&
mad_send_wr->seg_num < mad_send_wr->newwin &&
mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
/* Send failure will just result in a timeout/retry */
@@ -731,7 +739,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
if (ret)
goto out;
- mad_send_wr->refcount++;
+ change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START);
list_move_tail(&mad_send_wr->agent_list,
&mad_send_wr->mad_agent_priv->send_list);
}
@@ -890,7 +898,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->newwin = init_newwin(mad_send_wr);
/* We need to wait for the final ACK even if there isn't a response */
- mad_send_wr->refcount += (mad_send_wr->timeout == 0);
ret = send_next_seg(mad_send_wr);
if (!ret)
return IB_RMPP_RESULT_CONSUMED;
@@ -912,7 +919,7 @@ int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */
if (mad_send_wc->status != IB_WC_SUCCESS ||
- mad_send_wr->status != IB_WC_SUCCESS)
+ mad_send_wr->state == IB_MAD_STATE_CANCELED)
return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
if (!mad_send_wr->timeout)
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index a872643e8039..2220a2dfab24 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -255,7 +255,7 @@ EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex);
bool rdma_nl_get_privileged_qkey(void)
{
- return privileged_qkey || capable(CAP_NET_RAW);
+ return privileged_qkey;
}
EXPORT_SYMBOL(rdma_nl_get_privileged_qkey);
@@ -1469,10 +1469,11 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
};
-static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack,
- enum rdma_restrack_type res_type,
- res_fill_func_t fill_func)
+static noinline_for_stack int
+res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ enum rdma_restrack_type res_type,
+ res_fill_func_t fill_func)
{
const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
@@ -2263,10 +2264,10 @@ err:
return ret;
}
-static int stat_get_doit_default_counter(struct sk_buff *skb,
- struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack,
- struct nlattr *tb[])
+static noinline_for_stack int
+stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct nlattr *tb[])
{
struct rdma_hw_stats *stats;
struct nlattr *table_attr;
@@ -2356,8 +2357,9 @@ err:
return ret;
}
-static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack, struct nlattr *tb[])
+static noinline_for_stack int
+stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, struct nlattr *tb[])
{
static enum rdma_nl_counter_mode mode;
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 90c177edf9b0..18918f463361 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -1019,3 +1019,32 @@ void uverbs_finalize_object(struct ib_uobject *uobj,
WARN_ON(true);
}
}
+
+/**
+ * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the
+ * uverbs attributes file has CAP_NET_RAW
+ * capability or not.
+ *
+ * @attrs: Pointer to uverbs attributes
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false.
+ */
+bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ struct ib_ucontext *ucontext;
+ bool has_cap = false;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu);
+ ucontext = ib_uverbs_get_ucontext_file(ufile);
+ if (IS_ERR(ucontext))
+ goto out;
+ has_cap = rdma_dev_has_raw_cap(ucontext->device);
+
+out:
+ srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key);
+ return has_cap;
+}
+EXPORT_SYMBOL(rdma_uattrs_has_raw_cap);
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 33706dad6c0f..a59b087611cb 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[];
extern const struct uapi_definition uverbs_def_obj_cq[];
extern const struct uapi_definition uverbs_def_obj_device[];
extern const struct uapi_definition uverbs_def_obj_dm[];
+extern const struct uapi_definition uverbs_def_obj_dmah[];
extern const struct uapi_definition uverbs_def_obj_flow_action[];
extern const struct uapi_definition uverbs_def_obj_intf[];
extern const struct uapi_definition uverbs_def_obj_mr[];
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 3313410014cd..a7de6f403fca 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -100,6 +100,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
return container_of(res, struct rdma_counter, res)->device;
case RDMA_RESTRACK_SRQ:
return container_of(res, struct ib_srq, res)->device;
+ case RDMA_RESTRACK_DMAH:
+ return container_of(res, struct ib_dmah, res)->device;
default:
WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
return NULL;
diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c
index 6853c6d078f9..de5cb8bf0a61 100644
--- a/drivers/infiniband/core/ucaps.c
+++ b/drivers/infiniband/core/ucaps.c
@@ -170,7 +170,7 @@ int ib_create_ucap(enum rdma_user_cap type)
ucap->dev.class = &ucaps_class;
ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type);
ucap->dev.release = ucap_dev_release;
- ret = dev_set_name(&ucap->dev, ucap_names[type]);
+ ret = dev_set_name(&ucap->dev, "%s", ucap_names[type]);
if (ret)
goto err_device;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e9fa22d31c23..b1c44ec1a3f3 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -41,65 +41,83 @@
#include <linux/hugetlb.h>
#include <linux/interval_tree.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/pagemap.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
-static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
- const struct mmu_interval_notifier_ops *ops)
+static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
{
- int ret;
+ umem_odp->is_implicit_odp = 1;
+ umem_odp->umem.is_odp = 1;
+ mutex_init(&umem_odp->umem_mutex);
+}
+
+static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+ size_t page_size = 1UL << umem_odp->page_shift;
+ struct hmm_dma_map *map;
+ unsigned long start;
+ unsigned long end;
+ size_t nr_entries;
+ int ret = 0;
umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);
- if (!umem_odp->is_implicit_odp) {
- size_t page_size = 1UL << umem_odp->page_shift;
- unsigned long start;
- unsigned long end;
- size_t ndmas, npfns;
-
- start = ALIGN_DOWN(umem_odp->umem.address, page_size);
- if (check_add_overflow(umem_odp->umem.address,
- (unsigned long)umem_odp->umem.length,
- &end))
- return -EOVERFLOW;
- end = ALIGN(end, page_size);
- if (unlikely(end < page_size))
- return -EOVERFLOW;
-
- ndmas = (end - start) >> umem_odp->page_shift;
- if (!ndmas)
- return -EINVAL;
-
- npfns = (end - start) >> PAGE_SHIFT;
- umem_odp->pfn_list = kvcalloc(
- npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
- if (!umem_odp->pfn_list)
- return -ENOMEM;
-
- umem_odp->dma_list = kvcalloc(
- ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
- if (!umem_odp->dma_list) {
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ (unsigned long)umem_odp->umem.length, &end))
+ return -EOVERFLOW;
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
+ return -EOVERFLOW;
+ /*
+ * The mmu notifier can be called within reclaim contexts and takes the
+ * umem_mutex. This is rare to trigger in testing, teach lockdep about
+ * it.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ fs_reclaim_acquire(GFP_KERNEL);
+ mutex_lock(&umem_odp->umem_mutex);
+ mutex_unlock(&umem_odp->umem_mutex);
+ fs_reclaim_release(GFP_KERNEL);
+ }
+
+ nr_entries = (end - start) >> PAGE_SHIFT;
+ if (!(nr_entries * PAGE_SIZE / page_size))
+ return -EINVAL;
+
+ map = &umem_odp->map;
+ if (ib_uses_virt_dma(dev)) {
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
ret = -ENOMEM;
- goto out_pfn_list;
- }
+ } else
+ ret = hmm_dma_map_alloc(dev->dma_device, map,
+ (end - start) >> PAGE_SHIFT,
+ 1 << umem_odp->page_shift);
+ if (ret)
+ return ret;
- ret = mmu_interval_notifier_insert(&umem_odp->notifier,
- umem_odp->umem.owning_mm,
- start, end - start, ops);
- if (ret)
- goto out_dma_list;
- }
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm, start,
+ end - start, ops);
+ if (ret)
+ goto out_free_map;
return 0;
-out_dma_list:
- kvfree(umem_odp->dma_list);
-out_pfn_list:
- kvfree(umem_odp->pfn_list);
+out_free_map:
+ if (ib_uses_virt_dma(dev))
+ kfree(map->pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, map);
return ret;
}
@@ -118,7 +136,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
{
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
- int ret;
if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);
@@ -130,16 +147,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
- umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
- ret = ib_init_umem_odp(umem_odp, NULL);
- if (ret) {
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
- return ERR_PTR(ret);
- }
+ ib_init_umem_implicit_odp(umem_odp);
return umem_odp;
}
EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
@@ -260,74 +271,41 @@ err_put_pid:
}
EXPORT_SYMBOL(ib_umem_odp_get);
-void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
+static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+
/*
* Ensure that no more pages are mapped in the umem.
*
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
- if (!umem_odp->is_implicit_odp) {
- mutex_lock(&umem_odp->umem_mutex);
- ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
- ib_umem_end(umem_odp));
- mutex_unlock(&umem_odp->umem_mutex);
- mmu_interval_notifier_remove(&umem_odp->notifier);
- kvfree(umem_odp->dma_list);
- kvfree(umem_odp->pfn_list);
- }
- put_pid(umem_odp->tgid);
- kfree(umem_odp);
+ mutex_lock(&umem_odp->umem_mutex);
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ mutex_unlock(&umem_odp->umem_mutex);
+ mmu_interval_notifier_remove(&umem_odp->notifier);
+ if (ib_uses_virt_dma(dev))
+ kfree(umem_odp->map.pfn_list);
+ else
+ hmm_dma_map_free(dev->dma_device, &umem_odp->map);
}
-EXPORT_SYMBOL(ib_umem_odp_release);
-/*
- * Map for DMA and insert a single page into the on-demand paging page tables.
- *
- * @umem: the umem to insert the page to.
- * @dma_index: index in the umem to add the dma to.
- * @page: the page struct to map and add.
- * @access_mask: access permissions needed for this page.
- *
- * The function returns -EFAULT if the DMA mapping operation fails.
- *
- */
-static int ib_umem_odp_map_dma_single_page(
- struct ib_umem_odp *umem_odp,
- unsigned int dma_index,
- struct page *page,
- u64 access_mask)
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
- struct ib_device *dev = umem_odp->umem.ibdev;
- dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
-
- if (*dma_addr) {
- /*
- * If the page is already dma mapped it means it went through
- * a non-invalidating trasition, like read-only to writable.
- * Resync the flags.
- */
- *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
- return 0;
- }
+ if (!umem_odp->is_implicit_odp)
+ ib_umem_odp_free(umem_odp);
- *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
- DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(dev, *dma_addr)) {
- *dma_addr = 0;
- return -EFAULT;
- }
- umem_odp->npages++;
- *dma_addr |= access_mask;
- return 0;
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
}
+EXPORT_SYMBOL(ib_umem_odp_release);
/**
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
*
* Maps the range passed in the argument to DMA addresses.
- * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
* Upon success the ODP MR will be locked to let caller complete its device
* page table update.
*
@@ -355,9 +333,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;
- if (access_mask == 0)
- return -EINVAL;
-
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@@ -383,11 +358,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;
- if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
- range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
+ range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
retry:
@@ -415,22 +390,17 @@ retry:
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
- if (fault) {
- /*
- * Since we asked for hmm_range_fault() to populate
- * pages it shouldn't return an error entry on success.
- */
- WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
- WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
- } else {
- if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
- WARN_ON(umem_odp->dma_list[dma_index]);
- continue;
- }
- access_mask = ODP_READ_ALLOWED_BIT;
- if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
- access_mask |= ODP_WRITE_ALLOWED_BIT;
- }
+ /*
+ * Since we asked for hmm_range_fault() to populate
+ * pages it shouldn't return an error entry on success.
+ */
+ WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+ WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
+ continue;
+
+ if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
+ continue;
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -443,15 +413,6 @@ retry:
__func__, hmm_order, page_shift);
break;
}
-
- ret = ib_umem_odp_map_dma_single_page(
- umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
- access_mask);
- if (ret < 0) {
- ibdev_dbg(umem_odp->umem.ibdev,
- "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
- break;
- }
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@@ -471,45 +432,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
- dma_addr_t dma_addr;
- dma_addr_t dma;
- int idx;
- u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
+ u64 addr;
lockdep_assert_held(&umem_odp->umem_mutex);
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
- idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
- dma = umem_odp->dma_list[idx];
-
- /* The access flags guaranteed a valid DMA address in case was NULL */
- if (dma) {
- unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
- struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
-
- dma_addr = dma & ODP_DMA_ADDR_MASK;
- ib_dma_unmap_page(dev, dma_addr,
- BIT(umem_odp->page_shift),
- DMA_BIDIRECTIONAL);
- if (dma & ODP_WRITE_ALLOWED_BIT) {
- struct page *head_page = compound_head(page);
- /*
- * set_page_dirty prefers being called with
- * the page lock. However, MMU notifiers are
- * called sometimes with and sometimes without
- * the lock. We rely on the umem_mutex instead
- * to prevent other mmu notifiers from
- * continuing and allowing the page mapping to
- * be removed.
- */
- set_page_dirty(head_page);
- }
- umem_odp->dma_list[idx] = 0;
- umem_odp->npages--;
+ u64 offset = addr - ib_umem_start(umem_odp);
+ size_t idx = offset >> umem_odp->page_shift;
+ unsigned long pfn = umem_odp->map.pfn_list[idx];
+
+ if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
+ goto clear;
+
+ if (pfn & HMM_PFN_WRITE) {
+ struct page *page = hmm_pfn_to_page(pfn);
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
}
+ umem_odp->npages--;
+clear:
+ umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3c3bb670c805..ce16404cdfb8 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -193,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
fd, attrs);
if (IS_ERR(uobj))
- return (void *)uobj;
+ return ERR_CAST(uobj);
uverbs_uobject_get(uobj);
uobj_put_read(uobj);
@@ -741,7 +741,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
}
mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
- cmd.access_flags,
+ cmd.access_flags, NULL,
&attrs->driver_udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
@@ -1312,9 +1312,9 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
switch (cmd->qp_type) {
case IB_QPT_RAW_PACKET:
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
- break;
+ fallthrough;
case IB_QPT_RC:
case IB_QPT_UC:
case IB_QPT_UD:
@@ -1451,7 +1451,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
}
if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
- if (!capable(CAP_NET_RAW)) {
+ if (!rdma_uattrs_has_raw_cap(attrs)) {
ret = -EPERM;
goto err_put;
}
@@ -1877,7 +1877,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs,
attr->path_mig_state = cmd->base.path_mig_state;
if (cmd->base.attr_mask & IB_QP_QKEY) {
if (cmd->base.qkey & IB_QP_SET_QKEY &&
- !rdma_nl_get_privileged_qkey()) {
+ !(rdma_nl_get_privileged_qkey() ||
+ rdma_uattrs_has_raw_cap(attrs))) {
ret = -EPERM;
goto release_qp;
}
@@ -3225,7 +3226,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
if (cmd.comp_mask)
return -EINVAL;
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 432054f0a8a4..37cd37556510 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -64,15 +64,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
struct ib_ucq_object *obj = container_of(
uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
typeof(*obj), uevent.uobject);
+ struct ib_uverbs_completion_event_file *ev_file = NULL;
struct ib_device *ib_dev = attrs->context->device;
- int ret;
- u64 user_handle;
+ struct ib_umem_dmabuf *umem_dmabuf;
struct ib_cq_init_attr attr = {};
- struct ib_cq *cq;
- struct ib_uverbs_completion_event_file *ev_file = NULL;
struct ib_uobject *ev_file_uobj;
+ struct ib_umem *umem = NULL;
+ u64 buffer_length;
+ u64 buffer_offset;
+ struct ib_cq *cq;
+ u64 user_handle;
+ u64 buffer_va;
+ int buffer_fd;
+ int ret;
- if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq)
+ if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq)
return -EOPNOTSUPP;
ret = uverbs_copy_from(&attr.comp_vector, attrs,
@@ -112,9 +118,66 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->uevent.event_list);
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) {
+
+ ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+ if (ret)
+ goto err_event_file;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) ||
+ uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+ !ib_dev->ops.create_cq_umem) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(umem)) {
+ ret = PTR_ERR(umem);
+ goto err_event_file;
+ }
+ } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) {
+
+ ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET);
+ if (ret)
+ goto err_event_file;
+
+ ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+ if (ret)
+ goto err_event_file;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) ||
+ !ib_dev->ops.create_cq_umem) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length,
+ buffer_fd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(umem_dmabuf)) {
+ ret = PTR_ERR(umem_dmabuf);
+ goto err_event_file;
+ }
+ umem = &umem_dmabuf->umem;
+ } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+ uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) ||
+ !ib_dev->ops.create_cq) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
if (!cq) {
ret = -ENOMEM;
+ ib_umem_release(umem);
goto err_event_file;
}
@@ -128,7 +191,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
rdma_restrack_set_name(&cq->res, NULL);
- ret = ib_dev->ops.create_cq(cq, &attr, attrs);
+ ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) :
+ ib_dev->ops.create_cq(cq, &attr, attrs);
if (ret)
goto err_free;
@@ -180,6 +244,17 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_OBJECT_ASYNC_EVENT,
UVERBS_ACCESS_READ,
UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
UVERBS_ATTR_UHW());
static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c
new file mode 100644
index 000000000000..453ce656c6f2
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_dmah.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+#include "restrack.h"
+
+static int uverbs_free_dmah(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dmah *dmah = uobject->object;
+ int ret;
+
+ if (atomic_read(&dmah->usecnt))
+ return -EBUSY;
+
+ ret = dmah->device->ops.dealloc_dmah(dmah, attrs);
+ if (ret)
+ return ret;
+
+ rdma_restrack_del(&dmah->res);
+ kfree(dmah);
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE)
+ ->obj_attr.uobject;
+ struct ib_device *ib_dev = attrs->context->device;
+ struct ib_dmah *dmah;
+ int ret;
+
+ dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah);
+ if (!dmah)
+ return -ENOMEM;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) {
+ ret = uverbs_copy_from(&dmah->cpu_id, attrs,
+ UVERBS_ATTR_ALLOC_DMAH_CPU_ID);
+ if (ret)
+ goto err;
+
+ if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS);
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) {
+ dmah->mem_type = uverbs_attr_get_enum_id(attrs,
+ UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE);
+ dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS);
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) {
+ ret = uverbs_copy_from(&dmah->ph, attrs,
+ UVERBS_ATTR_ALLOC_DMAH_PH);
+ if (ret)
+ goto err;
+
+ /* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */
+ if (dmah->ph & 0xFC) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS);
+ }
+
+ dmah->device = ib_dev;
+ dmah->uobject = uobj;
+ atomic_set(&dmah->usecnt, 0);
+
+ rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH);
+ rdma_restrack_set_name(&dmah->res, NULL);
+
+ ret = ib_dev->ops.alloc_dmah(dmah, attrs);
+ if (ret) {
+ rdma_restrack_put(&dmah->res);
+ goto err;
+ }
+
+ uobj->object = dmah;
+ rdma_restrack_add(&dmah->res);
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE);
+ return 0;
+err:
+ kfree(dmah);
+ return ret;
+}
+
+static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = {
+ [TPH_MEM_TYPE_VM] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_NO_DATA(),
+ },
+ [TPH_MEM_TYPE_PM] = {
+ .type = UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_NO_DATA(),
+ },
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_DMAH_ALLOC,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE,
+ uverbs_dmah_mem_type,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH,
+ UVERBS_ATTR_TYPE(u8),
+ UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_DMAH_FREE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah),
+ &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC),
+ &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE));
+
+const struct uapi_definition uverbs_def_obj_dmah[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah),
+ UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 7ebc7bd3caae..570b9656801d 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -238,7 +238,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
return ret;
mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd,
- access_flags,
+ access_flags, NULL,
attrs);
if (IS_ERR(mr))
return PTR_ERR(mr);
@@ -266,6 +266,135 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
return ret;
}
+static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE);
+ u32 valid_access_flags = IB_ACCESS_SUPPORTED;
+ u64 length, iova, fd_offset = 0, addr = 0;
+ struct ib_device *ib_dev = pd->device;
+ struct ib_dmah *dmah = NULL;
+ bool has_fd_offset = false;
+ bool has_addr = false;
+ bool has_fd = false;
+ u32 access_flags;
+ struct ib_mr *mr;
+ int fd;
+ int ret;
+
+ ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH);
+ if (ret)
+ return ret;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) {
+ ret = uverbs_copy_from(&addr, attrs,
+ UVERBS_ATTR_REG_MR_ADDR);
+ if (ret)
+ return ret;
+ has_addr = true;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) {
+ ret = uverbs_copy_from(&fd_offset, attrs,
+ UVERBS_ATTR_REG_MR_FD_OFFSET);
+ if (ret)
+ return ret;
+ has_fd_offset = true;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) {
+ ret = uverbs_get_raw_fd(&fd, attrs,
+ UVERBS_ATTR_REG_MR_FD);
+ if (ret)
+ return ret;
+ has_fd = true;
+ }
+
+ if (has_fd) {
+ if (!ib_dev->ops.reg_user_mr_dmabuf)
+ return -EOPNOTSUPP;
+
+ /* FD requires offset and can't come with addr */
+ if (!has_fd_offset || has_addr)
+ return -EINVAL;
+
+ if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+ return -EINVAL;
+
+ valid_access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_RELAXED_ORDERING;
+ } else {
+ if (!has_addr || has_fd_offset)
+ return -EINVAL;
+
+ if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+ return -EINVAL;
+ }
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) {
+ dmah = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_REG_MR_DMA_HANDLE);
+ if (IS_ERR(dmah))
+ return PTR_ERR(dmah);
+ }
+
+ ret = uverbs_get_flags32(&access_flags, attrs,
+ UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+ valid_access_flags);
+ if (ret)
+ return ret;
+
+ ret = ib_check_mr_access(ib_dev, access_flags);
+ if (ret)
+ return ret;
+
+ if (has_fd)
+ mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length,
+ iova, fd, access_flags,
+ dmah, attrs);
+ else
+ mr = pd->device->ops.reg_user_mr(pd, addr, length, iova,
+ access_flags, dmah, NULL);
+
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_USER;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ if (dmah) {
+ mr->dmah = dmah;
+ atomic_inc(&dmah->usecnt);
+ }
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&mr->res, NULL);
+ rdma_restrack_add(&mr->res);
+ uobj->object = mr;
+
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY,
+ &mr->lkey, sizeof(mr->lkey));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+ return ret;
+}
+
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_ADVISE_MR,
UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
@@ -362,6 +491,44 @@ DECLARE_UVERBS_NAMED_METHOD(
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY));
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_REG_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE,
+ UVERBS_OBJECT_DMAH,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+ enum ib_access_flags,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL),
+ UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
DECLARE_UVERBS_NAMED_METHOD_DESTROY(
UVERBS_METHOD_MR_DESTROY,
UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
@@ -376,7 +543,8 @@ DECLARE_UVERBS_NAMED_OBJECT(
&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
&UVERBS_METHOD(UVERBS_METHOD_QUERY_MR),
- &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR));
+ &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR),
+ &UVERBS_METHOD(UVERBS_METHOD_REG_MR));
const struct uapi_definition uverbs_def_obj_mr[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c
index 7b4773fa4bc0..be0730e8509e 100644
--- a/drivers/infiniband/core/uverbs_std_types_qp.c
+++ b/drivers/infiniband/core/uverbs_std_types_qp.c
@@ -133,7 +133,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)(
device = xrcd->device;
break;
case IB_UVERBS_QPT_RAW_PACKET:
- if (!capable(CAP_NET_RAW))
+ if (!rdma_uattrs_has_raw_cap(attrs))
return -EPERM;
fallthrough;
case IB_UVERBS_QPT_RC:
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index a02916a3a79c..e00ea63175bd 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = {
UAPI_DEF_CHAIN(uverbs_def_obj_cq),
UAPI_DEF_CHAIN(uverbs_def_obj_device),
UAPI_DEF_CHAIN(uverbs_def_obj_dm),
+ UAPI_DEF_CHAIN(uverbs_def_obj_dmah),
UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
UAPI_DEF_CHAIN(uverbs_def_obj_intf),
UAPI_DEF_CHAIN(uverbs_def_obj_mr),
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index c5e78bbefbd0..3a5f81402d2f 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -572,7 +572,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
GFP_KERNEL : GFP_ATOMIC);
if (IS_ERR(slave)) {
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
- return (void *)slave;
+ return ERR_CAST(slave);
}
ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
rdma_lag_put_ah_roce_slave(slave);
@@ -2223,7 +2223,7 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
}
mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
- access_flags, NULL);
+ access_flags, NULL, NULL);
if (IS_ERR(mr))
return mr;
@@ -2262,6 +2262,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
{
struct ib_pd *pd = mr->pd;
struct ib_dm *dm = mr->dm;
+ struct ib_dmah *dmah = mr->dmah;
struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
int ret;
@@ -2272,6 +2273,8 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
atomic_dec(&pd->usecnt);
if (dm)
atomic_dec(&dm->usecnt);
+ if (dmah)
+ atomic_dec(&dmah->usecnt);
kfree(sig_attrs);
}