diff options
Diffstat (limited to 'drivers/infiniband')
47 files changed, 1158 insertions, 648 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index b84791f03a27..5ceda710f516 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -31,17 +31,6 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from <http://www.openfabrics.org/git/>. -config INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - bool "Experimental and unstable ABI for userspace access to flow steering verbs" - depends on INFINIBAND_USER_ACCESS - depends on STAGING - ---help--- - The final ABI for userspace access to flow steering verbs - has not been defined. To use the current ABI, *WHICH WILL - CHANGE IN THE FUTURE*, say Y here. - - If unsure, say N. - config INFINIBAND_USER_MEM bool depends on INFINIBAND_USER_ACCESS != n diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 784b97cb05b0..f2ef7ef0f36f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -383,14 +383,11 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; int id; - static int next_id; idr_preload(GFP_KERNEL); spin_lock_irqsave(&cm.lock, flags); - id = idr_alloc(&cm.local_id_table, cm_id_priv, next_id, 0, GFP_NOWAIT); - if (id >= 0) - next_id = max(id + 1, 0); + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); spin_unlock_irqrestore(&cm.lock, flags); idr_preload_end(); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index dab4b41f1715..8e49db690f33 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -328,28 +328,6 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) return ret; } -static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) -{ - int i; - int err; - struct ib_port_attr props; - union ib_gid tmp; - - err = ib_query_port(device, port_num, &props); - if (err) - return err; - - for (i = 0; i < props.gid_tbl_len; ++i) { - err = ib_query_gid(device, port_num, i, &tmp); - if (err) - return err; - if (!memcmp(&tmp, gid, sizeof tmp)) - return 0; - } - - return -EADDRNOTAVAIL; -} - static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; @@ -371,13 +349,14 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a return ret; } -static int cma_acquire_dev(struct rdma_id_private *id_priv) +static int cma_acquire_dev(struct rdma_id_private *id_priv, + struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; union ib_gid gid, iboe_gid; int ret = -ENODEV; - u8 port; + u8 port, found_port; enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; @@ -389,17 +368,39 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv) iboe_addr_get_sgid(dev_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); + if (listen_id_priv && + rdma_port_get_link_layer(listen_id_priv->id.device, + listen_id_priv->id.port_num) == dev_ll) { + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, + &found_port, NULL); + else + ret = ib_find_cached_gid(cma_dev->device, &gid, + &found_port, NULL); + + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; + goto out; + } + } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + if (listen_id_priv && + listen_id_priv->cma_dev == cma_dev && + listen_id_priv->id.port_num == port) + continue; if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = find_gid_port(cma_dev->device, &iboe_gid, port); + ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); else - ret = find_gid_port(cma_dev->device, &gid, port); + ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); - if (!ret) { - id_priv->id.port_num = port; + if (!ret && (port == found_port)) { + id_priv->id.port_num = found_port; goto out; } } @@ -1292,7 +1293,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id); + ret = cma_acquire_dev(conn_id, listen_id); if (ret) goto err2; @@ -1451,7 +1452,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct net_device *dev = NULL; struct rdma_cm_event event; int ret; struct ib_device_attr attr; @@ -1481,7 +1481,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; } - ret = cma_acquire_dev(conn_id); + ret = cma_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -1529,8 +1529,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, cma_deref_id(conn_id); out: - if (dev) - dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } @@ -1848,6 +1846,26 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) return 0; } +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + int prio; + struct net_device *dev; + + prio = rt_tos2priority(tos); + dev = ndev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(ndev) : ndev; + + if (dev->num_tc) + return netdev_get_prio_tc_map(dev, prio); + +#if IS_ENABLED(CONFIG_VLAN_8021Q) + if (ndev->priv_flags & IFF_802_1Q_VLAN) + return (vlan_dev_get_egress_qos_mask(ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +#endif + return 0; +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -1888,11 +1906,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; - route->path_rec->sl = netdev_get_prio_tc_map( - ndev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_real_dev(ndev) : ndev, - rt_tos2priority(id_priv->tos)); - + route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); @@ -2050,7 +2064,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, goto out; if (!status && !id_priv->cma_dev) - status = cma_acquire_dev(id_priv); + status = cma_acquire_dev(id_priv, NULL); if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, @@ -2294,7 +2308,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) int low, high, remaining; unsigned int rover; - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(&init_net, &low, &high); remaining = (high - low) + 1; rover = net_random() % remaining + low; retry: @@ -2547,7 +2561,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; - ret = cma_acquire_dev(id_priv); + ret = cma_acquire_dev(id_priv, NULL); if (ret) goto err1; } diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index c47c2034ca71..0717940ec3b5 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -181,9 +181,16 @@ static void add_ref(struct iw_cm_id *cm_id) static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; + int cb_destroy; + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - if (iwcm_deref_id(cm_id_priv) && - test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)) { + + /* + * Test bit before deref in case the cm_id gets freed on another + * thread. + */ + cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags); + if (iwcm_deref_id(cm_id_priv) && cb_destroy) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); } diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index da06abde9e0d..a1e9cba84944 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -148,7 +148,7 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) list_for_each_entry(client, &client_list, list) { if (client->index == index) { if (op < 0 || op >= client->nops || - !client->cb_table[RDMA_NL_GET_OP(op)].dump) + !client->cb_table[op].dump) return -EINVAL; { diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index cde1e7b5b85d..faad2caf22b1 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -612,6 +612,7 @@ static ssize_t show_node_type(struct device *device, switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index b0f189be543b..ab8b1c30b36b 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -57,7 +57,7 @@ MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; static struct ctl_table_header *ucma_ctl_table_hdr; -static ctl_table ucma_ctl_table[] = { +static struct ctl_table ucma_ctl_table[] = { { .procname = "max_backlog", .data = &max_backlog, @@ -271,7 +271,7 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, goto out; } ctx->backlog--; - } else if (!ctx->uid) { + } else if (!ctx->uid || ctx->cm_id != cm_id) { /* * We ignore events for new connections until userspace has set * their context. This can only happen if an error occurs on a diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index d8f9c6c272d7..a283274a5a09 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -47,6 +47,22 @@ #include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (const void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + /* * Our lifetime rules for these structs are the following: * @@ -178,6 +194,22 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + }; +}; + #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ const char __user *buf, int in_len, \ @@ -217,9 +249,13 @@ IB_UVERBS_DECLARE_CMD(destroy_srq); IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -IB_UVERBS_DECLARE_CMD(create_flow); -IB_UVERBS_DECLARE_CMD(destroy_flow); -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + +#define IB_UVERBS_DECLARE_EX_CMD(name) \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +IB_UVERBS_DECLARE_EX_CMD(create_flow); +IB_UVERBS_DECLARE_EX_CMD(destroy_flow); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2f0f01b70e3b..f1cc83855af6 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -54,17 +54,7 @@ static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ - -#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->inbuf = (void __user *) (ibuf); \ - (udata)->outbuf = (void __user *) (obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) /* * The ib_uobject locking scheme is as follows: @@ -939,13 +929,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - /* - * Local write permission is required if remote write or - * remote atomic permission is also requested. - */ - if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && - !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) - return -EINVAL; + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) @@ -2128,6 +2114,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; + if (next->opcode == IB_WR_SEND_WITH_IMM) + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; } else { switch (next->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: @@ -2601,10 +2590,12 @@ out_put: return ret ? ret : in_len; } -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec, +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { + if (kern_spec->reserved) + return -EINVAL; + ib_spec->type = kern_spec->type; switch (ib_spec->type) { @@ -2642,28 +2633,34 @@ static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec, return 0; } -ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; struct ib_flow *flow_id; - struct ib_kern_flow_attr *kern_flow_attr; + struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; int err = 0; void *kern_spec; void *ib_spec; int i; - int kern_attr_size; - if (out_len < sizeof(resp)) + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + if (ucore->outlen < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; @@ -2672,32 +2669,31 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) return -EPERM; - if (cmd.flow_attr.num_of_specs < 0 || - cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; - kern_attr_size = cmd.flow_attr.size - sizeof(cmd) - - sizeof(struct ib_uverbs_cmd_hdr_ex); + if (cmd.flow_attr.size > ucore->inlen || + cmd.flow_attr.size > + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) + return -EINVAL; - if (cmd.flow_attr.size < 0 || cmd.flow_attr.size > in_len || - kern_attr_size < 0 || kern_attr_size > - (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec))) + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) return -EINVAL; if (cmd.flow_attr.num_of_specs) { - kern_flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); if (!kern_flow_attr) return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); - if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd), - kern_attr_size)) { - err = -EFAULT; + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) goto err_free_attr; - } } else { kern_flow_attr = &cmd.flow_attr; - kern_attr_size = sizeof(cmd.flow_attr); } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); @@ -2714,7 +2710,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, goto err_uobj; } - flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; @@ -2729,19 +2725,23 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; - for (i = 0; i < flow_attr->num_of_specs && kern_attr_size > 0; i++) { + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && + cmd.flow_attr.size >= + ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - kern_attr_size -= ((struct ib_kern_spec *) kern_spec)->size; - kern_spec += ((struct ib_kern_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } - if (kern_attr_size) { - pr_warn("create flow failed, %d bytes left from uverb cmd\n", - kern_attr_size); + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); + err = -EINVAL; goto err_free; } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); @@ -2760,11 +2760,10 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; - if (copy_to_user((void __user *)(unsigned long) cmd.response, - &resp, sizeof(resp))) { - err = -EFAULT; + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) goto err_copy; - } put_qp_read(qp); mutex_lock(&file->mutex); @@ -2777,7 +2776,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - return in_len; + return 0; err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: @@ -2794,16 +2793,24 @@ err_free_attr: return err; } -ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) { +int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ struct ib_uverbs_destroy_flow cmd; struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); @@ -2825,9 +2832,8 @@ ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, put_uobj(uobj); - return ret ? ret : in_len; + return ret; } -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_uverbs_create_xsrq *cmd, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2df31f68ea09..08219fb3338b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -115,10 +115,13 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - [IB_USER_VERBS_CMD_CREATE_FLOW] = ib_uverbs_create_flow, - [IB_USER_VERBS_CMD_DESTROY_FLOW] = ib_uverbs_destroy_flow -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ +}; + +static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow }; static void ib_uverbs_add_one(struct ib_device *device); @@ -589,6 +592,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_cmd_hdr hdr; + __u32 flags; if (count < sizeof hdr) return -EINVAL; @@ -596,45 +600,110 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command]) - return -EINVAL; + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - if (!file->ucontext && - hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + if (!flags) { + __u32 command; - if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) - return -ENOSYS; + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) { - struct ib_uverbs_cmd_hdr_ex hdr_ex; + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - if (copy_from_user(&hdr_ex, buf, sizeof(hdr_ex))) - return -EFAULT; + if (command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[command]) + return -EINVAL; - if (((hdr_ex.in_words + hdr_ex.provider_in_words) * 4) != count) + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr_ex), - (hdr_ex.in_words + - hdr_ex.provider_in_words) * 4, - (hdr_ex.out_words + - hdr_ex.provider_out_words) * 4); - } else { -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) + return -ENOSYS; + if (hdr.in_words * 4 != count) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING + return uverbs_cmd_table[command](file, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + + } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + __u32 command; + + struct ib_uverbs_ex_cmd_hdr ex_hdr; + struct ib_udata ucore; + struct ib_udata uhw; + int err; + size_t written_count = count; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || + !uverbs_ex_cmd_table[command]) + return -ENOSYS; + + if (!file->ucontext) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + count -= sizeof(hdr) + sizeof(ex_hdr); + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.cmd_hdr_reserved) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr.out_words && !ex_hdr.provider_out_words) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, + (void __user *) (unsigned long) ex_hdr.response, + (hdr.out_words + ex_hdr.provider_out_words) * 8)) + return -EFAULT; + } else { + if (hdr.out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, + hdr.in_words * 8, hdr.out_words * 8); + + INIT_UDATA_BUF_OR_NULL(&uhw, + buf + ucore.inlen, + (unsigned long) ex_hdr.response + ucore.outlen, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + err = uverbs_ex_cmd_table[command](file, + &ucore, + &uhw); + + if (err) + return err; + + return written_count; } -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + + return -ENOSYS; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a321df28bab2..d4f6ddf72ffa 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -114,6 +114,8 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; default: BUG(); return 0; @@ -130,6 +132,7 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_ case RDMA_TRANSPORT_IB: return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; @@ -958,6 +961,11 @@ EXPORT_SYMBOL(ib_resize_cq); struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); mr = pd->device->get_dma_mr(pd, mr_access_flags); @@ -980,6 +988,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, u64 *iova_start) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); if (!pd->device->reg_phys_mr) return ERR_PTR(-ENOSYS); @@ -1010,6 +1023,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr, struct ib_pd *old_pd; int ret; + ret = ib_check_mr_access(mr_access_flags); + if (ret) + return ret; + if (!mr->device->rereg_phys_mr) return -ENOSYS; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 12fef76c791c..45126879ad28 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -524,50 +524,6 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } -#define VLAN_NONE 0xfff -#define FILTER_SEL_VLAN_NONE 0xffff -#define FILTER_SEL_WIDTH_P_FC (3+1) /* port uses 3 bits, FCoE one bit */ -#define FILTER_SEL_WIDTH_VIN_P_FC \ - (6 + 7 + FILTER_SEL_WIDTH_P_FC) /* 6 bits are unused, VF uses 7 bits*/ -#define FILTER_SEL_WIDTH_TAG_P_FC \ - (3 + FILTER_SEL_WIDTH_VIN_P_FC) /* PF uses 3 bits */ -#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC) - -static unsigned int select_ntuple(struct c4iw_dev *dev, struct dst_entry *dst, - struct l2t_entry *l2t) -{ - unsigned int ntuple = 0; - u32 viid; - - switch (dev->rdev.lldi.filt_mode) { - - /* default filter mode */ - case HW_TPL_FR_MT_PR_IV_P_FC: - if (l2t->vlan == VLAN_NONE) - ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC; - else { - ntuple |= l2t->vlan << FILTER_SEL_WIDTH_P_FC; - ntuple |= 1 << FILTER_SEL_WIDTH_TAG_P_FC; - } - ntuple |= l2t->lport << S_PORT | IPPROTO_TCP << - FILTER_SEL_WIDTH_VLD_TAG_P_FC; - break; - case HW_TPL_FR_MT_PR_OV_P_FC: { - viid = cxgb4_port_viid(l2t->neigh->dev); - - ntuple |= FW_VIID_VIN_GET(viid) << FILTER_SEL_WIDTH_P_FC; - ntuple |= FW_VIID_PFN_GET(viid) << FILTER_SEL_WIDTH_VIN_P_FC; - ntuple |= FW_VIID_VIVLD_GET(viid) << FILTER_SEL_WIDTH_TAG_P_FC; - ntuple |= l2t->lport << S_PORT | IPPROTO_TCP << - FILTER_SEL_WIDTH_VLD_TAG_P_FC; - break; - } - default: - break; - } - return ntuple; -} - static int send_connect(struct c4iw_ep *ep) { struct cpl_act_open_req *req; @@ -641,8 +597,9 @@ static int send_connect(struct c4iw_ep *ep) req->local_ip = la->sin_addr.s_addr; req->peer_ip = ra->sin_addr.s_addr; req->opt0 = cpu_to_be64(opt0); - req->params = cpu_to_be32(select_ntuple(ep->com.dev, - ep->dst, ep->l2t)); + req->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); req->opt2 = cpu_to_be32(opt2); } else { req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen); @@ -662,9 +619,9 @@ static int send_connect(struct c4iw_ep *ep) req6->peer_ip_lo = *((__be64 *) (ra6->sin6_addr.s6_addr + 8)); req6->opt0 = cpu_to_be64(opt0); - req6->params = cpu_to_be32( - select_ntuple(ep->com.dev, ep->dst, - ep->l2t)); + req6->params = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); req6->opt2 = cpu_to_be32(opt2); } } else { @@ -681,8 +638,9 @@ static int send_connect(struct c4iw_ep *ep) t5_req->peer_ip = ra->sin_addr.s_addr; t5_req->opt0 = cpu_to_be64(opt0); t5_req->params = cpu_to_be64(V_FILTER_TUPLE( - select_ntuple(ep->com.dev, - ep->dst, ep->l2t))); + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t))); t5_req->opt2 = cpu_to_be32(opt2); } else { t5_req6 = (struct cpl_t5_act_open_req6 *) @@ -703,7 +661,9 @@ static int send_connect(struct c4iw_ep *ep) (ra6->sin6_addr.s6_addr + 8)); t5_req6->opt0 = cpu_to_be64(opt0); t5_req6->params = (__force __be64)cpu_to_be32( - select_ntuple(ep->com.dev, ep->dst, ep->l2t)); + cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], + ep->l2t)); t5_req6->opt2 = cpu_to_be32(opt2); } } @@ -1630,7 +1590,8 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) memset(req, 0, sizeof(*req)); req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR)); req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16))); - req->le.filter = cpu_to_be32(select_ntuple(ep->com.dev, ep->dst, + req->le.filter = cpu_to_be32(cxgb4_select_ntuple( + ep->com.dev->rdev.lldi.ports[0], ep->l2t)); sin = (struct sockaddr_in *)&ep->com.local_addr; req->le.lport = sin->sin_port; @@ -2938,7 +2899,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) /* * Allocate a server TID. */ - if (dev->rdev.lldi.enable_fw_ofld_conn) + if (dev->rdev.lldi.enable_fw_ofld_conn && + ep->com.local_addr.ss_family == AF_INET) ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids, cm_id->local_addr.ss_family, ep); else @@ -3323,9 +3285,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) /* * Calculate the server tid from filter hit index from cpl_rx_pkt. */ - stid = (__force int) cpu_to_be32((__force u32) rss->hash_val) - - dev->rdev.lldi.tids->sftid_base - + dev->rdev.lldi.tids->nstids; + stid = (__force int) cpu_to_be32((__force u32) rss->hash_val); lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid); if (!lep) { @@ -3397,7 +3357,9 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) window = (__force u16) htons((__force u16)tcph->window); /* Calcuate filter portion for LE region. */ - filter = (__force unsigned int) cpu_to_be32(select_ntuple(dev, dst, e)); + filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple( + dev->rdev.lldi.ports[0], + e)); /* * Synthesize the cpl_pass_accept_req. We have everything except the diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 33d2cc6ab562..4a033853312e 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -602,10 +602,10 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.start, rdev->lldi.vr->cq.size); - PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu " + PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu " "qpmask 0x%x cqshift %lu cqmask 0x%x\n", (unsigned)pci_resource_len(rdev->lldi.pdev, 2), - (void *)(unsigned long)pci_resource_start(rdev->lldi.pdev, 2), + (u64)pci_resource_start(rdev->lldi.pdev, 2), rdev->lldi.db_reg, rdev->lldi.gts_reg, rdev->qpshift, rdev->qpmask, diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 4cb8eb24497c..84e45006451c 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -173,7 +173,7 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len, return ret; } -int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) +static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data) { u32 remain = len; u32 dmalen; diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c index f5cb13b21445..cc04b7ba3488 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -280,9 +280,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, int j; int ret; - ret = get_user_pages(current, current->mm, addr, - npages, 0, 1, pages, NULL); - + ret = get_user_pages_fast(addr, npages, 0, pages); if (ret != npages) { int i; @@ -811,10 +809,7 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd, while (dim) { const int mxp = 8; - down_write(¤t->mm->mmap_sem); ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); - up_write(¤t->mm->mmap_sem); - if (ret <= 0) goto done_unlock; else { diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index d5e60f44ba5a..66dbf8062374 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -324,7 +324,7 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) u32 i; i = cq->mcq.cons_index; - while (get_sw_cqe(cq, i & cq->ibcq.cqe)) + while (get_sw_cqe(cq, i)) ++i; return i - cq->mcq.cons_index; @@ -365,7 +365,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) mutex_lock(&cq->resize_mutex); - if (entries < 1 || entries > dev->dev->caps.max_cqes) { + if (entries < 1) { err = -EINVAL; goto out; } @@ -376,6 +376,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) goto out; } + if (entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + if (ibcq->uobject) { err = mlx4_alloc_resize_umem(dev, cq, entries, udata); if (err) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f0612645de99..1958c5ca792a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -177,18 +177,18 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; - props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; + props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; + props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; - props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws; + props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; - props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; + props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; @@ -526,7 +526,6 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, if (IS_ERR(mailbox)) return 0; - memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); @@ -547,8 +546,6 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - memset(mailbox->buf, 0, 256); - if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); @@ -879,8 +876,6 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att struct mlx4_ib_dev *mdev = to_mdev(qp->device); struct mlx4_cmd_mailbox *mailbox; struct mlx4_net_trans_rule_hw_ctrl *ctrl; - size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) + - (sizeof(struct _rule_hw) * flow_attr->num_of_specs); static const u16 __mlx4_domain[] = { [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, @@ -905,7 +900,6 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - memset(mailbox->buf, 0, rule_size); ctrl = mailbox->buf; ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | @@ -1691,11 +1685,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - ibdev->ib_dev.uverbs_cmd_mask |= - (1ull << IB_USER_VERBS_CMD_CREATE_FLOW) | - (1ull << IB_USER_VERBS_CMD_DESTROY_FLOW); -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); } mlx4_ib_alloc_eqs(dev, ibdev); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 344ab03948a3..b72627429745 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -556,7 +556,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, goto err_db; } mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); - (*cqb)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; *index = to_mucontext(context)->uuari.uars[0].index; @@ -620,7 +620,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, } mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT; + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; *index = dev->mdev.priv.uuari.uars[0].index; return 0; @@ -653,8 +653,11 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, int eqn; int err; + if (entries < 0) + return ERR_PTR(-EINVAL); + entries = roundup_pow_of_two(entries + 1); - if (entries < 1 || entries > dev->mdev.caps.max_cqes) + if (entries > dev->mdev.caps.max_cqes) return ERR_PTR(-EINVAL); cq = kzalloc(sizeof(*cq), GFP_KERNEL); @@ -747,17 +750,9 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq) return 0; } -static int is_equal_rsn(struct mlx5_cqe64 *cqe64, struct mlx5_ib_srq *srq, - u32 rsn) +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) { - u32 lrsn; - - if (srq) - lrsn = be32_to_cpu(cqe64->srqn) & 0xffffff; - else - lrsn = be32_to_cpu(cqe64->sop_drop_qpn) & 0xffffff; - - return rsn == lrsn; + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); } void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) @@ -787,8 +782,8 @@ void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; - if (is_equal_rsn(cqe64, srq, rsn)) { - if (srq) + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); ++nfreed; } else if (nfreed) { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b1a6cb3a2809..306534109627 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -745,7 +745,8 @@ static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in)); + err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); goto err_in; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 836be9157242..4c134d93d4fc 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -262,6 +262,9 @@ struct mlx5_ib_mr { int npages; struct completion done; enum ib_wc_status status; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + unsigned long start; }; struct mlx5_ib_fast_reg_page_list { @@ -323,6 +326,7 @@ struct mlx5_cache_ent { struct mlx5_ib_dev *dev; struct work_struct work; struct delayed_work dwork; + int pending; }; struct mlx5_mr_cache { @@ -358,6 +362,8 @@ struct mlx5_ib_dev { spinlock_t mr_lock; struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; + struct timer_list delay_timer; + int fill_delay; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 3453580b1eb2..039c3e40fcb4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -35,11 +35,12 @@ #include <linux/random.h> #include <linux/debugfs.h> #include <linux/export.h> +#include <linux/delay.h> #include <rdma/ib_umem.h> #include "mlx5_ib.h" enum { - DEF_CACHE_SIZE = 10, + MAX_PENDING_REG_MR = 8, }; enum { @@ -63,6 +64,51 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long flags; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags); + key = dev->mdev.priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags); + mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); +} + static int add_keys(struct mlx5_ib_dev *dev, int c, int num) { struct mlx5_mr_cache *cache = &dev->cache; @@ -78,36 +124,39 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) return -ENOMEM; for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { err = -ENOMEM; - goto out; + break; } mr->order = ent->order; mr->umred = 1; + mr->dev = dev; in->seg.status = 1 << 6; in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; in->seg.log2_page_size = 12; + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + mr->start = jiffies; err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, - sizeof(*in)); + sizeof(*in), reg_mr_callback, + mr, &mr->out); if (err) { mlx5_ib_warn(dev, "create mkey failed %d\n", err); kfree(mr); - goto out; + break; } - cache->last_add = jiffies; - - spin_lock(&ent->lock); - list_add_tail(&mr->list, &ent->head); - ent->cur++; - ent->size++; - spin_unlock(&ent->lock); } -out: kfree(in); return err; } @@ -121,16 +170,16 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) int i; for (i = 0; i < num; i++) { - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); @@ -162,9 +211,13 @@ static ssize_t size_write(struct file *filp, const char __user *buf, return -EINVAL; if (var > ent->size) { - err = add_keys(dev, c, var - ent->size); - if (err) - return err; + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); } else if (var < ent->size) { remove_keys(dev, c, ent->size - var); } @@ -280,23 +333,37 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) struct mlx5_ib_dev *dev = ent->dev; struct mlx5_mr_cache *cache = &dev->cache; int i = order2idx(dev, ent->order); + int err; if (cache->stopped) return; ent = &dev->cache.ent[i]; - if (ent->cur < 2 * ent->limit) { - add_keys(dev, i, 1); - if (ent->cur < 2 * ent->limit) - queue_work(cache->wq, &ent->work); + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } } else if (ent->cur > 2 * ent->limit) { if (!someone_adding(cache) && - time_after(jiffies, cache->last_add + 60 * HZ)) { + time_after(jiffies, cache->last_add + 300 * HZ)) { remove_keys(dev, i, 1); if (ent->cur > ent->limit) queue_work(cache->wq, &ent->work); } else { - queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ); + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); } } } @@ -336,18 +403,18 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (!list_empty(&ent->head)) { mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); if (ent->cur < ent->limit) queue_work(cache->wq, &ent->work); break; } - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); queue_work(cache->wq, &ent->work); @@ -374,12 +441,12 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) return; } ent = &cache->ent[c]; - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); list_add_tail(&mr->list, &ent->head); ent->cur++; if (ent->cur > 2 * ent->limit) shrink = 1; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); if (shrink) queue_work(cache->wq, &ent->work); @@ -394,16 +461,16 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) cancel_delayed_work(&ent->dwork); while (1) { - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); @@ -464,12 +531,18 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) debugfs_remove_recursive(dev->cache.root); } +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int limit; - int size; int err; int i; @@ -479,6 +552,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) return -ENOMEM; } + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { INIT_LIST_HEAD(&cache->ent[i].head); spin_lock_init(&cache->ent[i].lock); @@ -489,13 +563,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->order = i + 2; ent->dev = dev; - if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) { - size = dev->mdev.profile->mr_cache[i].size; + if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) limit = dev->mdev.profile->mr_cache[i].limit; - } else { - size = DEF_CACHE_SIZE; + else limit = 0; - } + INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); ent->limit = limit; @@ -522,6 +594,7 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) clean_keys(dev, i); destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); return 0; } @@ -551,7 +624,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in)); + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); if (err) goto err_in; @@ -660,14 +734,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, int err; int i; - for (i = 0; i < 10; i++) { + for (i = 0; i < 1; i++) { mr = alloc_cached_mr(dev, order); if (mr) break; err = add_keys(dev, order2idx(dev, order), 1); - if (err) { - mlx5_ib_warn(dev, "add_keys failed\n"); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); break; } } @@ -759,8 +833,10 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); in->seg.log2_page_size = page_shift; in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; @@ -944,7 +1020,8 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, * TBD not needed - issue 197292 */ in->seg.log2_page_size = PAGE_SHIFT; - err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in)); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); kfree(in); if (err) goto err_free; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5659ea880741..7c6b4ba49bec 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -551,7 +551,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - PAGE_SHIFT) << 24); + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); @@ -648,7 +648,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, goto err_buf; } (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); /* Set "fast registration enabled" for all kernel QPs */ (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); @@ -1317,9 +1318,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RNR_TIMEOUT | - MLX5_QP_OPTPAR_PM_STATE, + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PM_STATE, + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_SRQN | MLX5_QP_OPTPAR_CQN_RCV, @@ -1550,7 +1553,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); mlx5_st = to_mlx5_st(ibqp->qp_type); - if (mlx5_cur < 0 || mlx5_new < 0 || mlx5_st < 0) + if (mlx5_st < 0) goto out; optpar = ib_mask_to_mlx5_opt(attr_mask); @@ -1744,6 +1747,7 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, MLX5_MKEY_MASK_PD | MLX5_MKEY_MASK_LR | MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_A | @@ -1800,7 +1804,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); seg->len = cpu_to_be64(wr->wr.fast_reg.length); seg->log2_page_size = wr->wr.fast_reg.page_shift; - seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(wr->wr.fast_reg.rkey)); } static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, @@ -1913,6 +1918,10 @@ static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + set_frwr_pages(*seg, wr, mdev, pd, writ); *seg += sizeof(struct mlx5_wqe_data_seg); *size += (sizeof(struct mlx5_wqe_data_seg) / 16); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 0aa478bc291a..210b3eaf188a 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -123,7 +123,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_in; } - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); return 0; @@ -192,7 +192,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; return 0; @@ -390,9 +390,7 @@ int mlx5_ib_destroy_srq(struct ib_srq *srq) mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); ib_umem_release(msrq->umem); } else { - kfree(msrq->wrid); - mlx5_buf_free(&dev->mdev, &msrq->buf); - mlx5_db_free(&dev->mdev, &msrq->db); + destroy_srq_kernel(dev, msrq); } kfree(srq); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 5b53ca5a2284..8308e3634767 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2834,7 +2834,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->qp_context = nesqp->ibqp.qp_context; init_attr->send_cq = nesqp->ibqp.send_cq; init_attr->recv_cq = nesqp->ibqp.recv_cq; - init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq; + init_attr->srq = nesqp->ibqp.srq; init_attr->cap = attr->cap; return 0; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index adc11d14f878..294dd27b601e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -122,6 +122,32 @@ struct mqe_ctx { bool cmd_done; }; +struct ocrdma_hw_mr { + u32 lkey; + u8 fr_mr; + u8 remote_atomic; + u8 remote_rd; + u8 remote_wr; + u8 local_rd; + u8 local_wr; + u8 mw_bind; + u8 rsvd; + u64 len; + struct ocrdma_pbl *pbl_table; + u32 num_pbls; + u32 num_pbes; + u32 pbl_size; + u32 pbe_size; + u64 fbo; + u64 va; +}; + +struct ocrdma_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ocrdma_hw_mr hwmr; +}; + struct ocrdma_dev { struct ib_device ibdev; struct ocrdma_dev_attr attr; @@ -169,7 +195,7 @@ struct ocrdma_dev { struct list_head entry; struct rcu_head rcu; int id; - u64 stag_arr[OCRDMA_MAX_STAG]; + struct ocrdma_mr *stag_arr[OCRDMA_MAX_STAG]; u16 pvid; }; @@ -294,31 +320,6 @@ struct ocrdma_qp { u16 db_cache; }; -struct ocrdma_hw_mr { - u32 lkey; - u8 fr_mr; - u8 remote_atomic; - u8 remote_rd; - u8 remote_wr; - u8 local_rd; - u8 local_wr; - u8 mw_bind; - u8 rsvd; - u64 len; - struct ocrdma_pbl *pbl_table; - u32 num_pbls; - u32 num_pbes; - u32 pbl_size; - u32 pbe_size; - u64 fbo; - u64 va; -}; - -struct ocrdma_mr { - struct ib_mr ibmr; - struct ib_umem *umem; - struct ocrdma_hw_mr hwmr; -}; struct ocrdma_ucontext { struct ib_ucontext ibucontext; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 50219ab2279d..56bf32fcb62c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1783,7 +1783,7 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, u32 max_sges = attrs->cap.max_send_sge; /* QP1 may exceed 127 */ - max_wqe_allocated = min_t(int, attrs->cap.max_send_wr + 1, + max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1, dev->attr.max_wqe); status = ocrdma_build_q_conf(&max_wqe_allocated, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 0ce7674621ea..91443bcb9e0e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -452,9 +452,6 @@ static void ocrdma_remove_free(struct rcu_head *rcu) { struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu); - ocrdma_free_resources(dev); - ocrdma_cleanup_hw(dev); - idr_remove(&ocrdma_dev_id, dev->id); kfree(dev->mbx_cmd); ib_dealloc_device(&dev->ibdev); @@ -470,6 +467,10 @@ static void ocrdma_remove(struct ocrdma_dev *dev) spin_lock(&ocrdma_devlist_lock); list_del_rcu(&dev->entry); spin_unlock(&ocrdma_devlist_lock); + + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); + call_rcu(&dev->rcu, ocrdma_remove_free); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 69f1d1221a6b..7686dceadd29 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1981,9 +1981,7 @@ static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES); - if ((wr->wr.fast_reg.page_list_len > - qp->dev->attr.max_pages_per_frmr) || - (wr->wr.fast_reg.length > 0xffffffffULL)) + if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr) return -EINVAL; hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT); @@ -2839,7 +2837,7 @@ struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) goto mbx_err; mr->ibmr.rkey = mr->hwmr.lkey; mr->ibmr.lkey = mr->hwmr.lkey; - dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = (unsigned long) mr; + dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = mr; return &mr->ibmr; mbx_err: ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index f247fc6e6182..c61e2a92b3c1 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -456,13 +456,13 @@ static int remove_file(struct dentry *parent, char *name) spin_lock(&tmp->d_lock); if (!(d_unhashed(tmp) && tmp->d_inode)) { - dget_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); simple_unlink(parent->d_inode, tmp); } else { spin_unlock(&tmp->d_lock); } + dput(tmp); ret = 0; bail: @@ -491,6 +491,7 @@ static int remove_device_files(struct super_block *sb, goto bail; } + mutex_lock(&dir->d_inode->i_mutex); remove_file(dir, "counters"); remove_file(dir, "counter_names"); remove_file(dir, "portcounter_names"); @@ -505,8 +506,10 @@ static int remove_device_files(struct super_block *sb, } } remove_file(dir, "flash"); - d_delete(dir); + mutex_unlock(&dir->d_inode->i_mutex); ret = simple_rmdir(root->d_inode, dir); + d_delete(dir); + dput(dir); bail: mutex_unlock(&root->d_inode->i_mutex); diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 016e7429adf6..5bfc02f450e6 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -6190,21 +6190,20 @@ static int setup_txselect(const char *str, struct kernel_param *kp) { struct qib_devdata *dd; unsigned long val; - int ret; - + char *n; if (strlen(str) >= MAX_ATTEN_LEN) { pr_info("txselect_values string too long\n"); return -ENOSPC; } - ret = kstrtoul(str, 0, &val); - if (ret || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + val = simple_strtoul(str, &n, 0); + if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ)) { pr_info("txselect_values must start with a number < %d\n", TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); - return ret ? ret : -EINVAL; + return -EINVAL; } - strcpy(txselect_list, str); + list_for_each_entry(dd, &qib_dev_list, list) if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322) set_no_qsfp_atten(dd, 1); diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h index 28874f8606f8..941d4d50d8e7 100644 --- a/drivers/infiniband/hw/qib/qib_mad.h +++ b/drivers/infiniband/hw/qib/qib_mad.h @@ -54,7 +54,7 @@ struct ib_node_info { __be32 revision; u8 local_port_num; u8 vendor_id[3]; -} __attribute__ ((packed)); +} __packed; struct ib_mad_notice_attr { u8 generic_type; @@ -73,7 +73,7 @@ struct ib_mad_notice_attr { __be16 reserved; __be16 lid; /* where violation happened */ u8 port_num; /* where violation happened */ - } __attribute__ ((packed)) ntc_129_131; + } __packed ntc_129_131; struct { __be16 reserved; @@ -83,14 +83,14 @@ struct ib_mad_notice_attr { __be32 new_cap_mask; /* new capability mask */ u8 reserved3; u8 change_flags; /* low 3 bits only */ - } __attribute__ ((packed)) ntc_144; + } __packed ntc_144; struct { __be16 reserved; __be16 lid; /* lid where sys guid changed */ __be16 reserved2; __be64 new_sys_guid; - } __attribute__ ((packed)) ntc_145; + } __packed ntc_145; struct { __be16 reserved; @@ -104,7 +104,7 @@ struct ib_mad_notice_attr { u8 reserved3; u8 dr_trunc_hop; u8 dr_rtn_path[30]; - } __attribute__ ((packed)) ntc_256; + } __packed ntc_256; struct { __be16 reserved; @@ -115,7 +115,7 @@ struct ib_mad_notice_attr { __be32 qp2; /* high 8 bits reserved */ union ib_gid gid1; union ib_gid gid2; - } __attribute__ ((packed)) ntc_257_258; + } __packed ntc_257_258; } details; }; @@ -209,7 +209,7 @@ struct ib_pma_portcounters_cong { __be64 port_rcv_packets; __be64 port_xmit_wait; __be64 port_adr_events; -} __attribute__ ((packed)); +} __packed; #define IB_PMA_CONG_HW_CONTROL_TIMER 0x00 #define IB_PMA_CONG_HW_CONTROL_SAMPLE 0x01 diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 3f14009fb662..c8d9c4ab142b 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -51,8 +51,8 @@ * file calls, even though this violates some * expectations of harmlessness. */ -static int qib_tune_pcie_caps(struct qib_devdata *); -static int qib_tune_pcie_coalesce(struct qib_devdata *); +static void qib_tune_pcie_caps(struct qib_devdata *); +static void qib_tune_pcie_coalesce(struct qib_devdata *); /* * Do all the common PCIe setup and initialization. @@ -476,30 +476,6 @@ void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline) "pci_enable_device failed after reset: %d\n", r); } -/* code to adjust PCIe capabilities. */ - -static int fld2val(int wd, int mask) -{ - int lsbmask; - - if (!mask) - return 0; - wd &= mask; - lsbmask = mask ^ (mask & (mask - 1)); - wd /= lsbmask; - return wd; -} - -static int val2fld(int wd, int mask) -{ - int lsbmask; - - if (!mask) - return 0; - lsbmask = mask ^ (mask & (mask - 1)); - wd *= lsbmask; - return wd; -} static int qib_pcie_coalesce; module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO); @@ -511,7 +487,7 @@ MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets"); * of these chipsets, with some BIOS settings, and enabling it on those * systems may result in the system crashing, and/or data corruption. */ -static int qib_tune_pcie_coalesce(struct qib_devdata *dd) +static void qib_tune_pcie_coalesce(struct qib_devdata *dd) { int r; struct pci_dev *parent; @@ -519,18 +495,18 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) u32 mask, bits, val; if (!qib_pcie_coalesce) - return 0; + return; /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; if (parent->bus->parent) { qib_devinfo(dd->pcidev, "Parent not root\n"); - return 1; + return; } if (!pci_is_pcie(parent)) - return 1; + return; if (parent->vendor != 0x8086) - return 1; + return; /* * - bit 12: Max_rdcmp_Imt_EN: need to set to 1 @@ -563,13 +539,12 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) mask = (3U << 24) | (7U << 10); } else { /* not one of the chipsets that we know about */ - return 1; + return; } pci_read_config_dword(parent, 0x48, &val); val &= ~mask; val |= bits; r = pci_write_config_dword(parent, 0x48, val); - return 0; } /* @@ -580,55 +555,44 @@ static int qib_pcie_caps; module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); -static int qib_tune_pcie_caps(struct qib_devdata *dd) +static void qib_tune_pcie_caps(struct qib_devdata *dd) { - int ret = 1; /* Assume the worst */ struct pci_dev *parent; - u16 pcaps, pctl, ecaps, ectl; - int rc_sup, ep_sup; - int rc_cur, ep_cur; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs; /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; - if (parent->bus->parent) { + if (!pci_is_root_bus(parent->bus)) { qib_devinfo(dd->pcidev, "Parent not root\n"); - goto bail; + return; } if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - goto bail; - pcie_capability_read_word(parent, PCI_EXP_DEVCAP, &pcaps); - pcie_capability_read_word(parent, PCI_EXP_DEVCTL, &pctl); + return; + + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCAP, &ecaps); - pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; - ret = 0; /* Find max payload supported by root, endpoint */ - rc_sup = fld2val(pcaps, PCI_EXP_DEVCAP_PAYLOAD); - ep_sup = fld2val(ecaps, PCI_EXP_DEVCAP_PAYLOAD); - if (rc_sup > ep_sup) - rc_sup = ep_sup; - - rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_PAYLOAD); - ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_PAYLOAD); + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; /* If Supported greater than limit in module param, limit it */ - if (rc_sup > (qib_pcie_caps & 7)) - rc_sup = qib_pcie_caps & 7; + if (rc_mpss > (qib_pcie_caps & 7)) + rc_mpss = qib_pcie_caps & 7; /* If less than (allowed, supported), bump root payload */ - if (rc_sup > rc_cur) { - rc_cur = rc_sup; - pctl = (pctl & ~PCI_EXP_DEVCTL_PAYLOAD) | - val2fld(rc_cur, PCI_EXP_DEVCTL_PAYLOAD); - pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); } /* If less than (allowed, supported), bump endpoint payload */ - if (rc_sup > ep_cur) { - ep_cur = rc_sup; - ectl = (ectl & ~PCI_EXP_DEVCTL_PAYLOAD) | - val2fld(ep_cur, PCI_EXP_DEVCTL_PAYLOAD); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); } /* @@ -636,26 +600,22 @@ static int qib_tune_pcie_caps(struct qib_devdata *dd) * No field for max supported, but PCIe spec limits it to 4096, * which is code '5' (log2(4096) - 7) */ - rc_sup = 5; - if (rc_sup > ((qib_pcie_caps >> 4) & 7)) - rc_sup = (qib_pcie_caps >> 4) & 7; - rc_cur = fld2val(pctl, PCI_EXP_DEVCTL_READRQ); - ep_cur = fld2val(ectl, PCI_EXP_DEVCTL_READRQ); - - if (rc_sup > rc_cur) { - rc_cur = rc_sup; - pctl = (pctl & ~PCI_EXP_DEVCTL_READRQ) | - val2fld(rc_cur, PCI_EXP_DEVCTL_READRQ); - pcie_capability_write_word(parent, PCI_EXP_DEVCTL, pctl); + max_mrrs = 5; + if (max_mrrs > ((qib_pcie_caps >> 4) & 7)) + max_mrrs = (qib_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); } - if (rc_sup > ep_cur) { - ep_cur = rc_sup; - ectl = (ectl & ~PCI_EXP_DEVCTL_READRQ) | - val2fld(ep_cur, PCI_EXP_DEVCTL_READRQ); - pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); } -bail: - return ret; } /* End of PCIe capability tuning */ diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index d0a0ea0c14d6..165aee2ca8a0 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -594,8 +594,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, else j = npages; - ret = get_user_pages(current, current->mm, addr, - j, 0, 1, pages, NULL); + ret = get_user_pages_fast(addr, j, 0, pages); if (ret != j) { i = 0; j = ret; @@ -1294,11 +1293,8 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd, int mxp = 8; int ndesc = 0; - down_write(¤t->mm->mmap_sem); ret = qib_user_sdma_queue_pkts(dd, ppd, pq, iov, dim, &list, &mxp, &ndesc); - up_write(¤t->mm->mmap_sem); - if (ret < 0) goto done_unlock; else { diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 012e2c7575ad..a01c7d2cf541 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -150,14 +150,14 @@ struct ib_reth { __be64 vaddr; __be32 rkey; __be32 length; -} __attribute__ ((packed)); +} __packed; struct ib_atomic_eth { __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ __be32 rkey; __be64 swap_data; __be64 compare_data; -} __attribute__ ((packed)); +} __packed; struct qib_other_headers { __be32 bth[3]; @@ -178,7 +178,7 @@ struct qib_other_headers { __be32 aeth; struct ib_atomic_eth atomic_eth; } u; -} __attribute__ ((packed)); +} __packed; /* * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes @@ -195,12 +195,12 @@ struct qib_ib_header { } l; struct qib_other_headers oth; } u; -} __attribute__ ((packed)); +} __packed; struct qib_pio_header { __le32 pbc[2]; struct qib_ib_header hdr; -} __attribute__ ((packed)); +} __packed; /* * There is one struct qib_mcast for each multicast GID. diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index eb71aaa26a9a..c639f90cfda4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -101,6 +101,7 @@ enum { IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -151,6 +152,7 @@ struct ipoib_mcast { struct sk_buff_head pkt_queue; struct net_device *dev; + struct completion done; }; struct ipoib_rx_buf { @@ -299,7 +301,7 @@ struct ipoib_dev_priv { unsigned long flags; - struct mutex vlan_mutex; + struct rw_semaphore vlan_rwsem; struct rb_root path_tree; struct list_head path_list; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 7a3175400b2a..1377f85911c2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -140,7 +140,8 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, struct ipoib_cm_rx_buf *rx_ring, int id, int frags, - u64 mapping[IPOIB_CM_RX_SG]) + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -164,7 +165,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, } for (i = 0; i < frags; i++) { - struct page *page = alloc_page(GFP_ATOMIC); + struct page *page = alloc_page(gfp); if (!page) goto partial_error; @@ -382,7 +383,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, - rx->rx_ring[i].mapping)) { + rx->rx_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; @@ -639,7 +641,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; - newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump @@ -1556,7 +1559,8 @@ int ipoib_cm_dev_init(struct net_device *dev) for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, priv->cm.num_frags - 1, - priv->cm.srq_ring[i].mapping)) { + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 196b1d13cbcb..6a7003ddb0be 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -685,15 +685,13 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } ret = ipoib_cm_dev_open(dev); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); @@ -704,6 +702,11 @@ int ipoib_ib_dev_open(struct net_device *dev) napi_enable(&priv->napi); return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, 1); + return -1; } static void ipoib_pkey_dev_check_presence(struct net_device *dev) @@ -746,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_poll_task); + cancel_delayed_work_sync(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); - if (flush) - flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(dev, flush); @@ -974,7 +975,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, u16 new_index; int result; - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); /* * Flush any child interfaces too -- they might be up even if @@ -983,7 +984,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { /* for non-child devices must check/update the pkey value here */ @@ -1081,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); ipoib_mcast_stop_thread(dev, 1); ipoib_mcast_dev_flush(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 82cec1af902c..d64ed05fb082 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -119,7 +119,7 @@ int ipoib_open(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -129,7 +129,7 @@ int ipoib_open(struct net_device *dev) dev_change_flags(cpriv->dev, flags | IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } netif_start_queue(dev); @@ -162,7 +162,7 @@ static int ipoib_stop(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -172,7 +172,7 @@ static int ipoib_stop(struct net_device *dev) dev_change_flags(cpriv->dev, flags & ~IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } return 0; @@ -1350,7 +1350,7 @@ void ipoib_setup(struct net_device *dev) ipoib_set_ethtool_ops(dev); - netif_napi_add(dev, &priv->napi, ipoib_poll, 100); + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); dev->watchdog_timeo = HZ; @@ -1372,7 +1372,7 @@ void ipoib_setup(struct net_device *dev) spin_lock_init(&priv->lock); - mutex_init(&priv->vlan_mutex); + init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index cecb98a4c662..d4e005720d01 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -386,8 +386,10 @@ static int ipoib_mcast_join_complete(int status, mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; + if (status == -ENETRESET) { + status = 0; + goto out; + } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -407,7 +409,8 @@ static int ipoib_mcast_join_complete(int status, if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); - return 0; + status = 0; + goto out; } if (mcast->logcount++ < 20) { @@ -434,7 +437,8 @@ static int ipoib_mcast_join_complete(int status, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); - +out: + complete(&mcast->done); return status; } @@ -484,11 +488,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -510,10 +518,18 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else @@ -751,6 +767,11 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index f81abe16cf09..cdc7df4fdb8a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -31,6 +31,7 @@ */ #include <linux/netdevice.h> +#include <linux/if_arp.h> /* For ARPHRD_xxx */ #include <linux/module.h> #include <net/rtnetlink.h> #include "ipoib.h" @@ -103,7 +104,7 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, return -EINVAL; pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); - if (!pdev) + if (!pdev || pdev->type != ARPHRD_INFINIBAND) return -ENODEV; ppriv = netdev_priv(pdev); @@ -142,10 +143,10 @@ static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head priv = netdev_priv(dev); ppriv = netdev_priv(priv->parent); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); unregister_netdevice_queue(dev, head); list_del(&priv->list); - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); } static size_t ipoib_get_size(const struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 8292554bccb5..9fad7b5ac8b9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -140,7 +140,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); /* * First ensure this isn't a duplicate. We check the parent device and @@ -163,7 +163,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); out: - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); if (result) free_netdev(priv->dev); @@ -185,7 +185,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + + down_write(&ppriv->vlan_rwsem); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { @@ -195,7 +196,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) break; } } - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); if (dev) { diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig index ce3fd32167dc..02f9759ebb1a 100644 --- a/drivers/infiniband/ulp/isert/Kconfig +++ b/drivers/infiniband/ulp/isert/Kconfig @@ -1,5 +1,5 @@ config INFINIBAND_ISERT - tristate "iSCSI Extentions for RDMA (iSER) target support" + tristate "iSCSI Extensions for RDMA (iSER) target support" depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET ---help--- - Support for iSCSI Extentions for RDMA (iSER) Target on Infiniband fabrics. + Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics. diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 6df23502059a..9804fca6bf06 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -22,6 +22,7 @@ #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> +#include <linux/llist.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <target/target_core_base.h> @@ -206,7 +207,9 @@ isert_free_rx_descriptors(struct isert_conn *isert_conn) isert_conn->conn_rx_descs = NULL; } +static void isert_cq_tx_work(struct work_struct *); static void isert_cq_tx_callback(struct ib_cq *, void *); +static void isert_cq_rx_work(struct work_struct *); static void isert_cq_rx_callback(struct ib_cq *, void *); static int @@ -258,26 +261,36 @@ isert_create_device_ib_res(struct isert_device *device) cq_desc[i].device = device; cq_desc[i].cq_index = i; + INIT_WORK(&cq_desc[i].cq_rx_work, isert_cq_rx_work); device->dev_rx_cq[i] = ib_create_cq(device->ib_device, isert_cq_rx_callback, isert_cq_event_callback, (void *)&cq_desc[i], ISER_MAX_RX_CQ_LEN, i); - if (IS_ERR(device->dev_rx_cq[i])) + if (IS_ERR(device->dev_rx_cq[i])) { + ret = PTR_ERR(device->dev_rx_cq[i]); + device->dev_rx_cq[i] = NULL; goto out_cq; + } + INIT_WORK(&cq_desc[i].cq_tx_work, isert_cq_tx_work); device->dev_tx_cq[i] = ib_create_cq(device->ib_device, isert_cq_tx_callback, isert_cq_event_callback, (void *)&cq_desc[i], ISER_MAX_TX_CQ_LEN, i); - if (IS_ERR(device->dev_tx_cq[i])) + if (IS_ERR(device->dev_tx_cq[i])) { + ret = PTR_ERR(device->dev_tx_cq[i]); + device->dev_tx_cq[i] = NULL; goto out_cq; + } - if (ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP)) + ret = ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP); + if (ret) goto out_cq; - if (ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP)) + ret = ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP); + if (ret) goto out_cq; } @@ -489,6 +502,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) kref_init(&isert_conn->conn_kref); kref_get(&isert_conn->conn_kref); mutex_init(&isert_conn->conn_mutex); + mutex_init(&isert_conn->conn_comp_mutex); spin_lock_init(&isert_conn->conn_lock); cma_id->context = isert_conn; @@ -843,14 +857,32 @@ isert_init_tx_hdrs(struct isert_conn *isert_conn, } static void -isert_init_send_wr(struct isert_cmd *isert_cmd, struct ib_send_wr *send_wr) +isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_send_wr *send_wr, bool coalesce) { + struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; + isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND; send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc; send_wr->opcode = IB_WR_SEND; - send_wr->send_flags = IB_SEND_SIGNALED; - send_wr->sg_list = &isert_cmd->tx_desc.tx_sg[0]; + send_wr->sg_list = &tx_desc->tx_sg[0]; send_wr->num_sge = isert_cmd->tx_desc.num_sge; + /* + * Coalesce send completion interrupts by only setting IB_SEND_SIGNALED + * bit for every ISERT_COMP_BATCH_COUNT number of ib_post_send() calls. + */ + mutex_lock(&isert_conn->conn_comp_mutex); + if (coalesce && + ++isert_conn->conn_comp_batch < ISERT_COMP_BATCH_COUNT) { + llist_add(&tx_desc->comp_llnode, &isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_comp_mutex); + return; + } + isert_conn->conn_comp_batch = 0; + tx_desc->comp_llnode_batch = llist_del_all(&isert_conn->conn_comp_llist); + mutex_unlock(&isert_conn->conn_comp_mutex); + + send_wr->send_flags = IB_SEND_SIGNALED; } static int @@ -1582,8 +1614,8 @@ isert_response_completion(struct iser_tx_desc *tx_desc, } static void -isert_send_completion(struct iser_tx_desc *tx_desc, - struct isert_conn *isert_conn) +__isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) { struct ib_device *ib_dev = isert_conn->conn_cm_id->device; struct isert_cmd *isert_cmd = tx_desc->isert_cmd; @@ -1624,6 +1656,24 @@ isert_send_completion(struct iser_tx_desc *tx_desc, } static void +isert_send_completion(struct iser_tx_desc *tx_desc, + struct isert_conn *isert_conn) +{ + struct llist_node *llnode = tx_desc->comp_llnode_batch; + struct iser_tx_desc *t; + /* + * Drain coalesced completion llist starting from comp_llnode_batch + * setup in isert_init_send_wr(), and then complete trailing tx_desc. + */ + while (llnode) { + t = llist_entry(llnode, struct iser_tx_desc, comp_llnode); + llnode = llist_next(llnode); + __isert_send_completion(t, isert_conn); + } + __isert_send_completion(tx_desc, isert_conn); +} + +static void isert_cq_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) { struct ib_device *ib_dev = isert_conn->conn_cm_id->device; @@ -1686,7 +1736,6 @@ isert_cq_tx_callback(struct ib_cq *cq, void *context) { struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; - INIT_WORK(&cq_desc->cq_tx_work, isert_cq_tx_work); queue_work(isert_comp_wq, &cq_desc->cq_tx_work); } @@ -1730,7 +1779,6 @@ isert_cq_rx_callback(struct ib_cq *cq, void *context) { struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context; - INIT_WORK(&cq_desc->cq_rx_work, isert_cq_rx_work); queue_work(isert_rx_wq, &cq_desc->cq_rx_work); } @@ -1793,7 +1841,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_cmd->tx_desc.num_sge = 2; } - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, true); pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1813,7 +1861,7 @@ isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, &isert_cmd->tx_desc.iscsi_header, nopout_response); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting NOPIN Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1831,7 +1879,7 @@ isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *) &isert_cmd->tx_desc.iscsi_header); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Logout Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1849,7 +1897,7 @@ isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *) &isert_cmd->tx_desc.iscsi_header); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Task Management Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1881,7 +1929,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) tx_dsg->lkey = isert_conn->conn_mr->lkey; isert_cmd->tx_desc.num_sge = 2; - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Reject IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1921,7 +1969,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) tx_dsg->lkey = isert_conn->conn_mr->lkey; isert_cmd->tx_desc.num_sge = 2; } - isert_init_send_wr(isert_cmd, send_wr); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting Text Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -1991,8 +2039,6 @@ isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { data_left = se_cmd->data_length; - iscsit_increment_maxcmdsn(cmd, conn->sess); - cmd->stat_sn = conn->stat_sn++; } else { sg_off = cmd->write_data_done / PAGE_SIZE; data_left = se_cmd->data_length - cmd->write_data_done; @@ -2204,8 +2250,6 @@ isert_reg_rdma_frwr(struct iscsi_conn *conn, struct iscsi_cmd *cmd, if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) { data_left = se_cmd->data_length; - iscsit_increment_maxcmdsn(cmd, conn->sess); - cmd->stat_sn = conn->stat_sn++; } else { sg_off = cmd->write_data_done / PAGE_SIZE; data_left = se_cmd->data_length - cmd->write_data_done; @@ -2259,18 +2303,26 @@ isert_reg_rdma_frwr(struct iscsi_conn *conn, struct iscsi_cmd *cmd, data_len = min(data_left, rdma_write_max); wr->cur_rdma_length = data_len; - spin_lock_irqsave(&isert_conn->conn_lock, flags); - fr_desc = list_first_entry(&isert_conn->conn_frwr_pool, - struct fast_reg_descriptor, list); - list_del(&fr_desc->list); - spin_unlock_irqrestore(&isert_conn->conn_lock, flags); - wr->fr_desc = fr_desc; + /* if there is a single dma entry, dma mr is sufficient */ + if (count == 1) { + ib_sge->addr = ib_sg_dma_address(ib_dev, &sg_start[0]); + ib_sge->length = ib_sg_dma_len(ib_dev, &sg_start[0]); + ib_sge->lkey = isert_conn->conn_mr->lkey; + wr->fr_desc = NULL; + } else { + spin_lock_irqsave(&isert_conn->conn_lock, flags); + fr_desc = list_first_entry(&isert_conn->conn_frwr_pool, + struct fast_reg_descriptor, list); + list_del(&fr_desc->list); + spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + wr->fr_desc = fr_desc; - ret = isert_fast_reg_mr(fr_desc, isert_cmd, isert_conn, - ib_sge, offset, data_len); - if (ret) { - list_add_tail(&fr_desc->list, &isert_conn->conn_frwr_pool); - goto unmap_sg; + ret = isert_fast_reg_mr(fr_desc, isert_cmd, isert_conn, + ib_sge, offset, data_len); + if (ret) { + list_add_tail(&fr_desc->list, &isert_conn->conn_frwr_pool); + goto unmap_sg; + } } return 0; @@ -2306,10 +2358,11 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) * Build isert_conn->tx_desc for iSCSI response PDU and attach */ isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); - iscsit_build_rsp_pdu(cmd, conn, false, (struct iscsi_scsi_rsp *) + iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *) &isert_cmd->tx_desc.iscsi_header); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); - isert_init_send_wr(isert_cmd, &isert_cmd->tx_desc.send_wr); + isert_init_send_wr(isert_conn, isert_cmd, + &isert_cmd->tx_desc.send_wr, true); atomic_inc(&isert_conn->post_send_buf_count); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 631f2090f0b8..691f90ff2d83 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -43,6 +43,8 @@ struct iser_tx_desc { struct ib_sge tx_sg[2]; int num_sge; struct isert_cmd *isert_cmd; + struct llist_node *comp_llnode_batch; + struct llist_node comp_llnode; struct ib_send_wr send_wr; } __packed; @@ -121,6 +123,10 @@ struct isert_conn { int conn_frwr_pool_size; /* lock to protect frwr_pool */ spinlock_t conn_lock; +#define ISERT_COMP_BATCH_COUNT 8 + int conn_comp_batch; + struct llist_head conn_comp_llist; + struct mutex conn_comp_mutex; }; #define ISERT_MAX_CQ 64 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f93baf8254c4..a88631918e85 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -46,6 +46,7 @@ #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_dbg.h> +#include <scsi/scsi_tcq.h> #include <scsi/srp.h> #include <scsi/scsi_transport_srp.h> @@ -86,6 +87,32 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); +static struct kernel_param_ops srp_tmo_ops; + +static int srp_reconnect_delay = 10; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, + "Number of seconds between the observation of a transport" + " layer error and failing all I/O. \"off\" means that this" + " functionality is disabled."); + +static int srp_dev_loss_tmo = 600; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, + "Maximum number of seconds that the SRP transport should" + " insulate transport layer errors. After this time has been" + " exceeded the SCSI host is removed. Should be" + " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + " if fast_io_fail_tmo has not been set. \"off\" means that" + " this functionality is disabled."); + static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); @@ -102,6 +129,48 @@ static struct ib_client srp_client = { static struct ib_sa_client srp_sa_client; +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ + int tmo = *(int *)kp->arg; + + if (tmo >= 0) + return sprintf(buffer, "%d", tmo); + else + return sprintf(buffer, "off"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ + int tmo, res; + + if (strncmp(val, "off", 3) != 0) { + res = kstrtoint(val, 0, &tmo); + if (res) + goto out; + } else { + tmo = -1; + } + if (kp->arg == &srp_reconnect_delay) + res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, + srp_dev_loss_tmo); + else if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); + else + res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, + tmo); + if (res) + goto out; + *(int *)kp->arg = tmo; + +out: + return res; +} + +static struct kernel_param_ops srp_tmo_ops = { + .get = srp_tmo_get, + .set = srp_tmo_set, +}; + static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) { return (struct srp_target_port *) host->hostdata; @@ -231,16 +300,16 @@ static int srp_create_target_ib(struct srp_target_port *target) return -ENOMEM; recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_recv_completion, NULL, target, SRP_RQ_SIZE, - target->comp_vector); + srp_recv_completion, NULL, target, + target->queue_size, target->comp_vector); if (IS_ERR(recv_cq)) { ret = PTR_ERR(recv_cq); goto err; } send_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_send_completion, NULL, target, SRP_SQ_SIZE, - target->comp_vector); + srp_send_completion, NULL, target, + target->queue_size, target->comp_vector); if (IS_ERR(send_cq)) { ret = PTR_ERR(send_cq); goto err_recv_cq; @@ -249,8 +318,8 @@ static int srp_create_target_ib(struct srp_target_port *target) ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; - init_attr->cap.max_send_wr = SRP_SQ_SIZE; - init_attr->cap.max_recv_wr = SRP_RQ_SIZE; + init_attr->cap.max_send_wr = target->queue_size; + init_attr->cap.max_recv_wr = target->queue_size; init_attr->cap.max_recv_sge = 1; init_attr->cap.max_send_sge = 1; init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; @@ -296,6 +365,10 @@ err: return ret; } +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the target->[rt]x_ring checks. + */ static void srp_free_target_ib(struct srp_target_port *target) { int i; @@ -307,10 +380,18 @@ static void srp_free_target_ib(struct srp_target_port *target) target->qp = NULL; target->send_cq = target->recv_cq = NULL; - for (i = 0; i < SRP_RQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->rx_ring[i]); - for (i = 0; i < SRP_SQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->tx_ring[i]); + if (target->rx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->rx_ring[i]); + kfree(target->rx_ring); + target->rx_ring = NULL; + } + if (target->tx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->tx_ring[i]); + kfree(target->tx_ring); + target->tx_ring = NULL; + } } static void srp_path_rec_completion(int status, @@ -390,7 +471,7 @@ static int srp_send_req(struct srp_target_port *target) req->param.responder_resources = 4; req->param.remote_cm_response_timeout = 20; req->param.local_cm_response_timeout = 20; - req->param.retry_count = 7; + req->param.retry_count = target->tl_retry_count; req->param.rnr_retry_count = 7; req->param.max_cm_retries = 15; @@ -496,7 +577,11 @@ static void srp_free_req_data(struct srp_target_port *target) struct srp_request *req; int i; - for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) { + if (!target->req_ring) + return; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; kfree(req->fmr_list); kfree(req->map_page); if (req->indirect_dma_addr) { @@ -506,6 +591,50 @@ static void srp_free_req_data(struct srp_target_port *target) } kfree(req->indirect_desc); } + + kfree(target->req_ring); + target->req_ring = NULL; +} + +static int srp_alloc_req_data(struct srp_target_port *target) +{ + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + struct srp_request *req; + dma_addr_t dma_addr; + int i, ret = -ENOMEM; + + INIT_LIST_HEAD(&target->free_reqs); + + target->req_ring = kzalloc(target->req_ring_size * + sizeof(*target->req_ring), GFP_KERNEL); + if (!target->req_ring) + goto out; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; + req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + GFP_KERNEL); + req->map_page = kmalloc(SRP_FMR_SIZE * sizeof(void *), + GFP_KERNEL); + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->fmr_list || !req->map_page || !req->indirect_desc) + goto out; + + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) + goto out; + + req->indirect_dma_addr = dma_addr; + req->index = i; + list_add_tail(&req->list, &target->free_reqs); + } + ret = 0; + +out: + return ret; } /** @@ -528,12 +657,20 @@ static void srp_remove_target(struct srp_target_port *target) WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + cancel_work_sync(&target->tl_err_work); + srp_rport_put(target->rport); srp_free_req_data(target); + + spin_lock(&target->srp_host->target_lock); + list_del(&target->list); + spin_unlock(&target->srp_host->target_lock); + scsi_host_put(target->scsi_host); } @@ -545,10 +682,6 @@ static void srp_remove_work(struct work_struct *work) WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_remove_target(target); - - spin_lock(&target->srp_host->target_lock); - list_del(&target->list); - spin_unlock(&target->srp_host->target_lock); } static void srp_rport_delete(struct srp_rport *rport) @@ -686,23 +819,42 @@ static void srp_free_req(struct srp_target_port *target, spin_unlock_irqrestore(&target->lock, flags); } -static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) +static void srp_finish_req(struct srp_target_port *target, + struct srp_request *req, int result) { struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); if (scmnd) { srp_free_req(target, req, scmnd, 0); - scmnd->result = DID_RESET << 16; + scmnd->result = result; scmnd->scsi_done(scmnd); } } -static int srp_reconnect_target(struct srp_target_port *target) +static void srp_terminate_io(struct srp_rport *rport) { - struct Scsi_Host *shost = target->scsi_host; - int i, ret; + struct srp_target_port *target = rport->lld_data; + int i; - scsi_target_block(&shost->shost_gendev); + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16); + } +} + +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + int i, ret; srp_disconnect_target(target); /* @@ -721,41 +873,21 @@ static int srp_reconnect_target(struct srp_target_port *target) else srp_create_target_ib(target); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; - if (req->scmnd) - srp_reset_req(target, req); + srp_finish_req(target, req, DID_RESET << 16); } INIT_LIST_HEAD(&target->free_tx); - for (i = 0; i < SRP_SQ_SIZE; ++i) + for (i = 0; i < target->queue_size; ++i) list_add(&target->tx_ring[i]->list, &target->free_tx); if (ret == 0) ret = srp_connect_target(target); - scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING : - SDEV_TRANSPORT_OFFLINE); - target->transport_offline = !!ret; - - if (ret) - goto err; - - shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n"); - - return ret; - -err: - shost_printk(KERN_ERR, target->scsi_host, - PFX "reconnect failed (%d), removing target port.\n", ret); - - /* - * We couldn't reconnect, so kill our target port off. - * However, we have to defer the real removal because we - * are in the context of the SCSI error handler now, which - * will deadlock if we call scsi_remove_host(). - */ - srp_queue_remove_work(target); + if (ret == 0) + shost_printk(KERN_INFO, target->scsi_host, + PFX "reconnect succeeded\n"); return ret; } @@ -1302,15 +1434,30 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) PFX "Recv failed with error code %d\n", res); } -static void srp_handle_qp_err(enum ib_wc_status wc_status, - enum ib_wc_opcode wc_opcode, +/** + * srp_tl_err_work() - handle a transport layer error + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) +{ + struct srp_target_port *target; + + target = container_of(work, struct srp_target_port, tl_err_work); + if (target->rport) + srp_start_tl_fail_timers(target->rport); +} + +static void srp_handle_qp_err(enum ib_wc_status wc_status, bool send_err, struct srp_target_port *target) { if (target->connected && !target->qp_in_error) { shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %d\n", - wc_opcode & IB_WC_RECV ? "receive" : "send", + send_err ? "send" : "receive", wc_status); + queue_work(system_long_wq, &target->tl_err_work); } target->qp_in_error = true; } @@ -1325,7 +1472,7 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) if (likely(wc.status == IB_WC_SUCCESS)) { srp_handle_recv(target, &wc); } else { - srp_handle_qp_err(wc.status, wc.opcode, target); + srp_handle_qp_err(wc.status, false, target); } } } @@ -1341,7 +1488,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) iu = (struct srp_iu *) (uintptr_t) wc.wr_id; list_add(&iu->list, &target->free_tx); } else { - srp_handle_qp_err(wc.status, wc.opcode, target); + srp_handle_qp_err(wc.status, true, target); } } } @@ -1349,17 +1496,29 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(shost); + struct srp_rport *rport = target->rport; struct srp_request *req; struct srp_iu *iu; struct srp_cmd *cmd; struct ib_device *dev; unsigned long flags; - int len; + int len, result; + const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; + + /* + * The SCSI EH thread is the only context from which srp_queuecommand() + * can get invoked for blocked devices (SDEV_BLOCK / + * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by + * locking the rport mutex if invoked from inside the SCSI EH. + */ + if (in_scsi_eh) + mutex_lock(&rport->mutex); - if (unlikely(target->transport_offline)) { - scmnd->result = DID_NO_CONNECT << 16; + result = srp_chkready(target->rport); + if (unlikely(result)) { + scmnd->result = result; scmnd->scsi_done(scmnd); - return 0; + goto unlock_rport; } spin_lock_irqsave(&target->lock, flags); @@ -1404,6 +1563,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) goto err_unmap; } +unlock_rport: + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + return 0; err_unmap: @@ -1418,14 +1581,30 @@ err_iu: err_unlock: spin_unlock_irqrestore(&target->lock, flags); + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + return SCSI_MLQUEUE_HOST_BUSY; } +/* + * Note: the resources allocated in this function are freed in + * srp_free_target_ib(). + */ static int srp_alloc_iu_bufs(struct srp_target_port *target) { int i; - for (i = 0; i < SRP_RQ_SIZE; ++i) { + target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring), + GFP_KERNEL); + if (!target->rx_ring) + goto err_no_ring; + target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring), + GFP_KERNEL); + if (!target->tx_ring) + goto err_no_ring; + + for (i = 0; i < target->queue_size; ++i) { target->rx_ring[i] = srp_alloc_iu(target->srp_host, target->max_ti_iu_len, GFP_KERNEL, DMA_FROM_DEVICE); @@ -1433,7 +1612,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) goto err; } - for (i = 0; i < SRP_SQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, target->max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); @@ -1446,16 +1625,18 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) return 0; err: - for (i = 0; i < SRP_RQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { srp_free_iu(target->srp_host, target->rx_ring[i]); - target->rx_ring[i] = NULL; - } - - for (i = 0; i < SRP_SQ_SIZE; ++i) { srp_free_iu(target->srp_host, target->tx_ring[i]); - target->tx_ring[i] = NULL; } + +err_no_ring: + kfree(target->tx_ring); + target->tx_ring = NULL; + kfree(target->rx_ring); + target->rx_ring = NULL; + return -ENOMEM; } @@ -1506,6 +1687,9 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, target->scsi_host->can_queue = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, target->scsi_host->can_queue); + target->scsi_host->cmd_per_lun + = min_t(int, target->scsi_host->can_queue, + target->scsi_host->cmd_per_lun); } else { shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); @@ -1513,7 +1697,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, goto error; } - if (!target->rx_ring[0]) { + if (!target->rx_ring) { ret = srp_alloc_iu_bufs(target); if (ret) goto error; @@ -1533,7 +1717,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, if (ret) goto error_free; - for (i = 0; i < SRP_RQ_SIZE; i++) { + for (i = 0; i < target->queue_size; i++) { struct srp_iu *iu = target->rx_ring[i]; ret = srp_post_recv(target, iu); if (ret) @@ -1672,6 +1856,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) if (ib_send_cm_drep(cm_id, NULL, 0)) shost_printk(KERN_ERR, target->scsi_host, PFX "Sending CM DREP failed\n"); + queue_work(system_long_wq, &target->tl_err_work); break; case IB_CM_TIMEWAIT_EXIT: @@ -1698,9 +1883,61 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return 0; } +/** + * srp_change_queue_type - changing device queue tag type + * @sdev: scsi device struct + * @tag_type: requested tag type + * + * Returns queue tag type. + */ +static int +srp_change_queue_type(struct scsi_device *sdev, int tag_type) +{ + if (sdev->tagged_supported) { + scsi_set_tag_type(sdev, tag_type); + if (tag_type) + scsi_activate_tcq(sdev, sdev->queue_depth); + else + scsi_deactivate_tcq(sdev, sdev->queue_depth); + } else + tag_type = 0; + + return tag_type; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP + * (see include/scsi/scsi_host.h for definition) + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) +{ + struct Scsi_Host *shost = sdev->host; + int max_depth; + if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { + max_depth = shost->can_queue; + if (!sdev->tagged_supported) + max_depth = 1; + if (qdepth > max_depth) + qdepth = max_depth; + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); + } else if (reason == SCSI_QDEPTH_QFULL) + scsi_track_queue_full(sdev, qdepth); + else + return -EOPNOTSUPP; + + return sdev->queue_depth; +} + static int srp_send_tsk_mgmt(struct srp_target_port *target, u64 req_tag, unsigned int lun, u8 func) { + struct srp_rport *rport = target->rport; struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; @@ -1710,12 +1947,20 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, init_completion(&target->tsk_mgmt_done); + /* + * Lock the rport mutex to avoid that srp_create_target_ib() is + * invoked while a task management function is being sent. + */ + mutex_lock(&rport->mutex); spin_lock_irq(&target->lock); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); spin_unlock_irq(&target->lock); - if (!iu) + if (!iu) { + mutex_unlock(&rport->mutex); + return -1; + } ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); @@ -1732,8 +1977,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, DMA_TO_DEVICE); if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); + mutex_unlock(&rport->mutex); + return -1; } + mutex_unlock(&rport->mutex); if (!wait_for_completion_timeout(&target->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) @@ -1751,11 +1999,11 @@ static int srp_abort(struct scsi_cmnd *scmnd) shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); if (!req || !srp_claim_req(target, req, scmnd)) - return FAILED; + return SUCCESS; if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, SRP_TSK_ABORT_TASK) == 0) ret = SUCCESS; - else if (target->transport_offline) + else if (target->rport->state == SRP_RPORT_LOST) ret = FAST_IO_FAIL; else ret = FAILED; @@ -1779,10 +2027,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) if (target->tsk_mgmt_status) return FAILED; - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; if (req->scmnd && req->scmnd->device == scmnd->device) - srp_reset_req(target, req); + srp_finish_req(target, req, DID_RESET << 16); } return SUCCESS; @@ -1791,14 +2039,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - int ret = FAILED; shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); - if (!srp_reconnect_target(target)) - ret = SUCCESS; - - return ret; + return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } static int srp_slave_configure(struct scsi_device *sdev) @@ -1851,6 +2095,14 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey)); } +static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.sgid.raw); +} + static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1907,6 +2159,14 @@ static ssize_t show_comp_vector(struct device *dev, return sprintf(buf, "%d\n", target->comp_vector); } +static ssize_t show_tl_retry_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->tl_retry_count); +} + static ssize_t show_cmd_sg_entries(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1927,6 +2187,7 @@ static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static DEVICE_ATTR(sgid, S_IRUGO, show_sgid, NULL); static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL); static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); @@ -1934,6 +2195,7 @@ static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); +static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); @@ -1942,6 +2204,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_ioc_guid, &dev_attr_service_id, &dev_attr_pkey, + &dev_attr_sgid, &dev_attr_dgid, &dev_attr_orig_dgid, &dev_attr_req_lim, @@ -1949,6 +2212,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_local_ib_port, &dev_attr_local_ib_device, &dev_attr_comp_vector, + &dev_attr_tl_retry_count, &dev_attr_cmd_sg_entries, &dev_attr_allow_ext_sg, NULL @@ -1961,14 +2225,16 @@ static struct scsi_host_template srp_template = { .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, + .change_queue_type = srp_change_queue_type, .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, .skip_settle_delay = true, .sg_tablesize = SRP_DEF_SG_TABLESIZE, - .can_queue = SRP_CMD_SQ_SIZE, + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, .this_id = -1, - .cmd_per_lun = SRP_CMD_SQ_SIZE, + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = srp_host_attrs }; @@ -1994,6 +2260,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) } rport->lld_data = target; + target->rport = rport; spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); @@ -2073,6 +2340,8 @@ enum { SRP_OPT_ALLOW_EXT_SG = 1 << 10, SRP_OPT_SG_TABLESIZE = 1 << 11, SRP_OPT_COMP_VECTOR = 1 << 12, + SRP_OPT_TL_RETRY_COUNT = 1 << 13, + SRP_OPT_QUEUE_SIZE = 1 << 14, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -2094,6 +2363,8 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, + { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, + { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, { SRP_OPT_ERR, NULL } }; @@ -2188,13 +2459,25 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->scsi_host->max_sectors = token; break; + case SRP_OPT_QUEUE_SIZE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad queue_size parameter '%s'\n", p); + goto out; + } + target->scsi_host->can_queue = token; + target->queue_size = token + SRP_RSP_SQ_SIZE + + SRP_TSK_MGMT_SQ_SIZE; + if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + target->scsi_host->cmd_per_lun = token; + break; + case SRP_OPT_MAX_CMD_PER_LUN: - if (match_int(args, &token)) { + if (match_int(args, &token) || token < 1) { pr_warn("bad max cmd_per_lun parameter '%s'\n", p); goto out; } - target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); + target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_IO_CLASS: @@ -2257,6 +2540,15 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->comp_vector = token; break; + case SRP_OPT_TL_RETRY_COUNT: + if (match_int(args, &token) || token < 2 || token > 7) { + pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", + p); + goto out; + } + target->tl_retry_count = token; + break; + default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); @@ -2273,6 +2565,12 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) pr_warn("target creation request is missing parameter '%s'\n", srp_opt_tokens[i].pattern); + if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue + && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + pr_warn("cmd_per_lun = %d > queue_size = %d\n", + target->scsi_host->cmd_per_lun, + target->scsi_host->can_queue); + out: kfree(options); return ret; @@ -2287,8 +2585,7 @@ static ssize_t srp_create_target(struct device *dev, struct Scsi_Host *target_host; struct srp_target_port *target; struct ib_device *ibdev = host->srp_dev->dev; - dma_addr_t dma_addr; - int i, ret; + int ret; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); @@ -2311,11 +2608,15 @@ static ssize_t srp_create_target(struct device *dev, target->cmd_sg_cnt = cmd_sg_entries; target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; ret = srp_parse_options(buf, target); if (ret) goto err; + target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; + if (!srp_conn_unique(target->srp_host, target)) { shost_printk(KERN_INFO, target->scsi_host, PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", @@ -2339,31 +2640,13 @@ static ssize_t srp_create_target(struct device *dev, sizeof (struct srp_indirect_buf) + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + INIT_WORK(&target->tl_err_work, srp_tl_err_work); INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); - INIT_LIST_HEAD(&target->free_reqs); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - struct srp_request *req = &target->req_ring[i]; - - req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *), - GFP_KERNEL); - req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *), - GFP_KERNEL); - req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); - if (!req->fmr_list || !req->map_page || !req->indirect_desc) - goto err_free_mem; - - dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, - target->indirect_size, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(ibdev, dma_addr)) - goto err_free_mem; - - req->indirect_dma_addr = dma_addr; - req->index = i; - list_add_tail(&req->list, &target->free_reqs); - } + ret = srp_alloc_req_data(target); + if (ret) + goto err_free_mem; ib_query_gid(ibdev, host->port, 0, &target->path.sgid); @@ -2612,7 +2895,14 @@ static void srp_remove_one(struct ib_device *device) } static struct srp_function_template ib_srp_transport_functions = { + .has_rport_state = true, + .reset_timer_if_blocked = true, + .reconnect_delay = &srp_reconnect_delay, + .fast_io_fail_tmo = &srp_fast_io_fail_tmo, + .dev_loss_tmo = &srp_dev_loss_tmo, + .reconnect = srp_rport_reconnect, .rport_delete = srp_rport_delete, + .terminate_rport_io = srp_terminate_io, }; static int __init srp_init_module(void) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index e641088c14dc..575681063f38 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -57,14 +57,11 @@ enum { SRP_MAX_LUN = 512, SRP_DEF_SG_TABLESIZE = 12, - SRP_RQ_SHIFT = 6, - SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, - - SRP_SQ_SIZE = SRP_RQ_SIZE, + SRP_DEFAULT_QUEUE_SIZE = 1 << 6, SRP_RSP_SQ_SIZE = 1, - SRP_REQ_SQ_SIZE = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE, SRP_TSK_MGMT_SQ_SIZE = 1, - SRP_CMD_SQ_SIZE = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, + SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - + SRP_TSK_MGMT_SQ_SIZE, SRP_TAG_NO_REQ = ~0U, SRP_TAG_TSK_MGMT = 1U << 31, @@ -140,7 +137,6 @@ struct srp_target_port { unsigned int cmd_sg_cnt; unsigned int indirect_size; bool allow_ext_sg; - bool transport_offline; /* Everything above this point is used in the hot path of * command processing. Try to keep them packed into cachelines. @@ -153,10 +149,14 @@ struct srp_target_port { u16 io_class; struct srp_host *srp_host; struct Scsi_Host *scsi_host; + struct srp_rport *rport; char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + int queue_size; + int req_ring_size; int comp_vector; + int tl_retry_count; struct ib_sa_path_rec path; __be16 orig_dgid[8]; @@ -172,10 +172,11 @@ struct srp_target_port { int zero_req_lim; - struct srp_iu *tx_ring[SRP_SQ_SIZE]; - struct srp_iu *rx_ring[SRP_RQ_SIZE]; - struct srp_request req_ring[SRP_CMD_SQ_SIZE]; + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; + struct srp_request *req_ring; + struct work_struct tl_err_work; struct work_struct remove_work; struct list_head list; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 6c923c7039a1..520a7e5a490b 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1352,11 +1352,8 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) /* XXX(hch): this is a horrible layering violation.. */ spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); - - complete(&ioctx->cmd.transport_lun_stop_comp); break; case SRPT_STATE_CMD_RSP_SENT: /* @@ -1364,9 +1361,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * not been received in time. */ srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd); break; case SRPT_STATE_MGMT_RSP_SENT: @@ -1476,7 +1470,6 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, { struct se_cmd *cmd; enum srpt_command_state state; - unsigned long flags; cmd = &ioctx->cmd; state = srpt_get_cmd_state(ioctx); @@ -1496,9 +1489,6 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, __func__, __LINE__, state); break; case SRPT_RDMA_WRITE_LAST: - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state |= CMD_T_LUN_STOP; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); break; default: printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, |