diff options
65 files changed, 3058 insertions, 1204 deletions
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 53343ffbff7a..cb00d59da456 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1043,8 +1043,8 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_port(device, port, tprops); if (ret) { - printk(KERN_WARNING "ib_query_port failed (%d) for %s\n", - ret, device->name); + pr_warn("ib_query_port failed (%d) for %s\n", + ret, device->name); goto err; } @@ -1067,8 +1067,8 @@ static void ib_cache_update(struct ib_device *device, for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { - printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); + pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", + ret, device->name, i); goto err; } } @@ -1078,8 +1078,8 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_gid(device, port, i, gid_cache->table + i, NULL); if (ret) { - printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); + pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); goto err; } } @@ -1161,8 +1161,7 @@ int ib_cache_setup_one(struct ib_device *device) GFP_KERNEL); if (!device->cache.pkey_cache || !device->cache.lmc_cache) { - printk(KERN_WARNING "Couldn't allocate cache " - "for %s\n", device->name); + pr_warn("Couldn't allocate cache for %s\n", device->name); return -ENOMEM; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9729639df407..93ab0ae97208 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1206,6 +1206,10 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event, req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); + if (req->pkey != req_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; @@ -1213,6 +1217,10 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event, req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; + if (req->pkey != sidr_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; @@ -1713,7 +1721,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -2186,8 +2194,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) - printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " - "listening on device %s\n", ret, cma_dev->device->name); + pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", + ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -3239,7 +3247,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = 0; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -4003,8 +4011,8 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { - printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", - ndev->name, &id_priv->id); + pr_info("RDMA CM addr change for ndev %s used by id %p\n", + ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; @@ -4287,7 +4295,7 @@ static int __init cma_init(void) goto err; if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) - printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n"); + pr_warn("RDMA CMA: failed to add netlink callback\n"); cma_configfs_init(); return 0; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 00da80e02154..270c7ff6cba7 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -115,8 +115,8 @@ static int ib_device_check_mandatory(struct ib_device *device) for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - printk(KERN_WARNING "Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + pr_warn("Device %s is missing mandatory function %s\n", + device->name, mandatory_table[i].name); return -EINVAL; } } @@ -255,8 +255,8 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { - printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", - device->name, client->name); + pr_warn("Couldn't allocate client context for %s/%s\n", + device->name, client->name); return -ENOMEM; } @@ -343,28 +343,29 @@ int ib_register_device(struct ib_device *device, ret = read_port_immutable(device); if (ret) { - printk(KERN_WARNING "Couldn't create per port immutable data %s\n", - device->name); + pr_warn("Couldn't create per port immutable data %s\n", + device->name); goto out; } ret = ib_cache_setup_one(device); if (ret) { - printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); + pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); goto out; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->query_device(device, &device->attrs, &uhw); if (ret) { - printk(KERN_WARNING "Couldn't query the device attributes\n"); + pr_warn("Couldn't query the device attributes\n"); + ib_cache_cleanup_one(device); goto out; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { - printk(KERN_WARNING "Couldn't register device %s with driver model\n", - device->name); + pr_warn("Couldn't register device %s with driver model\n", + device->name); ib_cache_cleanup_one(device); goto out; } @@ -565,8 +566,8 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, goto out; } - printk(KERN_WARNING "No client context found for %s/%s\n", - device->name, client->name); + pr_warn("No client context found for %s/%s\n", + device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); @@ -959,13 +960,13 @@ static int __init ib_core_init(void) ret = class_register(&ib_class); if (ret) { - printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); + pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp; } ret = ibnl_init(); if (ret) { - printk(KERN_WARNING "Couldn't init IB netlink interface\n"); + pr_warn("Couldn't init IB netlink interface\n"); goto err_sysfs; } diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 6ac3683c144b..cdbb1f1a6d97 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -150,8 +150,8 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n", - fmr, fmr->ref_count); + pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n", + fmr, fmr->ref_count); } #endif } @@ -167,7 +167,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) ret = ib_unmap_fmr(&fmr_list); if (ret) - printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); + pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); spin_lock_irq(&pool->pool_lock); list_splice(&unmap_list, &pool->free_list); @@ -222,8 +222,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - printk(KERN_INFO PFX "Device %s does not support FMRs\n", - device->name); + pr_info(PFX "Device %s does not support FMRs\n", device->name); return ERR_PTR(-ENOSYS); } @@ -233,13 +232,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, max_remaps = device->attrs.max_map_per_fmr; pool = kmalloc(sizeof *pool, GFP_KERNEL); - if (!pool) { - printk(KERN_WARNING PFX "couldn't allocate pool struct\n"); + if (!pool) return ERR_PTR(-ENOMEM); - } pool->cache_bucket = NULL; - pool->flush_function = params->flush_function; pool->flush_arg = params->flush_arg; @@ -251,7 +247,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket, GFP_KERNEL); if (!pool->cache_bucket) { - printk(KERN_WARNING PFX "Failed to allocate cache in pool\n"); + pr_warn(PFX "Failed to allocate cache in pool\n"); ret = -ENOMEM; goto out_free_pool; } @@ -275,7 +271,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, "ib_fmr(%s)", device->name); if (IS_ERR(pool->thread)) { - printk(KERN_WARNING PFX "couldn't start cleanup thread\n"); + pr_warn(PFX "couldn't start cleanup thread\n"); ret = PTR_ERR(pool->thread); goto out_free_pool; } @@ -294,11 +290,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, for (i = 0; i < params->pool_size; ++i) { fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); - if (!fmr) { - printk(KERN_WARNING PFX "failed to allocate fmr " - "struct for FMR %d\n", i); + if (!fmr) goto out_fail; - } fmr->pool = pool; fmr->remap_count = 0; @@ -307,8 +300,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); if (IS_ERR(fmr->fmr)) { - printk(KERN_WARNING PFX "fmr_create failed " - "for FMR %d\n", i); + pr_warn(PFX "fmr_create failed for FMR %d\n", + i); kfree(fmr); goto out_fail; } @@ -363,8 +356,8 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) } if (i < pool->pool_size) - printk(KERN_WARNING PFX "pool still has %d regions registered\n", - pool->pool_size - i); + pr_warn(PFX "pool still has %d regions registered\n", + pool->pool_size - i); kfree(pool->cache_bucket); kfree(pool); @@ -463,7 +456,7 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, list_add(&fmr->list, &pool->free_list); spin_unlock_irqrestore(&pool->pool_lock, flags); - printk(KERN_WARNING PFX "fmr_map returns %d\n", result); + pr_warn(PFX "fmr_map returns %d\n", result); return ERR_PTR(result); } @@ -517,8 +510,8 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) #ifdef DEBUG if (fmr->ref_count < 0) - printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n", - fmr, fmr->ref_count); + pr_warn(PFX "FMR %p has ref count %d < 0\n", + fmr, fmr->ref_count); #endif spin_unlock_irqrestore(&pool->pool_lock, flags); diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c index 1b65986c0be3..19b1ee3279b4 100644 --- a/drivers/infiniband/core/packer.c +++ b/drivers/infiniband/core/packer.c @@ -44,7 +44,7 @@ static u64 value_read(int offset, int size, void *structure) case 4: return be32_to_cpup((__be32 *) (structure + offset)); case 8: return be64_to_cpup((__be64 *) (structure + offset)); default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); return 0; } } @@ -104,9 +104,8 @@ void ib_pack(const struct ib_field *desc, } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } if (desc[i].struct_size_bytes) @@ -132,7 +131,7 @@ static void value_write(int offset, int size, u64 val, void *structure) case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break; case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break; default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); } } @@ -188,9 +187,8 @@ void ib_unpack(const struct ib_field *desc, } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } memcpy(structure + desc[i].struct_offset_bytes, diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index f334090bb612..8e3bf6c8d3c3 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -864,13 +864,12 @@ static void update_sm_ah(struct work_struct *work) struct ib_ah_attr ah_attr; if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { - printk(KERN_WARNING "Couldn't query port\n"); + pr_warn("Couldn't query port\n"); return; } new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL); if (!new_ah) { - printk(KERN_WARNING "Couldn't allocate new SM AH\n"); return; } @@ -880,7 +879,7 @@ static void update_sm_ah(struct work_struct *work) new_ah->pkey_index = 0; if (ib_find_pkey(port->agent->device, port->port_num, IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) - printk(KERN_ERR "Couldn't find index for default PKey\n"); + pr_err("Couldn't find index for default PKey\n"); memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = port_attr.sm_lid; @@ -889,7 +888,7 @@ static void update_sm_ah(struct work_struct *work) new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); if (IS_ERR(new_ah->ah)) { - printk(KERN_WARNING "Couldn't create new SM AH\n"); + pr_warn("Couldn't create new SM AH\n"); kfree(new_ah); return; } @@ -1221,7 +1220,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, rec.net = NULL; rec.ifindex = 0; rec.gid_type = IB_GID_TYPE_IB; - memset(rec.dmac, 0, ETH_ALEN); + eth_zero_addr(rec.dmac); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); @@ -1800,13 +1799,13 @@ static int __init ib_sa_init(void) ret = ib_register_client(&sa_client); if (ret) { - printk(KERN_ERR "Couldn't register ib_sa client\n"); + pr_err("Couldn't register ib_sa client\n"); goto err1; } ret = mcast_init(); if (ret) { - printk(KERN_ERR "Couldn't initialize multicast handling\n"); + pr_err("Couldn't initialize multicast handling\n"); goto err2; } diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 6b4e8a008bc0..4a9aa0433b07 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -1234,7 +1234,7 @@ static int find_overflow_devnum(void) ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register dynamic device number\n"); + pr_err("ucm: couldn't register dynamic device number\n"); return ret; } } @@ -1329,19 +1329,19 @@ static int __init ib_ucm_init(void) ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register device number\n"); + pr_err("ucm: couldn't register device number\n"); goto error1; } ret = class_create_file(&cm_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "ucm: couldn't create abi_version attribute\n"); + pr_err("ucm: couldn't create abi_version attribute\n"); goto error2; } ret = ib_register_client(&ucm_client); if (ret) { - printk(KERN_ERR "ucm: couldn't register client\n"); + pr_err("ucm: couldn't register client\n"); goto error3; } return 0; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 8b5a934e1133..dd3bcceadfde 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -314,7 +314,7 @@ static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) } } if (!event_found) - printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n"); + pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); } static int ucma_event_handler(struct rdma_cm_id *cm_id, @@ -1716,13 +1716,13 @@ static int __init ucma_init(void) ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { - printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); + pr_err("rdma_ucm: couldn't create abi_version attr\n"); goto err1; } ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); if (!ucma_ctl_table_hdr) { - printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n"); + pr_err("rdma_ucm: couldn't register sysctl paths\n"); ret = -ENOMEM; goto err2; } diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 2116132568e7..29a45d2f8898 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -479,8 +479,8 @@ int ib_ud_header_unpack(void *buf, buf += IB_LRH_BYTES; if (header->lrh.link_version != 0) { - printk(KERN_WARNING "Invalid LRH.link_version %d\n", - header->lrh.link_version); + pr_warn("Invalid LRH.link_version %d\n", + header->lrh.link_version); return -EINVAL; } @@ -496,20 +496,20 @@ int ib_ud_header_unpack(void *buf, buf += IB_GRH_BYTES; if (header->grh.ip_version != 6) { - printk(KERN_WARNING "Invalid GRH.ip_version %d\n", - header->grh.ip_version); + pr_warn("Invalid GRH.ip_version %d\n", + header->grh.ip_version); return -EINVAL; } if (header->grh.next_header != 0x1b) { - printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n", - header->grh.next_header); + pr_warn("Invalid GRH.next_header 0x%02x\n", + header->grh.next_header); return -EINVAL; } break; default: - printk(KERN_WARNING "Invalid LRH.link_next_header %d\n", - header->lrh.link_next_header); + pr_warn("Invalid LRH.link_next_header %d\n", + header->lrh.link_next_header); return -EINVAL; } @@ -525,14 +525,13 @@ int ib_ud_header_unpack(void *buf, header->immediate_present = 1; break; default: - printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n", - header->bth.opcode); + pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); return -EINVAL; } if (header->bth.transport_header_version != 0) { - printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n", - header->bth.transport_header_version); + pr_warn("Invalid BTH.transport_header_version %d\n", + header->bth.transport_header_version); return -EINVAL; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6ffc9c4e93af..3638c787cb7c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1174,6 +1174,7 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; + struct ib_udata udata; int ret; if (out_len < sizeof(resp)) @@ -1195,7 +1196,12 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, goto err_free; } - mw = pd->device->alloc_mw(pd, cmd.mw_type); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); + + mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); goto err_put; @@ -1970,7 +1976,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, resp_size); INIT_UDATA(&uhw, buf + sizeof(cmd), (unsigned long)cmd.response + resp_size, - in_len - sizeof(cmd), out_len - resp_size); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - resp_size); memset(&cmd_ex, 0, sizeof(cmd_ex)); cmd_ex.user_handle = cmd.user_handle; @@ -3085,6 +3092,14 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) return -EPERM; + if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) + return -EINVAL; + + if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) || + (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT))) + return -EINVAL; + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; @@ -3413,7 +3428,8 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); if (ret) @@ -3439,7 +3455,8 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); if (ret) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 39680aed99dd..28ba2cc81535 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -683,12 +683,28 @@ out: return ev_file; } +static int verify_command_mask(struct ib_device *ib_dev, __u32 command) +{ + u64 mask; + + if (command <= IB_USER_VERBS_CMD_OPEN_QP) + mask = ib_dev->uverbs_cmd_mask; + else + mask = ib_dev->uverbs_ex_cmd_mask; + + if (mask & ((u64)1 << command)) + return 0; + + return -1; +} + static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; + __u32 command; __u32 flags; int srcu_key; ssize_t ret; @@ -707,37 +723,34 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - flags = (hdr.command & - IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) { + ret = -EINVAL; + goto out; + } - if (!flags) { - __u32 command; + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + if (verify_command_mask(ib_dev, command)) { + ret = -EOPNOTSUPP; + goto out; + } - if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) { - ret = -EINVAL; - goto out; - } + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) { + ret = -EINVAL; + goto out; + } - command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; + if (!flags) { if (command >= ARRAY_SIZE(uverbs_cmd_table) || !uverbs_cmd_table[command]) { ret = -EINVAL; goto out; } - if (!file->ucontext && - command != IB_USER_VERBS_CMD_GET_CONTEXT) { - ret = -EINVAL; - goto out; - } - - if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) { - ret = -ENOSYS; - goto out; - } - if (hdr.in_words * 4 != count) { ret = -EINVAL; goto out; @@ -749,21 +762,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, hdr.out_words * 4); } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { - __u32 command; - struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; size_t written_count = count; - if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) { - ret = -EINVAL; - goto out; - } - - command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || !uverbs_ex_cmd_table[command]) { ret = -ENOSYS; @@ -775,11 +778,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) { - ret = -ENOSYS; - goto out; - } - if (count < (sizeof(hdr) + sizeof(ex_hdr))) { ret = -EINVAL; goto out; @@ -1058,7 +1056,7 @@ static int find_overflow_devnum(void) ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n"); + pr_err("user_verbs: couldn't register dynamic device number\n"); return ret; } } @@ -1279,14 +1277,14 @@ static int __init ib_uverbs_init(void) ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register device number\n"); + pr_err("user_verbs: couldn't register device number\n"); goto out; } uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); if (IS_ERR(uverbs_class)) { ret = PTR_ERR(uverbs_class); - printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n"); + pr_err("user_verbs: couldn't create class infiniband_verbs\n"); goto out_chrdev; } @@ -1294,13 +1292,13 @@ static int __init ib_uverbs_init(void) ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n"); + pr_err("user_verbs: couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&uverbs_client); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register client\n"); + pr_err("user_verbs: couldn't register client\n"); goto out_class; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 5af6d024e053..5cd1e3987f2b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1567,6 +1567,8 @@ EXPORT_SYMBOL(ib_check_mr_status); * - The last sg element is allowed to have length less than page_size. * - If sg_nents total byte length exceeds the mr max_num_sge * page_size * then only max_num_sg entries will be mapped. + * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS_REG, non of these + * constraints holds and the page_size argument is ignored. * * Returns the number of sg elements that were mapped to the memory region. * @@ -1657,3 +1659,167 @@ next_page: return i; } EXPORT_SYMBOL(ib_sg_to_pages); + +struct ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* + * Post a WR and block until its completion is reaped for the SQ. + */ +static void __ib_drain_sq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe sdrain; + struct ib_send_wr swr = {}, *bad_swr; + int ret; + + if (qp->send_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->send_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + swr.wr_cqe = &sdrain.cqe; + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + ret = ib_post_send(qp, &swr, &bad_swr); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + wait_for_completion(&sdrain.done); +} + +/* + * Post a WR and block until its completion is reaped for the RQ. + */ +static void __ib_drain_rq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}, *bad_rwr; + int ret; + + if (qp->recv_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->recv_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + ret = ib_post_recv(qp, &rwr, &bad_rwr); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + wait_for_completion(&rdrain.done); +} + +/** + * ib_drain_sq() - Block until all SQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_sq(). + * + * The caller must: + * + * ensure there is room in the CQ and SQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_sq(struct ib_qp *qp) +{ + if (qp->device->drain_sq) + qp->device->drain_sq(qp); + else + __ib_drain_sq(qp); +} +EXPORT_SYMBOL(ib_drain_sq); + +/** + * ib_drain_rq() - Block until all RQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_rq(). + * + * The caller must: + * + * ensure there is room in the CQ and RQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_rq(struct ib_qp *qp) +{ + if (qp->device->drain_rq) + qp->device->drain_rq(qp); + else + __ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_rq); + +/** + * ib_drain_qp() - Block until all CQEs have been consumed by the + * application on both the RQ and SQ. + * @qp: queue pair to drain + * + * The caller must: + * + * ensure there is room in the CQ(s), SQ, and RQ for drain work requests + * and completions. + * + * allocate the CQs using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_qp(struct ib_qp *qp) +{ + ib_drain_sq(qp); + ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_qp); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 2734820d291b..42a7b8952d13 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -657,7 +657,8 @@ err: return ERR_PTR(err); } -static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct iwch_dev *rhp; struct iwch_pd *php; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index cf21df4a8bf5..b4eeb783573c 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -815,8 +815,15 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) } } out: - if (wq) + if (wq) { + if (unlikely(qhp->attr.state != C4IW_QP_STATE_RTS)) { + if (t4_sq_empty(wq)) + complete(&qhp->sq_drained); + if (t4_rq_empty(wq)) + complete(&qhp->rq_drained); + } spin_unlock(&qhp->lock); + } return ret; } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index fb2de75a0392..97c0e5aa5c64 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -476,6 +476,8 @@ struct c4iw_qp { wait_queue_head_t wait; struct timer_list timer; int sq_sig_all; + struct completion rq_drained; + struct completion sq_drained; }; static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) @@ -961,7 +963,8 @@ int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents); int c4iw_dealloc_mw(struct ib_mw *mw); -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata); @@ -1016,6 +1019,8 @@ extern int c4iw_wr_log; extern int db_fc_threshold; extern int db_coalescing_threshold; extern int use_dsgl; +void c4iw_drain_rq(struct ib_qp *qp); +void c4iw_drain_sq(struct ib_qp *qp); #endif diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 7849890c4781..766d39cff06c 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -34,6 +34,7 @@ #include <linux/moduleparam.h> #include <rdma/ib_umem.h> #include <linux/atomic.h> +#include <rdma/ib_user_verbs.h> #include "iw_cxgb4.h" @@ -552,7 +553,8 @@ err: return ERR_PTR(err); } -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_pd *php; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index ec04272fbdc2..104662d38d1e 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -564,6 +564,8 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.get_protocol_stats = c4iw_get_mib; dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = c4iw_port_immutable; + dev->ibdev.drain_sq = c4iw_drain_sq; + dev->ibdev.drain_rq = c4iw_drain_rq; dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index e99345eb875a..7b1b1e840ef1 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1697,6 +1697,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, qhp->attr.max_ird = 0; qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; spin_lock_init(&qhp->lock); + init_completion(&qhp->sq_drained); + init_completion(&qhp->rq_drained); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); atomic_set(&qhp->refcnt, 1); @@ -1888,3 +1890,17 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0; return 0; } + +void c4iw_drain_sq(struct ib_qp *ibqp) +{ + struct c4iw_qp *qp = to_c4iw_qp(ibqp); + + wait_for_completion(&qp->sq_drained); +} + +void c4iw_drain_rq(struct ib_qp *ibqp) +{ + struct c4iw_qp *qp = to_c4iw_qp(ibqp); + + wait_for_completion(&qp->rq_drained); +} diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 21cb41a60fe8..c74ef2620b85 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -310,7 +310,7 @@ static void aliasguid_query_handler(int status, if (status) { pr_debug("(port: %d) failed: status = %d\n", cb_ctx->port, status); - rec->time_to_run = ktime_get_real_ns() + 1 * NSEC_PER_SEC; + rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC; goto out; } @@ -416,7 +416,7 @@ next_entry: be64_to_cpu((__force __be64)rec->guid_indexes), be64_to_cpu((__force __be64)applied_guid_indexes), be64_to_cpu((__force __be64)declined_guid_indexes)); - rec->time_to_run = ktime_get_real_ns() + + rec->time_to_run = ktime_get_boot_ns() + resched_delay_sec * NSEC_PER_SEC; } else { rec->status = MLX4_GUID_INFO_STATUS_SET; @@ -708,7 +708,7 @@ static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, } } if (resched_delay_sec) { - u64 curr_time = ktime_get_real_ns(); + u64 curr_time = ktime_get_boot_ns(); *resched_delay_sec = (low_record_time < curr_time) ? 0 : div_u64((low_record_time - curr_time), NSEC_PER_SEC); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1c7ab6cabbb8..914bc98e753f 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1643,6 +1643,56 @@ static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_ return err; } +static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev, + struct ib_flow_attr *flow_attr, + enum mlx4_net_trans_promisc_mode *type) +{ + int err = 0; + + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER) || + (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) || + (flow_attr->num_of_specs > 1) || (flow_attr->priority != 0)) { + return -EOPNOTSUPP; + } + + if (flow_attr->num_of_specs == 0) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + union ib_flow_spec *ib_spec; + + ib_spec = (union ib_flow_spec *)(flow_attr + 1); + if (ib_spec->type != IB_FLOW_SPEC_ETH) + return -EINVAL; + + /* if all is zero than MC and UC */ + if (is_zero_ether_addr(ib_spec->eth.mask.dst_mac)) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + u8 mac[ETH_ALEN] = {ib_spec->eth.mask.dst_mac[0] ^ 0x01, + ib_spec->eth.mask.dst_mac[1], + ib_spec->eth.mask.dst_mac[2], + ib_spec->eth.mask.dst_mac[3], + ib_spec->eth.mask.dst_mac[4], + ib_spec->eth.mask.dst_mac[5]}; + + /* Above xor was only on MC bit, non empty mask is valid + * only if this bit is set and rest are zero. + */ + if (!is_zero_ether_addr(&mac[0])) + return -EINVAL; + + if (is_multicast_ether_addr(ib_spec->eth.val.dst_mac)) + type[0] = MLX4_FS_MC_SNIFFER; + else + type[0] = MLX4_FS_UC_SNIFFER; + } + } + + return err; +} + static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) @@ -1653,6 +1703,10 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct mlx4_dev *dev = (to_mdev(qp->device))->dev; int is_bonded = mlx4_is_bonded(dev); + if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + (flow_attr->type != IB_FLOW_ATTR_NORMAL)) + return ERR_PTR(-EOPNOTSUPP); + memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); @@ -1663,7 +1717,19 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: - type[0] = MLX4_FS_REGULAR; + /* If dont trap flag (continue match) is set, under specific + * condition traffic be replicated to given qp, + * without stealing it + */ + if (unlikely(flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)) { + err = mlx4_ib_add_dont_trap_rule(dev, + flow_attr, + type); + if (err) + goto err_free; + } else { + type[0] = MLX4_FS_REGULAR; + } break; case IB_FLOW_ATTR_ALL_DEFAULT: @@ -1675,8 +1741,8 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, break; case IB_FLOW_ATTR_SNIFFER: - type[0] = MLX4_FS_UC_SNIFFER; - type[1] = MLX4_FS_MC_SNIFFER; + type[0] = MLX4_FS_MIRROR_RX_PORT; + type[1] = MLX4_FS_MIRROR_SX_PORT; break; default: diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 52ce7b000044..1eca01cebe51 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -711,7 +711,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); -struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 242b94ec105b..ce0b5aa8eb9b 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -32,6 +32,7 @@ */ #include <linux/slab.h> +#include <rdma/ib_user_verbs.h> #include "mlx4_ib.h" @@ -334,7 +335,8 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr) return 0; } -struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_mw *mw; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 27a70159e2ea..4e851889355a 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o -mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index fd1de31e0611..a00ba4418de9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -207,7 +207,10 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, break; case MLX5_CQE_RESP_SEND: wc->opcode = IB_WC_RECV; - wc->wc_flags = 0; + wc->wc_flags = IB_WC_IP_CSUM_OK; + if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) + wc->wc_flags = 0; break; case MLX5_CQE_RESP_SEND_IMM: wc->opcode = IB_WC_RECV; @@ -431,7 +434,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_core_qp *mqp; struct mlx5_ib_wq *wq; struct mlx5_sig_err_cqe *sig_err_cqe; - struct mlx5_core_mr *mmr; + struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; uint8_t opcode; uint32_t qpn; @@ -536,17 +539,17 @@ repoll: case MLX5_CQE_SIG_ERR: sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; - read_lock(&dev->mdev->priv.mr_table.lock); - mmr = __mlx5_mr_lookup(dev->mdev, - mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); - if (unlikely(!mmr)) { - read_unlock(&dev->mdev->priv.mr_table.lock); + read_lock(&dev->mdev->priv.mkey_table.lock); + mmkey = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmkey)) { + read_unlock(&dev->mdev->priv.mkey_table.lock); mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); return -EINVAL; } - mr = to_mibmr(mmr); + mr = to_mibmr(mmkey); get_sig_err_item(sig_err_cqe, &mr->sig->err_item); mr->sig->sig_err_exists = true; mr->sig->sigerr_count++; @@ -558,25 +561,51 @@ repoll: mr->sig->err_item.expected, mr->sig->err_item.actual); - read_unlock(&dev->mdev->priv.mr_table.lock); + read_unlock(&dev->mdev->priv.mkey_table.lock); goto repoll; } return 0; } +static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_ib_wc *soft_wc, *next; + int npolled = 0; + + list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) { + if (npolled >= num_entries) + break; + + mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", + cq->mcq.cqn); + + wc[npolled++] = soft_wc->wc; + list_del(&soft_wc->list); + kfree(soft_wc); + } + + return npolled; +} + int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mlx5_ib_cq *cq = to_mcq(ibcq); struct mlx5_ib_qp *cur_qp = NULL; unsigned long flags; + int soft_polled = 0; int npolled; int err = 0; spin_lock_irqsave(&cq->lock, flags); - for (npolled = 0; npolled < num_entries; npolled++) { - err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc); + + for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled); if (err) break; } @@ -587,7 +616,7 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_unlock_irqrestore(&cq->lock, flags); if (err == 0 || err == -EAGAIN) - return npolled; + return soft_polled + npolled; else return err; } @@ -595,16 +624,27 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + struct mlx5_ib_cq *cq = to_mcq(ibcq); void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, irq_flags); + if (cq->notify_flags != IB_CQ_NEXT_COMP) + cq->notify_flags = flags & IB_CQ_SOLICITED_MASK; - mlx5_cq_arm(&to_mcq(ibcq)->mcq, + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list)) + ret = 1; + spin_unlock_irqrestore(&cq->lock, irq_flags); + + mlx5_cq_arm(&cq->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, uar_page, MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), to_mcq(ibcq)->mcq.cons_index); - return 0; + return ret; } static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, @@ -757,6 +797,14 @@ static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) mlx5_db_free(dev->mdev, &cq->db); } +static void notify_soft_wc_handler(struct work_struct *work) +{ + struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq, + notify_work); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, @@ -807,6 +855,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, &index, &inlen); if (err) goto err_create; + + INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } cq->cqe_size = cqe_size; @@ -832,6 +882,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; + INIT_LIST_HEAD(&cq->wc_list); + if (context) if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { err = -EFAULT; @@ -1219,3 +1271,27 @@ int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) cq = to_mcq(ibcq); return cq->cqe_size; } + +/* Called from atomic context */ +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) +{ + struct mlx5_ib_wc *soft_wc; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + unsigned long flags; + + soft_wc = kmalloc(sizeof(*soft_wc), GFP_ATOMIC); + if (!soft_wc) + return -ENOMEM; + + soft_wc->wc = *wc; + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&soft_wc->list, &cq->wc_list); + if (cq->notify_flags == IB_CQ_NEXT_COMP || + wc->status != IB_WC_SUCCESS) { + cq->notify_flags = 0; + schedule_work(&cq->notify_work); + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c new file mode 100644 index 000000000000..53e03c8ede79 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct mlx5_ib_gsi_wr { + struct ib_cqe cqe; + struct ib_wc wc; + int send_flags; + bool completed:1; +}; + +struct mlx5_ib_gsi_qp { + struct ib_qp ibqp; + struct ib_qp *rx_qp; + u8 port_num; + struct ib_qp_cap cap; + enum ib_sig_type sq_sig_type; + /* Serialize qp state modifications */ + struct mutex mutex; + struct ib_cq *cq; + struct mlx5_ib_gsi_wr *outstanding_wrs; + u32 outstanding_pi, outstanding_ci; + int num_qps; + /* Protects access to the tx_qps. Post send operations synchronize + * with tx_qp creation in setup_qp(). Also protects the + * outstanding_wrs array and indices. + */ + spinlock_t lock; + struct ib_qp **tx_qps; +}; + +static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) +{ + return container_of(qp, struct mlx5_ib_gsi_qp, ibqp); +} + +static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); +} + +static u32 next_outstanding(struct mlx5_ib_gsi_qp *gsi, u32 index) +{ + return ++index % gsi->cap.max_send_wr; +} + +#define for_each_outstanding_wr(gsi, index) \ + for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; \ + index = next_outstanding(gsi, index)) + +/* Call with gsi->lock locked */ +static void generate_completions(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_cq *gsi_cq = gsi->ibqp.send_cq; + struct mlx5_ib_gsi_wr *wr; + u32 index; + + for_each_outstanding_wr(gsi, index) { + wr = &gsi->outstanding_wrs[index]; + + if (!wr->completed) + break; + + if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR || + wr->send_flags & IB_SEND_SIGNALED) + WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc)); + + wr->completed = false; + } + + gsi->outstanding_ci = index; +} + +static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_gsi_qp *gsi = cq->cq_context; + struct mlx5_ib_gsi_wr *wr = + container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe); + u64 wr_id; + unsigned long flags; + + spin_lock_irqsave(&gsi->lock, flags); + wr->completed = true; + wr_id = wr->wc.wr_id; + wr->wc = *wc; + wr->wc.wr_id = wr_id; + wr->wc.qp = &gsi->ibqp; + + generate_completions(gsi); + spin_unlock_irqrestore(&gsi->lock, flags); +} + +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_gsi_qp *gsi; + struct ib_qp_init_attr hw_init_attr = *init_attr; + const u8 port_num = init_attr->port_num; + const int num_pkeys = pd->device->attrs.max_pkeys; + const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; + int ret; + + mlx5_ib_dbg(dev, "creating GSI QP\n"); + + if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) { + mlx5_ib_warn(dev, + "invalid port number %d during GSI QP creation\n", + port_num); + return ERR_PTR(-EINVAL); + } + + gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); + if (!gsi) + return ERR_PTR(-ENOMEM); + + gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL); + if (!gsi->tx_qps) { + ret = -ENOMEM; + goto err_free; + } + + gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr, + sizeof(*gsi->outstanding_wrs), + GFP_KERNEL); + if (!gsi->outstanding_wrs) { + ret = -ENOMEM; + goto err_free_tx; + } + + mutex_init(&gsi->mutex); + + mutex_lock(&dev->devr.mutex); + + if (dev->devr.ports[port_num - 1].gsi) { + mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", + port_num); + ret = -EBUSY; + goto err_free_wrs; + } + gsi->num_qps = num_qps; + spin_lock_init(&gsi->lock); + + gsi->cap = init_attr->cap; + gsi->sq_sig_type = init_attr->sq_sig_type; + gsi->ibqp.qp_num = 1; + gsi->port_num = port_num; + + gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0, + IB_POLL_SOFTIRQ); + if (IS_ERR(gsi->cq)) { + mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", + PTR_ERR(gsi->cq)); + ret = PTR_ERR(gsi->cq); + goto err_free_wrs; + } + + hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; + hw_init_attr.send_cq = gsi->cq; + if (num_qps) { + hw_init_attr.cap.max_send_wr = 0; + hw_init_attr.cap.max_send_sge = 0; + hw_init_attr.cap.max_inline_data = 0; + } + gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); + if (IS_ERR(gsi->rx_qp)) { + mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", + PTR_ERR(gsi->rx_qp)); + ret = PTR_ERR(gsi->rx_qp); + goto err_destroy_cq; + } + + dev->devr.ports[init_attr->port_num - 1].gsi = gsi; + + mutex_unlock(&dev->devr.mutex); + + return &gsi->ibqp; + +err_destroy_cq: + ib_free_cq(gsi->cq); +err_free_wrs: + mutex_unlock(&dev->devr.mutex); + kfree(gsi->outstanding_wrs); +err_free_tx: + kfree(gsi->tx_qps); +err_free: + kfree(gsi); + return ERR_PTR(ret); +} + +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + const int port_num = gsi->port_num; + int qp_index; + int ret; + + mlx5_ib_dbg(dev, "destroying GSI QP\n"); + + mutex_lock(&dev->devr.mutex); + ret = ib_destroy_qp(gsi->rx_qp); + if (ret) { + mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n", + ret); + mutex_unlock(&dev->devr.mutex); + return ret; + } + dev->devr.ports[port_num - 1].gsi = NULL; + mutex_unlock(&dev->devr.mutex); + gsi->rx_qp = NULL; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) { + if (!gsi->tx_qps[qp_index]) + continue; + WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index])); + gsi->tx_qps[qp_index] = NULL; + } + + ib_free_cq(gsi->cq); + + kfree(gsi->outstanding_wrs); + kfree(gsi->tx_qps); + kfree(gsi); + + return 0; +} + +static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_pd *pd = gsi->rx_qp->pd; + struct ib_qp_init_attr init_attr = { + .event_handler = gsi->rx_qp->event_handler, + .qp_context = gsi->rx_qp->qp_context, + .send_cq = gsi->cq, + .recv_cq = gsi->rx_qp->recv_cq, + .cap = { + .max_send_wr = gsi->cap.max_send_wr, + .max_send_sge = gsi->cap.max_send_sge, + .max_inline_data = gsi->cap.max_inline_data, + }, + .sq_sig_type = gsi->sq_sig_type, + .qp_type = IB_QPT_UD, + .create_flags = mlx5_ib_create_qp_sqpn_qp1(), + }; + + return ib_create_qp(pd, &init_attr); +} + +static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, + u16 qp_index) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct ib_qp_attr attr; + int mask; + int ret; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = qp_index; + attr.qkey = IB_QP1_QKEY; + attr.port_num = gsi->port_num; + ret = ib_modify_qp(qp, &attr, mask); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n", + qp->qp_num, ret); + return ret; + } + + return 0; +} + +static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) +{ + struct ib_device *device = gsi->rx_qp->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_qp *qp; + unsigned long flags; + u16 pkey; + int ret; + + ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); + if (ret) { + mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", + gsi->port_num, qp_index); + return; + } + + if (!pkey) { + mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d. Skipping.\n", + gsi->port_num, qp_index); + return; + } + + spin_lock_irqsave(&gsi->lock, flags); + qp = gsi->tx_qps[qp_index]; + spin_unlock_irqrestore(&gsi->lock, flags); + if (qp) { + mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n", + gsi->port_num, qp_index); + return; + } + + qp = create_gsi_ud_qp(gsi); + if (IS_ERR(qp)) { + mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", + PTR_ERR(qp)); + return; + } + + ret = modify_to_rts(gsi, qp, qp_index); + if (ret) + goto err_destroy_qp; + + spin_lock_irqsave(&gsi->lock, flags); + WARN_ON_ONCE(gsi->tx_qps[qp_index]); + gsi->tx_qps[qp_index] = qp; + spin_unlock_irqrestore(&gsi->lock, flags); + + return; + +err_destroy_qp: + WARN_ON_ONCE(qp); +} + +static void setup_qps(struct mlx5_ib_gsi_qp *gsi) +{ + u16 qp_index; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) + setup_qp(gsi, qp_index); +} + +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state); + + mutex_lock(&gsi->mutex); + ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); + if (ret) { + mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret); + goto unlock; + } + + if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS) + setup_qps(gsi); + +unlock: + mutex_unlock(&gsi->mutex); + + return ret; +} + +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mutex_lock(&gsi->mutex); + ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr); + qp_init_attr->cap = gsi->cap; + mutex_unlock(&gsi->mutex); + + return ret; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr, struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_gsi_wr *gsi_wr; + + if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) { + mlx5_ib_warn(dev, "no available GSI work request.\n"); + return -ENOMEM; + } + + gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi]; + gsi->outstanding_pi = next_outstanding(gsi, gsi->outstanding_pi); + + if (!wc) { + memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc)); + gsi_wr->wc.pkey_index = wr->pkey_index; + gsi_wr->wc.wr_id = wr->wr.wr_id; + } else { + gsi_wr->wc = *wc; + gsi_wr->completed = true; + } + + gsi_wr->cqe.done = &handle_single_completion; + wr->wr.wr_cqe = &gsi_wr->cqe; + + return 0; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr) +{ + struct ib_wc wc = { + { .wr_id = wr->wr.wr_id }, + .status = IB_WC_SUCCESS, + .opcode = IB_WC_SEND, + .qp = &gsi->ibqp, + }; + int ret; + + ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc); + if (ret) + return ret; + + generate_completions(gsi); + + return 0; +} + +/* Call with gsi->lock locked */ +static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + int qp_index = wr->pkey_index; + + if (!mlx5_ib_deth_sqpn_cap(dev)) + return gsi->rx_qp; + + if (qp_index >= gsi->num_qps) + return NULL; + + return gsi->tx_qps[qp_index]; +} + +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + struct ib_qp *tx_qp; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ib_ud_wr cur_wr = *ud_wr(wr); + + cur_wr.wr.next = NULL; + + spin_lock_irqsave(&gsi->lock, flags); + tx_qp = get_tx_qp(gsi, &cur_wr); + if (!tx_qp) { + ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr); + if (ret) + goto err; + spin_unlock_irqrestore(&gsi->lock, flags); + continue; + } + + ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL); + if (ret) + goto err; + + ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr); + if (ret) { + /* Undo the effect of adding the outstanding wr */ + gsi->outstanding_pi = (gsi->outstanding_pi - 1) % + gsi->cap.max_send_wr; + goto err; + } + spin_unlock_irqrestore(&gsi->lock, flags); + } + + return 0; + +err: + spin_unlock_irqrestore(&gsi->lock, flags); + *bad_wr = wr; + return ret; +} + +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + + return ib_post_recv(gsi->rx_qp, wr, bad_wr); +} + +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi) +{ + if (!gsi) + return; + + mutex_lock(&gsi->mutex); + setup_qps(gsi); + mutex_unlock(&gsi->mutex); +} diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index b84d13a487cc..41d8a0036465 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -31,8 +31,10 @@ */ #include <linux/mlx5/cmd.h> +#include <linux/mlx5/vport.h> #include <rdma/ib_mad.h> #include <rdma/ib_smi.h> +#include <rdma/ib_pma.h> #include "mlx5_ib.h" enum { @@ -57,20 +59,12 @@ int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); } -int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - const struct ib_wc *in_wc, const struct ib_grh *in_grh, - const struct ib_mad_hdr *in, size_t in_mad_size, - struct ib_mad_hdr *out, size_t *out_mad_size, - u16 *out_mad_pkey_index) +static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid; int err; - const struct ib_mad *in_mad = (const struct ib_mad *)in; - struct ib_mad *out_mad = (struct ib_mad *)out; - - if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad))) - return IB_MAD_RESULT_FAILURE; slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); @@ -117,6 +111,156 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } +static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, + void *out) +{ +#define MLX5_SUM_CNT(p, cntr1, cntr2) \ + (MLX5_GET64(query_vport_counter_out, p, cntr1) + \ + MLX5_GET64(query_vport_counter_out, p, cntr2)) + + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets, + transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets, + received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.packets, + transmitted_ib_multicast.packets)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.packets, + received_ib_multicast.packets)); + pma_cnt_ext->port_unicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_multicast.packets); +} + +static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, + void *out) +{ + /* Traffic counters will be reported in + * their 64bit form via ib_pma_portcounters_ext by default. + */ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_ASSIGN_PMA_CNTR(counter_var, counter_name) { \ + counter_var = MLX5_GET_BE(typeof(counter_var), \ + ib_port_cntrs_grp_data_layout, \ + out_pma, counter_name); \ + } + + MLX5_ASSIGN_PMA_CNTR(pma_cnt->symbol_error_counter, + symbol_error_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_error_recovery_counter, + link_error_recovery_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_downed_counter, + link_downed_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_errors, + port_rcv_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_remphys_errors, + port_rcv_remote_physical_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_switch_relay_errors, + port_rcv_switch_relay_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_discards, + port_xmit_discards); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors, + port_xmit_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors, + port_rcv_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors, + link_overrun_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->vl15_dropped, + vl_15_dropped); +} + +static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int err; + void *out_cnt; + + /* Decalring support of extended counters */ + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { + struct ib_class_port_info cpi = {}; + + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + } + + if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_vport_counter(dev->mdev, 0, + port_num, out_cnt, sz); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_ib_ppcnt(dev->mdev, port_num, + out_cnt, sz); + if (!err) + pma_cnt_assign(pma_cnt, out_cnt); + } + + kvfree(out_cnt); + if (err) + return IB_MAD_RESULT_FAILURE; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; + + memset(out_mad->data, 0, sizeof(out_mad->data)); + + if (MLX5_CAP_GEN(mdev, vport_counters) && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { + return process_pma_cmd(ibdev, port_num, in_mad, out_mad); + } else { + return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + in_mad, out_mad); + } +} + int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) { struct ib_smp *in_mad = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 03c418ccbc98..5afbb697e691 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -487,6 +487,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; + if (MLX5_CAP_GEN(mdev, imaicl)) { + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_WINDOW_TYPE_2B; + props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + /* We support 'Gappy' memory registration too */ + props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; + } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; @@ -504,6 +511,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, (MLX5_CAP_ETH(dev->mdev, csum_cap))) props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + props->device_cap_flags |= IB_DEVICE_UD_TSO; + } + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; @@ -529,7 +541,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; - props->max_fast_reg_page_list_len = (unsigned int)-1; + props->max_fast_reg_page_list_len = + 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); @@ -1369,11 +1382,20 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) return 0; } +static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) +{ + priority *= 2; + if (!dont_trap) + priority++; + return priority; +} + #define MLX5_FS_MAX_TYPES 10 #define MLX5_FS_MAX_ENTRIES 32000UL static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, struct ib_flow_attr *flow_attr) { + bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_table *ft; @@ -1383,10 +1405,12 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int err = 0; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - if (flow_is_multicast_only(flow_attr)) + if (flow_is_multicast_only(flow_attr) && + !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else - priority = flow_attr->priority; + priority = ib_prio_to_core_prio(flow_attr->priority, + dont_trap); ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); num_entries = MLX5_FS_MAX_ENTRIES; @@ -1434,6 +1458,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, unsigned int spec_index; u32 *match_c; u32 *match_v; + u32 action; int err = 0; if (!is_valid_attr(flow_attr)) @@ -1459,9 +1484,11 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, /* Outer header support only */ match_criteria_enable = (!outer_header_zero(match_c)) << 0; + action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable, match_c, match_v, - MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + action, MLX5_FS_DEFAULT_FLOW_TAG, dst); @@ -1481,6 +1508,29 @@ free: return err ? ERR_PTR(err) : handler; } +static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_dst = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); + if (!IS_ERR(handler)) { + handler_dst = create_flow_rule(dev, ft_prio, + flow_attr, dst); + if (IS_ERR(handler_dst)) { + mlx5_del_flow_rule(handler->rule); + kfree(handler); + handler = handler_dst; + } else { + list_add(&handler_dst->list, &handler->list); + } + } + + return handler; +} enum { LEFTOVERS_MC, LEFTOVERS_UC, @@ -1558,7 +1608,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, if (domain != IB_FLOW_DOMAIN_USER || flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || - flow_attr->flags) + (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) return ERR_PTR(-EINVAL); dst = kzalloc(sizeof(*dst), GFP_KERNEL); @@ -1577,8 +1627,13 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, dst->tir_num = to_mqp(qp)->raw_packet_qp.rq.tirn; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - handler = create_flow_rule(dev, ft_prio, flow_attr, - dst); + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { + handler = create_dont_trap_rule(dev, ft_prio, + flow_attr, dst); + } else { + handler = create_flow_rule(dev, ft_prio, flow_attr, + dst); + } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { handler = create_leftovers_rule(dev, ft_prio, flow_attr, @@ -1716,6 +1771,17 @@ static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_reg_pages, }; +static void pkey_change_handler(struct work_struct *work) +{ + struct mlx5_ib_port_resources *ports = + container_of(work, struct mlx5_ib_port_resources, + pkey_change_work); + + mutex_lock(&ports->devr->mutex); + mlx5_ib_gsi_pkey_change(ports->gsi); + mutex_unlock(&ports->devr->mutex); +} + static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { @@ -1752,6 +1818,8 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; port = (u8)param; + + schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: @@ -1838,7 +1906,7 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); mlx5_ib_destroy_qp(dev->umrc.qp); - ib_destroy_cq(dev->umrc.cq); + ib_free_cq(dev->umrc.cq); ib_dealloc_pd(dev->umrc.pd); } @@ -1853,7 +1921,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev) struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - struct ib_cq_init_attr cq_attr = {}; int ret; attr = kzalloc(sizeof(*attr), GFP_KERNEL); @@ -1870,15 +1937,12 @@ static int create_umr_res(struct mlx5_ib_dev *dev) goto error_0; } - cq_attr.cqe = 128; - cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, - &cq_attr); + cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); goto error_2; } - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); init_attr->send_cq = cq; init_attr->recv_cq = cq; @@ -1945,7 +2009,7 @@ error_4: mlx5_ib_destroy_qp(qp); error_3: - ib_destroy_cq(cq); + ib_free_cq(cq); error_2: ib_dealloc_pd(pd); @@ -1961,10 +2025,13 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; + int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); + mutex_init(&devr->mutex); + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); @@ -2052,6 +2119,12 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + devr->ports[port].devr = devr; + } + return 0; error5: @@ -2070,12 +2143,20 @@ error0: static void destroy_dev_resources(struct mlx5_ib_resources *devr) { + struct mlx5_ib_dev *dev = + container_of(devr, struct mlx5_ib_dev, devr); + int port; + mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); + + /* Make sure no change P_Key work items are still executing */ + for (port = 0; port < dev->num_ports; ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); } static u32 get_core_cap_flags(struct ib_device *ibdev) @@ -2198,6 +2279,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | @@ -2258,6 +2340,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; @@ -2269,6 +2352,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) mlx5_ib_internal_fill_odp_caps(dev); + if (MLX5_CAP_GEN(mdev, imaicl)) { + dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; + dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d2b9737baa36..76b2b42e0535 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -43,6 +43,7 @@ #include <linux/mlx5/srq.h> #include <linux/types.h> #include <linux/mlx5/transobj.h> +#include <rdma/ib_user_verbs.h> #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ @@ -126,7 +127,7 @@ struct mlx5_ib_pd { }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) -#define MLX5_IB_FLOW_LAST_PRIO (MLX5_IB_FLOW_MCAST_PRIO - 1) +#define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) #if (MLX5_IB_FLOW_LAST_PRIO <= 0) #error "Invalid number of bypass priorities" #endif @@ -162,9 +163,31 @@ struct mlx5_ib_flow_db { #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) + +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END + #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +/* + * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI + * creates the actual hardware QP. + */ +#define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 +/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. + * + * These flags are intended for internal use by the mlx5_ib driver, and they + * rely on the range reserved for that use in the ib_qp_create_flags enum. + */ + +/* Create a UD QP whose source QP number is 1 */ +static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void) +{ + return IB_QP_CREATE_RESERVED_START; +} + struct wr_list { u16 opcode; u16 next; @@ -325,11 +348,14 @@ struct mlx5_ib_cq_buf { }; enum mlx5_ib_qp_flags { - MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, - MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, - MLX5_IB_QP_CROSS_CHANNEL = 1 << 2, - MLX5_IB_QP_MANAGED_SEND = 1 << 3, - MLX5_IB_QP_MANAGED_RECV = 1 << 4, + MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, + MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, + MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, + /* QP uses 1 as its source QP number */ + MLX5_IB_QP_SQPN_QP1 = 1 << 6, }; struct mlx5_umr_wr { @@ -373,6 +399,14 @@ struct mlx5_ib_cq { struct ib_umem *resize_umem; int cqe_size; u32 create_flags; + struct list_head wc_list; + enum ib_cq_notify_flags notify_flags; + struct work_struct notify_work; +}; + +struct mlx5_ib_wc { + struct ib_wc wc; + struct list_head list; }; struct mlx5_ib_srq { @@ -413,7 +447,8 @@ struct mlx5_ib_mr { int ndescs; int max_descs; int desc_size; - struct mlx5_core_mr mmr; + int access_mode; + struct mlx5_core_mkey mmkey; struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; struct list_head list; @@ -425,19 +460,20 @@ struct mlx5_ib_mr { struct mlx5_core_sig_ctx *sig; int live; void *descs_alloc; + int access_flags; /* Needed for rereg MR */ +}; + +struct mlx5_ib_mw { + struct ib_mw ibmw; + struct mlx5_core_mkey mmkey; }; struct mlx5_ib_umr_context { + struct ib_cqe cqe; enum ib_wc_status status; struct completion done; }; -static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) -{ - context->status = -1; - init_completion(&context->done); -} - struct umr_common { struct ib_pd *pd; struct ib_cq *cq; @@ -487,6 +523,14 @@ struct mlx5_mr_cache { unsigned long last_add; }; +struct mlx5_ib_gsi_qp; + +struct mlx5_ib_port_resources { + struct mlx5_ib_resources *devr; + struct mlx5_ib_gsi_qp *gsi; + struct work_struct pkey_change_work; +}; + struct mlx5_ib_resources { struct ib_cq *c0; struct ib_xrcd *x0; @@ -494,6 +538,9 @@ struct mlx5_ib_resources { struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mlx5_ib_port_resources ports[2]; + /* Protects changes to the port resources */ + struct mutex mutex; }; struct mlx5_roce { @@ -558,9 +605,9 @@ static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } -static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey) { - return container_of(mmr, struct mlx5_ib_mr, mmr); + return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) @@ -588,6 +635,11 @@ static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx5_ib_mr, ibmr); } +static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx5_ib_mw, ibmw); +} + struct mlx5_ib_ah { struct ib_ah ibah; struct mlx5_av av; @@ -648,8 +700,14 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); +int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, int zap); +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int access_flags, + struct ib_pd *pd, struct ib_udata *udata); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, @@ -700,7 +758,6 @@ int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); @@ -739,6 +796,23 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index); +/* GSI QP helper functions */ +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr); +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp); +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask); +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); + +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -758,7 +832,7 @@ static inline u8 convert_access(int acc) static inline int is_qp1(enum ib_qp_type qp_type) { - return qp_type == IB_QPT_GSI; + return qp_type == MLX5_IB_QPT_HW_GSI; } #define MLX5_MAX_UMR_SHIFT 16 diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 6000f7aeede9..4d5bff151cdf 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -40,6 +40,7 @@ #include <rdma/ib_umem_odp.h> #include <rdma/ib_verbs.h> #include "mlx5_ib.h" +#include "user.h" enum { MAX_PENDING_REG_MR = 8, @@ -57,7 +58,7 @@ static int clean_mr(struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* Wait until all page fault handlers using the mr complete. */ @@ -77,6 +78,40 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) +{ + return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >= + length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1)); +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static void update_odp_mr(struct mlx5_ib_mr *mr) +{ + if (mr->umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +} +#endif + static void reg_mr_callback(int status, void *context) { struct mlx5_ib_mr *mr = context; @@ -86,7 +121,7 @@ static void reg_mr_callback(int status, void *context) struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; - struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; int err; spin_lock_irqsave(&ent->lock, flags); @@ -113,7 +148,7 @@ static void reg_mr_callback(int status, void *context) spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); - mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + mr->mmkey.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; cache->last_add = jiffies; @@ -124,10 +159,10 @@ static void reg_mr_callback(int status, void *context) spin_unlock_irqrestore(&ent->lock, flags); write_lock_irqsave(&table->lock, flags); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), - &mr->mmr); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey); if (err) - pr_err("Error inserting to mr tree. 0x%x\n", -err); + pr_err("Error inserting to mkey tree. 0x%x\n", -err); write_unlock_irqrestore(&table->lock, flags); } @@ -168,7 +203,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) spin_lock_irq(&ent->lock); ent->pending++; spin_unlock_irq(&ent->lock); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), reg_mr_callback, mr, &mr->out); if (err) { @@ -657,14 +692,14 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, NULL); if (err) goto err_in; kfree(in); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; return &mr->ibmr; @@ -693,10 +728,40 @@ static int use_umr(int order) return order <= MLX5_MAX_UMR_SHIFT; } -static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, - struct ib_sge *sg, u64 dma, int n, u32 key, - int page_shift, u64 virt_addr, u64 len, - int access_flags) +static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int npages, int page_shift, int *size, + __be64 **mr_pas, dma_addr_t *dma) +{ + __be64 *pas; + struct device *ddev = dev->ib_dev.dma_device; + + /* + * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. + */ + *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!(*mr_pas)) + return -ENOMEM; + + pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, *size - npages * sizeof(u64)); + + *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, *dma)) { + kfree(*mr_pas); + return -ENOMEM; + } + + return 0; +} + +static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_umr_wr *umrwr = umr_wr(wr); @@ -706,7 +771,6 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, sg->lkey = dev->umrc.pd->local_dma_lkey; wr->next = NULL; - wr->send_flags = 0; wr->sg_list = sg; if (n) wr->num_sge = 1; @@ -718,6 +782,19 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, umrwr->npages = n; umrwr->page_shift = page_shift; umrwr->mkey = key; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift); + + wr->send_flags = 0; + umrwr->target.virt_addr = virt_addr; umrwr->length = len; umrwr->access_flags = access_flags; @@ -734,26 +811,45 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, umrwr->mkey = key; } -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length, + int access_flags, int *npages, + int *page_shift, int *ncont, int *order) { - struct mlx5_ib_umr_context *context; - struct ib_wc wc; - int err; - - while (1) { - err = ib_poll_cq(cq, 1, &wc); - if (err < 0) { - pr_warn("poll cq error %d\n", err); - return; - } - if (err == 0) - break; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length, + access_flags, 0); + if (IS_ERR(umem)) { + mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } - context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; - context->status = wc.status; - complete(&context->done); + mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order); + if (!*npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + ib_umem_release(umem); + return ERR_PTR(-EINVAL); } - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + *npages, *ncont, *order, *page_shift); + + return umem; +} + +static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); + + context->status = wc->status; + complete(&context->done); +} + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->cqe.done = mlx5_ib_umr_done; + context->status = -1; + init_completion(&context->done); } static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, @@ -764,13 +860,12 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct device *ddev = dev->ib_dev.dma_device; struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr; + struct mlx5_umr_wr umrwr = {}; struct ib_send_wr *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; int size; __be64 *mr_pas; - __be64 *pas; dma_addr_t dma; int err = 0; int i; @@ -790,33 +885,17 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); - /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. - * To avoid copying garbage after the pas array, we allocate - * a little more. */ - size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); - mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); - if (!mr_pas) { - err = -ENOMEM; + err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas, + &dma); + if (err) goto free_mr; - } - pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); - mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); - /* Clear padding after the actual pages. */ - memset(pas + npages, 0, size - npages * sizeof(u64)); - - dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); - if (dma_mapping_error(ddev, dma)) { - err = -ENOMEM; - goto free_pas; - } + mlx5_ib_init_umr_context(&umr_context); - memset(&umrwr, 0, sizeof(umrwr)); - umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; - prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmr.key, + umrwr.wr.wr_cqe = &umr_context.cqe; + prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, page_shift, virt_addr, len, access_flags); - mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { @@ -830,9 +909,9 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, } } - mr->mmr.iova = virt_addr; - mr->mmr.size = len; - mr->mmr.pd = to_mpd(pd)->pdn; + mr->mmkey.iova = virt_addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; mr->live = 1; @@ -840,7 +919,6 @@ unmap_dma: up(&umrc->sem); dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); -free_pas: kfree(mr_pas); free_mr: @@ -929,8 +1007,10 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + mlx5_ib_init_umr_context(&umr_context); + memset(&wr, 0, sizeof(wr)); - wr.wr.wr_id = (u64)(unsigned long)&umr_context; + wr.wr.wr_cqe = &umr_context.cqe; sg.addr = dma; sg.length = ALIGN(npages * sizeof(u64), @@ -944,10 +1024,9 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, wr.wr.opcode = MLX5_IB_WR_UMR; wr.npages = sg.length / sizeof(u64); wr.page_shift = PAGE_SHIFT; - wr.mkey = mr->mmr.key; + wr.mkey = mr->mmkey.key; wr.target.offset = start_page_index; - mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); err = ib_post_send(umrc->qp, &wr.wr, &bad); if (err) { @@ -974,10 +1053,14 @@ free_pas: } #endif -static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, - u64 length, struct ib_umem *umem, - int npages, int page_shift, - int access_flags) +/* + * If ibmr is NULL it will be allocated by reg_create. + * Else, the given ibmr will be used. + */ +static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, + u64 virt_addr, u64 length, + struct ib_umem *umem, int npages, + int page_shift, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_create_mkey_mbox_in *in; @@ -986,7 +1069,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -1013,7 +1096,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen, NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); @@ -1024,7 +1107,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, mr->live = 1; kvfree(in); - mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); return mr; @@ -1032,11 +1115,23 @@ err_2: kvfree(in); err_1: - kfree(mr); + if (!ibmr) + kfree(mr); return ERR_PTR(err); } +static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + int npages, u64 length, int access_flags) +{ + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->ibmr.length = length; + mr->access_flags = access_flags; +} + struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -1052,22 +1147,11 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); - umem = ib_umem_get(pd->uobject->context, start, length, access_flags, - 0); - if (IS_ERR(umem)) { - mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); - return (void *)umem; - } + umem = mr_umem_get(pd, start, length, access_flags, &npages, + &page_shift, &ncont, &order); - mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); - if (!npages) { - mlx5_ib_warn(dev, "avoid zero region\n"); - err = -EINVAL; - goto error; - } - - mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", - npages, ncont, order, page_shift); + if (IS_ERR(umem)) + return (void *)umem; if (use_umr(order)) { mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, @@ -1083,45 +1167,21 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } if (!mr) - mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, - access_flags); + mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, + page_shift, access_flags); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto error; } - mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); mr->umem = umem; - mr->npages = npages; - atomic_add(npages, &dev->mdev->priv.reg_pages); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + set_mr_fileds(dev, mr, npages, length, access_flags); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (umem->odp_data) { - /* - * This barrier prevents the compiler from moving the - * setting of umem->odp_data->private to point to our - * MR, before reg_umr finished, to ensure that the MR - * initialization have finished before starting to - * handle invalidations. - */ - smp_wmb(); - mr->umem->odp_data->private = mr; - /* - * Make sure we will see the new - * umem->odp_data->private value in the invalidation - * routines, before we can get page faults on the - * MR. Page faults can happen once we put the MR in - * the tree, below this line. Without the barrier, - * there can be a fault handling and an invalidation - * before umem->odp_data->private == mr is visible to - * the invalidation handler. - */ - smp_wmb(); - } + update_odp_mr(mr); #endif return &mr->ibmr; @@ -1135,15 +1195,15 @@ static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr; + struct mlx5_umr_wr umrwr = {}; struct ib_send_wr *bad; int err; - memset(&umrwr.wr, 0, sizeof(umrwr)); - umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; - prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmr.key); - mlx5_ib_init_umr_context(&umr_context); + + umrwr.wr.wr_cqe = &umr_context.cqe; + prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); + down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { @@ -1165,6 +1225,167 @@ error: return err; } +static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, + u64 length, int npages, int page_shift, int order, + int access_flags, int flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr *bad; + struct mlx5_umr_wr umrwr = {}; + struct ib_sge sg; + struct umr_common *umrc = &dev->umrc; + dma_addr_t dma = 0; + __be64 *mr_pas = NULL; + int size; + int err; + + mlx5_ib_init_umr_context(&umr_context); + + umrwr.wr.wr_cqe = &umr_context.cqe; + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; + + if (flags & IB_MR_REREG_TRANS) { + err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size, + &mr_pas, &dma); + if (err) + return err; + + umrwr.target.virt_addr = virt_addr; + umrwr.length = length; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + } + + prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, + page_shift); + + if (flags & IB_MR_REREG_PD) { + umrwr.pd = pd; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD; + } + + if (flags & IB_MR_REREG_ACCESS) { + umrwr.access_flags = access_flags; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; + } + + /* post send request to UMR QP */ + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr.wr, &bad); + + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; + } + } + + up(&umrc->sem); + if (flags & IB_MR_REREG_TRANS) { + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + kfree(mr_pas); + } + return err; +} + +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int new_access_flags, + struct ib_pd *new_pd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); + struct mlx5_ib_mr *mr = to_mmr(ib_mr); + struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd; + int access_flags = flags & IB_MR_REREG_ACCESS ? + new_access_flags : + mr->access_flags; + u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address; + u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length; + int page_shift = 0; + int npages = 0; + int ncont = 0; + int order = 0; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + + if (flags != IB_MR_REREG_PD) { + /* + * Replace umem. This needs to be done whether or not UMR is + * used. + */ + flags |= IB_MR_REREG_TRANS; + ib_umem_release(mr->umem); + mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages, + &page_shift, &ncont, &order); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + mr->umem = NULL; + return err; + } + } + + if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { + /* + * UMR can't be used - MKey needs to be replaced. + */ + if (mr->umred) { + err = unreg_umr(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to unregister MR\n"); + } else { + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to destroy MKey\n"); + } + if (err) + return err; + + mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, + page_shift, access_flags); + + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->umred = 0; + } else { + /* + * Send a UMR WQE + */ + err = rereg_umr(pd, mr, addr, len, npages, page_shift, + order, access_flags, flags); + if (err) { + mlx5_ib_warn(dev, "Failed to rereg UMR\n"); + return err; + } + } + + if (flags & IB_MR_REREG_PD) { + ib_mr->pd = pd; + mr->mmkey.pd = to_mpd(pd)->pdn; + } + + if (flags & IB_MR_REREG_ACCESS) + mr->access_flags = access_flags; + + if (flags & IB_MR_REREG_TRANS) { + atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); + set_mr_fileds(dev, mr, npages, len, access_flags); + mr->mmkey.iova = addr; + mr->mmkey.size = len; + } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + update_odp_mr(mr); +#endif + + return 0; +} + static int mlx5_alloc_priv_descs(struct ib_device *device, struct mlx5_ib_mr *mr, @@ -1236,7 +1457,7 @@ static int clean_mr(struct mlx5_ib_mr *mr) err = destroy_mkey(dev, mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", - mr->mmr.key, err); + mr->mmkey.key, err); return err; } } else { @@ -1300,8 +1521,8 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_create_mkey_mbox_in *in; struct mlx5_ib_mr *mr; - int access_mode, err; - int ndescs = roundup(max_num_sg, 4); + int ndescs = ALIGN(max_num_sg, 4); + int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) @@ -1319,7 +1540,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); if (mr_type == IB_MR_TYPE_MEM_REG) { - access_mode = MLX5_ACCESS_MODE_MTT; + mr->access_mode = MLX5_ACCESS_MODE_MTT; in->seg.log2_page_size = PAGE_SHIFT; err = mlx5_alloc_priv_descs(pd->device, mr, @@ -1329,6 +1550,15 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, mr->desc_size = sizeof(u64); mr->max_descs = ndescs; + } else if (mr_type == IB_MR_TYPE_SG_GAPS) { + mr->access_mode = MLX5_ACCESS_MODE_KLM; + + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(struct mlx5_klm)); + if (err) + goto err_free_in; + mr->desc_size = sizeof(struct mlx5_klm); + mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SIGNATURE) { u32 psv_index[2]; @@ -1347,7 +1577,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, if (err) goto err_free_sig; - access_mode = MLX5_ACCESS_MODE_KLM; + mr->access_mode = MLX5_ACCESS_MODE_KLM; mr->sig->psv_memory.psv_idx = psv_index[0]; mr->sig->psv_wire.psv_idx = psv_index[1]; @@ -1361,14 +1591,14 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, goto err_free_in; } - in->seg.flags = MLX5_PERM_UMR_EN | access_mode; - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + in->seg.flags = MLX5_PERM_UMR_EN | mr->access_mode; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, NULL); if (err) goto err_destroy_psv; - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; kfree(in); @@ -1395,6 +1625,88 @@ err_free: return ERR_PTR(err); } +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in = NULL; + struct mlx5_ib_mw *mw = NULL; + int ndescs; + int err; + struct mlx5_ib_alloc_mw req = {}; + struct { + __u32 comp_mask; + __u32 response_length; + } resp = {}; + + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); + if (err) + return ERR_PTR(err); + + if (req.comp_mask || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + udata->inlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); + + ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!mw || !in) { + err = -ENOMEM; + goto free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_KLM | + MLX5_PERM_LOCAL_READ; + if (type == IB_MW_TYPE_2) + in->seg.flags_pd |= cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + + err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto free; + + mw->ibmw.rkey = mw->mmkey.key; + + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); + if (resp.response_length) { + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); + goto free; + } + } + + kfree(in); + return &mw->ibmw; + +free: + kfree(mw); + kfree(in); + return ERR_PTR(err); +} + +int mlx5_ib_dealloc_mw(struct ib_mw *mw) +{ + struct mlx5_ib_mw *mmw = to_mmw(mw); + int err; + + err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev, + &mmw->mmkey); + if (!err) + kfree(mmw); + return err; +} + int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status) { @@ -1436,6 +1748,32 @@ done: return ret; } +static int +mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, + struct scatterlist *sgl, + unsigned short sg_nents) +{ + struct scatterlist *sg = sgl; + struct mlx5_klm *klms = mr->descs; + u32 lkey = mr->ibmr.pd->local_dma_lkey; + int i; + + mr->ibmr.iova = sg_dma_address(sg); + mr->ibmr.length = 0; + mr->ndescs = sg_nents; + + for_each_sg(sgl, sg, sg_nents, i) { + if (unlikely(i > mr->max_descs)) + break; + klms[i].va = cpu_to_be64(sg_dma_address(sg)); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg)); + klms[i].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg); + } + + return i; +} + static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); @@ -1463,7 +1801,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, mr->desc_size * mr->max_descs, DMA_TO_DEVICE); - n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents); + else + n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index b8d76361a48d..34e79e709c67 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -142,13 +142,13 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, u32 key) { u32 base_key = mlx5_base_mkey(key); - struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); - struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); + struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); + struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!mmr || mmr->key != key || !mr->live) + if (!mmkey || mmkey->key != key || !mr->live) return NULL; - return container_of(mmr, struct mlx5_ib_mr, mmr); + return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, @@ -232,7 +232,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, io_virt += pfault->mpfault.bytes_committed; bcnt -= pfault->mpfault.bytes_committed; - start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; + start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; if (mr->umem->writable) access_mask |= ODP_WRITE_ALLOWED_BIT; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 34cb8e87c7b8..8dee8bc1e0fe 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -58,6 +58,7 @@ enum { static const u32 mlx5_ib_opcode[] = { [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_LSO] = MLX5_OPCODE_LSO, [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, @@ -72,6 +73,9 @@ static const u32 mlx5_ib_opcode[] = { [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, }; +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; +}; static int is_qp0(enum ib_qp_type qp_type) { @@ -260,11 +264,11 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, return 0; } -static int sq_overhead(enum ib_qp_type qp_type) +static int sq_overhead(struct ib_qp_init_attr *attr) { int size = 0; - switch (qp_type) { + switch (attr->qp_type) { case IB_QPT_XRC_INI: size += sizeof(struct mlx5_wqe_xrc_seg); /* fall through */ @@ -287,8 +291,12 @@ static int sq_overhead(enum ib_qp_type qp_type) break; case IB_QPT_UD: + if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + size += sizeof(struct mlx5_wqe_eth_pad) + + sizeof(struct mlx5_wqe_eth_seg); + /* fall through */ case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; @@ -311,7 +319,7 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) int inl_size = 0; int size; - size = sq_overhead(attr->qp_type); + size = sq_overhead(attr); if (size < 0) return size; @@ -348,8 +356,8 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, return -EINVAL; } - qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - - sizeof(struct mlx5_wqe_inline_seg); + qp->max_inline_data = wqe_size - sq_overhead(attr) - + sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) @@ -590,7 +598,7 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_XRC_INI: case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; - case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; @@ -783,7 +791,10 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, int err; uuari = &dev->mdev->priv.uuari; - if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_IPOIB_UD_LSO | + mlx5_ib_create_qp_sqpn_qp1())) return -EINVAL; if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) @@ -828,6 +839,11 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) { + (*in)->ctx.deth_sqpn = cpu_to_be32(1); + qp->flags |= MLX5_IB_QP_SQPN_QP1; + } + mlx5_fill_page_array(&qp->buf, (*in)->pas); err = mlx5_db_alloc(dev->mdev, &qp->db); @@ -1228,6 +1244,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) qp->flags |= MLX5_IB_QP_MANAGED_RECV; } + + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) + if (!MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n"); + return -EOPNOTSUPP; + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -1271,6 +1295,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, ucmd.sq_wqe_count, max_wqes); return -EINVAL; } + if (init_attr->create_flags & + mlx5_ib_create_qp_sqpn_qp1()) { + mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n"); + return -EINVAL; + } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, &inlen, base); if (err) @@ -1385,6 +1414,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, /* 0xffffff means we ask to work with cqe version 0 */ MLX5_SET(qpc, qpc, user_index, uidx); } + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); + qp->flags |= MLX5_IB_QP_LSO; + } if (init_attr->qp_type == IB_QPT_RAW_PACKET) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; @@ -1494,7 +1530,7 @@ static void get_cqs(struct mlx5_ib_qp *qp, break; case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -1657,7 +1693,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: case MLX5_IB_QPT_REG_UMR: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) @@ -1686,6 +1722,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, break; + case IB_QPT_GSI: + return mlx5_ib_gsi_create_qp(pd, init_attr); + case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_MAX: @@ -1704,6 +1743,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); + if (unlikely(qp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_destroy_qp(qp); + destroy_qp_common(dev, mqp); kfree(mqp); @@ -2161,8 +2203,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context = &in->ctx; err = to_mlx5_st(ibqp->qp_type); - if (err < 0) + if (err < 0) { + mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); goto out; + } context->flags = cpu_to_be32(err << 16); @@ -2182,7 +2226,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; } else if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { @@ -2284,6 +2328,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->sq_crq_size |= cpu_to_be16(1 << 4); + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + context->deth_sqpn = cpu_to_be32(1); mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -2363,11 +2409,18 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); + + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? + IB_QPT_GSI : ibqp->qp_type; + mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; @@ -2378,32 +2431,46 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } - if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && - !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - ll)) + if (qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, ibqp->qp_type, attr_mask); goto out; + } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || - attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); goto out; + } if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= - dev->mdev->port_caps[port - 1].pkey_table_len) + dev->mdev->port_caps[port - 1].pkey_table_len) { + mlx5_ib_dbg(dev, "invalid pkey index %d\n", + attr->pkey_index); goto out; + } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); goto out; + } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); goto out; + } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; @@ -2442,6 +2509,59 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, rseg->reserved = 0; } +static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, + struct ib_send_wr *wr, void *qend, + struct mlx5_ib_qp *qp, int *size) +{ + void *seg = eseg; + + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (wr->send_flags & IB_SEND_IP_CSUM) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + + seg += sizeof(struct mlx5_wqe_eth_seg); + *size += sizeof(struct mlx5_wqe_eth_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); + int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start); + u64 left, leftlen, copysz; + void *pdata = ud_wr->header; + + left = ud_wr->hlen; + eseg->mss = cpu_to_be16(ud_wr->mss); + eseg->inline_hdr_sz = cpu_to_be16(left); + + /* + * check if there is space till the end of queue, if yes, + * copy all in one shot, otherwise copy till the end of queue, + * rollback and than the copy the left + */ + leftlen = qend - (void *)eseg->inline_hdr_start; + copysz = min_t(u64, leftlen, left); + + memcpy(seg - size_of_inl_hdr_start, pdata, copysz); + + if (likely(copysz > size_of_inl_hdr_start)) { + seg += ALIGN(copysz - size_of_inl_hdr_start, 16); + *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16; + } + + if (unlikely(copysz < left)) { /* the last wqe in the queue */ + seg = mlx5_get_send_wqe(qp, 0); + left -= copysz; + pdata += copysz; + memcpy(seg, pdata, left); + seg += ALIGN(left, 16); + *size += ALIGN(left, 16) / 16; + } + } + + return seg; +} + static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, struct ib_send_wr *wr) { @@ -2509,6 +2629,11 @@ static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, int ndescs = mr->ndescs; memset(umr, 0, sizeof(*umr)); + + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + umr->flags = MLX5_UMR_CHECK_NOT_FREE; umr->klm_octowords = get_klm_octo(ndescs); umr->mkey_mask = frwr_mkey_mask(); @@ -2558,6 +2683,44 @@ static __be64 get_umr_update_mtt_mask(void) return cpu_to_be64(result); } +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, struct ib_send_wr *wr) { @@ -2576,9 +2739,15 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, umr->mkey_mask = get_umr_update_mtt_mask(); umr->bsf_octowords = get_klm_octo(umrwr->target.offset); umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; - } else { - umr->mkey_mask = get_umr_reg_mr_mask(); } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS) + umr->mkey_mask |= get_umr_update_access_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD) + umr->mkey_mask |= get_umr_update_pd_mask(); + if (!umr->mkey_mask) + umr->mkey_mask = get_umr_reg_mr_mask(); } else { umr->mkey_mask = get_umr_unreg_mr_mask(); } @@ -2603,13 +2772,19 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, int ndescs = ALIGN(mr->ndescs, 8) >> 1; memset(seg, 0, sizeof(*seg)); - seg->flags = get_umr_flags(access) | MLX5_ACCESS_MODE_MTT; + + if (mr->access_mode == MLX5_ACCESS_MODE_MTT) + seg->log2_page_size = ilog2(mr->ibmr.page_size); + else if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + seg->flags = get_umr_flags(access) | mr->access_mode; seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); seg->start_addr = cpu_to_be64(mr->ibmr.iova); seg->len = cpu_to_be64(mr->ibmr.length); seg->xlt_oct_size = cpu_to_be32(ndescs); - seg->log2_page_size = ilog2(mr->ibmr.page_size); } static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) @@ -2630,7 +2805,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w seg->flags = convert_access(umrwr->access_flags); if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { - seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); } seg->len = cpu_to_be64(umrwr->length); @@ -3196,13 +3372,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); - struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_xrc_seg *xrc; - struct mlx5_bf *bf = qp->bf; + struct mlx5_bf *bf; int uninitialized_var(size); - void *qend = qp->sq.qend; + void *qend; unsigned long flags; unsigned idx; int err = 0; @@ -3214,6 +3390,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, u8 next_fence = 0; u8 fence; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); + + qp = to_mqp(ibqp); + bf = qp->bf; + qend = qp->sq.qend; + spin_lock_irqsave(&qp->sq.lock, flags); for (nreq = 0; wr; nreq++, wr = wr->next) { @@ -3373,16 +3556,37 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_UD: case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); size += sizeof(struct mlx5_wqe_datagram_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); break; + case IB_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + seg += sizeof(struct mlx5_wqe_eth_pad); + size += sizeof(struct mlx5_wqe_eth_pad) / 16; + seg = set_eth_seg(seg, wr, qend, qp, &size); + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + } + break; case MLX5_IB_QPT_REG_UMR: if (wr->opcode != MLX5_IB_WR_UMR) { err = -EINVAL; @@ -3502,6 +3706,9 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int ind; int i; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); + spin_lock_irqsave(&qp->rq.lock, flags); ind = qp->rq.head & (qp->rq.wqe_cnt - 1); @@ -3822,6 +4029,10 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int err = 0; u8 raw_packet_qp_state; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, + qp_init_attr); + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* * Wait for any outstanding page faults, in case the user frees memory @@ -3874,6 +4085,8 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; if (qp->flags & MLX5_IB_QP_MANAGED_RECV) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1(); qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 4659256cd95e..3b2ddd64a371 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -75,7 +75,8 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, - struct ib_udata *udata, int buf_size, int *inlen) + struct ib_udata *udata, int buf_size, int *inlen, + int is_xrc) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_srq ucmd = {}; @@ -87,13 +88,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, int ncont; u32 offset; u32 uidx = MLX5_IB_DEFAULT_UIDX; - int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); - if (drv_data < 0) - return -EINVAL; - - ucmdlen = (drv_data < sizeof(ucmd)) ? - drv_data : sizeof(ucmd); + ucmdlen = min(udata->inlen, sizeof(ucmd)); if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { mlx5_ib_dbg(dev, "failed copy udata\n"); @@ -103,15 +99,17 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, if (ucmd.reserved0 || ucmd.reserved1) return -EINVAL; - if (drv_data > sizeof(ucmd) && + if (udata->inlen > sizeof(ucmd) && !ib_is_udata_cleared(udata, sizeof(ucmd), - drv_data - sizeof(ucmd))) + udata->inlen - sizeof(ucmd))) return -EINVAL; - err = get_srq_user_index(to_mucontext(pd->uobject->context), - &ucmd, udata->inlen, &uidx); - if (err) - return err; + if (is_xrc) { + err = get_srq_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + } srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); @@ -151,7 +149,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); - if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && + is_xrc){ xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, xrc_srq_context_entry); MLX5_SET(xrc_srqc, xsrqc, user_index, uidx); @@ -170,7 +169,7 @@ err_umem: static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, int buf_size, - int *inlen) + int *inlen, int is_xrc) { int err; int i; @@ -224,7 +223,8 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; - if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && + is_xrc){ xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, xrc_srq_context_entry); /* 0xffffff means we ask to work with cqe version 0 */ @@ -302,10 +302,14 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, srq->msrq.max_avail_gather); + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + if (pd->uobject) - err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen, + is_xrc); else - err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen, + is_xrc); if (err) { mlx5_ib_warn(dev, "create srq %s failed, err %d\n", @@ -313,7 +317,6 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, goto err_srq; } - is_xrc = (init_attr->srq_type == IB_SRQT_XRC); in->ctx.state_log_sz = ilog2(srq->msrq.max); flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; xrcdn = 0; diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index b94a55404a59..61bc308bb802 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -152,6 +152,13 @@ struct mlx5_ib_create_qp_resp { __u32 uuar_index; }; +struct mlx5_ib_alloc_mw { + __u32 comp_mask; + __u8 num_klms; + __u8 reserved1; + __u16 reserved2; +}; + static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_qp *ucmd, int inlen, diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 8c4daf7f22ec..5af19b4cde51 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -56,7 +56,8 @@ static int nes_dereg_mr(struct ib_mr *ib_mr); /** * nes_alloc_mw */ -static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type) +static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type, + struct ib_udata *udata) { struct nes_pd *nespd = to_nespd(ibpd); struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index a6f3eab0f350..85be0de3ab26 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -244,6 +244,7 @@ struct ipoib_cm_tx { unsigned tx_tail; unsigned long flags; u32 mtu; + unsigned max_send_sge; }; struct ipoib_cm_rx_buf { @@ -390,6 +391,7 @@ struct ipoib_dev_priv { int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; + unsigned max_send_sge; }; struct ipoib_ah { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 917e46ea3bf6..c8ed53562c9b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; int rc; + unsigned usable_sge = tx->max_send_sge - !!skb_headlen(skb); if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -719,7 +720,23 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); return; } - + if (skb_shinfo(skb)->nr_frags > usable_sge) { + if (skb_linearize(skb) < 0) { + ipoib_warn(priv, "skb could not be linearized\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + /* Does skb_linearize return ok without reducing nr_frags? */ + if (skb_shinfo(skb)->nr_frags > usable_sge) { + ipoib_warn(priv, "too many frags after skb linearize\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", tx->tx_head, skb->len, tx->qp->qp_num); @@ -1031,7 +1048,8 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ struct ib_qp *tx_qp; if (dev->features & NETIF_F_SG) - attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + attr.cap.max_send_sge = + min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); tx_qp = ib_create_qp(priv->pd, &attr); if (PTR_ERR(tx_qp) == -EINVAL) { @@ -1040,6 +1058,7 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; tx_qp = ib_create_qp(priv->pd, &attr); } + tx->max_send_sge = attr.cap.max_send_sge; return tx_qp; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index fa9c42ff1fb0..899e6b7fb8a5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -538,6 +538,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_tx_buf *tx_req; int hlen, rc; void *phead; + unsigned usable_sge = priv->max_send_sge - !!skb_headlen(skb); if (skb_is_gso(skb)) { hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); @@ -561,6 +562,23 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, phead = NULL; hlen = 0; } + if (skb_shinfo(skb)->nr_frags > usable_sge) { + if (skb_linearize(skb) < 0) { + ipoib_warn(priv, "skb could not be linearized\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + /* Does skb_linearize return ok without reducing nr_frags? */ + if (skb_shinfo(skb)->nr_frags > usable_sge) { + ipoib_warn(priv, "too many frags after skb linearize\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", skb->len, address, qpn); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index d48c5bae7877..b809c373e40e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -206,7 +206,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; if (dev->features & NETIF_F_SG) - init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + init_attr.cap.max_send_sge = + min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { @@ -233,6 +234,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; + priv->max_send_sge = init_attr.cap.max_send_sge; + return 0; out_free_send_cq: diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index c827c93f46c5..80b6bedc172f 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -969,7 +969,16 @@ static umode_t iser_attr_is_visible(int param_type, int param) static int iscsi_iser_slave_alloc(struct scsi_device *sdev) { - blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K); + struct iscsi_session *session; + struct iser_conn *iser_conn; + struct ib_device *ib_dev; + + session = starget_to_session(scsi_target(sdev))->dd_data; + iser_conn = session->leadconn->dd_data; + ib_dev = iser_conn->ib_conn.device->ib_device; + + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K); return 0; } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 95f0a64e076b..0351059783b1 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -458,9 +458,6 @@ struct iser_fr_pool { * @comp: iser completion context * @fr_pool: connection fast registration poool * @pi_support: Indicate device T10-PI support - * @last: last send wr to signal all flush errors were drained - * @last_cqe: cqe handler for last wr - * @last_comp: completes when all connection completions consumed */ struct ib_conn { struct rdma_cm_id *cma_id; @@ -472,10 +469,7 @@ struct ib_conn { struct iser_comp *comp; struct iser_fr_pool fr_pool; bool pi_support; - struct ib_send_wr last; - struct ib_cqe last_cqe; struct ib_cqe reg_cqe; - struct completion last_comp; }; /** @@ -617,7 +611,6 @@ void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc); -void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_task_rdma_init(struct iscsi_iser_task *task); diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index ed54b388e7ad..81ae2e30dd12 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -729,13 +729,6 @@ void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc) kmem_cache_free(ig.desc_cache, desc); } -void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc) -{ - struct ib_conn *ib_conn = wc->qp->qp_context; - - complete(&ib_conn->last_comp); -} - void iser_task_rdma_init(struct iscsi_iser_task *iser_task) { diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 40c0f4978e2f..1b4945367e4f 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -252,14 +252,21 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn) } static int -iser_alloc_reg_res(struct ib_device *ib_device, +iser_alloc_reg_res(struct iser_device *device, struct ib_pd *pd, struct iser_reg_resources *res, unsigned int size) { + struct ib_device *ib_dev = device->ib_device; + enum ib_mr_type mr_type; int ret; - res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, size); + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + + res->mr = ib_alloc_mr(pd, mr_type, size); if (IS_ERR(res->mr)) { ret = PTR_ERR(res->mr); iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); @@ -277,7 +284,7 @@ iser_free_reg_res(struct iser_reg_resources *rsc) } static int -iser_alloc_pi_ctx(struct ib_device *ib_device, +iser_alloc_pi_ctx(struct iser_device *device, struct ib_pd *pd, struct iser_fr_desc *desc, unsigned int size) @@ -291,7 +298,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, pi_ctx = desc->pi_ctx; - ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc, size); + ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size); if (ret) { iser_err("failed to allocate reg_resources\n"); goto alloc_reg_res_err; @@ -324,7 +331,7 @@ iser_free_pi_ctx(struct iser_pi_context *pi_ctx) } static struct iser_fr_desc * -iser_create_fastreg_desc(struct ib_device *ib_device, +iser_create_fastreg_desc(struct iser_device *device, struct ib_pd *pd, bool pi_enable, unsigned int size) @@ -336,12 +343,12 @@ iser_create_fastreg_desc(struct ib_device *ib_device, if (!desc) return ERR_PTR(-ENOMEM); - ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc, size); + ret = iser_alloc_reg_res(device, pd, &desc->rsc, size); if (ret) goto reg_res_alloc_failure; if (pi_enable) { - ret = iser_alloc_pi_ctx(ib_device, pd, desc, size); + ret = iser_alloc_pi_ctx(device, pd, desc, size); if (ret) goto pi_ctx_alloc_failure; } @@ -374,7 +381,7 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, spin_lock_init(&fr_pool->lock); fr_pool->size = 0; for (i = 0; i < cmds_max; i++) { - desc = iser_create_fastreg_desc(device->ib_device, device->pd, + desc = iser_create_fastreg_desc(device, device->pd, ib_conn->pi_support, size); if (IS_ERR(desc)) { ret = PTR_ERR(desc); @@ -663,7 +670,6 @@ void iser_conn_release(struct iser_conn *iser_conn) int iser_conn_terminate(struct iser_conn *iser_conn) { struct ib_conn *ib_conn = &iser_conn->ib_conn; - struct ib_send_wr *bad_wr; int err = 0; /* terminate the iser conn only if the conn state is UP */ @@ -688,14 +694,8 @@ int iser_conn_terminate(struct iser_conn *iser_conn) iser_err("Failed to disconnect, conn: 0x%p err %d\n", iser_conn, err); - /* post an indication that all flush errors were consumed */ - err = ib_post_send(ib_conn->qp, &ib_conn->last, &bad_wr); - if (err) { - iser_err("conn %p failed to post last wr", ib_conn); - return 1; - } - - wait_for_completion(&ib_conn->last_comp); + /* block until all flush errors are consumed */ + ib_drain_sq(ib_conn->qp); } return 1; @@ -954,10 +954,6 @@ void iser_conn_init(struct iser_conn *iser_conn) ib_conn->post_recv_buf_count = 0; ib_conn->reg_cqe.done = iser_reg_comp; - ib_conn->last_cqe.done = iser_last_comp; - ib_conn->last.wr_cqe = &ib_conn->last_cqe; - ib_conn->last.opcode = IB_WR_SEND; - init_completion(&ib_conn->last_comp); } /** diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 03022f6420d7..b6bf20496021 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -446,49 +446,17 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) dev->max_pages_per_mr); } -static void srp_drain_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct srp_rdma_ch *ch = cq->cq_context; - - complete(&ch->done); -} - -static struct ib_cqe srp_drain_cqe = { - .done = srp_drain_done, -}; - /** * srp_destroy_qp() - destroy an RDMA queue pair * @ch: SRP RDMA channel. * - * Change a queue pair into the error state and wait until all receive - * completions have been processed before destroying it. This avoids that - * the receive completion handler can access the queue pair while it is + * Drain the qp before destroying it. This avoids that the receive + * completion handler can access the queue pair while it is * being destroyed. */ static void srp_destroy_qp(struct srp_rdma_ch *ch) { - static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; - static struct ib_recv_wr wr = { 0 }; - struct ib_recv_wr *bad_wr; - int ret; - - wr.wr_cqe = &srp_drain_cqe; - /* Destroying a QP and reusing ch->done is only safe if not connected */ - WARN_ON_ONCE(ch->connected); - - ret = ib_modify_qp(ch->qp, &attr, IB_QP_STATE); - WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret); - if (ret) - goto out; - - init_completion(&ch->done); - ret = ib_post_recv(ch->qp, &wr, &bad_wr); - WARN_ONCE(ret, "ib_post_recv() returned %d\n", ret); - if (ret == 0) - wait_for_completion(&ch->done); - -out: + ib_drain_rq(ch->qp); ib_destroy_qp(ch->qp); } @@ -508,7 +476,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) if (!init_attr) return -ENOMEM; - /* queue_size + 1 for ib_drain_qp */ + /* queue_size + 1 for ib_drain_rq() */ recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1, ch->comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(recv_cq)) { diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 0c37fee363b1..25bdaeef2520 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -91,76 +91,32 @@ MODULE_PARM_DESC(srpt_service_guid, " instead of using the node_guid of the first HCA."); static struct ib_client srpt_client; -static void srpt_release_channel(struct srpt_rdma_ch *ch); +static void srpt_release_cmd(struct se_cmd *se_cmd); +static void srpt_free_ch(struct kref *kref); static int srpt_queue_status(struct se_cmd *cmd); static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void srpt_process_wait_list(struct srpt_rdma_ch *ch); -/** - * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE. - */ -static inline -enum dma_data_direction opposite_dma_dir(enum dma_data_direction dir) -{ - switch (dir) { - case DMA_TO_DEVICE: return DMA_FROM_DEVICE; - case DMA_FROM_DEVICE: return DMA_TO_DEVICE; - default: return dir; - } -} - -/** - * srpt_sdev_name() - Return the name associated with the HCA. - * - * Examples are ib0, ib1, ... - */ -static inline const char *srpt_sdev_name(struct srpt_device *sdev) -{ - return sdev->device->name; -} - -static enum rdma_ch_state srpt_get_ch_state(struct srpt_rdma_ch *ch) -{ - unsigned long flags; - enum rdma_ch_state state; - - spin_lock_irqsave(&ch->spinlock, flags); - state = ch->state; - spin_unlock_irqrestore(&ch->spinlock, flags); - return state; -} - -static enum rdma_ch_state -srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new_state) -{ - unsigned long flags; - enum rdma_ch_state prev; - - spin_lock_irqsave(&ch->spinlock, flags); - prev = ch->state; - ch->state = new_state; - spin_unlock_irqrestore(&ch->spinlock, flags); - return prev; -} - -/** - * srpt_test_and_set_ch_state() - Test and set the channel state. - * - * Returns true if and only if the channel state has been set to the new state. +/* + * The only allowed channel state changes are those that change the channel + * state into a state with a higher numerical value. Hence the new > prev test. */ -static bool -srpt_test_and_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state old, - enum rdma_ch_state new) +static bool srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new) { unsigned long flags; enum rdma_ch_state prev; + bool changed = false; spin_lock_irqsave(&ch->spinlock, flags); prev = ch->state; - if (prev == old) + if (new > prev) { ch->state = new; + changed = true; + } spin_unlock_irqrestore(&ch->spinlock, flags); - return prev == old; + + return changed; } /** @@ -182,7 +138,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, return; pr_debug("ASYNC event= %d on device= %s\n", event->event, - srpt_sdev_name(sdev)); + sdev->device->name); switch (event->event) { case IB_EVENT_PORT_ERR: @@ -220,25 +176,39 @@ static void srpt_srq_event(struct ib_event *event, void *ctx) pr_info("SRQ event %d\n", event->event); } +static const char *get_ch_state_name(enum rdma_ch_state s) +{ + switch (s) { + case CH_CONNECTING: + return "connecting"; + case CH_LIVE: + return "live"; + case CH_DISCONNECTING: + return "disconnecting"; + case CH_DRAINING: + return "draining"; + case CH_DISCONNECTED: + return "disconnected"; + } + return "???"; +} + /** * srpt_qp_event() - QP event callback function. */ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) { pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n", - event->event, ch->cm_id, ch->sess_name, srpt_get_ch_state(ch)); + event->event, ch->cm_id, ch->sess_name, ch->state); switch (event->event) { case IB_EVENT_COMM_EST: ib_cm_notify(ch->cm_id, event->event); break; case IB_EVENT_QP_LAST_WQE_REACHED: - if (srpt_test_and_set_ch_state(ch, CH_DRAINING, - CH_RELEASING)) - srpt_release_channel(ch); - else - pr_debug("%s: state %d - ignored LAST_WQE.\n", - ch->sess_name, srpt_get_ch_state(ch)); + pr_debug("%s-%d, state %s: received Last WQE event.\n", + ch->sess_name, ch->qp->qp_num, + get_ch_state_name(ch->state)); break; default: pr_err("received unrecognized IB QP event %d\n", event->event); @@ -281,7 +251,7 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad) struct ib_class_port_info *cif; cif = (struct ib_class_port_info *)mad->data; - memset(cif, 0, sizeof *cif); + memset(cif, 0, sizeof(*cif)); cif->base_version = 1; cif->class_version = 1; cif->resp_time_value = 20; @@ -340,7 +310,7 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, return; } - memset(iocp, 0, sizeof *iocp); + memset(iocp, 0, sizeof(*iocp)); strcpy(iocp->id_string, SRPT_ID_STRING); iocp->guid = cpu_to_be64(srpt_service_guid); iocp->vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id); @@ -390,7 +360,7 @@ static void srpt_get_svc_entries(u64 ioc_guid, } svc_entries = (struct ib_dm_svc_entries *)mad->data; - memset(svc_entries, 0, sizeof *svc_entries); + memset(svc_entries, 0, sizeof(*svc_entries)); svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid); snprintf(svc_entries->service_entries[0].name, sizeof(svc_entries->service_entries[0].name), @@ -484,7 +454,7 @@ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, rsp->ah = ah; dm_mad = rsp->mad; - memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof *dm_mad); + memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof(*dm_mad)); dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; dm_mad->mad_hdr.status = 0; @@ -532,7 +502,7 @@ static int srpt_refresh_port(struct srpt_port *sport) struct ib_port_attr port_attr; int ret; - memset(&port_modify, 0, sizeof port_modify); + memset(&port_modify, 0, sizeof(port_modify)); port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; port_modify.clr_port_cap_mask = 0; @@ -553,7 +523,7 @@ static int srpt_refresh_port(struct srpt_port *sport) goto err_query_port; if (!sport->mad_agent) { - memset(®_req, 0, sizeof reg_req); + memset(®_req, 0, sizeof(reg_req)); reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT; reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION; set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask); @@ -841,6 +811,39 @@ out: } /** + * srpt_zerolength_write() - Perform a zero-length RDMA write. + * + * A quote from the InfiniBand specification: C9-88: For an HCA responder + * using Reliable Connection service, for each zero-length RDMA READ or WRITE + * request, the R_Key shall not be validated, even if the request includes + * Immediate data. + */ +static int srpt_zerolength_write(struct srpt_rdma_ch *ch) +{ + struct ib_send_wr wr, *bad_wr; + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_RDMA_WRITE; + wr.wr_cqe = &ch->zw_cqe; + wr.send_flags = IB_SEND_SIGNALED; + return ib_post_send(ch->qp, &wr, &bad_wr); +} + +static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct srpt_rdma_ch *ch = cq->cq_context; + + if (wc->status == IB_WC_SUCCESS) { + srpt_process_wait_list(ch); + } else { + if (srpt_set_ch_state(ch, CH_DISCONNECTED)) + schedule_work(&ch->release_work); + else + WARN_ONCE("%s-%d\n", ch->sess_name, ch->qp->qp_num); + } +} + +/** * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. * @ioctx: Pointer to the I/O context associated with the request. * @srp_cmd: Pointer to the SRP_CMD request data. @@ -903,14 +906,14 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, db = (struct srp_direct_buf *)(srp_cmd->add_data + add_cdb_offset); - memcpy(ioctx->rbufs, db, sizeof *db); + memcpy(ioctx->rbufs, db, sizeof(*db)); *data_len = be32_to_cpu(db->len); } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { idb = (struct srp_indirect_buf *)(srp_cmd->add_data + add_cdb_offset); - ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof *db; + ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof(*db); if (ioctx->n_rbuf > (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { @@ -929,7 +932,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, ioctx->rbufs = &ioctx->single_rbuf; else { ioctx->rbufs = - kmalloc(ioctx->n_rbuf * sizeof *db, GFP_ATOMIC); + kmalloc(ioctx->n_rbuf * sizeof(*db), GFP_ATOMIC); if (!ioctx->rbufs) { ioctx->n_rbuf = 0; ret = -ENOMEM; @@ -938,7 +941,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, } db = idb->desc_list; - memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof *db); + memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof(*db)); *data_len = be32_to_cpu(idb->len); } out: @@ -956,7 +959,7 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) struct ib_qp_attr *attr; int ret; - attr = kzalloc(sizeof *attr, GFP_KERNEL); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; @@ -1070,7 +1073,7 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, dir = ioctx->cmd.data_direction; BUG_ON(dir == DMA_NONE); ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, - opposite_dma_dir(dir)); + target_reverse_dma_direction(&ioctx->cmd)); ioctx->mapped_sg_count = 0; } } @@ -1107,7 +1110,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, - opposite_dma_dir(dir)); + target_reverse_dma_direction(cmd)); if (unlikely(!count)) return -EAGAIN; @@ -1313,10 +1316,7 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) /* * If the command is in a state where the target core is waiting for - * the ib_srpt driver, change the state to the next state. Changing - * the state of the command from SRPT_STATE_NEED_DATA to - * SRPT_STATE_DATA_IN ensures that srpt_xmit_response() will call this - * function a second time. + * the ib_srpt driver, change the state to the next state. */ spin_lock_irqsave(&ioctx->spinlock, flags); @@ -1325,25 +1325,17 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) case SRPT_STATE_NEED_DATA: ioctx->state = SRPT_STATE_DATA_IN; break; - case SRPT_STATE_DATA_IN: case SRPT_STATE_CMD_RSP_SENT: case SRPT_STATE_MGMT_RSP_SENT: ioctx->state = SRPT_STATE_DONE; break; default: + WARN_ONCE(true, "%s: unexpected I/O context state %d\n", + __func__, state); break; } spin_unlock_irqrestore(&ioctx->spinlock, flags); - if (state == SRPT_STATE_DONE) { - struct srpt_rdma_ch *ch = ioctx->ch; - - BUG_ON(ch->sess == NULL); - - target_put_sess_cmd(&ioctx->cmd); - goto out; - } - pr_debug("Aborting cmd with state %d and tag %lld\n", state, ioctx->cmd.tag); @@ -1351,19 +1343,16 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) case SRPT_STATE_NEW: case SRPT_STATE_DATA_IN: case SRPT_STATE_MGMT: + case SRPT_STATE_DONE: /* * Do nothing - defer abort processing until * srpt_queue_response() is invoked. */ - WARN_ON(!transport_check_aborted_status(&ioctx->cmd, false)); break; case SRPT_STATE_NEED_DATA: - /* DMA_TO_DEVICE (write) - RDMA read error. */ - - /* XXX(hch): this is a horrible layering violation.. */ - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); + pr_debug("tag %#llx: RDMA read error\n", ioctx->cmd.tag); + transport_generic_request_failure(&ioctx->cmd, + TCM_CHECK_CONDITION_ABORT_CMD); break; case SRPT_STATE_CMD_RSP_SENT: /* @@ -1371,18 +1360,16 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * not been received in time. */ srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); - target_put_sess_cmd(&ioctx->cmd); + transport_generic_free_cmd(&ioctx->cmd, 0); break; case SRPT_STATE_MGMT_RSP_SENT: - srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - target_put_sess_cmd(&ioctx->cmd); + transport_generic_free_cmd(&ioctx->cmd, 0); break; default: WARN(1, "Unexpected command state (%d)", state); break; } -out: return state; } @@ -1422,9 +1409,14 @@ static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe); if (unlikely(wc->status != IB_WC_SUCCESS)) { + /* + * Note: if an RDMA write error completion is received that + * means that a SEND also has been posted. Defer further + * processing of the associated command until the send error + * completion has been received. + */ pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n", ioctx, wc->status); - srpt_abort_cmd(ioctx); } } @@ -1464,7 +1456,7 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, sense_data_len = ioctx->cmd.scsi_sense_length; WARN_ON(sense_data_len > sizeof(ioctx->sense_data)); - memset(srp_rsp, 0, sizeof *srp_rsp); + memset(srp_rsp, 0, sizeof(*srp_rsp)); srp_rsp->opcode = SRP_RSP; srp_rsp->req_lim_delta = cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); @@ -1514,7 +1506,7 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, srp_rsp = ioctx->ioctx.buf; BUG_ON(!srp_rsp); - memset(srp_rsp, 0, sizeof *srp_rsp); + memset(srp_rsp, 0, sizeof(*srp_rsp)); srp_rsp->opcode = SRP_RSP; srp_rsp->req_lim_delta = @@ -1528,80 +1520,6 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, return resp_len; } -#define NO_SUCH_LUN ((uint64_t)-1LL) - -/* - * SCSI LUN addressing method. See also SAM-2 and the section about - * eight byte LUNs. - */ -enum scsi_lun_addr_method { - SCSI_LUN_ADDR_METHOD_PERIPHERAL = 0, - SCSI_LUN_ADDR_METHOD_FLAT = 1, - SCSI_LUN_ADDR_METHOD_LUN = 2, - SCSI_LUN_ADDR_METHOD_EXTENDED_LUN = 3, -}; - -/* - * srpt_unpack_lun() - Convert from network LUN to linear LUN. - * - * Convert an 2-byte, 4-byte, 6-byte or 8-byte LUN structure in network byte - * order (big endian) to a linear LUN. Supports three LUN addressing methods: - * peripheral, flat and logical unit. See also SAM-2, section 4.9.4 (page 40). - */ -static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) -{ - uint64_t res = NO_SUCH_LUN; - int addressing_method; - - if (unlikely(len < 2)) { - pr_err("Illegal LUN length %d, expected 2 bytes or more\n", - len); - goto out; - } - - switch (len) { - case 8: - if ((*((__be64 *)lun) & - cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0) - goto out_err; - break; - case 4: - if (*((__be16 *)&lun[2]) != 0) - goto out_err; - break; - case 6: - if (*((__be32 *)&lun[2]) != 0) - goto out_err; - break; - case 2: - break; - default: - goto out_err; - } - - addressing_method = (*lun) >> 6; /* highest two bits of byte 0 */ - switch (addressing_method) { - case SCSI_LUN_ADDR_METHOD_PERIPHERAL: - case SCSI_LUN_ADDR_METHOD_FLAT: - case SCSI_LUN_ADDR_METHOD_LUN: - res = *(lun + 1) | (((*lun) & 0x3f) << 8); - break; - - case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN: - default: - pr_err("Unimplemented LUN addressing method %u\n", - addressing_method); - break; - } - -out: - return res; - -out_err: - pr_err("Support for multi-level LUNs has not yet been implemented\n"); - goto out; -} - static int srpt_check_stop_free(struct se_cmd *cmd) { struct srpt_send_ioctx *ioctx = container_of(cmd, @@ -1613,16 +1531,14 @@ static int srpt_check_stop_free(struct se_cmd *cmd) /** * srpt_handle_cmd() - Process SRP_CMD. */ -static int srpt_handle_cmd(struct srpt_rdma_ch *ch, - struct srpt_recv_ioctx *recv_ioctx, - struct srpt_send_ioctx *send_ioctx) +static void srpt_handle_cmd(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) { struct se_cmd *cmd; struct srp_cmd *srp_cmd; - uint64_t unpacked_lun; u64 data_len; enum dma_data_direction dir; - sense_reason_t ret; int rc; BUG_ON(!send_ioctx); @@ -1650,65 +1566,23 @@ static int srpt_handle_cmd(struct srpt_rdma_ch *ch, if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { pr_err("0x%llx: parsing SRP descriptor table failed.\n", srp_cmd->tag); - ret = TCM_INVALID_CDB_FIELD; - goto send_sense; + goto release_ioctx; } - unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun, - sizeof(srp_cmd->lun)); rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, - &send_ioctx->sense_data[0], unpacked_lun, data_len, - TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + &send_ioctx->sense_data[0], + scsilun_to_int(&srp_cmd->lun), data_len, + TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); if (rc != 0) { - ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - goto send_sense; + pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc, + srp_cmd->tag); + goto release_ioctx; } - return 0; - -send_sense: - transport_send_check_condition_and_sense(cmd, ret, 0); - return -1; -} - -/** - * srpt_rx_mgmt_fn_tag() - Process a task management function by tag. - * @ch: RDMA channel of the task management request. - * @fn: Task management function to perform. - * @req_tag: Tag of the SRP task management request. - * @mgmt_ioctx: I/O context of the task management request. - * - * Returns zero if the target core will process the task management - * request asynchronously. - * - * Note: It is assumed that the initiator serializes tag-based task management - * requests. - */ -static int srpt_rx_mgmt_fn_tag(struct srpt_send_ioctx *ioctx, u64 tag) -{ - struct srpt_device *sdev; - struct srpt_rdma_ch *ch; - struct srpt_send_ioctx *target; - int ret, i; + return; - ret = -EINVAL; - ch = ioctx->ch; - BUG_ON(!ch); - BUG_ON(!ch->sport); - sdev = ch->sport->sdev; - BUG_ON(!sdev); - spin_lock_irq(&sdev->spinlock); - for (i = 0; i < ch->rq_size; ++i) { - target = ch->ioctx_ring[i]; - if (target->cmd.se_lun == ioctx->cmd.se_lun && - target->cmd.tag == tag && - srpt_get_cmd_state(target) != SRPT_STATE_DONE) { - ret = 0; - /* now let the target core abort &target->cmd; */ - break; - } - } - spin_unlock_irq(&sdev->spinlock); - return ret; +release_ioctx: + send_ioctx->state = SRPT_STATE_DONE; + srpt_release_cmd(cmd); } static int srp_tmr_to_tcm(int fn) @@ -1744,8 +1618,6 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, struct srp_tsk_mgmt *srp_tsk; struct se_cmd *cmd; struct se_session *sess = ch->sess; - uint64_t unpacked_lun; - uint32_t tag = 0; int tcm_tmr; int rc; @@ -1761,26 +1633,10 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT); send_ioctx->cmd.tag = srp_tsk->tag; tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func); - if (tcm_tmr < 0) { - send_ioctx->cmd.se_tmr_req->response = - TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; - goto fail; - } - unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun, - sizeof(srp_tsk->lun)); - - if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) { - rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag); - if (rc < 0) { - send_ioctx->cmd.se_tmr_req->response = - TMR_TASK_DOES_NOT_EXIST; - goto fail; - } - tag = srp_tsk->task_tag; - } - rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun, - srp_tsk, tcm_tmr, GFP_KERNEL, tag, - TARGET_SCF_ACK_KREF); + rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, + scsilun_to_int(&srp_tsk->lun), srp_tsk, tcm_tmr, + GFP_KERNEL, srp_tsk->task_tag, + TARGET_SCF_ACK_KREF); if (rc != 0) { send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED; goto fail; @@ -1800,7 +1656,6 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_send_ioctx *send_ioctx) { struct srp_cmd *srp_cmd; - enum rdma_ch_state ch_state; BUG_ON(!ch); BUG_ON(!recv_ioctx); @@ -1809,13 +1664,12 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, recv_ioctx->ioctx.dma, srp_max_req_size, DMA_FROM_DEVICE); - ch_state = srpt_get_ch_state(ch); - if (unlikely(ch_state == CH_CONNECTING)) { + if (unlikely(ch->state == CH_CONNECTING)) { list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); goto out; } - if (unlikely(ch_state != CH_LIVE)) + if (unlikely(ch->state != CH_LIVE)) goto out; srp_cmd = recv_ioctx->ioctx.buf; @@ -1878,6 +1732,28 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) } } +/* + * This function must be called from the context in which RDMA completions are + * processed because it accesses the wait list without protection against + * access from other threads. + */ +static void srpt_process_wait_list(struct srpt_rdma_ch *ch) +{ + struct srpt_send_ioctx *ioctx; + + while (!list_empty(&ch->cmd_wait_list) && + ch->state >= CH_LIVE && + (ioctx = srpt_get_send_ioctx(ch)) != NULL) { + struct srpt_recv_ioctx *recv_ioctx; + + recv_ioctx = list_first_entry(&ch->cmd_wait_list, + struct srpt_recv_ioctx, + wait_list); + list_del(&recv_ioctx->wait_list); + srpt_handle_new_iu(ch, recv_ioctx, ioctx); + } +} + /** * Note: Although this has not yet been observed during tests, at least in * theory it is possible that the srpt_get_send_ioctx() call invoked by @@ -1905,15 +1781,10 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) atomic_inc(&ch->sq_wr_avail); - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) pr_info("sending response for ioctx 0x%p failed" " with status %d\n", ioctx, wc->status); - atomic_dec(&ch->req_lim); - srpt_abort_cmd(ioctx); - goto out; - } - if (state != SRPT_STATE_DONE) { srpt_unmap_sg_to_ib_sge(ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); @@ -1922,18 +1793,7 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) " wr_id = %u.\n", ioctx->ioctx.index); } -out: - while (!list_empty(&ch->cmd_wait_list) && - srpt_get_ch_state(ch) == CH_LIVE && - (ioctx = srpt_get_send_ioctx(ch)) != NULL) { - struct srpt_recv_ioctx *recv_ioctx; - - recv_ioctx = list_first_entry(&ch->cmd_wait_list, - struct srpt_recv_ioctx, - wait_list); - list_del(&recv_ioctx->wait_list); - srpt_handle_new_iu(ch, recv_ioctx, ioctx); - } + srpt_process_wait_list(ch); } /** @@ -1950,7 +1810,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) WARN_ON(ch->rq_size < 1); ret = -ENOMEM; - qp_init = kzalloc(sizeof *qp_init, GFP_KERNEL); + qp_init = kzalloc(sizeof(*qp_init), GFP_KERNEL); if (!qp_init) goto out; @@ -2017,168 +1877,102 @@ static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) } /** - * __srpt_close_ch() - Close an RDMA channel by setting the QP error state. + * srpt_close_ch() - Close an RDMA channel. * - * Reset the QP and make sure all resources associated with the channel will - * be deallocated at an appropriate time. + * Make sure all resources associated with the channel will be deallocated at + * an appropriate time. * - * Note: The caller must hold ch->sport->sdev->spinlock. + * Returns true if and only if the channel state has been modified into + * CH_DRAINING. */ -static void __srpt_close_ch(struct srpt_rdma_ch *ch) +static bool srpt_close_ch(struct srpt_rdma_ch *ch) { - enum rdma_ch_state prev_state; - unsigned long flags; + int ret; - spin_lock_irqsave(&ch->spinlock, flags); - prev_state = ch->state; - switch (prev_state) { - case CH_CONNECTING: - case CH_LIVE: - ch->state = CH_DISCONNECTING; - break; - default: - break; + if (!srpt_set_ch_state(ch, CH_DRAINING)) { + pr_debug("%s-%d: already closed\n", ch->sess_name, + ch->qp->qp_num); + return false; } - spin_unlock_irqrestore(&ch->spinlock, flags); - - switch (prev_state) { - case CH_CONNECTING: - ib_send_cm_rej(ch->cm_id, IB_CM_REJ_NO_RESOURCES, NULL, 0, - NULL, 0); - /* fall through */ - case CH_LIVE: - if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0) - pr_err("sending CM DREQ failed.\n"); - break; - case CH_DISCONNECTING: - break; - case CH_DRAINING: - case CH_RELEASING: - break; - } -} - -/** - * srpt_close_ch() - Close an RDMA channel. - */ -static void srpt_close_ch(struct srpt_rdma_ch *ch) -{ - struct srpt_device *sdev; - sdev = ch->sport->sdev; - spin_lock_irq(&sdev->spinlock); - __srpt_close_ch(ch); - spin_unlock_irq(&sdev->spinlock); -} + kref_get(&ch->kref); -/** - * srpt_shutdown_session() - Whether or not a session may be shut down. - */ -static int srpt_shutdown_session(struct se_session *se_sess) -{ - struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; - unsigned long flags; + ret = srpt_ch_qp_err(ch); + if (ret < 0) + pr_err("%s-%d: changing queue pair into error state failed: %d\n", + ch->sess_name, ch->qp->qp_num, ret); - spin_lock_irqsave(&ch->spinlock, flags); - if (ch->in_shutdown) { - spin_unlock_irqrestore(&ch->spinlock, flags); - return true; + pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, + ch->qp->qp_num); + ret = srpt_zerolength_write(ch); + if (ret < 0) { + pr_err("%s-%d: queuing zero-length write failed: %d\n", + ch->sess_name, ch->qp->qp_num, ret); + if (srpt_set_ch_state(ch, CH_DISCONNECTED)) + schedule_work(&ch->release_work); + else + WARN_ON_ONCE(true); } - ch->in_shutdown = true; - target_sess_cmd_list_set_waiting(se_sess); - spin_unlock_irqrestore(&ch->spinlock, flags); + kref_put(&ch->kref, srpt_free_ch); return true; } -/** - * srpt_drain_channel() - Drain a channel by resetting the IB queue pair. - * @cm_id: Pointer to the CM ID of the channel to be drained. - * - * Note: Must be called from inside srpt_cm_handler to avoid a race between - * accessing sdev->spinlock and the call to kfree(sdev) in srpt_remove_one() - * (the caller of srpt_cm_handler holds the cm_id spinlock; srpt_remove_one() - * waits until all target sessions for the associated IB device have been - * unregistered and target session registration involves a call to - * ib_destroy_cm_id(), which locks the cm_id spinlock and hence waits until - * this function has finished). +/* + * Change the channel state into CH_DISCONNECTING. If a channel has not yet + * reached the connected state, close it. If a channel is in the connected + * state, send a DREQ. If a DREQ has been received, send a DREP. Note: it is + * the responsibility of the caller to ensure that this function is not + * invoked concurrently with the code that accepts a connection. This means + * that this function must either be invoked from inside a CM callback + * function or that it must be invoked with the srpt_port.mutex held. */ -static void srpt_drain_channel(struct ib_cm_id *cm_id) +static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) { - struct srpt_device *sdev; - struct srpt_rdma_ch *ch; int ret; - bool do_reset = false; - WARN_ON_ONCE(irqs_disabled()); + if (!srpt_set_ch_state(ch, CH_DISCONNECTING)) + return -ENOTCONN; - sdev = cm_id->context; - BUG_ON(!sdev); - spin_lock_irq(&sdev->spinlock); - list_for_each_entry(ch, &sdev->rch_list, list) { - if (ch->cm_id == cm_id) { - do_reset = srpt_test_and_set_ch_state(ch, - CH_CONNECTING, CH_DRAINING) || - srpt_test_and_set_ch_state(ch, - CH_LIVE, CH_DRAINING) || - srpt_test_and_set_ch_state(ch, - CH_DISCONNECTING, CH_DRAINING); - break; - } - } - spin_unlock_irq(&sdev->spinlock); + ret = ib_send_cm_dreq(ch->cm_id, NULL, 0); + if (ret < 0) + ret = ib_send_cm_drep(ch->cm_id, NULL, 0); - if (do_reset) { - if (ch->sess) - srpt_shutdown_session(ch->sess); + if (ret < 0 && srpt_close_ch(ch)) + ret = 0; - ret = srpt_ch_qp_err(ch); - if (ret < 0) - pr_err("Setting queue pair in error state" - " failed: %d\n", ret); - } + return ret; } -/** - * srpt_find_channel() - Look up an RDMA channel. - * @cm_id: Pointer to the CM ID of the channel to be looked up. - * - * Return NULL if no matching RDMA channel has been found. - */ -static struct srpt_rdma_ch *srpt_find_channel(struct srpt_device *sdev, - struct ib_cm_id *cm_id) +static void __srpt_close_all_ch(struct srpt_device *sdev) { struct srpt_rdma_ch *ch; - bool found; - WARN_ON_ONCE(irqs_disabled()); - BUG_ON(!sdev); + lockdep_assert_held(&sdev->mutex); - found = false; - spin_lock_irq(&sdev->spinlock); list_for_each_entry(ch, &sdev->rch_list, list) { - if (ch->cm_id == cm_id) { - found = true; - break; - } + if (srpt_disconnect_ch(ch) >= 0) + pr_info("Closing channel %s-%d because target %s has been disabled\n", + ch->sess_name, ch->qp->qp_num, + sdev->device->name); + srpt_close_ch(ch); } - spin_unlock_irq(&sdev->spinlock); - - return found ? ch : NULL; } /** - * srpt_release_channel() - Release channel resources. - * - * Schedules the actual release because: - * - Calling the ib_destroy_cm_id() call from inside an IB CM callback would - * trigger a deadlock. - * - It is not safe to call TCM transport_* functions from interrupt context. + * srpt_shutdown_session() - Whether or not a session may be shut down. */ -static void srpt_release_channel(struct srpt_rdma_ch *ch) +static int srpt_shutdown_session(struct se_session *se_sess) +{ + return 1; +} + +static void srpt_free_ch(struct kref *kref) { - schedule_work(&ch->release_work); + struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref); + + kfree(ch); } static void srpt_release_channel_work(struct work_struct *w) @@ -2188,8 +1982,8 @@ static void srpt_release_channel_work(struct work_struct *w) struct se_session *se_sess; ch = container_of(w, struct srpt_rdma_ch, release_work); - pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess, - ch->release_done); + pr_debug("%s: %s-%d; release_done = %p\n", __func__, ch->sess_name, + ch->qp->qp_num, ch->release_done); sdev = ch->sport->sdev; BUG_ON(!sdev); @@ -2197,6 +1991,7 @@ static void srpt_release_channel_work(struct work_struct *w) se_sess = ch->sess; BUG_ON(!se_sess); + target_sess_cmd_list_set_waiting(se_sess); target_wait_for_sess_cmds(se_sess); transport_deregister_session_configfs(se_sess); @@ -2211,16 +2006,15 @@ static void srpt_release_channel_work(struct work_struct *w) ch->sport->sdev, ch->rq_size, ch->rsp_size, DMA_TO_DEVICE); - spin_lock_irq(&sdev->spinlock); - list_del(&ch->list); - spin_unlock_irq(&sdev->spinlock); - + mutex_lock(&sdev->mutex); + list_del_init(&ch->list); if (ch->release_done) complete(ch->release_done); + mutex_unlock(&sdev->mutex); wake_up(&sdev->ch_releaseQ); - kfree(ch); + kref_put(&ch->kref, srpt_free_ch); } /** @@ -2266,9 +2060,9 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); - rsp = kzalloc(sizeof *rsp, GFP_KERNEL); - rej = kzalloc(sizeof *rej, GFP_KERNEL); - rep_param = kzalloc(sizeof *rep_param, GFP_KERNEL); + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + rej = kzalloc(sizeof(*rej), GFP_KERNEL); + rep_param = kzalloc(sizeof(*rep_param), GFP_KERNEL); if (!rsp || !rej || !rep_param) { ret = -ENOMEM; @@ -2297,7 +2091,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; - spin_lock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) { if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) @@ -2305,26 +2099,16 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, && param->port == ch->sport->port && param->listen_id == ch->sport->sdev->cm_id && ch->cm_id) { - enum rdma_ch_state ch_state; - - ch_state = srpt_get_ch_state(ch); - if (ch_state != CH_CONNECTING - && ch_state != CH_LIVE) + if (srpt_disconnect_ch(ch) < 0) continue; - - /* found an existing channel */ - pr_debug("Found existing channel %s" - " cm_id= %p state= %d\n", - ch->sess_name, ch->cm_id, ch_state); - - __srpt_close_ch(ch); - + pr_info("Relogin - closed existing channel %s\n", + ch->sess_name); rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED; } } - spin_unlock_irq(&sdev->spinlock); + mutex_unlock(&sdev->mutex); } else rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; @@ -2340,7 +2124,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto reject; } - ch = kzalloc(sizeof *ch, GFP_KERNEL); + ch = kzalloc(sizeof(*ch), GFP_KERNEL); if (!ch) { rej->reason = cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); @@ -2349,11 +2133,14 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto reject; } + kref_init(&ch->kref); + ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); memcpy(ch->i_port_id, req->initiator_port_id, 16); memcpy(ch->t_port_id, req->target_port_id, 16); ch->sport = &sdev->port[param->port - 1]; ch->cm_id = cm_id; + cm_id->context = ch; /* * Avoid QUEUE_FULL conditions by limiting the number of buffers used * for the SRP protocol to the command queue size. @@ -2453,7 +2240,7 @@ try_again: /* create cm reply */ rep_param->qp_num = ch->qp->qp_num; rep_param->private_data = (void *)rsp; - rep_param->private_data_len = sizeof *rsp; + rep_param->private_data_len = sizeof(*rsp); rep_param->rnr_retry_count = 7; rep_param->flow_control = 1; rep_param->failover_accepted = 0; @@ -2468,14 +2255,14 @@ try_again: goto release_channel; } - spin_lock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); list_add_tail(&ch->list, &sdev->rch_list); - spin_unlock_irq(&sdev->spinlock); + mutex_unlock(&sdev->mutex); goto out; release_channel: - srpt_set_ch_state(ch, CH_RELEASING); + srpt_disconnect_ch(ch); transport_deregister_session_configfs(ch->sess); transport_deregister_session(ch->sess); ch->sess = NULL; @@ -2497,7 +2284,7 @@ reject: | SRP_BUF_FORMAT_INDIRECT); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, - (void *)rej, sizeof *rej); + (void *)rej, sizeof(*rej)); out: kfree(rep_param); @@ -2507,10 +2294,23 @@ out: return ret; } -static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) +static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, + enum ib_cm_rej_reason reason, + const u8 *private_data, + u8 private_data_len) { - pr_info("Received IB REJ for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); + char *priv = NULL; + int i; + + if (private_data_len && (priv = kmalloc(private_data_len * 3 + 1, + GFP_KERNEL))) { + for (i = 0; i < private_data_len; i++) + sprintf(priv + 3 * i, " %02x", private_data[i]); + } + pr_info("Received CM REJ for ch %s-%d; reason %d%s%s.\n", + ch->sess_name, ch->qp->qp_num, reason, private_data_len ? + "; private data" : "", priv ? priv : " (?)"); + kfree(priv); } /** @@ -2519,87 +2319,23 @@ static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) * An IB_CM_RTU_RECEIVED message indicates that the connection is established * and that the recipient may begin transmitting (RTU = ready to use). */ -static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id) +static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) { - struct srpt_rdma_ch *ch; int ret; - ch = srpt_find_channel(cm_id->context, cm_id); - BUG_ON(!ch); - - if (srpt_test_and_set_ch_state(ch, CH_CONNECTING, CH_LIVE)) { - struct srpt_recv_ioctx *ioctx, *ioctx_tmp; - + if (srpt_set_ch_state(ch, CH_LIVE)) { ret = srpt_ch_qp_rts(ch, ch->qp); - list_for_each_entry_safe(ioctx, ioctx_tmp, &ch->cmd_wait_list, - wait_list) { - list_del(&ioctx->wait_list); - srpt_handle_new_iu(ch, ioctx, NULL); - } - if (ret) + if (ret == 0) { + /* Trigger wait list processing. */ + ret = srpt_zerolength_write(ch); + WARN_ONCE(ret < 0, "%d\n", ret); + } else { srpt_close_ch(ch); + } } } -static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id) -{ - pr_info("Received IB TimeWait exit for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); -} - -static void srpt_cm_rep_error(struct ib_cm_id *cm_id) -{ - pr_info("Received IB REP error for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); -} - -/** - * srpt_cm_dreq_recv() - Process reception of a DREQ message. - */ -static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) -{ - struct srpt_rdma_ch *ch; - unsigned long flags; - bool send_drep = false; - - ch = srpt_find_channel(cm_id->context, cm_id); - BUG_ON(!ch); - - pr_debug("cm_id= %p ch->state= %d\n", cm_id, srpt_get_ch_state(ch)); - - spin_lock_irqsave(&ch->spinlock, flags); - switch (ch->state) { - case CH_CONNECTING: - case CH_LIVE: - send_drep = true; - ch->state = CH_DISCONNECTING; - break; - case CH_DISCONNECTING: - case CH_DRAINING: - case CH_RELEASING: - WARN(true, "unexpected channel state %d\n", ch->state); - break; - } - spin_unlock_irqrestore(&ch->spinlock, flags); - - if (send_drep) { - if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0) - pr_err("Sending IB DREP failed.\n"); - pr_info("Received DREQ and sent DREP for session %s.\n", - ch->sess_name); - } -} - -/** - * srpt_cm_drep_recv() - Process reception of a DREP message. - */ -static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) -{ - pr_info("Received InfiniBand DREP message for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); -} - /** * srpt_cm_handler() - IB connection manager callback function. * @@ -2612,6 +2348,7 @@ static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) */ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { + struct srpt_rdma_ch *ch = cm_id->context; int ret; ret = 0; @@ -2621,32 +2358,39 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) event->private_data); break; case IB_CM_REJ_RECEIVED: - srpt_cm_rej_recv(cm_id); + srpt_cm_rej_recv(ch, event->param.rej_rcvd.reason, + event->private_data, + IB_CM_REJ_PRIVATE_DATA_SIZE); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: - srpt_cm_rtu_recv(cm_id); + srpt_cm_rtu_recv(ch); break; case IB_CM_DREQ_RECEIVED: - srpt_cm_dreq_recv(cm_id); + srpt_disconnect_ch(ch); break; case IB_CM_DREP_RECEIVED: - srpt_cm_drep_recv(cm_id); + pr_info("Received CM DREP message for ch %s-%d.\n", + ch->sess_name, ch->qp->qp_num); + srpt_close_ch(ch); break; case IB_CM_TIMEWAIT_EXIT: - srpt_cm_timewait_exit(cm_id); + pr_info("Received CM TimeWait exit for ch %s-%d.\n", + ch->sess_name, ch->qp->qp_num); + srpt_close_ch(ch); break; case IB_CM_REP_ERROR: - srpt_cm_rep_error(cm_id); + pr_info("Received CM REP error for ch %s-%d.\n", ch->sess_name, + ch->qp->qp_num); break; case IB_CM_DREQ_ERROR: - pr_info("Received IB DREQ ERROR event.\n"); + pr_info("Received CM DREQ ERROR event.\n"); break; case IB_CM_MRA_RECEIVED: - pr_info("Received IB MRA event\n"); + pr_info("Received CM MRA event\n"); break; default: - pr_err("received unrecognized IB CM event %d\n", event->event); + pr_err("received unrecognized CM event %d\n", event->event); break; } @@ -2755,41 +2499,14 @@ static int srpt_write_pending_status(struct se_cmd *se_cmd) */ static int srpt_write_pending(struct se_cmd *se_cmd) { - struct srpt_rdma_ch *ch; - struct srpt_send_ioctx *ioctx; + struct srpt_send_ioctx *ioctx = + container_of(se_cmd, struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; enum srpt_command_state new_state; - enum rdma_ch_state ch_state; - int ret; - - ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); WARN_ON(new_state == SRPT_STATE_DONE); - - ch = ioctx->ch; - BUG_ON(!ch); - - ch_state = srpt_get_ch_state(ch); - switch (ch_state) { - case CH_CONNECTING: - WARN(true, "unexpected channel state %d\n", ch_state); - ret = -EINVAL; - goto out; - case CH_LIVE: - break; - case CH_DISCONNECTING: - case CH_DRAINING: - case CH_RELEASING: - pr_debug("cmd with tag %lld: channel disconnecting\n", - ioctx->cmd.tag); - srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN); - ret = -EINVAL; - goto out; - } - ret = srpt_xfer_data(ch, ioctx); - -out: - return ret; + return srpt_xfer_data(ch, ioctx); } static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) @@ -2920,36 +2637,25 @@ static void srpt_refresh_port_work(struct work_struct *work) srpt_refresh_port(sport); } -static int srpt_ch_list_empty(struct srpt_device *sdev) -{ - int res; - - spin_lock_irq(&sdev->spinlock); - res = list_empty(&sdev->rch_list); - spin_unlock_irq(&sdev->spinlock); - - return res; -} - /** * srpt_release_sdev() - Free the channel resources associated with a target. */ static int srpt_release_sdev(struct srpt_device *sdev) { - struct srpt_rdma_ch *ch, *tmp_ch; - int res; + int i, res; WARN_ON_ONCE(irqs_disabled()); BUG_ON(!sdev); - spin_lock_irq(&sdev->spinlock); - list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) - __srpt_close_ch(ch); - spin_unlock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); + for (i = 0; i < ARRAY_SIZE(sdev->port); i++) + sdev->port[i].enabled = false; + __srpt_close_all_ch(sdev); + mutex_unlock(&sdev->mutex); res = wait_event_interruptible(sdev->ch_releaseQ, - srpt_ch_list_empty(sdev)); + list_empty_careful(&sdev->rch_list)); if (res) pr_err("%s: interrupted.\n", __func__); @@ -3003,14 +2709,14 @@ static void srpt_add_one(struct ib_device *device) pr_debug("device = %p, device->dma_ops = %p\n", device, device->dma_ops); - sdev = kzalloc(sizeof *sdev, GFP_KERNEL); + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); if (!sdev) goto err; sdev->device = device; INIT_LIST_HEAD(&sdev->rch_list); init_waitqueue_head(&sdev->ch_releaseQ); - spin_lock_init(&sdev->spinlock); + mutex_init(&sdev->mutex); sdev->pd = ib_alloc_pd(device); if (IS_ERR(sdev->pd)) @@ -3082,7 +2788,7 @@ static void srpt_add_one(struct ib_device *device) if (srpt_refresh_port(sport)) { pr_err("MAD registration failed for %s-%d.\n", - srpt_sdev_name(sdev), i); + sdev->device->name, i); goto err_ring; } snprintf(sport->port_guid, sizeof(sport->port_guid), @@ -3231,24 +2937,26 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) static void srpt_close_session(struct se_session *se_sess) { DECLARE_COMPLETION_ONSTACK(release_done); - struct srpt_rdma_ch *ch; - struct srpt_device *sdev; - unsigned long res; - - ch = se_sess->fabric_sess_ptr; - WARN_ON(ch->sess != se_sess); + struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; + struct srpt_device *sdev = ch->sport->sdev; + bool wait; - pr_debug("ch %p state %d\n", ch, srpt_get_ch_state(ch)); + pr_debug("ch %s-%d state %d\n", ch->sess_name, ch->qp->qp_num, + ch->state); - sdev = ch->sport->sdev; - spin_lock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); BUG_ON(ch->release_done); ch->release_done = &release_done; - __srpt_close_ch(ch); - spin_unlock_irq(&sdev->spinlock); + wait = !list_empty(&ch->list); + srpt_disconnect_ch(ch); + mutex_unlock(&sdev->mutex); - res = wait_for_completion_timeout(&release_done, 60 * HZ); - WARN_ON(res == 0); + if (!wait) + return; + + while (wait_for_completion_timeout(&release_done, 180 * HZ) == 0) + pr_info("%s(%s-%d state %d): still waiting ...\n", __func__, + ch->sess_name, ch->qp->qp_num, ch->state); } /** @@ -3456,6 +3164,8 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, { struct se_portal_group *se_tpg = to_tpg(item); struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_device *sdev = sport->sdev; + struct srpt_rdma_ch *ch; unsigned long tmp; int ret; @@ -3469,11 +3179,24 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, pr_err("Illegal value for srpt_tpg_store_enable: %lu\n", tmp); return -EINVAL; } - if (tmp == 1) - sport->enabled = true; - else - sport->enabled = false; + if (sport->enabled == tmp) + goto out; + sport->enabled = tmp; + if (sport->enabled) + goto out; + + mutex_lock(&sdev->mutex); + list_for_each_entry(ch, &sdev->rch_list, list) { + if (ch->sport == sport) { + pr_debug("%s: ch %p %s-%d\n", __func__, ch, + ch->sess_name, ch->qp->qp_num); + srpt_disconnect_ch(ch); + srpt_close_ch(ch); + } + } + mutex_unlock(&sdev->mutex); +out: return count; } @@ -3565,7 +3288,6 @@ static struct configfs_attribute *srpt_wwn_attrs[] = { static const struct target_core_fabric_ops srpt_template = { .module = THIS_MODULE, .name = "srpt", - .node_acl_size = sizeof(struct srpt_node_acl), .get_fabric_name = srpt_get_fabric_name, .tpg_get_wwn = srpt_get_fabric_wwn, .tpg_get_tag = srpt_get_tag, diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 09037f2b0b51..af9b8b527340 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -218,20 +218,20 @@ struct srpt_send_ioctx { /** * enum rdma_ch_state - SRP channel state. - * @CH_CONNECTING: QP is in RTR state; waiting for RTU. - * @CH_LIVE: QP is in RTS state. - * @CH_DISCONNECTING: DREQ has been received; waiting for DREP - * or DREQ has been send and waiting for DREP - * or . - * @CH_DRAINING: QP is in ERR state; waiting for last WQE event. - * @CH_RELEASING: Last WQE event has been received; releasing resources. + * @CH_CONNECTING: QP is in RTR state; waiting for RTU. + * @CH_LIVE: QP is in RTS state. + * @CH_DISCONNECTING: DREQ has been sent and waiting for DREP or DREQ has + * been received. + * @CH_DRAINING: DREP has been received or waiting for DREP timed out + * and last work request has been queued. + * @CH_DISCONNECTED: Last completion has been received. */ enum rdma_ch_state { CH_CONNECTING, CH_LIVE, CH_DISCONNECTING, CH_DRAINING, - CH_RELEASING + CH_DISCONNECTED, }; /** @@ -267,6 +267,8 @@ struct srpt_rdma_ch { struct ib_cm_id *cm_id; struct ib_qp *qp; struct ib_cq *cq; + struct ib_cqe zw_cqe; + struct kref kref; int rq_size; u32 rsp_size; atomic_t sq_wr_avail; @@ -286,7 +288,6 @@ struct srpt_rdma_ch { u8 sess_name[36]; struct work_struct release_work; struct completion *release_done; - bool in_shutdown; }; /** @@ -343,7 +344,7 @@ struct srpt_port { * @ioctx_ring: Per-HCA SRQ. * @rch_list: Per-device channel list -- see also srpt_rdma_ch.list. * @ch_releaseQ: Enables waiting for removal from rch_list. - * @spinlock: Protects rch_list and tpg. + * @mutex: Protects rch_list. * @port: Information about the ports owned by this HCA. * @event_handler: Per-HCA asynchronous IB event handler. * @list: Node in srpt_dev_list. @@ -357,18 +358,10 @@ struct srpt_device { struct srpt_recv_ioctx **ioctx_ring; struct list_head rch_list; wait_queue_head_t ch_releaseQ; - spinlock_t spinlock; + struct mutex mutex; struct srpt_port port[2]; struct ib_event_handler event_handler; struct list_head list; }; -/** - * struct srpt_node_acl - Per-initiator ACL data (managed via configfs). - * @nacl: Target core node ACL information. - */ -struct srpt_node_acl { - struct se_node_acl nacl; -}; - #endif /* IB_SRPT_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index d66c690a8597..e97094598b2d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -157,7 +157,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [29] = "802.1ad offload support", [31] = "Modifying loopback source checks using UPDATE_QP support", [32] = "Loopback source checks support", - [33] = "RoCEv2 support" + [33] = "RoCEv2 support", + [34] = "DMFS Sniffer support (UC & MC)" }; int i; @@ -810,6 +811,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN; dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f; + if (field & 0x20) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER; MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_BEACON_OFFSET); if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_BEACON; diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c index 1d4e2e054647..42d8de892bfe 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c @@ -752,8 +752,10 @@ static const u8 __promisc_mode[] = { [MLX4_FS_REGULAR] = 0x0, [MLX4_FS_ALL_DEFAULT] = 0x1, [MLX4_FS_MC_DEFAULT] = 0x3, - [MLX4_FS_UC_SNIFFER] = 0x4, - [MLX4_FS_MC_SNIFFER] = 0x5, + [MLX4_FS_MIRROR_RX_PORT] = 0x4, + [MLX4_FS_MIRROR_SX_PORT] = 0x5, + [MLX4_FS_UC_SNIFFER] = 0x6, + [MLX4_FS_MC_SNIFFER] = 0x7, }; int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index aac071a7e830..6ef0bfded2ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -515,7 +515,7 @@ struct mlx5e_priv { struct mlx5_uar cq_uar; u32 pdn; u32 tdn; - struct mlx5_core_mr mr; + struct mlx5_core_mkey mkey; struct mlx5e_rq drop_rq; struct mlx5e_channel **channel; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d4e1c3045200..43a148939557 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -982,7 +982,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; - c->mkey_be = cpu_to_be32(priv->mr.key); + c->mkey_be = cpu_to_be32(priv->mkey.key); c->num_tc = priv->params.num_tc; mlx5e_build_channeltc_to_txq_map(priv, ix); @@ -2194,7 +2194,7 @@ static void mlx5e_build_netdev(struct net_device *netdev) } static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, - struct mlx5_core_mr *mr) + struct mlx5_core_mkey *mkey) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_create_mkey_mbox_in *in; @@ -2210,7 +2210,7 @@ static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, in->seg.flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - err = mlx5_core_create_mkey(mdev, mr, in, sizeof(*in), NULL, NULL, + err = mlx5_core_create_mkey(mdev, mkey, in, sizeof(*in), NULL, NULL, NULL); kvfree(in); @@ -2259,7 +2259,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) goto err_dealloc_pd; } - err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr); + err = mlx5e_create_mkey(priv, priv->pdn, &priv->mkey); if (err) { mlx5_core_err(mdev, "create mkey failed, %d\n", err); goto err_dealloc_transport_domain; @@ -2333,7 +2333,7 @@ err_destroy_tises: mlx5e_destroy_tises(priv); err_destroy_mkey: - mlx5_core_destroy_mkey(mdev, &priv->mr); + mlx5_core_destroy_mkey(mdev, &priv->mkey); err_dealloc_transport_domain: mlx5_core_dealloc_transport_domain(mdev, priv->tdn); @@ -2367,7 +2367,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) mlx5e_destroy_rqt(priv, MLX5E_INDIRECTION_RQT); mlx5e_close_drop_rq(priv); mlx5e_destroy_tises(priv); - mlx5_core_destroy_mkey(priv->mdev, &priv->mr); + mlx5_core_destroy_mkey(priv->mdev, &priv->mkey); mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 6f68dba8d7ed..bf3446794bd5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -77,6 +77,9 @@ #define KERNEL_NUM_PRIOS 1 #define KENREL_MIN_LEVEL 2 +#define ANCHOR_MAX_FT 1 +#define ANCHOR_NUM_PRIOS 1 +#define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1) struct node_caps { size_t arr_sz; long *caps; @@ -92,7 +95,7 @@ static struct init_tree_node { int max_ft; } root_fs = { .type = FS_TYPE_NAMESPACE, - .ar_size = 3, + .ar_size = 4, .children = (struct init_tree_node[]) { ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_REQUIRED_CAPS(FS_CAP(flow_table_properties_nic_receive.flow_modify_en), @@ -108,6 +111,8 @@ static struct init_tree_node { FS_CAP(flow_table_properties_nic_receive.identified_miss_table_mode), FS_CAP(flow_table_properties_nic_receive.flow_table_modify)), ADD_NS(ADD_MULTIPLE_PRIO(LEFTOVERS_NUM_PRIOS, LEFTOVERS_MAX_FT))), + ADD_PRIO(0, ANCHOR_MIN_LEVEL, 0, {}, + ADD_NS(ADD_MULTIPLE_PRIO(ANCHOR_NUM_PRIOS, ANCHOR_MAX_FT))), } }; @@ -196,8 +201,10 @@ static void tree_put_node(struct fs_node *node) static int tree_remove_node(struct fs_node *node) { - if (atomic_read(&node->refcount) > 1) - return -EPERM; + if (atomic_read(&node->refcount) > 1) { + atomic_dec(&node->refcount); + return -EEXIST; + } tree_put_node(node); return 0; } @@ -360,6 +367,11 @@ static void del_rule(struct fs_node *node) memcpy(match_value, fte->val, sizeof(fte->val)); fs_get_obj(ft, fg->node.parent); list_del(&rule->node.list); + if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + mutex_lock(&rule->dest_attr.ft->lock); + list_del(&rule->next_ft); + mutex_unlock(&rule->dest_attr.ft->lock); + } fte->dests_size--; if (fte->dests_size) { err = mlx5_cmd_update_fte(dev, ft, @@ -465,6 +477,8 @@ static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte, ft->node.type = FS_TYPE_FLOW_TABLE; ft->type = table_type; ft->max_fte = max_fte; + INIT_LIST_HEAD(&ft->fwd_rules); + mutex_init(&ft->lock); return ft; } @@ -601,9 +615,63 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio return err; } +static int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct fs_fte *fte; + int err = 0; + + fs_get_obj(fte, rule->node.parent); + if (!(fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return -EINVAL; + lock_ref_node(&fte->node); + fs_get_obj(fg, fte->node.parent); + fs_get_obj(ft, fg->node.parent); + + memcpy(&rule->dest_attr, dest, sizeof(*dest)); + err = mlx5_cmd_update_fte(get_dev(&ft->node), + ft, fg->id, fte); + unlock_ref_node(&fte->node); + + return err; +} + +/* Modify/set FWD rules that point on old_next_ft to point on new_next_ft */ +static int connect_fwd_rules(struct mlx5_core_dev *dev, + struct mlx5_flow_table *new_next_ft, + struct mlx5_flow_table *old_next_ft) +{ + struct mlx5_flow_destination dest; + struct mlx5_flow_rule *iter; + int err = 0; + + /* new_next_ft and old_next_ft could be NULL only + * when we create/destroy the anchor flow table. + */ + if (!new_next_ft || !old_next_ft) + return 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = new_next_ft; + + mutex_lock(&old_next_ft->lock); + list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules); + mutex_unlock(&old_next_ft->lock); + list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) { + err = mlx5_modify_rule_destination(iter, &dest); + if (err) + pr_err("mlx5_core: failed to modify rule to point on flow table %d\n", + new_next_ft->id); + } + return 0; +} + static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, struct fs_prio *prio) { + struct mlx5_flow_table *next_ft; int err = 0; /* Connect_prev_fts and update_root_ft_create are mutually exclusive */ @@ -612,6 +680,11 @@ static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table err = connect_prev_fts(dev, ft, prio); if (err) return err; + + next_ft = find_next_chained_ft(prio); + err = connect_fwd_rules(dev, ft, next_ft); + if (err) + return err; } if (MLX5_CAP_FLOWTABLE(dev, @@ -762,6 +835,7 @@ static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest) if (!rule) return NULL; + INIT_LIST_HEAD(&rule->next_ft); rule->node.type = FS_TYPE_FLOW_DEST; memcpy(&rule->dest_attr, dest, sizeof(*dest)); @@ -782,9 +856,14 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte, return ERR_PTR(-ENOMEM); fs_get_obj(ft, fg->node.parent); - /* Add dest to dests list- added as first element after the head */ + /* Add dest to dests list- we need flow tables to be in the + * end of the list for forward to next prio rules. + */ tree_init_node(&rule->node, 1, del_rule); - list_add_tail(&rule->node.list, &fte->node.children); + if (dest && dest->type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + list_add(&rule->node.list, &fte->node.children); + else + list_add_tail(&rule->node.list, &fte->node.children); fte->dests_size++; if (fte->dests_size == 1) err = mlx5_cmd_create_fte(get_dev(&ft->node), @@ -903,6 +982,25 @@ out: return fg; } +static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_rule *rule; + + list_for_each_entry(rule, &fte->node.children, node.list) { + if (rule->dest_attr.type == dest->type) { + if ((dest->type == MLX5_FLOW_DESTINATION_TYPE_VPORT && + dest->vport_num == rule->dest_attr.vport_num) || + (dest->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + dest->ft == rule->dest_attr.ft) || + (dest->type == MLX5_FLOW_DESTINATION_TYPE_TIR && + dest->tir_num == rule->dest_attr.tir_num)) + return rule; + } + } + return NULL; +} + static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg, u32 *match_value, u8 action, @@ -919,6 +1017,13 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg, nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD); if (compare_match_value(&fg->mask, match_value, &fte->val) && action == fte->action && flow_tag == fte->flow_tag) { + rule = find_flow_rule(fte, dest); + if (rule) { + atomic_inc(&rule->node.refcount); + unlock_ref_node(&fte->node); + unlock_ref_node(&fg->node); + return rule; + } rule = add_rule_fte(fte, fg, dest); unlock_ref_node(&fte->node); if (IS_ERR(rule)) @@ -984,14 +1089,14 @@ static struct mlx5_flow_rule *add_rule_to_auto_fg(struct mlx5_flow_table *ft, return rule; } -struct mlx5_flow_rule * -mlx5_add_flow_rule(struct mlx5_flow_table *ft, - u8 match_criteria_enable, - u32 *match_criteria, - u32 *match_value, - u32 action, - u32 flow_tag, - struct mlx5_flow_destination *dest) +static struct mlx5_flow_rule * +_mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest) { struct mlx5_flow_group *g; struct mlx5_flow_rule *rule; @@ -1014,6 +1119,63 @@ unlock: unlock_ref_node(&ft->node); return rule; } + +static bool fwd_next_prio_supported(struct mlx5_flow_table *ft) +{ + return ((ft->type == FS_FT_NIC_RX) && + (MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs))); +} + +struct mlx5_flow_rule * +mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + struct mlx5_flow_destination gen_dest; + struct mlx5_flow_table *next_ft = NULL; + struct mlx5_flow_rule *rule = NULL; + u32 sw_action = action; + struct fs_prio *prio; + + fs_get_obj(prio, ft->node.parent); + if (action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + if (!fwd_next_prio_supported(ft)) + return ERR_PTR(-EOPNOTSUPP); + if (dest) + return ERR_PTR(-EINVAL); + mutex_lock(&root->chain_lock); + next_ft = find_next_chained_ft(prio); + if (next_ft) { + gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + gen_dest.ft = next_ft; + dest = &gen_dest; + action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } else { + mutex_unlock(&root->chain_lock); + return ERR_PTR(-EOPNOTSUPP); + } + } + + rule = _mlx5_add_flow_rule(ft, match_criteria_enable, match_criteria, + match_value, action, flow_tag, dest); + + if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + if (!IS_ERR_OR_NULL(rule) && + (list_empty(&rule->next_ft))) { + mutex_lock(&next_ft->lock); + list_add(&rule->next_ft, &next_ft->fwd_rules); + mutex_unlock(&next_ft->lock); + rule->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + } + mutex_unlock(&root->chain_lock); + } + return rule; +} EXPORT_SYMBOL(mlx5_add_flow_rule); void mlx5_del_flow_rule(struct mlx5_flow_rule *rule) @@ -1077,6 +1239,10 @@ static int disconnect_flow_table(struct mlx5_flow_table *ft) return 0; next_ft = find_next_chained_ft(prio); + err = connect_fwd_rules(dev, next_ft, ft); + if (err) + return err; + err = connect_prev_fts(dev, next_ft, prio); if (err) mlx5_core_warn(dev, "Failed to disconnect flow table %d\n", @@ -1126,6 +1292,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, case MLX5_FLOW_NAMESPACE_BYPASS: case MLX5_FLOW_NAMESPACE_KERNEL: case MLX5_FLOW_NAMESPACE_LEFTOVERS: + case MLX5_FLOW_NAMESPACE_ANCHOR: prio = type; break; case MLX5_FLOW_NAMESPACE_FDB: @@ -1351,6 +1518,25 @@ static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns) } } +#define ANCHOR_PRIO 0 +#define ANCHOR_SIZE 1 +static int create_anchor_flow_table(struct mlx5_core_dev + *dev) +{ + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_flow_table *ft; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ANCHOR); + if (!ns) + return -EINVAL; + ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE); + if (IS_ERR(ft)) { + mlx5_core_err(dev, "Failed to create last anchor flow table"); + return PTR_ERR(ft); + } + return 0; +} + static int init_root_ns(struct mlx5_core_dev *dev) { @@ -1363,6 +1549,9 @@ static int init_root_ns(struct mlx5_core_dev *dev) set_prio_attrs(dev->priv.root_ns); + if (create_anchor_flow_table(dev)) + goto cleanup; + return 0; cleanup: @@ -1392,6 +1581,15 @@ static void cleanup_single_prio_root_ns(struct mlx5_core_dev *dev, root_ns = NULL; } +static void destroy_flow_tables(struct fs_prio *prio) +{ + struct mlx5_flow_table *iter; + struct mlx5_flow_table *tmp; + + fs_for_each_ft_safe(iter, tmp, prio) + mlx5_destroy_flow_table(iter); +} + static void cleanup_root_ns(struct mlx5_core_dev *dev) { struct mlx5_flow_root_namespace *root_ns = dev->priv.root_ns; @@ -1420,6 +1618,7 @@ static void cleanup_root_ns(struct mlx5_core_dev *dev) list); fs_get_obj(obj_iter_prio2, iter_prio2); + destroy_flow_tables(obj_iter_prio2); if (tree_remove_node(iter_prio2)) { mlx5_core_warn(dev, "Priority %d wasn't destroyed, refcount > 1\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 00245fd7e4bc..f37a6248a27b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -68,6 +68,11 @@ struct fs_node { struct mlx5_flow_rule { struct fs_node node; struct mlx5_flow_destination dest_attr; + /* next_ft should be accessed under chain_lock and only of + * destination type is FWD_NEXT_fT. + */ + struct list_head next_ft; + u32 sw_action; }; /* Type of children is mlx5_flow_group */ @@ -82,6 +87,10 @@ struct mlx5_flow_table { unsigned int required_groups; unsigned int num_groups; } autogroup; + /* Protect fwd_rules */ + struct mutex lock; + /* FWD rules that point on this flow table */ + struct list_head fwd_rules; }; /* Type of children is mlx5_flow_rule */ @@ -142,6 +151,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_list_for_each_entry(pos, root) \ list_for_each_entry(pos, root, node.list) +#define fs_list_for_each_entry_safe(pos, tmp, root) \ + list_for_each_entry_safe(pos, tmp, root, node.list) + #define fs_for_each_ns_or_ft_reverse(pos, prio) \ list_for_each_entry_reverse(pos, &(prio)->node.children, list) @@ -157,6 +169,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_for_each_ft(pos, prio) \ fs_list_for_each_entry(pos, &(prio)->node.children) +#define fs_for_each_ft_safe(pos, tmp, prio) \ + fs_list_for_each_entry_safe(pos, tmp, &(prio)->node.children) + #define fs_for_each_fg(pos, ft) \ fs_list_for_each_entry(pos, &(ft)->node.children) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 1545a944c309..0916bbc69269 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1117,7 +1117,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) mlx5_init_cq_table(dev); mlx5_init_qp_table(dev); mlx5_init_srq_table(dev); - mlx5_init_mr_table(dev); + mlx5_init_mkey_table(dev); err = mlx5_init_fs(dev); if (err) { @@ -1164,7 +1164,7 @@ err_sriov: err_reg_dev: mlx5_cleanup_fs(dev); err_fs: - mlx5_cleanup_mr_table(dev); + mlx5_cleanup_mkey_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); @@ -1237,7 +1237,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) #endif mlx5_cleanup_fs(dev); - mlx5_cleanup_mr_table(dev); + mlx5_cleanup_mkey_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 6fa22b51e460..77a7293921d5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -36,25 +36,26 @@ #include <linux/mlx5/cmd.h> #include "mlx5_core.h" -void mlx5_init_mr_table(struct mlx5_core_dev *dev) +void mlx5_init_mkey_table(struct mlx5_core_dev *dev) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; memset(table, 0, sizeof(*table)); rwlock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); } -void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev) +void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev) { } -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, struct mlx5_create_mkey_mbox_in *in, int inlen, mlx5_cmd_cbk_t callback, void *context, struct mlx5_create_mkey_mbox_out *out) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; struct mlx5_create_mkey_mbox_out lout; int err; u8 key; @@ -83,34 +84,35 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, return mlx5_cmd_status_to_err(&lout.hdr); } - mr->iova = be64_to_cpu(in->seg.start_addr); - mr->size = be64_to_cpu(in->seg.len); - mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; - mr->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; + mkey->iova = be64_to_cpu(in->seg.start_addr); + mkey->size = be64_to_cpu(in->seg.len); + mkey->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; + mkey->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", - be32_to_cpu(lout.mkey), key, mr->key); + be32_to_cpu(lout.mkey), key, mkey->key); - /* connect to MR tree */ + /* connect to mkey tree */ write_lock_irq(&table->lock); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->key), mr); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), mkey); write_unlock_irq(&table->lock); if (err) { - mlx5_core_warn(dev, "failed radix tree insert of mr 0x%x, %d\n", - mlx5_base_mkey(mr->key), err); - mlx5_core_destroy_mkey(dev, mr); + mlx5_core_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n", + mlx5_base_mkey(mkey->key), err); + mlx5_core_destroy_mkey(dev, mkey); } return err; } EXPORT_SYMBOL(mlx5_core_create_mkey); -int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; struct mlx5_destroy_mkey_mbox_in in; struct mlx5_destroy_mkey_mbox_out out; - struct mlx5_core_mr *deleted_mr; + struct mlx5_core_mkey *deleted_mkey; unsigned long flags; int err; @@ -118,16 +120,16 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) memset(&out, 0, sizeof(out)); write_lock_irqsave(&table->lock, flags); - deleted_mr = radix_tree_delete(&table->tree, mlx5_base_mkey(mr->key)); + deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key)); write_unlock_irqrestore(&table->lock, flags); - if (!deleted_mr) { - mlx5_core_warn(dev, "failed radix tree delete of mr 0x%x\n", - mlx5_base_mkey(mr->key)); + if (!deleted_mkey) { + mlx5_core_warn(dev, "failed radix tree delete of mkey 0x%x\n", + mlx5_base_mkey(mkey->key)); return -ENOENT; } in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); if (err) return err; @@ -139,7 +141,7 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) } EXPORT_SYMBOL(mlx5_core_destroy_mkey); -int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_query_mkey_mbox_out *out, int outlen) { struct mlx5_query_mkey_mbox_in in; @@ -149,7 +151,7 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, memset(out, 0, outlen); in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); if (err) return err; @@ -161,7 +163,7 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, } EXPORT_SYMBOL(mlx5_core_query_mkey); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey) { struct mlx5_query_special_ctxs_mbox_in in; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index a87e773e93f3..5635ce7ad693 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -324,6 +324,29 @@ int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap); +int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, + u8 port_num, void *out, size_t sz) +{ + u32 *in; + int err; + + in = mlx5_vzalloc(sz); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(ppcnt_reg, in, local_port, port_num); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP); + err = mlx5_core_access_reg(dev, in, sz, out, + sz, MLX5_REG_PPCNT, 0, 0); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_ib_ppcnt); + int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause) { u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index c7398b95aecd..90ab09e375b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -850,3 +850,43 @@ int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); } EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); + +int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, void *out, size_t out_sz) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); + int is_group_manager; + void *in; + int err; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + in = mlx5_vzalloc(in_sz); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(query_vport_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_COUNTER); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_vport_counter_in, in, other_vport, 1); + MLX5_SET(query_vport_counter_in, in, vport_number, 0); + } else { + err = -EPERM; + goto free; + } + } + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_vport_counter_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + if (err) + goto free; + err = mlx5_cmd_status_to_err_v2(out); + +free: + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_vport_counter); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a0e8cc8dcc67..8541a913f6a3 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -219,6 +219,7 @@ enum { MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31, MLX4_DEV_CAP_FLAG2_LB_SRC_CHK = 1ULL << 32, MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, + MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER = 1ULL << 34, }; enum { @@ -1160,6 +1161,8 @@ enum mlx4_net_trans_promisc_mode { MLX4_FS_REGULAR = 1, MLX4_FS_ALL_DEFAULT, MLX4_FS_MC_DEFAULT, + MLX4_FS_MIRROR_RX_PORT, + MLX4_FS_MIRROR_SX_PORT, MLX4_FS_UC_SNIFFER, MLX4_FS_MC_SNIFFER, MLX4_FS_MODE_NUM, /* should be last */ diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 987764afa65c..9566b3b3b2c5 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -105,6 +105,29 @@ __mlx5_mask(typ, fld)) ___t; \ }) +/* Big endian getters */ +#define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\ + __mlx5_64_off(typ, fld))) + +#define MLX5_GET_BE(type_t, typ, p, fld) ({ \ + type_t tmp; \ + switch (sizeof(tmp)) { \ + case sizeof(u8): \ + tmp = (__force type_t)MLX5_GET(typ, p, fld); \ + break; \ + case sizeof(u16): \ + tmp = (__force type_t)cpu_to_be16(MLX5_GET(typ, p, fld)); \ + break; \ + case sizeof(u32): \ + tmp = (__force type_t)cpu_to_be32(MLX5_GET(typ, p, fld)); \ + break; \ + case sizeof(u64): \ + tmp = (__force type_t)MLX5_GET64_BE(typ, p, fld); \ + break; \ + } \ + tmp; \ + }) + enum { MLX5_MAX_COMMANDS = 32, MLX5_CMD_DATA_BLOCK_SIZE = 512, @@ -1284,7 +1307,8 @@ enum { MLX5_RFC_3635_COUNTERS_GROUP = 0x3, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5, MLX5_PER_PRIORITY_COUNTERS_GROUP = 0x10, - MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11 + MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11, + MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, }; static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) @@ -1294,6 +1318,11 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_BY_PASS_NUM_PRIOS 9 +#define MLX5_BY_PASS_NUM_REGULAR_PRIOS 8 +#define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 8 +#define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 +#define MLX5_BY_PASS_NUM_PRIOS (MLX5_BY_PASS_NUM_REGULAR_PRIOS +\ + MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS +\ + MLX5_BY_PASS_NUM_MULTICAST_PRIOS) #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1e3006dcf35d..9108904a6a56 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -338,7 +338,7 @@ struct mlx5_core_sig_ctx { u32 sigerr_count; }; -struct mlx5_core_mr { +struct mlx5_core_mkey { u64 iova; u64 size; u32 key; @@ -426,7 +426,7 @@ struct mlx5_srq_table { struct radix_tree_root tree; }; -struct mlx5_mr_table { +struct mlx5_mkey_table { /* protect radix tree */ rwlock_t lock; @@ -484,9 +484,9 @@ struct mlx5_priv { struct mlx5_cq_table cq_table; /* end: cq staff */ - /* start: mr staff */ - struct mlx5_mr_table mr_table; - /* end: mr staff */ + /* start: mkey staff */ + struct mlx5_mkey_table mkey_table; + /* end: mkey staff */ /* start: alloc staff */ /* protect buffer alocation according to numa node */ @@ -739,16 +739,18 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, struct mlx5_query_srq_mbox_out *out); int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); -void mlx5_init_mr_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev); -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +void mlx5_init_mkey_table(struct mlx5_core_dev *dev); +void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev); +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, struct mlx5_create_mkey_mbox_in *in, int inlen, mlx5_cmd_cbk_t callback, void *context, struct mlx5_create_mkey_mbox_out *out); -int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr); -int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey); +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_query_mkey_mbox_out *out, int outlen); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); @@ -847,6 +849,8 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); +int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, + u8 port_num, void *out, size_t sz); static inline int fw_initializing(struct mlx5_core_dev *dev) { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8230caa3fb6e..8dec5508d93d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -38,6 +38,10 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 +enum { + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, +}; + #define LEFTOVERS_RULE_NUM 2 static inline void build_leftovers_ft_param(int *priority, int *n_ent, @@ -52,6 +56,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, MLX5_FLOW_NAMESPACE_KERNEL, MLX5_FLOW_NAMESPACE_LEFTOVERS, + MLX5_FLOW_NAMESPACE_ANCHOR, MLX5_FLOW_NAMESPACE_FDB, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 51f1e540fc2b..a3cacab22849 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -458,7 +458,8 @@ struct mlx5_ifc_ads_bits { }; struct mlx5_ifc_flow_table_nic_cap_bits { - u8 reserved_at_0[0x200]; + u8 nic_rx_multi_path_tirs[0x1]; + u8 reserved_at_1[0x1ff]; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; @@ -736,7 +737,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; - u8 reserved_at_200[0xe]; + u8 reserved_at_200[0x3]; + u8 ipoib_basic_offloads[0x1]; + u8 reserved_at_204[0xa]; u8 drain_sigerr[0x1]; u8 cmdif_checksum[0x2]; u8 sigerr_cqe[0x1]; @@ -767,10 +770,13 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cd[0x1]; u8 reserved_at_22c[0x1]; u8 apm[0x1]; - u8 reserved_at_22e[0x7]; + u8 reserved_at_22e[0x2]; + u8 imaicl[0x1]; + u8 reserved_at_231[0x4]; u8 qkv[0x1]; u8 pkv[0x1]; - u8 reserved_at_237[0x4]; + u8 set_deth_sqpn[0x1]; + u8 reserved_at_239[0x3]; u8 xrc[0x1]; u8 ud[0x1]; u8 uc[0x1]; @@ -1208,6 +1214,36 @@ struct mlx5_ifc_phys_layer_cntrs_bits { u8 reserved_at_640[0x180]; }; +struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits { + u8 symbol_error_counter[0x10]; + + u8 link_error_recovery_counter[0x8]; + + u8 link_downed_counter[0x8]; + + u8 port_rcv_errors[0x10]; + + u8 port_rcv_remote_physical_errors[0x10]; + + u8 port_rcv_switch_relay_errors[0x10]; + + u8 port_xmit_discards[0x10]; + + u8 port_xmit_constraint_errors[0x8]; + + u8 port_rcv_constraint_errors[0x8]; + + u8 reserved_at_70[0x8]; + + u8 link_overrun_errors[0x8]; + + u8 reserved_at_80[0x10]; + + u8 vl_15_dropped[0x10]; + + u8 reserved_at_a0[0xa0]; +}; + struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits { u8 transmit_queue_high[0x20]; @@ -1780,7 +1816,7 @@ struct mlx5_ifc_qpc_bits { u8 log_sq_size[0x4]; u8 reserved_at_55[0x6]; u8 rlky[0x1]; - u8 reserved_at_5c[0x4]; + u8 ulp_stateless_offload_mode[0x4]; u8 counter_set_id[0x8]; u8 uar_page[0x18]; @@ -2618,6 +2654,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout; struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout; struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout; + struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; u8 reserved_at_0[0x7c0]; }; @@ -3126,7 +3163,8 @@ struct mlx5_ifc_query_vport_counter_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 reserved_at_41[0xb]; + u8 port_num[0x4]; u8 vport_number[0x10]; u8 reserved_at_60[0x60]; @@ -6954,6 +6992,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_peir_reg_bits peir_reg; struct mlx5_ifc_pelc_reg_bits pelc_reg; struct mlx5_ifc_pfcc_reg_bits pfcc_reg; + struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_pifr_reg_bits pifr_reg; struct mlx5_ifc_pipg_reg_bits pipg_reg; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 5b8c89ffaa58..cf031a3f16c5 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -499,7 +499,8 @@ struct mlx5_qp_context { u8 reserved2[4]; __be32 next_send_psn; __be32 cqn_send; - u8 reserved3[8]; + __be32 deth_sqpn; + u8 reserved3[4]; __be32 last_acked_psn; __be32 ssn; __be32 params2; @@ -621,9 +622,9 @@ static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); } -static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key) +static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key) { - return radix_tree_lookup(&dev->priv.mr_table.tree, key); + return radix_tree_lookup(&dev->priv.mkey_table.tree, key); } struct mlx5_page_fault_resume_mbox_in { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 123771003e68..a9f2bcc98cab 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -92,5 +92,7 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev); int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev); +int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, void *out, size_t out_sz); #endif /* __MLX5_VPORT_H__ */ diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 0ff049bd9ad4..37dd534cbeab 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -424,11 +424,11 @@ typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent, /** * ib_mad_snoop_handler - Callback handler for snooping sent MADs. * @mad_agent: MAD agent that snooped the MAD. - * @send_wr: Work request information on the sent MAD. + * @send_buf: send MAD data buffer. * @mad_send_wc: Work completion information on the sent MAD. Valid * only for snooping that occurs on a send completion. * - * Clients snooping MADs should not modify data referenced by the @send_wr + * Clients snooping MADs should not modify data referenced by the @send_buf * or @mad_send_wc. */ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 284b00c8fea4..3a03c1d18afa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -212,6 +212,7 @@ enum ib_device_cap_flags { IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1 << 31), + IB_DEVICE_SG_GAPS_REG = (1ULL << 32), }; enum ib_signature_prot_cap { @@ -662,10 +663,15 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); * @IB_MR_TYPE_SIGNATURE: memory region that is used for * signature operations (data-integrity * capable regions) + * @IB_MR_TYPE_SG_GAPS: memory region that is capable to + * register any arbitrary sg lists (without + * the normal mr constraints - see + * ib_map_mr_sg) */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SIGNATURE, + IB_MR_TYPE_SG_GAPS, }; /** @@ -1487,6 +1493,11 @@ enum ib_flow_domain { IB_FLOW_DOMAIN_NUM /* Must be last */ }; +enum ib_flow_flags { + IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ + IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ +}; + struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; @@ -1808,7 +1819,8 @@ struct ib_device { struct scatterlist *sg, int sg_nents); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, - enum ib_mw_type type); + enum ib_mw_type type, + struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, @@ -1846,6 +1858,8 @@ struct ib_device { int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); + void (*drain_rq)(struct ib_qp *qp); + void (*drain_sq)(struct ib_qp *qp); struct ib_dma_mapping_ops *dma_ops; @@ -3094,4 +3108,7 @@ int ib_sg_to_pages(struct ib_mr *mr, int sg_nents, int (*set_page)(struct ib_mr *, u64)); +void ib_drain_rq(struct ib_qp *qp); +void ib_drain_sq(struct ib_qp *qp); +void ib_drain_qp(struct ib_qp *qp); #endif /* IB_VERBS_H */ diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 52b4a2f993f2..1852e383afd6 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -109,14 +109,13 @@ struct p9_trans_rdma { /** * p9_rdma_context - Keeps track of in-process WR * - * @wc_op: The original WR op for when the CQE completes in error. * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ struct p9_rdma_req; struct p9_rdma_context { - enum ib_wc_opcode wc_op; + struct ib_cqe cqe; dma_addr_t busa; union { struct p9_req_t *req; @@ -284,9 +283,12 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) } static void -handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); struct p9_req_t *req; int err = 0; int16_t tag; @@ -295,7 +297,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, DMA_FROM_DEVICE); - if (status != IB_WC_SUCCESS) + if (wc->status != IB_WC_SUCCESS) goto err_out; err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); @@ -316,21 +318,32 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, req->rc = c->rc; p9_client_cb(client, req, REQ_STATUS_RCVD); + out: + up(&rdma->rq_sem); + kfree(c); return; err_out: - p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status); + p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", + req, err, wc->status); rdma->state = P9_RDMA_FLUSHING; client->status = Disconnected; + goto out; } static void -handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); + ib_dma_unmap_single(rdma->cm_id->device, c->busa, c->req->tc->size, DMA_TO_DEVICE); + up(&rdma->sq_sem); + kfree(c); } static void qp_event_handler(struct ib_event *event, void *context) @@ -339,42 +352,6 @@ static void qp_event_handler(struct ib_event *event, void *context) event->event, context); } -static void cq_comp_handler(struct ib_cq *cq, void *cq_context) -{ - struct p9_client *client = cq_context; - struct p9_trans_rdma *rdma = client->trans; - int ret; - struct ib_wc wc; - - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); - while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; - - switch (c->wc_op) { - case IB_WC_RECV: - handle_recv(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->rq_sem); - break; - - case IB_WC_SEND: - handle_send(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->sq_sem); - break; - - default: - pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n", - c->wc_op, wc.opcode, wc.status); - break; - } - kfree(c); - } -} - -static void cq_event_handler(struct ib_event *e, void *v) -{ - p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); -} - static void rdma_destroy_trans(struct p9_trans_rdma *rdma) { if (!rdma) @@ -387,7 +364,7 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma) ib_dealloc_pd(rdma->pd); if (rdma->cq && !IS_ERR(rdma->cq)) - ib_destroy_cq(rdma->cq); + ib_free_cq(rdma->cq); if (rdma->cm_id && !IS_ERR(rdma->cm_id)) rdma_destroy_id(rdma->cm_id); @@ -408,13 +385,14 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; + c->cqe.done = recv_done; + sge.addr = c->busa; sge.length = client->msize; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_RECV; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; return ib_post_recv(rdma->qp, &wr, &bad_wr); @@ -499,13 +477,14 @@ dont_need_post_recv: goto send_error; } + c->cqe.done = send_done; + sge.addr = c->busa; sge.length = c->req->tc->size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_SEND; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.sg_list = &sge; @@ -642,7 +621,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; - struct ib_cq_init_attr cq_attr = {}; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); @@ -695,13 +673,11 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) goto error; /* Create the Completion Queue */ - cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; - rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, - cq_event_handler, client, - &cq_attr); + rdma->cq = ib_alloc_cq(rdma->cm_id->device, client, + opts.sq_depth + opts.rq_depth + 1, + 0, IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); /* Create the Protection Domain */ rdma->pd = ib_alloc_pd(rdma->cm_id->device); |