diff options
224 files changed, 24043 insertions, 17223 deletions
diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt index 9028b025501a..3ecf0c3a133f 100644 --- a/Documentation/infiniband/sysfs.txt +++ b/Documentation/infiniband/sysfs.txt @@ -78,9 +78,10 @@ HFI1 chip_reset - diagnostic (root only) boardversion - board version ports/1/ - CMgtA/ + CCMgtA/ cc_settings_bin - CCA tables used by PSM2 cc_table_bin + cc_prescan - enable prescaning for faster BECN response sc2v/ - 32 files (0 - 31) used to translate sl->vl sl2sc/ - 32 files (0 - 31) used to translate sl->sc vl2mtu/ - 16 (0 - 15) files used to determine MTU for vl diff --git a/MAINTAINERS b/MAINTAINERS index 20e6a83b8d3f..c62557e6893e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9085,6 +9085,12 @@ L: rds-devel@oss.oracle.com (moderated for non-subscribers) S: Supported F: net/rds/ +RDMAVT - RDMA verbs software +M: Dennis Dalessandro <dennis.dalessandro@intel.com> +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/sw/rdmavt + READ-COPY UPDATE (RCU) M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> M: Josh Triplett <josh@joshtriplett.org> diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 0434760cd562..6425c0e5d18a 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -83,4 +83,6 @@ source "drivers/infiniband/ulp/srpt/Kconfig" source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" +source "drivers/infiniband/sw/rdmavt/Kconfig" + endif # INFINIBAND diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index dc21836b5a8d..fad0b44c356f 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_INFINIBAND) += core/ obj-$(CONFIG_INFINIBAND) += hw/ obj-$(CONFIG_INFINIBAND) += ulp/ +obj-$(CONFIG_INFINIBAND) += sw/ diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 53343ffbff7a..cb00d59da456 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1043,8 +1043,8 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_port(device, port, tprops); if (ret) { - printk(KERN_WARNING "ib_query_port failed (%d) for %s\n", - ret, device->name); + pr_warn("ib_query_port failed (%d) for %s\n", + ret, device->name); goto err; } @@ -1067,8 +1067,8 @@ static void ib_cache_update(struct ib_device *device, for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { - printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); + pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", + ret, device->name, i); goto err; } } @@ -1078,8 +1078,8 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_gid(device, port, i, gid_cache->table + i, NULL); if (ret) { - printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); + pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); goto err; } } @@ -1161,8 +1161,7 @@ int ib_cache_setup_one(struct ib_device *device) GFP_KERNEL); if (!device->cache.pkey_cache || !device->cache.lmc_cache) { - printk(KERN_WARNING "Couldn't allocate cache " - "for %s\n", device->name); + pr_warn("Couldn't allocate cache for %s\n", device->name); return -ENOMEM; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9729639df407..93ab0ae97208 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1206,6 +1206,10 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event, req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); + if (req->pkey != req_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; @@ -1213,6 +1217,10 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event, req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; + if (req->pkey != sidr_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; @@ -1713,7 +1721,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -2186,8 +2194,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) - printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " - "listening on device %s\n", ret, cma_dev->device->name); + pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", + ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -3239,7 +3247,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = 0; break; default: - printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", + pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } @@ -4003,8 +4011,8 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { - printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", - ndev->name, &id_priv->id); + pr_info("RDMA CM addr change for ndev %s used by id %p\n", + ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; @@ -4287,7 +4295,7 @@ static int __init cma_init(void) goto err; if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) - printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n"); + pr_warn("RDMA CMA: failed to add netlink callback\n"); cma_configfs_init(); return 0; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 00da80e02154..10979844026a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -115,8 +115,8 @@ static int ib_device_check_mandatory(struct ib_device *device) for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - printk(KERN_WARNING "Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + pr_warn("Device %s is missing mandatory function %s\n", + device->name, mandatory_table[i].name); return -EINVAL; } } @@ -255,8 +255,8 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { - printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", - device->name, client->name); + pr_warn("Couldn't allocate client context for %s/%s\n", + device->name, client->name); return -ENOMEM; } @@ -343,28 +343,29 @@ int ib_register_device(struct ib_device *device, ret = read_port_immutable(device); if (ret) { - printk(KERN_WARNING "Couldn't create per port immutable data %s\n", - device->name); + pr_warn("Couldn't create per port immutable data %s\n", + device->name); goto out; } ret = ib_cache_setup_one(device); if (ret) { - printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); + pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); goto out; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->query_device(device, &device->attrs, &uhw); if (ret) { - printk(KERN_WARNING "Couldn't query the device attributes\n"); + pr_warn("Couldn't query the device attributes\n"); + ib_cache_cleanup_one(device); goto out; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { - printk(KERN_WARNING "Couldn't register device %s with driver model\n", - device->name); + pr_warn("Couldn't register device %s with driver model\n", + device->name); ib_cache_cleanup_one(device); goto out; } @@ -565,8 +566,8 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, goto out; } - printk(KERN_WARNING "No client context found for %s/%s\n", - device->name, client->name); + pr_warn("No client context found for %s/%s\n", + device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); @@ -649,10 +650,23 @@ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { + union ib_gid gid; + int err; + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; - return device->query_port(device, port_num, port_attr); + memset(port_attr, 0, sizeof(*port_attr)); + err = device->query_port(device, port_num, port_attr); + if (err || port_attr->subnet_prefix) + return err; + + err = ib_query_gid(device, port_num, 0, &gid, NULL); + if (err) + return err; + + port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); + return 0; } EXPORT_SYMBOL(ib_query_port); @@ -959,13 +973,13 @@ static int __init ib_core_init(void) ret = class_register(&ib_class); if (ret) { - printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); + pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp; } ret = ibnl_init(); if (ret) { - printk(KERN_WARNING "Couldn't init IB netlink interface\n"); + pr_warn("Couldn't init IB netlink interface\n"); goto err_sysfs; } diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 6ac3683c144b..cdbb1f1a6d97 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -150,8 +150,8 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n", - fmr, fmr->ref_count); + pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n", + fmr, fmr->ref_count); } #endif } @@ -167,7 +167,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) ret = ib_unmap_fmr(&fmr_list); if (ret) - printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); + pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); spin_lock_irq(&pool->pool_lock); list_splice(&unmap_list, &pool->free_list); @@ -222,8 +222,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - printk(KERN_INFO PFX "Device %s does not support FMRs\n", - device->name); + pr_info(PFX "Device %s does not support FMRs\n", device->name); return ERR_PTR(-ENOSYS); } @@ -233,13 +232,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, max_remaps = device->attrs.max_map_per_fmr; pool = kmalloc(sizeof *pool, GFP_KERNEL); - if (!pool) { - printk(KERN_WARNING PFX "couldn't allocate pool struct\n"); + if (!pool) return ERR_PTR(-ENOMEM); - } pool->cache_bucket = NULL; - pool->flush_function = params->flush_function; pool->flush_arg = params->flush_arg; @@ -251,7 +247,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket, GFP_KERNEL); if (!pool->cache_bucket) { - printk(KERN_WARNING PFX "Failed to allocate cache in pool\n"); + pr_warn(PFX "Failed to allocate cache in pool\n"); ret = -ENOMEM; goto out_free_pool; } @@ -275,7 +271,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, "ib_fmr(%s)", device->name); if (IS_ERR(pool->thread)) { - printk(KERN_WARNING PFX "couldn't start cleanup thread\n"); + pr_warn(PFX "couldn't start cleanup thread\n"); ret = PTR_ERR(pool->thread); goto out_free_pool; } @@ -294,11 +290,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, for (i = 0; i < params->pool_size; ++i) { fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); - if (!fmr) { - printk(KERN_WARNING PFX "failed to allocate fmr " - "struct for FMR %d\n", i); + if (!fmr) goto out_fail; - } fmr->pool = pool; fmr->remap_count = 0; @@ -307,8 +300,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); if (IS_ERR(fmr->fmr)) { - printk(KERN_WARNING PFX "fmr_create failed " - "for FMR %d\n", i); + pr_warn(PFX "fmr_create failed for FMR %d\n", + i); kfree(fmr); goto out_fail; } @@ -363,8 +356,8 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) } if (i < pool->pool_size) - printk(KERN_WARNING PFX "pool still has %d regions registered\n", - pool->pool_size - i); + pr_warn(PFX "pool still has %d regions registered\n", + pool->pool_size - i); kfree(pool->cache_bucket); kfree(pool); @@ -463,7 +456,7 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, list_add(&fmr->list, &pool->free_list); spin_unlock_irqrestore(&pool->pool_lock, flags); - printk(KERN_WARNING PFX "fmr_map returns %d\n", result); + pr_warn(PFX "fmr_map returns %d\n", result); return ERR_PTR(result); } @@ -517,8 +510,8 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) #ifdef DEBUG if (fmr->ref_count < 0) - printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n", - fmr, fmr->ref_count); + pr_warn(PFX "FMR %p has ref count %d < 0\n", + fmr, fmr->ref_count); #endif spin_unlock_irqrestore(&pool->pool_lock, flags); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index ff9163dc1596..e28a160cdab0 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -50,6 +50,8 @@ #include <rdma/iw_cm.h> #include <rdma/ib_addr.h> +#include <rdma/iw_portmap.h> +#include <rdma/rdma_netlink.h> #include "iwcm.h" @@ -57,6 +59,16 @@ MODULE_AUTHOR("Tom Tucker"); MODULE_DESCRIPTION("iWARP CM"); MODULE_LICENSE("Dual BSD/GPL"); +static struct ibnl_client_cbs iwcm_nl_cb_table[] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} +}; + static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; @@ -402,6 +414,11 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) } spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (cm_id->mapped) { + iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); + iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); + } + (void)iwcm_deref_id(cm_id_priv); } @@ -426,6 +443,97 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id) } EXPORT_SYMBOL(iw_destroy_cm_id); +/** + * iw_cm_check_wildcard - If IP address is 0 then use original + * @pm_addr: sockaddr containing the ip to check for wildcard + * @cm_addr: sockaddr containing the actual IP address + * @cm_outaddr: sockaddr to set IP addr which leaving port + * + * Checks the pm_addr for wildcard and then sets cm_outaddr's + * IP to the actual (cm_addr). + */ +static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, + struct sockaddr_storage *cm_addr, + struct sockaddr_storage *cm_outaddr) +{ + if (pm_addr->ss_family == AF_INET) { + struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; + + if (pm4_addr->sin_addr.s_addr == INADDR_ANY) { + struct sockaddr_in *cm4_addr = + (struct sockaddr_in *)cm_addr; + struct sockaddr_in *cm4_outaddr = + (struct sockaddr_in *)cm_outaddr; + + cm4_outaddr->sin_addr = cm4_addr->sin_addr; + } + } else { + struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr; + + if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) { + struct sockaddr_in6 *cm6_addr = + (struct sockaddr_in6 *)cm_addr; + struct sockaddr_in6 *cm6_outaddr = + (struct sockaddr_in6 *)cm_outaddr; + + cm6_outaddr->sin6_addr = cm6_addr->sin6_addr; + } + } +} + +/** + * iw_cm_map - Use portmapper to map the ports + * @cm_id: connection manager pointer + * @active: Indicates the active side when true + * returns nonzero for error only if iwpm_create_mapinfo() fails + * + * Tries to add a mapping for a port using the Portmapper. If + * successful in mapping the IP/Port it will check the remote + * mapped IP address for a wildcard IP address and replace the + * zero IP address with the remote_addr. + */ +static int iw_cm_map(struct iw_cm_id *cm_id, bool active) +{ + struct iwpm_dev_data pm_reg_msg; + struct iwpm_sa_data pm_msg; + int status; + + cm_id->m_local_addr = cm_id->local_addr; + cm_id->m_remote_addr = cm_id->remote_addr; + + memcpy(pm_reg_msg.dev_name, cm_id->device->name, + sizeof(pm_reg_msg.dev_name)); + memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname, + sizeof(pm_reg_msg.if_name)); + + if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) || + !iwpm_valid_pid()) + return 0; + + cm_id->mapped = true; + pm_msg.loc_addr = cm_id->local_addr; + pm_msg.rem_addr = cm_id->remote_addr; + if (active) + status = iwpm_add_and_query_mapping(&pm_msg, + RDMA_NL_IWCM); + else + status = iwpm_add_mapping(&pm_msg, RDMA_NL_IWCM); + + if (!status) { + cm_id->m_local_addr = pm_msg.mapped_loc_addr; + if (active) { + cm_id->m_remote_addr = pm_msg.mapped_rem_addr; + iw_cm_check_wildcard(&pm_msg.mapped_rem_addr, + &cm_id->remote_addr, + &cm_id->m_remote_addr); + } + } + + return iwpm_create_mapinfo(&cm_id->local_addr, + &cm_id->m_local_addr, + RDMA_NL_IWCM); +} + /* * CM_ID <-- LISTEN * @@ -452,7 +560,9 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->create_listen(cm_id, backlog); + ret = iw_cm_map(cm_id, false); + if (!ret) + ret = cm_id->device->iwcm->create_listen(cm_id, backlog); if (ret) cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); @@ -582,39 +692,37 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_IDLE) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - return -EINVAL; + ret = -EINVAL; + goto err; } /* Get the ib_qp given the QPN */ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - return -EINVAL; + ret = -EINVAL; + goto err; } cm_id->device->iwcm->add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->connect(cm_id, iw_param); - if (ret) { - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id_priv->qp) { - cm_id->device->iwcm->rem_ref(qp); - cm_id_priv->qp = NULL; - } - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); - cm_id_priv->state = IW_CM_STATE_IDLE; - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); - } + ret = iw_cm_map(cm_id, true); + if (!ret) + ret = cm_id->device->iwcm->connect(cm_id, iw_param); + if (!ret) + return 0; /* success */ + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->qp) { + cm_id->device->iwcm->rem_ref(qp); + cm_id_priv->qp = NULL; + } + cm_id_priv->state = IW_CM_STATE_IDLE; +err: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_connect); @@ -656,8 +764,23 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, goto out; cm_id->provider_data = iw_event->provider_data; - cm_id->local_addr = iw_event->local_addr; - cm_id->remote_addr = iw_event->remote_addr; + cm_id->m_local_addr = iw_event->local_addr; + cm_id->m_remote_addr = iw_event->remote_addr; + cm_id->local_addr = listen_id_priv->id.local_addr; + + ret = iwpm_get_remote_info(&listen_id_priv->id.m_local_addr, + &iw_event->remote_addr, + &cm_id->remote_addr, + RDMA_NL_IWCM); + if (ret) { + cm_id->remote_addr = iw_event->remote_addr; + } else { + iw_cm_check_wildcard(&listen_id_priv->id.m_local_addr, + &iw_event->local_addr, + &cm_id->local_addr); + iw_event->local_addr = cm_id->local_addr; + iw_event->remote_addr = cm_id->remote_addr; + } cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; @@ -753,8 +876,10 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); if (iw_event->status == 0) { - cm_id_priv->id.local_addr = iw_event->local_addr; - cm_id_priv->id.remote_addr = iw_event->remote_addr; + cm_id_priv->id.m_local_addr = iw_event->local_addr; + cm_id_priv->id.m_remote_addr = iw_event->remote_addr; + iw_event->local_addr = cm_id_priv->id.local_addr; + iw_event->remote_addr = cm_id_priv->id.remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ @@ -1044,6 +1169,17 @@ EXPORT_SYMBOL(iw_cm_init_qp_attr); static int __init iw_cm_init(void) { + int ret; + + ret = iwpm_init(RDMA_NL_IWCM); + if (ret) + pr_err("iw_cm: couldn't init iwpm\n"); + + ret = ibnl_add_client(RDMA_NL_IWCM, RDMA_NL_IWPM_NUM_OPS, + iwcm_nl_cb_table); + if (ret) + pr_err("iw_cm: couldn't register netlink callbacks\n"); + iwcm_wq = create_singlethread_workqueue("iw_cm_wq"); if (!iwcm_wq) return -ENOMEM; @@ -1063,6 +1199,8 @@ static void __exit iw_cm_cleanup(void) { unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); + ibnl_remove_client(RDMA_NL_IWCM); + iwpm_exit(RDMA_NL_IWCM); } module_init(iw_cm_init); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 22a3abee2a54..43e3fa27102b 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -88,8 +88,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ); if (ret) goto pid_query_error; - ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE, - pm_msg->if_name, IWPM_NLA_REG_IF_NAME); + ret = ibnl_put_attr(skb, nlh, IFNAMSIZ, + pm_msg->if_name, IWPM_NLA_REG_IF_NAME); if (ret) goto pid_query_error; ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE, @@ -394,7 +394,7 @@ register_pid_response_exit: /* always for found nlmsg_request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); - wake_up(&nlmsg_request->waitq); + up(&nlmsg_request->sem); return 0; } EXPORT_SYMBOL(iwpm_register_pid_cb); @@ -463,7 +463,7 @@ add_mapping_response_exit: /* always for found request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); - wake_up(&nlmsg_request->waitq); + up(&nlmsg_request->sem); return 0; } EXPORT_SYMBOL(iwpm_add_mapping_cb); @@ -555,7 +555,7 @@ query_mapping_response_exit: /* always for found request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); - wake_up(&nlmsg_request->waitq); + up(&nlmsg_request->sem); return 0; } EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb); @@ -749,7 +749,7 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) /* always for found request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); - wake_up(&nlmsg_request->waitq); + up(&nlmsg_request->sem); return 0; } EXPORT_SYMBOL(iwpm_mapping_error_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 5fb089e91353..9b2bf2fb2b00 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -254,9 +254,9 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) } int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, - struct sockaddr_storage *mapped_rem_addr, - struct sockaddr_storage *remote_addr, - u8 nl_client) + struct sockaddr_storage *mapped_rem_addr, + struct sockaddr_storage *remote_addr, + u8 nl_client) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; @@ -322,6 +322,8 @@ struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, nlmsg_request->nl_client = nl_client; nlmsg_request->request_done = 0; nlmsg_request->err_code = 0; + sema_init(&nlmsg_request->sem, 1); + down(&nlmsg_request->sem); return nlmsg_request; } @@ -364,11 +366,9 @@ struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) { int ret; - init_waitqueue_head(&nlmsg_request->waitq); - ret = wait_event_timeout(nlmsg_request->waitq, - (nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT); - if (!ret) { + ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT); + if (ret) { ret = -EINVAL; pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index b7b9e194ce81..af1fc14a0d3d 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -69,7 +69,7 @@ struct iwpm_nlmsg_request { u8 nl_client; u8 request_done; u16 err_code; - wait_queue_head_t waitq; + struct semaphore sem; struct kref kref; }; diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c index 1b65986c0be3..19b1ee3279b4 100644 --- a/drivers/infiniband/core/packer.c +++ b/drivers/infiniband/core/packer.c @@ -44,7 +44,7 @@ static u64 value_read(int offset, int size, void *structure) case 4: return be32_to_cpup((__be32 *) (structure + offset)); case 8: return be64_to_cpup((__be64 *) (structure + offset)); default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); return 0; } } @@ -104,9 +104,8 @@ void ib_pack(const struct ib_field *desc, } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } if (desc[i].struct_size_bytes) @@ -132,7 +131,7 @@ static void value_write(int offset, int size, u64 val, void *structure) case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break; case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break; default: - printk(KERN_WARNING "Field size %d bits not handled\n", size * 8); + pr_warn("Field size %d bits not handled\n", size * 8); } } @@ -188,9 +187,8 @@ void ib_unpack(const struct ib_field *desc, } else { if (desc[i].offset_bits % 8 || desc[i].size_bits % 8) { - printk(KERN_WARNING "Structure field %s of size %d " - "bits is not byte-aligned\n", - desc[i].field_name, desc[i].size_bits); + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); } memcpy(structure + desc[i].struct_offset_bytes, diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index f334090bb612..d2214a55ac4a 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -864,13 +864,12 @@ static void update_sm_ah(struct work_struct *work) struct ib_ah_attr ah_attr; if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { - printk(KERN_WARNING "Couldn't query port\n"); + pr_warn("Couldn't query port\n"); return; } new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL); if (!new_ah) { - printk(KERN_WARNING "Couldn't allocate new SM AH\n"); return; } @@ -880,16 +879,21 @@ static void update_sm_ah(struct work_struct *work) new_ah->pkey_index = 0; if (ib_find_pkey(port->agent->device, port->port_num, IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) - printk(KERN_ERR "Couldn't find index for default PKey\n"); + pr_err("Couldn't find index for default PKey\n"); memset(&ah_attr, 0, sizeof ah_attr); ah_attr.dlid = port_attr.sm_lid; ah_attr.sl = port_attr.sm_sl; ah_attr.port_num = port->port_num; + if (port_attr.grh_required) { + ah_attr.ah_flags = IB_AH_GRH; + ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(port_attr.subnet_prefix); + ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID); + } new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); if (IS_ERR(new_ah->ah)) { - printk(KERN_WARNING "Couldn't create new SM AH\n"); + pr_warn("Couldn't create new SM AH\n"); kfree(new_ah); return; } @@ -1221,7 +1225,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, rec.net = NULL; rec.ifindex = 0; rec.gid_type = IB_GID_TYPE_IB; - memset(rec.dmac, 0, ETH_ALEN); + eth_zero_addr(rec.dmac); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); @@ -1800,13 +1804,13 @@ static int __init ib_sa_init(void) ret = ib_register_client(&sa_client); if (ret) { - printk(KERN_ERR "Couldn't register ib_sa client\n"); + pr_err("Couldn't register ib_sa client\n"); goto err1; } ret = mcast_init(); if (ret) { - printk(KERN_ERR "Couldn't initialize multicast handling\n"); + pr_err("Couldn't initialize multicast handling\n"); goto err2; } diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 6b4e8a008bc0..4a9aa0433b07 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -1234,7 +1234,7 @@ static int find_overflow_devnum(void) ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register dynamic device number\n"); + pr_err("ucm: couldn't register dynamic device number\n"); return ret; } } @@ -1329,19 +1329,19 @@ static int __init ib_ucm_init(void) ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, "infiniband_cm"); if (ret) { - printk(KERN_ERR "ucm: couldn't register device number\n"); + pr_err("ucm: couldn't register device number\n"); goto error1; } ret = class_create_file(&cm_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "ucm: couldn't create abi_version attribute\n"); + pr_err("ucm: couldn't create abi_version attribute\n"); goto error2; } ret = ib_register_client(&ucm_client); if (ret) { - printk(KERN_ERR "ucm: couldn't register client\n"); + pr_err("ucm: couldn't register client\n"); goto error3; } return 0; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 8b5a934e1133..dd3bcceadfde 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -314,7 +314,7 @@ static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) } } if (!event_found) - printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n"); + pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); } static int ucma_event_handler(struct rdma_cm_id *cm_id, @@ -1716,13 +1716,13 @@ static int __init ucma_init(void) ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { - printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); + pr_err("rdma_ucm: couldn't create abi_version attr\n"); goto err1; } ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); if (!ucma_ctl_table_hdr) { - printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n"); + pr_err("rdma_ucm: couldn't register sysctl paths\n"); ret = -ENOMEM; goto err2; } diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 2116132568e7..29a45d2f8898 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -479,8 +479,8 @@ int ib_ud_header_unpack(void *buf, buf += IB_LRH_BYTES; if (header->lrh.link_version != 0) { - printk(KERN_WARNING "Invalid LRH.link_version %d\n", - header->lrh.link_version); + pr_warn("Invalid LRH.link_version %d\n", + header->lrh.link_version); return -EINVAL; } @@ -496,20 +496,20 @@ int ib_ud_header_unpack(void *buf, buf += IB_GRH_BYTES; if (header->grh.ip_version != 6) { - printk(KERN_WARNING "Invalid GRH.ip_version %d\n", - header->grh.ip_version); + pr_warn("Invalid GRH.ip_version %d\n", + header->grh.ip_version); return -EINVAL; } if (header->grh.next_header != 0x1b) { - printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n", - header->grh.next_header); + pr_warn("Invalid GRH.next_header 0x%02x\n", + header->grh.next_header); return -EINVAL; } break; default: - printk(KERN_WARNING "Invalid LRH.link_next_header %d\n", - header->lrh.link_next_header); + pr_warn("Invalid LRH.link_next_header %d\n", + header->lrh.link_next_header); return -EINVAL; } @@ -525,14 +525,13 @@ int ib_ud_header_unpack(void *buf, header->immediate_present = 1; break; default: - printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n", - header->bth.opcode); + pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); return -EINVAL; } if (header->bth.transport_header_version != 0) { - printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n", - header->bth.transport_header_version); + pr_warn("Invalid BTH.transport_header_version %d\n", + header->bth.transport_header_version); return -EINVAL; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6ffc9c4e93af..6fdc7ecdaca0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -402,7 +402,7 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file, resp->hw_ver = attr->hw_ver; resp->max_qp = attr->max_qp; resp->max_qp_wr = attr->max_qp_wr; - resp->device_cap_flags = attr->device_cap_flags; + resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); resp->max_sge = attr->max_sge; resp->max_sge_rd = attr->max_sge_rd; resp->max_cq = attr->max_cq; @@ -1174,6 +1174,7 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; + struct ib_udata udata; int ret; if (out_len < sizeof(resp)) @@ -1195,7 +1196,12 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, goto err_free; } - mw = pd->device->alloc_mw(pd, cmd.mw_type); + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long)cmd.response + sizeof(resp), + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof(resp)); + + mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); goto err_put; @@ -1970,7 +1976,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, resp_size); INIT_UDATA(&uhw, buf + sizeof(cmd), (unsigned long)cmd.response + resp_size, - in_len - sizeof(cmd), out_len - resp_size); + in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), + out_len - resp_size); memset(&cmd_ex, 0, sizeof(cmd_ex)); cmd_ex.user_handle = cmd.user_handle; @@ -3085,6 +3092,14 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) return -EPERM; + if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) + return -EINVAL; + + if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) || + (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT))) + return -EINVAL; + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; @@ -3413,7 +3428,8 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); if (ret) @@ -3439,7 +3455,8 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr), + out_len - sizeof resp); ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); if (ret) @@ -3583,9 +3600,9 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) { - struct ib_uverbs_ex_query_device_resp resp; + struct ib_uverbs_ex_query_device_resp resp = { {0} }; struct ib_uverbs_ex_query_device cmd; - struct ib_device_attr attr; + struct ib_device_attr attr = {0}; int err; if (ucore->inlen < sizeof(cmd)) @@ -3606,14 +3623,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, if (ucore->outlen < resp.response_length) return -ENOSPC; - memset(&attr, 0, sizeof(attr)); - err = ib_dev->query_device(ib_dev, &attr, uhw); if (err) return err; copy_query_dev_fields(file, ib_dev, &resp.base, &attr); - resp.comp_mask = 0; if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) goto end; @@ -3626,9 +3640,6 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, attr.odp_caps.per_transport_caps.uc_odp_caps; resp.odp_caps.per_transport_caps.ud_odp_caps = attr.odp_caps.per_transport_caps.ud_odp_caps; - resp.odp_caps.reserved = 0; -#else - memset(&resp.odp_caps, 0, sizeof(resp.odp_caps)); #endif resp.response_length += sizeof(resp.odp_caps); @@ -3646,8 +3657,5 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); - if (err) - return err; - - return 0; + return err; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 39680aed99dd..28ba2cc81535 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -683,12 +683,28 @@ out: return ev_file; } +static int verify_command_mask(struct ib_device *ib_dev, __u32 command) +{ + u64 mask; + + if (command <= IB_USER_VERBS_CMD_OPEN_QP) + mask = ib_dev->uverbs_cmd_mask; + else + mask = ib_dev->uverbs_ex_cmd_mask; + + if (mask & ((u64)1 << command)) + return 0; + + return -1; +} + static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; + __u32 command; __u32 flags; int srcu_key; ssize_t ret; @@ -707,37 +723,34 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - flags = (hdr.command & - IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) { + ret = -EINVAL; + goto out; + } - if (!flags) { - __u32 command; + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + if (verify_command_mask(ib_dev, command)) { + ret = -EOPNOTSUPP; + goto out; + } - if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) { - ret = -EINVAL; - goto out; - } + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) { + ret = -EINVAL; + goto out; + } - command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; + if (!flags) { if (command >= ARRAY_SIZE(uverbs_cmd_table) || !uverbs_cmd_table[command]) { ret = -EINVAL; goto out; } - if (!file->ucontext && - command != IB_USER_VERBS_CMD_GET_CONTEXT) { - ret = -EINVAL; - goto out; - } - - if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) { - ret = -ENOSYS; - goto out; - } - if (hdr.in_words * 4 != count) { ret = -EINVAL; goto out; @@ -749,21 +762,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, hdr.out_words * 4); } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { - __u32 command; - struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; size_t written_count = count; - if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) { - ret = -EINVAL; - goto out; - } - - command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || !uverbs_ex_cmd_table[command]) { ret = -ENOSYS; @@ -775,11 +778,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } - if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) { - ret = -ENOSYS; - goto out; - } - if (count < (sizeof(hdr) + sizeof(ex_hdr))) { ret = -EINVAL; goto out; @@ -1058,7 +1056,7 @@ static int find_overflow_devnum(void) ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n"); + pr_err("user_verbs: couldn't register dynamic device number\n"); return ret; } } @@ -1279,14 +1277,14 @@ static int __init ib_uverbs_init(void) ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register device number\n"); + pr_err("user_verbs: couldn't register device number\n"); goto out; } uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); if (IS_ERR(uverbs_class)) { ret = PTR_ERR(uverbs_class); - printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n"); + pr_err("user_verbs: couldn't create class infiniband_verbs\n"); goto out_chrdev; } @@ -1294,13 +1292,13 @@ static int __init ib_uverbs_init(void) ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n"); + pr_err("user_verbs: couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&uverbs_client); if (ret) { - printk(KERN_ERR "user_verbs: couldn't register client\n"); + pr_err("user_verbs: couldn't register client\n"); goto out_class; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 5af6d024e053..15b8adbf39c0 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1551,6 +1551,46 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, } EXPORT_SYMBOL(ib_check_mr_status); +int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, + int state) +{ + if (!device->set_vf_link_state) + return -ENOSYS; + + return device->set_vf_link_state(device, vf, port, state); +} +EXPORT_SYMBOL(ib_set_vf_link_state); + +int ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info) +{ + if (!device->get_vf_config) + return -ENOSYS; + + return device->get_vf_config(device, vf, port, info); +} +EXPORT_SYMBOL(ib_get_vf_config); + +int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats) +{ + if (!device->get_vf_stats) + return -ENOSYS; + + return device->get_vf_stats(device, vf, port, stats); +} +EXPORT_SYMBOL(ib_get_vf_stats); + +int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, + int type) +{ + if (!device->set_vf_guid) + return -ENOSYS; + + return device->set_vf_guid(device, vf, port, guid, type); +} +EXPORT_SYMBOL(ib_set_vf_guid); + /** * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list * and set it the memory region. @@ -1567,6 +1607,8 @@ EXPORT_SYMBOL(ib_check_mr_status); * - The last sg element is allowed to have length less than page_size. * - If sg_nents total byte length exceeds the mr max_num_sge * page_size * then only max_num_sg entries will be mapped. + * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS_REG, non of these + * constraints holds and the page_size argument is ignored. * * Returns the number of sg elements that were mapped to the memory region. * @@ -1657,3 +1699,167 @@ next_page: return i; } EXPORT_SYMBOL(ib_sg_to_pages); + +struct ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* + * Post a WR and block until its completion is reaped for the SQ. + */ +static void __ib_drain_sq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe sdrain; + struct ib_send_wr swr = {}, *bad_swr; + int ret; + + if (qp->send_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->send_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + swr.wr_cqe = &sdrain.cqe; + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + ret = ib_post_send(qp, &swr, &bad_swr); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + wait_for_completion(&sdrain.done); +} + +/* + * Post a WR and block until its completion is reaped for the RQ. + */ +static void __ib_drain_rq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}, *bad_rwr; + int ret; + + if (qp->recv_cq->poll_ctx == IB_POLL_DIRECT) { + WARN_ONCE(qp->recv_cq->poll_ctx == IB_POLL_DIRECT, + "IB_POLL_DIRECT poll_ctx not supported for drain\n"); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + ret = ib_post_recv(qp, &rwr, &bad_rwr); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + wait_for_completion(&rdrain.done); +} + +/** + * ib_drain_sq() - Block until all SQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_sq(). + * + * The caller must: + * + * ensure there is room in the CQ and SQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_sq(struct ib_qp *qp) +{ + if (qp->device->drain_sq) + qp->device->drain_sq(qp); + else + __ib_drain_sq(qp); +} +EXPORT_SYMBOL(ib_drain_sq); + +/** + * ib_drain_rq() - Block until all RQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_rq(). + * + * The caller must: + * + * ensure there is room in the CQ and RQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_rq(struct ib_qp *qp) +{ + if (qp->device->drain_rq) + qp->device->drain_rq(qp); + else + __ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_rq); + +/** + * ib_drain_qp() - Block until all CQEs have been consumed by the + * application on both the RQ and SQ. + * @qp: queue pair to drain + * + * The caller must: + * + * ensure there is room in the CQ(s), SQ, and RQ for drain work requests + * and completions. + * + * allocate the CQs using ib_alloc_cq() and the CQ poll context cannot be + * IB_POLL_DIRECT. + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_qp(struct ib_qp *qp) +{ + ib_drain_sq(qp); + ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_qp); diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index f504ba73e5dc..d403231a4aff 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -1877,7 +1877,7 @@ err: static int is_loopback_dst(struct iw_cm_id *cm_id) { struct net_device *dev; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr); if (!dev) @@ -1892,10 +1892,10 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct iwch_ep *ep; struct rtable *rt; int err = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; - if (cm_id->remote_addr.ss_family != PF_INET) { + if (cm_id->m_remote_addr.ss_family != PF_INET) { err = -ENOSYS; goto out; } @@ -1961,9 +1961,9 @@ int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) state_set(&ep->com, CONNECTING); ep->tos = IPTOS_LOWDELAY; - memcpy(&ep->com.local_addr, &cm_id->local_addr, + memcpy(&ep->com.local_addr, &cm_id->m_local_addr, sizeof(ep->com.local_addr)); - memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + memcpy(&ep->com.remote_addr, &cm_id->m_remote_addr, sizeof(ep->com.remote_addr)); /* send connect request to rnic */ @@ -1992,7 +1992,7 @@ int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) might_sleep(); - if (cm_id->local_addr.ss_family != PF_INET) { + if (cm_id->m_local_addr.ss_family != PF_INET) { err = -ENOSYS; goto fail1; } @@ -2008,7 +2008,7 @@ int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; ep->backlog = backlog; - memcpy(&ep->com.local_addr, &cm_id->local_addr, + memcpy(&ep->com.local_addr, &cm_id->m_local_addr, sizeof(ep->com.local_addr)); /* diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 2734820d291b..42a7b8952d13 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -657,7 +657,8 @@ err: return ERR_PTR(err); } -static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct iwch_dev *rhp; struct iwch_pd *php; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index cd2ff5f9518a..651711370d55 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -302,7 +302,7 @@ void _c4iw_free_ep(struct kref *kref) if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) - &ep->com.mapped_local_addr; + &ep->com.local_addr; cxgb4_clip_release( ep->com.dev->rdev.lldi.ports[0], @@ -314,12 +314,6 @@ void _c4iw_free_ep(struct kref *kref) dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); } - if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) { - print_addr(&ep->com, __func__, "remove_mapinfo/mapping"); - iwpm_remove_mapinfo(&ep->com.local_addr, - &ep->com.mapped_local_addr); - iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); - } kfree(ep); } @@ -455,7 +449,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) state_set(&ep->com, DEAD); if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + (struct sockaddr_in6 *)&ep->com.local_addr; cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0], (const u32 *)&sin6->sin6_addr.s6_addr, 1); } @@ -485,12 +479,19 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) unsigned int flowclen = 80; struct fw_flowc_wr *flowc; int i; + u16 vlan = ep->l2t->vlan; + int nparams; + + if (vlan == CPL_L2T_VLAN_NONE) + nparams = 8; + else + nparams = 9; skb = get_skb(skb, flowclen, GFP_KERNEL); flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen); flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) | - FW_FLOWC_WR_NPARAMS_V(8)); + FW_FLOWC_WR_NPARAMS_V(nparams)); flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(flowclen, 16)) | FW_WR_FLOWID_V(ep->hwtid)); @@ -511,9 +512,17 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) flowc->mnemval[6].val = cpu_to_be32(ep->snd_win); flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[7].val = cpu_to_be32(ep->emss); - /* Pad WR to 16 byte boundary */ - flowc->mnemval[8].mnemonic = 0; - flowc->mnemval[8].val = 0; + if (nparams == 9) { + u16 pri; + + pri = (vlan & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + flowc->mnemval[8].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; + flowc->mnemval[8].val = cpu_to_be32(pri); + } else { + /* Pad WR to 16 byte boundary */ + flowc->mnemval[8].mnemonic = 0; + flowc->mnemval[8].val = 0; + } for (i = 0; i < 9; i++) { flowc->mnemval[i].r4[0] = 0; flowc->mnemval[i].r4[1] = 0; @@ -568,54 +577,6 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } -/* - * c4iw_form_pm_msg - Form a port mapper message with mapping info - */ -static void c4iw_form_pm_msg(struct c4iw_ep *ep, - struct iwpm_sa_data *pm_msg) -{ - memcpy(&pm_msg->loc_addr, &ep->com.local_addr, - sizeof(ep->com.local_addr)); - memcpy(&pm_msg->rem_addr, &ep->com.remote_addr, - sizeof(ep->com.remote_addr)); -} - -/* - * c4iw_form_reg_msg - Form a port mapper message with dev info - */ -static void c4iw_form_reg_msg(struct c4iw_dev *dev, - struct iwpm_dev_data *pm_msg) -{ - memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE); - memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name, - IWPM_IFNAME_SIZE); -} - -static void c4iw_record_pm_msg(struct c4iw_ep *ep, - struct iwpm_sa_data *pm_msg) -{ - memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr, - sizeof(ep->com.mapped_local_addr)); - memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr, - sizeof(ep->com.mapped_remote_addr)); -} - -static int get_remote_addr(struct c4iw_ep *parent_ep, struct c4iw_ep *child_ep) -{ - int ret; - - print_addr(&parent_ep->com, __func__, "get_remote_addr parent_ep "); - print_addr(&child_ep->com, __func__, "get_remote_addr child_ep "); - - ret = iwpm_get_remote_info(&parent_ep->com.mapped_local_addr, - &child_ep->com.mapped_remote_addr, - &child_ep->com.remote_addr, RDMA_NL_C4IW); - if (ret) - PDBG("Unable to find remote peer addr info - err %d\n", ret); - - return ret; -} - static void best_mtu(const unsigned short *mtus, unsigned short mtu, unsigned int *idx, int use_ts, int ipv6) { @@ -645,13 +606,13 @@ static int send_connect(struct c4iw_ep *ep) int wscale; int win, sizev4, sizev6, wrlen; struct sockaddr_in *la = (struct sockaddr_in *) - &ep->com.mapped_local_addr; + &ep->com.local_addr; struct sockaddr_in *ra = (struct sockaddr_in *) - &ep->com.mapped_remote_addr; + &ep->com.remote_addr; struct sockaddr_in6 *la6 = (struct sockaddr_in6 *) - &ep->com.mapped_local_addr; + &ep->com.local_addr; struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *) - &ep->com.mapped_remote_addr; + &ep->com.remote_addr; int ret; enum chip_type adapter_type = ep->com.dev->rdev.lldi.adapter_type; u32 isn = (prandom_u32() & ~7UL) - 1; @@ -710,7 +671,7 @@ static int send_connect(struct c4iw_ep *ep) L2T_IDX_V(ep->l2t->idx) | TX_CHAN_V(ep->tx_chan) | SMAC_SEL_V(ep->smac_idx) | - DSCP_V(ep->tos) | + DSCP_V(ep->tos >> 2) | ULP_MODE_V(ULP_MODE_TCPDDP) | RCV_BUFSIZ_V(win); opt2 = RX_CHANNEL_V(0) | @@ -1829,10 +1790,10 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) req->le.filter = cpu_to_be32(cxgb4_select_ntuple( ep->com.dev->rdev.lldi.ports[0], ep->l2t)); - sin = (struct sockaddr_in *)&ep->com.mapped_local_addr; + sin = (struct sockaddr_in *)&ep->com.local_addr; req->le.lport = sin->sin_port; req->le.u.ipv4.lip = sin->sin_addr.s_addr; - sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr; + sin = (struct sockaddr_in *)&ep->com.remote_addr; req->le.pport = sin->sin_port; req->le.u.ipv4.pip = sin->sin_addr.s_addr; req->tcb.t_state_to_astid = @@ -1864,7 +1825,7 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) L2T_IDX_V(ep->l2t->idx) | TX_CHAN_V(ep->tx_chan) | SMAC_SEL_V(ep->smac_idx) | - DSCP_V(ep->tos) | + DSCP_V(ep->tos >> 2) | ULP_MODE_V(ULP_MODE_TCPDDP) | RCV_BUFSIZ_V(win)); req->tcb.opt2 = (__force __be32) (PACE_V(1) | @@ -1928,7 +1889,7 @@ static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi) static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, struct dst_entry *dst, struct c4iw_dev *cdev, - bool clear_mpa_v1, enum chip_type adapter_type) + bool clear_mpa_v1, enum chip_type adapter_type, u8 tos) { struct neighbour *n; int err, step; @@ -1958,7 +1919,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, goto out; } ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, - n, pdev, 0); + n, pdev, rt_tos2priority(tos)); if (!ep->l2t) goto out; ep->mtu = pdev->mtu; @@ -2013,13 +1974,13 @@ static int c4iw_reconnect(struct c4iw_ep *ep) { int err = 0; struct sockaddr_in *laddr = (struct sockaddr_in *) - &ep->com.cm_id->local_addr; + &ep->com.cm_id->m_local_addr; struct sockaddr_in *raddr = (struct sockaddr_in *) - &ep->com.cm_id->remote_addr; + &ep->com.cm_id->m_remote_addr; struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *) - &ep->com.cm_id->local_addr; + &ep->com.cm_id->m_local_addr; struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *) - &ep->com.cm_id->remote_addr; + &ep->com.cm_id->m_remote_addr; int iptype; __u8 *ra; @@ -2038,10 +1999,10 @@ static int c4iw_reconnect(struct c4iw_ep *ep) insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid); /* find a route */ - if (ep->com.cm_id->local_addr.ss_family == AF_INET) { + if (ep->com.cm_id->m_local_addr.ss_family == AF_INET) { ep->dst = find_route(ep->com.dev, laddr->sin_addr.s_addr, raddr->sin_addr.s_addr, laddr->sin_port, - raddr->sin_port, 0); + raddr->sin_port, ep->com.cm_id->tos); iptype = 4; ra = (__u8 *)&raddr->sin_addr; } else { @@ -2058,7 +2019,8 @@ static int c4iw_reconnect(struct c4iw_ep *ep) goto fail3; } err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, false, - ep->com.dev->rdev.lldi.adapter_type); + ep->com.dev->rdev.lldi.adapter_type, + ep->com.cm_id->tos); if (err) { pr_err("%s - cannot alloc l2e.\n", __func__); goto fail4; @@ -2069,7 +2031,7 @@ static int c4iw_reconnect(struct c4iw_ep *ep) ep->l2t->idx); state_set(&ep->com, CONNECTING); - ep->tos = 0; + ep->tos = ep->com.cm_id->tos; /* send connect request to rnic */ err = send_connect(ep); @@ -2109,10 +2071,10 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct sockaddr_in6 *ra6; ep = lookup_atid(t, atid); - la = (struct sockaddr_in *)&ep->com.mapped_local_addr; - ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr; - la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; - ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr; + la = (struct sockaddr_in *)&ep->com.local_addr; + ra = (struct sockaddr_in *)&ep->com.remote_addr; + la6 = (struct sockaddr_in6 *)&ep->com.local_addr; + ra6 = (struct sockaddr_in6 *)&ep->com.remote_addr; PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid, status, status2errno(status)); @@ -2154,7 +2116,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) - &ep->com.mapped_local_addr; + &ep->com.local_addr; cxgb4_clip_release( ep->com.dev->rdev.lldi.ports[0], (const u32 *) @@ -2189,7 +2151,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + (struct sockaddr_in6 *)&ep->com.local_addr; cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0], (const u32 *)&sin6->sin6_addr.s6_addr, 1); } @@ -2391,6 +2353,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) u16 peer_mss = ntohs(req->tcpopt.mss); int iptype; unsigned short hdrs; + u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); parent_ep = lookup_stid(t, stid); if (!parent_ep) { @@ -2399,8 +2362,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) } if (state_read(&parent_ep->com) != LISTEN) { - printk(KERN_ERR "%s - listening ep not in LISTEN\n", - __func__); + PDBG("%s - listening ep not in LISTEN\n", __func__); goto reject; } @@ -2415,7 +2377,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) ntohs(peer_port), peer_mss); dst = find_route(dev, *(__be32 *)local_ip, *(__be32 *)peer_ip, local_port, peer_port, - PASS_OPEN_TOS_G(ntohl(req->tos_stid))); + tos); } else { PDBG("%s parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d rport %d peer_mss %d\n" , __func__, parent_ep, hwtid, @@ -2441,7 +2403,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) } err = import_ep(child_ep, iptype, peer_ip, dst, dev, false, - parent_ep->com.dev->rdev.lldi.adapter_type); + parent_ep->com.dev->rdev.lldi.adapter_type, tos); if (err) { printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", __func__); @@ -2459,18 +2421,9 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) child_ep->com.dev = dev; child_ep->com.cm_id = NULL; - /* - * The mapped_local and mapped_remote addresses get setup with - * the actual 4-tuple. The local address will be based on the - * actual local address of the connection, but on the port number - * of the parent listening endpoint. The remote address is - * setup based on a query to the IWPM since we don't know what it - * originally was before mapping. If no mapping was done, then - * mapped_remote == remote, and mapped_local == local. - */ if (iptype == 4) { struct sockaddr_in *sin = (struct sockaddr_in *) - &child_ep->com.mapped_local_addr; + &child_ep->com.local_addr; sin->sin_family = PF_INET; sin->sin_port = local_port; @@ -2482,12 +2435,12 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) &parent_ep->com.local_addr)->sin_port; sin->sin_addr.s_addr = *(__be32 *)local_ip; - sin = (struct sockaddr_in *)&child_ep->com.mapped_remote_addr; + sin = (struct sockaddr_in *)&child_ep->com.remote_addr; sin->sin_family = PF_INET; sin->sin_port = peer_port; sin->sin_addr.s_addr = *(__be32 *)peer_ip; } else { - sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr; + sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr; sin6->sin6_family = PF_INET6; sin6->sin6_port = local_port; memcpy(sin6->sin6_addr.s6_addr, local_ip, 16); @@ -2498,18 +2451,15 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) &parent_ep->com.local_addr)->sin6_port; memcpy(sin6->sin6_addr.s6_addr, local_ip, 16); - sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_remote_addr; + sin6 = (struct sockaddr_in6 *)&child_ep->com.remote_addr; sin6->sin6_family = PF_INET6; sin6->sin6_port = peer_port; memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16); } - memcpy(&child_ep->com.remote_addr, &child_ep->com.mapped_remote_addr, - sizeof(child_ep->com.remote_addr)); - get_remote_addr(parent_ep, child_ep); c4iw_get_ep(&parent_ep->com); child_ep->parent_ep = parent_ep; - child_ep->tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); + child_ep->tos = tos; child_ep->dst = dst; child_ep->hwtid = hwtid; @@ -2522,7 +2472,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) accept_cr(child_ep, skb, req); set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); if (iptype == 6) { - sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr; + sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr; cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0], (const u32 *)&sin6->sin6_addr.s6_addr, 1); } @@ -2765,7 +2715,7 @@ out: if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) - &ep->com.mapped_local_addr; + &ep->com.local_addr; cxgb4_clip_release( ep->com.dev->rdev.lldi.ports[0], (const u32 *)&sin6->sin6_addr.s6_addr, @@ -3026,8 +2976,8 @@ static int pick_local_ipaddrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) { struct in_device *ind; int found = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; ind = in_dev_get(dev->rdev.lldi.ports[0]); if (!ind) @@ -3072,8 +3022,8 @@ static int get_lladdr(struct net_device *dev, struct in6_addr *addr, static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id) { struct in6_addr uninitialized_var(addr); - struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr; - struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr; + struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->m_local_addr; + struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr; if (!get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) { memcpy(la6->sin6_addr.s6_addr, &addr, 16); @@ -3092,11 +3042,8 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct sockaddr_in *raddr; struct sockaddr_in6 *laddr6; struct sockaddr_in6 *raddr6; - struct iwpm_dev_data pm_reg_msg; - struct iwpm_sa_data pm_msg; __u8 *ra; int iptype; - int iwpm_err = 0; if ((conn_param->ord > cur_max_read_depth(dev)) || (conn_param->ird > cur_max_read_depth(dev))) { @@ -3144,47 +3091,17 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } insert_handle(dev, &dev->atid_idr, ep, ep->atid); - memcpy(&ep->com.local_addr, &cm_id->local_addr, + memcpy(&ep->com.local_addr, &cm_id->m_local_addr, sizeof(ep->com.local_addr)); - memcpy(&ep->com.remote_addr, &cm_id->remote_addr, + memcpy(&ep->com.remote_addr, &cm_id->m_remote_addr, sizeof(ep->com.remote_addr)); - /* No port mapper available, go with the specified peer information */ - memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, - sizeof(ep->com.mapped_local_addr)); - memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr, - sizeof(ep->com.mapped_remote_addr)); - - c4iw_form_reg_msg(dev, &pm_reg_msg); - iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); - if (iwpm_err) { - PDBG("%s: Port Mapper reg pid fail (err = %d).\n", - __func__, iwpm_err); - } - if (iwpm_valid_pid() && !iwpm_err) { - c4iw_form_pm_msg(ep, &pm_msg); - iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW); - if (iwpm_err) - PDBG("%s: Port Mapper query fail (err = %d).\n", - __func__, iwpm_err); - else - c4iw_record_pm_msg(ep, &pm_msg); - } - if (iwpm_create_mapinfo(&ep->com.local_addr, - &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { - iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW); - err = -ENOMEM; - goto fail1; - } - print_addr(&ep->com, __func__, "add_query/create_mapinfo"); - set_bit(RELEASE_MAPINFO, &ep->com.flags); - - laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr; - raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr; - laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; - raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr; + laddr = (struct sockaddr_in *)&ep->com.local_addr; + raddr = (struct sockaddr_in *)&ep->com.remote_addr; + laddr6 = (struct sockaddr_in6 *)&ep->com.local_addr; + raddr6 = (struct sockaddr_in6 *) &ep->com.remote_addr; - if (cm_id->remote_addr.ss_family == AF_INET) { + if (cm_id->m_remote_addr.ss_family == AF_INET) { iptype = 4; ra = (__u8 *)&raddr->sin_addr; @@ -3203,7 +3120,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ra, ntohs(raddr->sin_port)); ep->dst = find_route(dev, laddr->sin_addr.s_addr, raddr->sin_addr.s_addr, laddr->sin_port, - raddr->sin_port, 0); + raddr->sin_port, cm_id->tos); } else { iptype = 6; ra = (__u8 *)&raddr6->sin6_addr; @@ -3234,7 +3151,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true, - ep->com.dev->rdev.lldi.adapter_type); + ep->com.dev->rdev.lldi.adapter_type, cm_id->tos); if (err) { printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__); goto fail3; @@ -3245,7 +3162,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->l2t->idx); state_set(&ep->com, CONNECTING); - ep->tos = 0; + ep->tos = cm_id->tos; /* send connect request to rnic */ err = send_connect(ep); @@ -3269,7 +3186,7 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) { int err; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) - &ep->com.mapped_local_addr; + &ep->com.local_addr; if (ipv6_addr_type(&sin6->sin6_addr) != IPV6_ADDR_ANY) { err = cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0], @@ -3302,7 +3219,7 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) { int err; struct sockaddr_in *sin = (struct sockaddr_in *) - &ep->com.mapped_local_addr; + &ep->com.local_addr; if (dev->rdev.lldi.enable_fw_ofld_conn) { do { @@ -3343,9 +3260,6 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) int err = 0; struct c4iw_dev *dev = to_c4iw_dev(cm_id->device); struct c4iw_listen_ep *ep; - struct iwpm_dev_data pm_reg_msg; - struct iwpm_sa_data pm_msg; - int iwpm_err = 0; might_sleep(); @@ -3360,7 +3274,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) ep->com.cm_id = cm_id; ep->com.dev = dev; ep->backlog = backlog; - memcpy(&ep->com.local_addr, &cm_id->local_addr, + memcpy(&ep->com.local_addr, &cm_id->m_local_addr, sizeof(ep->com.local_addr)); /* @@ -3369,10 +3283,10 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) if (dev->rdev.lldi.enable_fw_ofld_conn && ep->com.local_addr.ss_family == AF_INET) ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids, - cm_id->local_addr.ss_family, ep); + cm_id->m_local_addr.ss_family, ep); else ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids, - cm_id->local_addr.ss_family, ep); + cm_id->m_local_addr.ss_family, ep); if (ep->stid == -1) { printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__); @@ -3381,36 +3295,9 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) } insert_handle(dev, &dev->stid_idr, ep, ep->stid); - /* No port mapper available, go with the specified info */ - memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr, - sizeof(ep->com.mapped_local_addr)); - - c4iw_form_reg_msg(dev, &pm_reg_msg); - iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW); - if (iwpm_err) { - PDBG("%s: Port Mapper reg pid fail (err = %d).\n", - __func__, iwpm_err); - } - if (iwpm_valid_pid() && !iwpm_err) { - memcpy(&pm_msg.loc_addr, &ep->com.local_addr, - sizeof(ep->com.local_addr)); - iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW); - if (iwpm_err) - PDBG("%s: Port Mapper query fail (err = %d).\n", - __func__, iwpm_err); - else - memcpy(&ep->com.mapped_local_addr, - &pm_msg.mapped_loc_addr, - sizeof(ep->com.mapped_local_addr)); - } - if (iwpm_create_mapinfo(&ep->com.local_addr, - &ep->com.mapped_local_addr, RDMA_NL_C4IW)) { - err = -ENOMEM; - goto fail3; - } - print_addr(&ep->com, __func__, "add_mapping/create_mapinfo"); + memcpy(&ep->com.local_addr, &cm_id->m_local_addr, + sizeof(ep->com.local_addr)); - set_bit(RELEASE_MAPINFO, &ep->com.flags); state_set(&ep->com, LISTEN); if (ep->com.local_addr.ss_family == AF_INET) err = create_server4(dev, ep); @@ -3421,7 +3308,6 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) goto out; } -fail3: cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: @@ -3456,7 +3342,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id) goto done; err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait, 0, 0, __func__); - sin6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + sin6 = (struct sockaddr_in6 *)&ep->com.local_addr; cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0], (const u32 *)&sin6->sin6_addr.s6_addr, 1); } @@ -3580,7 +3466,7 @@ static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, state_set(&ep->com, DEAD); if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)&ep->com.mapped_local_addr; + (struct sockaddr_in6 *)&ep->com.local_addr; cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0], (const u32 *)&sin6->sin6_addr.s6_addr, 1); } diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index cf21df4a8bf5..b4eeb783573c 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -815,8 +815,15 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) } } out: - if (wq) + if (wq) { + if (unlikely(qhp->attr.state != C4IW_QP_STATE_RTS)) { + if (t4_sq_empty(wq)) + complete(&qhp->sq_drained); + if (t4_rq_empty(wq)) + complete(&qhp->rq_drained); + } spin_unlock(&qhp->lock); + } return ret; } diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 8024ea4417b8..ae2e8b23d2dd 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -87,17 +87,6 @@ struct c4iw_debugfs_data { int pos; }; -/* registered cxgb4 netlink callbacks */ -static struct ibnl_client_cbs c4iw_nl_cb_table[] = { - [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, - [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, - [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, - [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, - [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, - [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, - [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} -}; - static int count_idrs(int id, void *p, void *data) { int *countp = data; @@ -242,13 +231,13 @@ static int dump_qp(int id, void *p, void *data) if (qp->ep) { if (qp->ep->com.local_addr.ss_family == AF_INET) { struct sockaddr_in *lsin = (struct sockaddr_in *) - &qp->ep->com.local_addr; + &qp->ep->com.cm_id->local_addr; struct sockaddr_in *rsin = (struct sockaddr_in *) - &qp->ep->com.remote_addr; + &qp->ep->com.cm_id->remote_addr; struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) - &qp->ep->com.mapped_local_addr; + &qp->ep->com.cm_id->m_local_addr; struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) - &qp->ep->com.mapped_remote_addr; + &qp->ep->com.cm_id->m_remote_addr; cc = snprintf(qpd->buf + qpd->pos, space, "rc qp sq id %u rq id %u state %u " @@ -264,15 +253,15 @@ static int dump_qp(int id, void *p, void *data) ntohs(mapped_rsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) - &qp->ep->com.local_addr; + &qp->ep->com.cm_id->local_addr; struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) - &qp->ep->com.remote_addr; + &qp->ep->com.cm_id->remote_addr; struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) - &qp->ep->com.mapped_local_addr; + &qp->ep->com.cm_id->m_local_addr; struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) - &qp->ep->com.mapped_remote_addr; + &qp->ep->com.cm_id->m_remote_addr; cc = snprintf(qpd->buf + qpd->pos, space, "rc qp sq id %u rq id %u state %u " @@ -545,13 +534,13 @@ static int dump_ep(int id, void *p, void *data) if (ep->com.local_addr.ss_family == AF_INET) { struct sockaddr_in *lsin = (struct sockaddr_in *) - &ep->com.local_addr; + &ep->com.cm_id->local_addr; struct sockaddr_in *rsin = (struct sockaddr_in *) - &ep->com.remote_addr; + &ep->com.cm_id->remote_addr; struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) - &ep->com.mapped_local_addr; + &ep->com.cm_id->m_local_addr; struct sockaddr_in *mapped_rsin = (struct sockaddr_in *) - &ep->com.mapped_remote_addr; + &ep->com.cm_id->m_remote_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p qp %p state %d flags 0x%lx " @@ -569,13 +558,13 @@ static int dump_ep(int id, void *p, void *data) ntohs(mapped_rsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) - &ep->com.local_addr; + &ep->com.cm_id->local_addr; struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *) - &ep->com.remote_addr; + &ep->com.cm_id->remote_addr; struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) - &ep->com.mapped_local_addr; + &ep->com.cm_id->m_local_addr; struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *) - &ep->com.mapped_remote_addr; + &ep->com.cm_id->m_remote_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p qp %p state %d flags 0x%lx " @@ -610,9 +599,9 @@ static int dump_listen_ep(int id, void *p, void *data) if (ep->com.local_addr.ss_family == AF_INET) { struct sockaddr_in *lsin = (struct sockaddr_in *) - &ep->com.local_addr; + &ep->com.cm_id->local_addr; struct sockaddr_in *mapped_lsin = (struct sockaddr_in *) - &ep->com.mapped_local_addr; + &ep->com.cm_id->m_local_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p state %d flags 0x%lx stid %d " @@ -623,9 +612,9 @@ static int dump_listen_ep(int id, void *p, void *data) ntohs(mapped_lsin->sin_port)); } else { struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *) - &ep->com.local_addr; + &ep->com.cm_id->local_addr; struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *) - &ep->com.mapped_local_addr; + &ep->com.cm_id->m_local_addr; cc = snprintf(epd->buf + epd->pos, space, "ep %p cm_id %p state %d flags 0x%lx stid %d " @@ -801,10 +790,9 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.start, rdev->lldi.vr->cq.size); - PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p " + PDBG("udb %pR db_reg %p gts_reg %p " "qpmask 0x%x cqmask 0x%x\n", - (unsigned)pci_resource_len(rdev->lldi.pdev, 2), - (void *)pci_resource_start(rdev->lldi.pdev, 2), + &rdev->lldi.pdev->resource[2], rdev->lldi.db_reg, rdev->lldi.gts_reg, rdev->qpmask, rdev->cqmask); @@ -1506,20 +1494,6 @@ static int __init c4iw_init_module(void) printk(KERN_WARNING MOD "could not create debugfs entry, continuing\n"); - if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS, - c4iw_nl_cb_table)) - pr_err("%s[%u]: Failed to add netlink callback\n" - , __func__, __LINE__); - - err = iwpm_init(RDMA_NL_C4IW); - if (err) { - pr_err("port mapper initialization failed with %d\n", err); - ibnl_remove_client(RDMA_NL_C4IW); - c4iw_cm_term(); - debugfs_remove_recursive(c4iw_debugfs_root); - return err; - } - cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info); return 0; @@ -1537,8 +1511,6 @@ static void __exit c4iw_exit_module(void) } mutex_unlock(&dev_mutex); cxgb4_unregister_uld(CXGB4_ULD_RDMA); - iwpm_exit(RDMA_NL_C4IW); - ibnl_remove_client(RDMA_NL_C4IW); c4iw_cm_term(); debugfs_remove_recursive(c4iw_debugfs_root); } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index fb2de75a0392..df43f871ab61 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -476,6 +476,8 @@ struct c4iw_qp { wait_queue_head_t wait; struct timer_list timer; int sq_sig_all; + struct completion rq_drained; + struct completion sq_drained; }; static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) @@ -753,7 +755,6 @@ enum c4iw_ep_flags { CLOSE_SENT = 3, TIMEOUT = 4, QP_REFERENCED = 5, - RELEASE_MAPINFO = 6, }; enum c4iw_ep_history { @@ -790,8 +791,6 @@ struct c4iw_ep_common { struct mutex mutex; struct sockaddr_storage local_addr; struct sockaddr_storage remote_addr; - struct sockaddr_storage mapped_local_addr; - struct sockaddr_storage mapped_remote_addr; struct c4iw_wr_wait wr_wait; unsigned long flags; unsigned long history; @@ -843,45 +842,6 @@ struct c4iw_ep { struct c4iw_ep_stats stats; }; -static inline void print_addr(struct c4iw_ep_common *epc, const char *func, - const char *msg) -{ - -#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr)) -#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port) -#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr)) -#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port) - - if (c4iw_debug) { - switch (epc->local_addr.ss_family) { - case AF_INET: - PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n", - func, msg, SINA(&epc->local_addr), - SINP(&epc->local_addr), - SINP(&epc->mapped_local_addr), - SINA(&epc->remote_addr), - SINP(&epc->remote_addr), - SINP(&epc->mapped_remote_addr)); - break; - case AF_INET6: - PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n", - func, msg, SIN6A(&epc->local_addr), - SIN6P(&epc->local_addr), - SIN6P(&epc->mapped_local_addr), - SIN6A(&epc->remote_addr), - SIN6P(&epc->remote_addr), - SIN6P(&epc->mapped_remote_addr)); - break; - default: - break; - } - } -#undef SINA -#undef SINP -#undef SIN6A -#undef SIN6P -} - static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) { return cm_id->provider_data; @@ -961,7 +921,8 @@ int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents); int c4iw_dealloc_mw(struct ib_mw *mw); -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata); @@ -1016,6 +977,8 @@ extern int c4iw_wr_log; extern int db_fc_threshold; extern int db_coalescing_threshold; extern int use_dsgl; +void c4iw_drain_rq(struct ib_qp *qp); +void c4iw_drain_sq(struct ib_qp *qp); #endif diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 7849890c4781..008be07d5604 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -34,6 +34,7 @@ #include <linux/moduleparam.h> #include <rdma/ib_umem.h> #include <linux/atomic.h> +#include <rdma/ib_user_verbs.h> #include "iw_cxgb4.h" @@ -552,7 +553,8 @@ err: return ERR_PTR(err); } -struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_pd *php; @@ -617,12 +619,14 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, int ret = 0; int length = roundup(max_num_sg * sizeof(u64), 32); + php = to_c4iw_pd(pd); + rhp = php->rhp; + if (mr_type != IB_MR_TYPE_MEM_REG || - max_num_sg > t4_max_fr_depth(use_dsgl)) + max_num_sg > t4_max_fr_depth(&rhp->rdev.lldi.ulptx_memwrite_dsgl && + use_dsgl)) return ERR_PTR(-EINVAL); - php = to_c4iw_pd(pd); - rhp = php->rhp; mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); if (!mhp) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index ec04272fbdc2..124682dc5709 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -339,7 +339,8 @@ static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *pro props->max_mr = c4iw_num_stags(&dev->rdev); props->max_pd = T4_MAX_NUM_PD; props->local_ca_ack_delay = 0; - props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl); + props->max_fast_reg_page_list_len = + t4_max_fr_depth(dev->rdev.lldi.ulptx_memwrite_dsgl && use_dsgl); return 0; } @@ -564,6 +565,8 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.get_protocol_stats = c4iw_get_mib; dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = c4iw_port_immutable; + dev->ibdev.drain_sq = c4iw_drain_sq; + dev->ibdev.drain_rq = c4iw_drain_rq; dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index e99345eb875a..e17fb5d5e033 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -606,7 +606,7 @@ static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, } static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, - struct ib_reg_wr *wr, u8 *len16, u8 t5dev) + struct ib_reg_wr *wr, u8 *len16, bool dsgl_supported) { struct c4iw_mr *mhp = to_c4iw_mr(wr->mr); struct fw_ri_immd *imdp; @@ -615,7 +615,7 @@ static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, int pbllen = roundup(mhp->mpl_len * sizeof(u64), 32); int rem; - if (mhp->mpl_len > t4_max_fr_depth(use_dsgl)) + if (mhp->mpl_len > t4_max_fr_depth(dsgl_supported && use_dsgl)) return -EINVAL; wqe->fr.qpbinde_to_dcacpu = 0; @@ -629,7 +629,7 @@ static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, wqe->fr.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova & 0xffffffff); - if (t5dev && use_dsgl && (pbllen > max_fr_immd)) { + if (dsgl_supported && use_dsgl && (pbllen > max_fr_immd)) { struct fw_ri_dsgl *sglp; for (i = 0; i < mhp->mpl_len; i++) @@ -808,9 +808,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_FR_NSMR_WR; swsqe->opcode = FW_RI_FAST_REGISTER; err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr), &len16, - is_t5( - qhp->rhp->rdev.lldi.adapter_type) ? - 1 : 0); + qhp->rhp->rdev.lldi.ulptx_memwrite_dsgl); break; case IB_WR_LOCAL_INV: if (wr->send_flags & IB_SEND_FENCE) @@ -1621,7 +1619,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, unsigned int sqsize, rqsize; struct c4iw_ucontext *ucontext; int ret; - struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL; + struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm; + struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL; PDBG("%s ib_pd %p\n", __func__, pd); @@ -1697,6 +1696,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, qhp->attr.max_ird = 0; qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; spin_lock_init(&qhp->lock); + init_completion(&qhp->sq_drained); + init_completion(&qhp->rq_drained); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); atomic_set(&qhp->refcnt, 1); @@ -1706,29 +1707,30 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, goto err2; if (udata) { - mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); - if (!mm1) { + sq_key_mm = kmalloc(sizeof(*sq_key_mm), GFP_KERNEL); + if (!sq_key_mm) { ret = -ENOMEM; goto err3; } - mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); - if (!mm2) { + rq_key_mm = kmalloc(sizeof(*rq_key_mm), GFP_KERNEL); + if (!rq_key_mm) { ret = -ENOMEM; goto err4; } - mm3 = kmalloc(sizeof *mm3, GFP_KERNEL); - if (!mm3) { + sq_db_key_mm = kmalloc(sizeof(*sq_db_key_mm), GFP_KERNEL); + if (!sq_db_key_mm) { ret = -ENOMEM; goto err5; } - mm4 = kmalloc(sizeof *mm4, GFP_KERNEL); - if (!mm4) { + rq_db_key_mm = kmalloc(sizeof(*rq_db_key_mm), GFP_KERNEL); + if (!rq_db_key_mm) { ret = -ENOMEM; goto err6; } if (t4_sq_onchip(&qhp->wq.sq)) { - mm5 = kmalloc(sizeof *mm5, GFP_KERNEL); - if (!mm5) { + ma_sync_key_mm = kmalloc(sizeof(*ma_sync_key_mm), + GFP_KERNEL); + if (!ma_sync_key_mm) { ret = -ENOMEM; goto err7; } @@ -1743,7 +1745,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, uresp.rq_size = qhp->wq.rq.size; uresp.rq_memsize = qhp->wq.rq.memsize; spin_lock(&ucontext->mmap_lock); - if (mm5) { + if (ma_sync_key_mm) { uresp.ma_sync_key = ucontext->key; ucontext->key += PAGE_SIZE; } else { @@ -1761,28 +1763,29 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); if (ret) goto err8; - mm1->key = uresp.sq_key; - mm1->addr = qhp->wq.sq.phys_addr; - mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize); - insert_mmap(ucontext, mm1); - mm2->key = uresp.rq_key; - mm2->addr = virt_to_phys(qhp->wq.rq.queue); - mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize); - insert_mmap(ucontext, mm2); - mm3->key = uresp.sq_db_gts_key; - mm3->addr = (__force unsigned long)qhp->wq.sq.bar2_pa; - mm3->len = PAGE_SIZE; - insert_mmap(ucontext, mm3); - mm4->key = uresp.rq_db_gts_key; - mm4->addr = (__force unsigned long)qhp->wq.rq.bar2_pa; - mm4->len = PAGE_SIZE; - insert_mmap(ucontext, mm4); - if (mm5) { - mm5->key = uresp.ma_sync_key; - mm5->addr = (pci_resource_start(rhp->rdev.lldi.pdev, 0) - + PCIE_MA_SYNC_A) & PAGE_MASK; - mm5->len = PAGE_SIZE; - insert_mmap(ucontext, mm5); + sq_key_mm->key = uresp.sq_key; + sq_key_mm->addr = qhp->wq.sq.phys_addr; + sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize); + insert_mmap(ucontext, sq_key_mm); + rq_key_mm->key = uresp.rq_key; + rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue); + rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_mmap(ucontext, rq_key_mm); + sq_db_key_mm->key = uresp.sq_db_gts_key; + sq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.sq.bar2_pa; + sq_db_key_mm->len = PAGE_SIZE; + insert_mmap(ucontext, sq_db_key_mm); + rq_db_key_mm->key = uresp.rq_db_gts_key; + rq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.rq.bar2_pa; + rq_db_key_mm->len = PAGE_SIZE; + insert_mmap(ucontext, rq_db_key_mm); + if (ma_sync_key_mm) { + ma_sync_key_mm->key = uresp.ma_sync_key; + ma_sync_key_mm->addr = + (pci_resource_start(rhp->rdev.lldi.pdev, 0) + + PCIE_MA_SYNC_A) & PAGE_MASK; + ma_sync_key_mm->len = PAGE_SIZE; + insert_mmap(ucontext, ma_sync_key_mm); } } qhp->ibqp.qp_num = qhp->wq.sq.qid; @@ -1795,15 +1798,15 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, qhp->wq.rq.memsize, attrs->cap.max_recv_wr); return &qhp->ibqp; err8: - kfree(mm5); + kfree(ma_sync_key_mm); err7: - kfree(mm4); + kfree(rq_db_key_mm); err6: - kfree(mm3); + kfree(sq_db_key_mm); err5: - kfree(mm2); + kfree(rq_key_mm); err4: - kfree(mm1); + kfree(sq_key_mm); err3: remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); err2: @@ -1888,3 +1891,17 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0; return 0; } + +void c4iw_drain_sq(struct ib_qp *ibqp) +{ + struct c4iw_qp *qp = to_c4iw_qp(ibqp); + + wait_for_completion(&qp->sq_drained); +} + +void c4iw_drain_rq(struct ib_qp *ibqp) +{ + struct c4iw_qp *qp = to_c4iw_qp(ibqp); + + wait_for_completion(&qp->rq_drained); +} diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 21cb41a60fe8..c74ef2620b85 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -310,7 +310,7 @@ static void aliasguid_query_handler(int status, if (status) { pr_debug("(port: %d) failed: status = %d\n", cb_ctx->port, status); - rec->time_to_run = ktime_get_real_ns() + 1 * NSEC_PER_SEC; + rec->time_to_run = ktime_get_boot_ns() + 1 * NSEC_PER_SEC; goto out; } @@ -416,7 +416,7 @@ next_entry: be64_to_cpu((__force __be64)rec->guid_indexes), be64_to_cpu((__force __be64)applied_guid_indexes), be64_to_cpu((__force __be64)declined_guid_indexes)); - rec->time_to_run = ktime_get_real_ns() + + rec->time_to_run = ktime_get_boot_ns() + resched_delay_sec * NSEC_PER_SEC; } else { rec->status = MLX4_GUID_INFO_STATUS_SET; @@ -708,7 +708,7 @@ static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, } } if (resched_delay_sec) { - u64 curr_time = ktime_get_real_ns(); + u64 curr_time = ktime_get_boot_ns(); *resched_delay_sec = (low_record_time < curr_time) ? 0 : div_u64((low_record_time - curr_time), NSEC_PER_SEC); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1c7ab6cabbb8..914bc98e753f 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1643,6 +1643,56 @@ static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_ return err; } +static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev, + struct ib_flow_attr *flow_attr, + enum mlx4_net_trans_promisc_mode *type) +{ + int err = 0; + + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER) || + (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC) || + (flow_attr->num_of_specs > 1) || (flow_attr->priority != 0)) { + return -EOPNOTSUPP; + } + + if (flow_attr->num_of_specs == 0) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + union ib_flow_spec *ib_spec; + + ib_spec = (union ib_flow_spec *)(flow_attr + 1); + if (ib_spec->type != IB_FLOW_SPEC_ETH) + return -EINVAL; + + /* if all is zero than MC and UC */ + if (is_zero_ether_addr(ib_spec->eth.mask.dst_mac)) { + type[0] = MLX4_FS_MC_SNIFFER; + type[1] = MLX4_FS_UC_SNIFFER; + } else { + u8 mac[ETH_ALEN] = {ib_spec->eth.mask.dst_mac[0] ^ 0x01, + ib_spec->eth.mask.dst_mac[1], + ib_spec->eth.mask.dst_mac[2], + ib_spec->eth.mask.dst_mac[3], + ib_spec->eth.mask.dst_mac[4], + ib_spec->eth.mask.dst_mac[5]}; + + /* Above xor was only on MC bit, non empty mask is valid + * only if this bit is set and rest are zero. + */ + if (!is_zero_ether_addr(&mac[0])) + return -EINVAL; + + if (is_multicast_ether_addr(ib_spec->eth.val.dst_mac)) + type[0] = MLX4_FS_MC_SNIFFER; + else + type[0] = MLX4_FS_UC_SNIFFER; + } + } + + return err; +} + static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) @@ -1653,6 +1703,10 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct mlx4_dev *dev = (to_mdev(qp->device))->dev; int is_bonded = mlx4_is_bonded(dev); + if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + (flow_attr->type != IB_FLOW_ATTR_NORMAL)) + return ERR_PTR(-EOPNOTSUPP); + memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); @@ -1663,7 +1717,19 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: - type[0] = MLX4_FS_REGULAR; + /* If dont trap flag (continue match) is set, under specific + * condition traffic be replicated to given qp, + * without stealing it + */ + if (unlikely(flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)) { + err = mlx4_ib_add_dont_trap_rule(dev, + flow_attr, + type); + if (err) + goto err_free; + } else { + type[0] = MLX4_FS_REGULAR; + } break; case IB_FLOW_ATTR_ALL_DEFAULT: @@ -1675,8 +1741,8 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, break; case IB_FLOW_ATTR_SNIFFER: - type[0] = MLX4_FS_UC_SNIFFER; - type[1] = MLX4_FS_MC_SNIFFER; + type[0] = MLX4_FS_MIRROR_RX_PORT; + type[1] = MLX4_FS_MIRROR_SX_PORT; break; default: diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 52ce7b000044..1eca01cebe51 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -711,7 +711,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); -struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 242b94ec105b..ce0b5aa8eb9b 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -32,6 +32,7 @@ */ #include <linux/slab.h> +#include <rdma/ib_user_verbs.h> #include "mlx4_ib.h" @@ -334,7 +335,8 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr) return 0; } -struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_mw *mw; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 27a70159e2ea..7493a83acd28 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o -mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index fd1de31e0611..a00ba4418de9 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -207,7 +207,10 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, break; case MLX5_CQE_RESP_SEND: wc->opcode = IB_WC_RECV; - wc->wc_flags = 0; + wc->wc_flags = IB_WC_IP_CSUM_OK; + if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) + wc->wc_flags = 0; break; case MLX5_CQE_RESP_SEND_IMM: wc->opcode = IB_WC_RECV; @@ -431,7 +434,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_core_qp *mqp; struct mlx5_ib_wq *wq; struct mlx5_sig_err_cqe *sig_err_cqe; - struct mlx5_core_mr *mmr; + struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; uint8_t opcode; uint32_t qpn; @@ -536,17 +539,17 @@ repoll: case MLX5_CQE_SIG_ERR: sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; - read_lock(&dev->mdev->priv.mr_table.lock); - mmr = __mlx5_mr_lookup(dev->mdev, - mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); - if (unlikely(!mmr)) { - read_unlock(&dev->mdev->priv.mr_table.lock); + read_lock(&dev->mdev->priv.mkey_table.lock); + mmkey = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + if (unlikely(!mmkey)) { + read_unlock(&dev->mdev->priv.mkey_table.lock); mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); return -EINVAL; } - mr = to_mibmr(mmr); + mr = to_mibmr(mmkey); get_sig_err_item(sig_err_cqe, &mr->sig->err_item); mr->sig->sig_err_exists = true; mr->sig->sigerr_count++; @@ -558,25 +561,51 @@ repoll: mr->sig->err_item.expected, mr->sig->err_item.actual); - read_unlock(&dev->mdev->priv.mr_table.lock); + read_unlock(&dev->mdev->priv.mkey_table.lock); goto repoll; } return 0; } +static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_ib_wc *soft_wc, *next; + int npolled = 0; + + list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) { + if (npolled >= num_entries) + break; + + mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", + cq->mcq.cqn); + + wc[npolled++] = soft_wc->wc; + list_del(&soft_wc->list); + kfree(soft_wc); + } + + return npolled; +} + int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mlx5_ib_cq *cq = to_mcq(ibcq); struct mlx5_ib_qp *cur_qp = NULL; unsigned long flags; + int soft_polled = 0; int npolled; int err = 0; spin_lock_irqsave(&cq->lock, flags); - for (npolled = 0; npolled < num_entries; npolled++) { - err = mlx5_poll_one(cq, &cur_qp, wc + npolled); + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc); + + for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { + err = mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled); if (err) break; } @@ -587,7 +616,7 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_unlock_irqrestore(&cq->lock, flags); if (err == 0 || err == -EAGAIN) - return npolled; + return soft_polled + npolled; else return err; } @@ -595,16 +624,27 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + struct mlx5_ib_cq *cq = to_mcq(ibcq); void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, irq_flags); + if (cq->notify_flags != IB_CQ_NEXT_COMP) + cq->notify_flags = flags & IB_CQ_SOLICITED_MASK; - mlx5_cq_arm(&to_mcq(ibcq)->mcq, + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list)) + ret = 1; + spin_unlock_irqrestore(&cq->lock, irq_flags); + + mlx5_cq_arm(&cq->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, uar_page, MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), to_mcq(ibcq)->mcq.cons_index); - return 0; + return ret; } static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, @@ -757,6 +797,14 @@ static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) mlx5_db_free(dev->mdev, &cq->db); } +static void notify_soft_wc_handler(struct work_struct *work) +{ + struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq, + notify_work); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, @@ -807,6 +855,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, &index, &inlen); if (err) goto err_create; + + INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } cq->cqe_size = cqe_size; @@ -832,6 +882,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; + INIT_LIST_HEAD(&cq->wc_list); + if (context) if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { err = -EFAULT; @@ -1219,3 +1271,27 @@ int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) cq = to_mcq(ibcq); return cq->cqe_size; } + +/* Called from atomic context */ +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) +{ + struct mlx5_ib_wc *soft_wc; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + unsigned long flags; + + soft_wc = kmalloc(sizeof(*soft_wc), GFP_ATOMIC); + if (!soft_wc) + return -ENOMEM; + + soft_wc->wc = *wc; + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&soft_wc->list, &cq->wc_list); + if (cq->notify_flags == IB_CQ_NEXT_COMP || + wc->status != IB_WC_SUCCESS) { + cq->notify_flags = 0; + schedule_work(&cq->notify_work); + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c new file mode 100644 index 000000000000..53e03c8ede79 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct mlx5_ib_gsi_wr { + struct ib_cqe cqe; + struct ib_wc wc; + int send_flags; + bool completed:1; +}; + +struct mlx5_ib_gsi_qp { + struct ib_qp ibqp; + struct ib_qp *rx_qp; + u8 port_num; + struct ib_qp_cap cap; + enum ib_sig_type sq_sig_type; + /* Serialize qp state modifications */ + struct mutex mutex; + struct ib_cq *cq; + struct mlx5_ib_gsi_wr *outstanding_wrs; + u32 outstanding_pi, outstanding_ci; + int num_qps; + /* Protects access to the tx_qps. Post send operations synchronize + * with tx_qp creation in setup_qp(). Also protects the + * outstanding_wrs array and indices. + */ + spinlock_t lock; + struct ib_qp **tx_qps; +}; + +static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) +{ + return container_of(qp, struct mlx5_ib_gsi_qp, ibqp); +} + +static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); +} + +static u32 next_outstanding(struct mlx5_ib_gsi_qp *gsi, u32 index) +{ + return ++index % gsi->cap.max_send_wr; +} + +#define for_each_outstanding_wr(gsi, index) \ + for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; \ + index = next_outstanding(gsi, index)) + +/* Call with gsi->lock locked */ +static void generate_completions(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_cq *gsi_cq = gsi->ibqp.send_cq; + struct mlx5_ib_gsi_wr *wr; + u32 index; + + for_each_outstanding_wr(gsi, index) { + wr = &gsi->outstanding_wrs[index]; + + if (!wr->completed) + break; + + if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR || + wr->send_flags & IB_SEND_SIGNALED) + WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc)); + + wr->completed = false; + } + + gsi->outstanding_ci = index; +} + +static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_gsi_qp *gsi = cq->cq_context; + struct mlx5_ib_gsi_wr *wr = + container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe); + u64 wr_id; + unsigned long flags; + + spin_lock_irqsave(&gsi->lock, flags); + wr->completed = true; + wr_id = wr->wc.wr_id; + wr->wc = *wc; + wr->wc.wr_id = wr_id; + wr->wc.qp = &gsi->ibqp; + + generate_completions(gsi); + spin_unlock_irqrestore(&gsi->lock, flags); +} + +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_gsi_qp *gsi; + struct ib_qp_init_attr hw_init_attr = *init_attr; + const u8 port_num = init_attr->port_num; + const int num_pkeys = pd->device->attrs.max_pkeys; + const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; + int ret; + + mlx5_ib_dbg(dev, "creating GSI QP\n"); + + if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) { + mlx5_ib_warn(dev, + "invalid port number %d during GSI QP creation\n", + port_num); + return ERR_PTR(-EINVAL); + } + + gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); + if (!gsi) + return ERR_PTR(-ENOMEM); + + gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL); + if (!gsi->tx_qps) { + ret = -ENOMEM; + goto err_free; + } + + gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr, + sizeof(*gsi->outstanding_wrs), + GFP_KERNEL); + if (!gsi->outstanding_wrs) { + ret = -ENOMEM; + goto err_free_tx; + } + + mutex_init(&gsi->mutex); + + mutex_lock(&dev->devr.mutex); + + if (dev->devr.ports[port_num - 1].gsi) { + mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", + port_num); + ret = -EBUSY; + goto err_free_wrs; + } + gsi->num_qps = num_qps; + spin_lock_init(&gsi->lock); + + gsi->cap = init_attr->cap; + gsi->sq_sig_type = init_attr->sq_sig_type; + gsi->ibqp.qp_num = 1; + gsi->port_num = port_num; + + gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0, + IB_POLL_SOFTIRQ); + if (IS_ERR(gsi->cq)) { + mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", + PTR_ERR(gsi->cq)); + ret = PTR_ERR(gsi->cq); + goto err_free_wrs; + } + + hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; + hw_init_attr.send_cq = gsi->cq; + if (num_qps) { + hw_init_attr.cap.max_send_wr = 0; + hw_init_attr.cap.max_send_sge = 0; + hw_init_attr.cap.max_inline_data = 0; + } + gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); + if (IS_ERR(gsi->rx_qp)) { + mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", + PTR_ERR(gsi->rx_qp)); + ret = PTR_ERR(gsi->rx_qp); + goto err_destroy_cq; + } + + dev->devr.ports[init_attr->port_num - 1].gsi = gsi; + + mutex_unlock(&dev->devr.mutex); + + return &gsi->ibqp; + +err_destroy_cq: + ib_free_cq(gsi->cq); +err_free_wrs: + mutex_unlock(&dev->devr.mutex); + kfree(gsi->outstanding_wrs); +err_free_tx: + kfree(gsi->tx_qps); +err_free: + kfree(gsi); + return ERR_PTR(ret); +} + +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + const int port_num = gsi->port_num; + int qp_index; + int ret; + + mlx5_ib_dbg(dev, "destroying GSI QP\n"); + + mutex_lock(&dev->devr.mutex); + ret = ib_destroy_qp(gsi->rx_qp); + if (ret) { + mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n", + ret); + mutex_unlock(&dev->devr.mutex); + return ret; + } + dev->devr.ports[port_num - 1].gsi = NULL; + mutex_unlock(&dev->devr.mutex); + gsi->rx_qp = NULL; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) { + if (!gsi->tx_qps[qp_index]) + continue; + WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index])); + gsi->tx_qps[qp_index] = NULL; + } + + ib_free_cq(gsi->cq); + + kfree(gsi->outstanding_wrs); + kfree(gsi->tx_qps); + kfree(gsi); + + return 0; +} + +static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_pd *pd = gsi->rx_qp->pd; + struct ib_qp_init_attr init_attr = { + .event_handler = gsi->rx_qp->event_handler, + .qp_context = gsi->rx_qp->qp_context, + .send_cq = gsi->cq, + .recv_cq = gsi->rx_qp->recv_cq, + .cap = { + .max_send_wr = gsi->cap.max_send_wr, + .max_send_sge = gsi->cap.max_send_sge, + .max_inline_data = gsi->cap.max_inline_data, + }, + .sq_sig_type = gsi->sq_sig_type, + .qp_type = IB_QPT_UD, + .create_flags = mlx5_ib_create_qp_sqpn_qp1(), + }; + + return ib_create_qp(pd, &init_attr); +} + +static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, + u16 qp_index) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct ib_qp_attr attr; + int mask; + int ret; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = qp_index; + attr.qkey = IB_QP1_QKEY; + attr.port_num = gsi->port_num; + ret = ib_modify_qp(qp, &attr, mask); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n", + qp->qp_num, ret); + return ret; + } + + return 0; +} + +static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) +{ + struct ib_device *device = gsi->rx_qp->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_qp *qp; + unsigned long flags; + u16 pkey; + int ret; + + ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); + if (ret) { + mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", + gsi->port_num, qp_index); + return; + } + + if (!pkey) { + mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d. Skipping.\n", + gsi->port_num, qp_index); + return; + } + + spin_lock_irqsave(&gsi->lock, flags); + qp = gsi->tx_qps[qp_index]; + spin_unlock_irqrestore(&gsi->lock, flags); + if (qp) { + mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n", + gsi->port_num, qp_index); + return; + } + + qp = create_gsi_ud_qp(gsi); + if (IS_ERR(qp)) { + mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", + PTR_ERR(qp)); + return; + } + + ret = modify_to_rts(gsi, qp, qp_index); + if (ret) + goto err_destroy_qp; + + spin_lock_irqsave(&gsi->lock, flags); + WARN_ON_ONCE(gsi->tx_qps[qp_index]); + gsi->tx_qps[qp_index] = qp; + spin_unlock_irqrestore(&gsi->lock, flags); + + return; + +err_destroy_qp: + WARN_ON_ONCE(qp); +} + +static void setup_qps(struct mlx5_ib_gsi_qp *gsi) +{ + u16 qp_index; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) + setup_qp(gsi, qp_index); +} + +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state); + + mutex_lock(&gsi->mutex); + ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); + if (ret) { + mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret); + goto unlock; + } + + if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS) + setup_qps(gsi); + +unlock: + mutex_unlock(&gsi->mutex); + + return ret; +} + +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mutex_lock(&gsi->mutex); + ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr); + qp_init_attr->cap = gsi->cap; + mutex_unlock(&gsi->mutex); + + return ret; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr, struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_gsi_wr *gsi_wr; + + if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) { + mlx5_ib_warn(dev, "no available GSI work request.\n"); + return -ENOMEM; + } + + gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi]; + gsi->outstanding_pi = next_outstanding(gsi, gsi->outstanding_pi); + + if (!wc) { + memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc)); + gsi_wr->wc.pkey_index = wr->pkey_index; + gsi_wr->wc.wr_id = wr->wr.wr_id; + } else { + gsi_wr->wc = *wc; + gsi_wr->completed = true; + } + + gsi_wr->cqe.done = &handle_single_completion; + wr->wr.wr_cqe = &gsi_wr->cqe; + + return 0; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr) +{ + struct ib_wc wc = { + { .wr_id = wr->wr.wr_id }, + .status = IB_WC_SUCCESS, + .opcode = IB_WC_SEND, + .qp = &gsi->ibqp, + }; + int ret; + + ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc); + if (ret) + return ret; + + generate_completions(gsi); + + return 0; +} + +/* Call with gsi->lock locked */ +static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + int qp_index = wr->pkey_index; + + if (!mlx5_ib_deth_sqpn_cap(dev)) + return gsi->rx_qp; + + if (qp_index >= gsi->num_qps) + return NULL; + + return gsi->tx_qps[qp_index]; +} + +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + struct ib_qp *tx_qp; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ib_ud_wr cur_wr = *ud_wr(wr); + + cur_wr.wr.next = NULL; + + spin_lock_irqsave(&gsi->lock, flags); + tx_qp = get_tx_qp(gsi, &cur_wr); + if (!tx_qp) { + ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr); + if (ret) + goto err; + spin_unlock_irqrestore(&gsi->lock, flags); + continue; + } + + ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL); + if (ret) + goto err; + + ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr); + if (ret) { + /* Undo the effect of adding the outstanding wr */ + gsi->outstanding_pi = (gsi->outstanding_pi - 1) % + gsi->cap.max_send_wr; + goto err; + } + spin_unlock_irqrestore(&gsi->lock, flags); + } + + return 0; + +err: + spin_unlock_irqrestore(&gsi->lock, flags); + *bad_wr = wr; + return ret; +} + +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + + return ib_post_recv(gsi->rx_qp, wr, bad_wr); +} + +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi) +{ + if (!gsi) + return; + + mutex_lock(&gsi->mutex); + setup_qps(gsi); + mutex_unlock(&gsi->mutex); +} diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c new file mode 100644 index 000000000000..c1b9de800fe5 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/ib_virt.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/mlx5/vport.h> +#include "mlx5_ib.h" + +static inline u32 mlx_to_net_policy(enum port_state_policy mlx_policy) +{ + switch (mlx_policy) { + case MLX5_POLICY_DOWN: + return IFLA_VF_LINK_STATE_DISABLE; + case MLX5_POLICY_UP: + return IFLA_VF_LINK_STATE_ENABLE; + case MLX5_POLICY_FOLLOW: + return IFLA_VF_LINK_STATE_AUTO; + default: + return __IFLA_VF_LINK_STATE_MAX; + } +} + +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *rep; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + err = mlx5_query_hca_vport_context(mdev, 1, 1, vf + 1, rep); + if (err) { + mlx5_ib_warn(dev, "failed to query port policy for vf %d (%d)\n", + vf, err); + goto free; + } + memset(info, 0, sizeof(*info)); + info->linkstate = mlx_to_net_policy(rep->policy); + if (info->linkstate == __IFLA_VF_LINK_STATE_MAX) + err = -EINVAL; + +free: + kfree(rep); + return err; +} + +static inline enum port_state_policy net_to_mlx_policy(int policy) +{ + switch (policy) { + case IFLA_VF_LINK_STATE_DISABLE: + return MLX5_POLICY_DOWN; + case IFLA_VF_LINK_STATE_ENABLE: + return MLX5_POLICY_UP; + case IFLA_VF_LINK_STATE_AUTO: + return MLX5_POLICY_FOLLOW; + default: + return MLX5_POLICY_INVALID; + } +} + +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u8 port, int state) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->policy = net_to_mlx_policy(state); + if (in->policy == MLX5_POLICY_INVALID) { + err = -EINVAL; + goto out; + } + in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + +out: + kfree(in); + return err; +} + +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u8 port, struct ifla_vf_stats *stats) +{ + int out_sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *dev; + void *out; + int err; + + dev = to_mdev(device); + mdev = dev->mdev; + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_vport_counter(mdev, true, vf, port, out, out_sz); + if (err) + goto ex; + + stats->rx_packets = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.packets); + stats->tx_packets = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.packets); + stats->rx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.octets); + stats->tx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.octets); + stats->multicast = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_multicast.packets); + +ex: + kfree(out); + return err; +} + +static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID; + in->node_guid = guid; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + kfree(in); + return err; +} + +static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID; + in->port_guid = guid; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + kfree(in); + return err; +} + +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, + u64 guid, int type) +{ + if (type == IFLA_VF_IB_NODE_GUID) + return set_vf_node_guid(device, vf, port, guid); + else if (type == IFLA_VF_IB_PORT_GUID) + return set_vf_port_guid(device, vf, port, guid); + + return -EINVAL; +} diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index b84d13a487cc..1534af113058 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -31,8 +31,10 @@ */ #include <linux/mlx5/cmd.h> +#include <linux/mlx5/vport.h> #include <rdma/ib_mad.h> #include <rdma/ib_smi.h> +#include <rdma/ib_pma.h> #include "mlx5_ib.h" enum { @@ -57,20 +59,12 @@ int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port); } -int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - const struct ib_wc *in_wc, const struct ib_grh *in_grh, - const struct ib_mad_hdr *in, size_t in_mad_size, - struct ib_mad_hdr *out, size_t *out_mad_size, - u16 *out_mad_pkey_index) +static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid; int err; - const struct ib_mad *in_mad = (const struct ib_mad *)in; - struct ib_mad *out_mad = (struct ib_mad *)out; - - if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || - *out_mad_size != sizeof(*out_mad))) - return IB_MAD_RESULT_FAILURE; slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); @@ -117,6 +111,156 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } +static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, + void *out) +{ +#define MLX5_SUM_CNT(p, cntr1, cntr2) \ + (MLX5_GET64(query_vport_counter_out, p, cntr1) + \ + MLX5_GET64(query_vport_counter_out, p, cntr2)) + + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets, + transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets, + received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.packets, + transmitted_ib_multicast.packets)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.packets, + received_ib_multicast.packets)); + pma_cnt_ext->port_unicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_multicast.packets); +} + +static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, + void *out) +{ + /* Traffic counters will be reported in + * their 64bit form via ib_pma_portcounters_ext by default. + */ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_ASSIGN_PMA_CNTR(counter_var, counter_name) { \ + counter_var = MLX5_GET_BE(typeof(counter_var), \ + ib_port_cntrs_grp_data_layout, \ + out_pma, counter_name); \ + } + + MLX5_ASSIGN_PMA_CNTR(pma_cnt->symbol_error_counter, + symbol_error_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_error_recovery_counter, + link_error_recovery_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_downed_counter, + link_downed_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_errors, + port_rcv_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_remphys_errors, + port_rcv_remote_physical_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_switch_relay_errors, + port_rcv_switch_relay_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_discards, + port_xmit_discards); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors, + port_xmit_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors, + port_rcv_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors, + link_overrun_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->vl15_dropped, + vl_15_dropped); +} + +static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + int err; + void *out_cnt; + + /* Decalring support of extended counters */ + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { + struct ib_class_port_info cpi = {}; + + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + } + + if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_vport_counter(dev->mdev, 0, 0, + port_num, out_cnt, sz); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_ib_ppcnt(dev->mdev, port_num, + out_cnt, sz); + if (!err) + pma_cnt_assign(pma_cnt, out_cnt); + } + + kvfree(out_cnt); + if (err) + return IB_MAD_RESULT_FAILURE; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; + + memset(out_mad->data, 0, sizeof(out_mad->data)); + + if (MLX5_CAP_GEN(mdev, vport_counters) && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { + return process_pma_cmd(ibdev, port_num, in_mad, out_mad); + } else { + return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + in_mad, out_mad); + } +} + int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) { struct ib_smp *in_mad = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 03c418ccbc98..e305990b73f6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -283,7 +283,7 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { - return !dev->mdev->issi; + return !MLX5_CAP_GEN(dev->mdev, ib_virt); } enum { @@ -487,6 +487,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; + if (MLX5_CAP_GEN(mdev, imaicl)) { + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_WINDOW_TYPE_2B; + props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + /* We support 'Gappy' memory registration too */ + props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; + } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; @@ -504,6 +511,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, (MLX5_CAP_ETH(dev->mdev, csum_cap))) props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + props->device_cap_flags |= IB_DEVICE_UD_TSO; + } + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; @@ -529,7 +541,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; - props->max_fast_reg_page_list_len = (unsigned int)-1; + props->max_fast_reg_page_list_len = + 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); @@ -549,6 +562,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, cd)) props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + if (!mlx5_core_is_pf(mdev)) + props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; + return 0; } @@ -686,6 +702,7 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, props->qkey_viol_cntr = rep->qkey_violation_counter; props->subnet_timeout = rep->subnet_timeout; props->init_type_reply = rep->init_type_reply; + props->grh_required = rep->grh_required; err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port); if (err) @@ -1369,11 +1386,20 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) return 0; } +static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) +{ + priority *= 2; + if (!dont_trap) + priority++; + return priority; +} + #define MLX5_FS_MAX_TYPES 10 #define MLX5_FS_MAX_ENTRIES 32000UL static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, struct ib_flow_attr *flow_attr) { + bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_table *ft; @@ -1383,10 +1409,12 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int err = 0; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - if (flow_is_multicast_only(flow_attr)) + if (flow_is_multicast_only(flow_attr) && + !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else - priority = flow_attr->priority; + priority = ib_prio_to_core_prio(flow_attr->priority, + dont_trap); ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); num_entries = MLX5_FS_MAX_ENTRIES; @@ -1434,6 +1462,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, unsigned int spec_index; u32 *match_c; u32 *match_v; + u32 action; int err = 0; if (!is_valid_attr(flow_attr)) @@ -1459,9 +1488,11 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, /* Outer header support only */ match_criteria_enable = (!outer_header_zero(match_c)) << 0; + action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable, match_c, match_v, - MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + action, MLX5_FS_DEFAULT_FLOW_TAG, dst); @@ -1481,6 +1512,29 @@ free: return err ? ERR_PTR(err) : handler; } +static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_dst = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); + if (!IS_ERR(handler)) { + handler_dst = create_flow_rule(dev, ft_prio, + flow_attr, dst); + if (IS_ERR(handler_dst)) { + mlx5_del_flow_rule(handler->rule); + kfree(handler); + handler = handler_dst; + } else { + list_add(&handler_dst->list, &handler->list); + } + } + + return handler; +} enum { LEFTOVERS_MC, LEFTOVERS_UC, @@ -1558,7 +1612,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, if (domain != IB_FLOW_DOMAIN_USER || flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || - flow_attr->flags) + (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) return ERR_PTR(-EINVAL); dst = kzalloc(sizeof(*dst), GFP_KERNEL); @@ -1577,8 +1631,13 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, dst->tir_num = to_mqp(qp)->raw_packet_qp.rq.tirn; if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - handler = create_flow_rule(dev, ft_prio, flow_attr, - dst); + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { + handler = create_dont_trap_rule(dev, ft_prio, + flow_attr, dst); + } else { + handler = create_flow_rule(dev, ft_prio, flow_attr, + dst); + } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { handler = create_leftovers_rule(dev, ft_prio, flow_attr, @@ -1716,6 +1775,17 @@ static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_reg_pages, }; +static void pkey_change_handler(struct work_struct *work) +{ + struct mlx5_ib_port_resources *ports = + container_of(work, struct mlx5_ib_port_resources, + pkey_change_work); + + mutex_lock(&ports->devr->mutex); + mlx5_ib_gsi_pkey_change(ports->gsi); + mutex_unlock(&ports->devr->mutex); +} + static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { @@ -1752,6 +1822,8 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; port = (u8)param; + + schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: @@ -1838,7 +1910,7 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); mlx5_ib_destroy_qp(dev->umrc.qp); - ib_destroy_cq(dev->umrc.cq); + ib_free_cq(dev->umrc.cq); ib_dealloc_pd(dev->umrc.pd); } @@ -1853,7 +1925,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev) struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - struct ib_cq_init_attr cq_attr = {}; int ret; attr = kzalloc(sizeof(*attr), GFP_KERNEL); @@ -1870,15 +1941,12 @@ static int create_umr_res(struct mlx5_ib_dev *dev) goto error_0; } - cq_attr.cqe = 128; - cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, - &cq_attr); + cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); if (IS_ERR(cq)) { mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); ret = PTR_ERR(cq); goto error_2; } - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); init_attr->send_cq = cq; init_attr->recv_cq = cq; @@ -1945,7 +2013,7 @@ error_4: mlx5_ib_destroy_qp(qp); error_3: - ib_destroy_cq(cq); + ib_free_cq(cq); error_2: ib_dealloc_pd(pd); @@ -1961,10 +2029,13 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; + int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); + mutex_init(&devr->mutex); + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); @@ -2052,6 +2123,12 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + devr->ports[port].devr = devr; + } + return 0; error5: @@ -2070,12 +2147,20 @@ error0: static void destroy_dev_resources(struct mlx5_ib_resources *devr) { + struct mlx5_ib_dev *dev = + container_of(devr, struct mlx5_ib_dev, devr); + int port; + mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); + + /* Make sure no change P_Key work items are still executing */ + for (port = 0; port < dev->num_ports; ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); } static u32 get_core_cap_flags(struct ib_device *ibdev) @@ -2198,6 +2283,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | @@ -2258,6 +2344,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; + dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; @@ -2266,9 +2353,23 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; + if (mlx5_core_is_pf(mdev)) { + dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config; + dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state; + dev->ib_dev.get_vf_stats = mlx5_ib_get_vf_stats; + dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; + } mlx5_ib_internal_fill_odp_caps(dev); + if (MLX5_CAP_GEN(mdev, imaicl)) { + dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; + dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d2b9737baa36..f16c818ad2e6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -43,6 +43,7 @@ #include <linux/mlx5/srq.h> #include <linux/types.h> #include <linux/mlx5/transobj.h> +#include <rdma/ib_user_verbs.h> #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ @@ -126,7 +127,7 @@ struct mlx5_ib_pd { }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) -#define MLX5_IB_FLOW_LAST_PRIO (MLX5_IB_FLOW_MCAST_PRIO - 1) +#define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) #if (MLX5_IB_FLOW_LAST_PRIO <= 0) #error "Invalid number of bypass priorities" #endif @@ -162,9 +163,31 @@ struct mlx5_ib_flow_db { #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) + +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END + #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +/* + * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI + * creates the actual hardware QP. + */ +#define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 +/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. + * + * These flags are intended for internal use by the mlx5_ib driver, and they + * rely on the range reserved for that use in the ib_qp_create_flags enum. + */ + +/* Create a UD QP whose source QP number is 1 */ +static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void) +{ + return IB_QP_CREATE_RESERVED_START; +} + struct wr_list { u16 opcode; u16 next; @@ -325,11 +348,14 @@ struct mlx5_ib_cq_buf { }; enum mlx5_ib_qp_flags { - MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, - MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, - MLX5_IB_QP_CROSS_CHANNEL = 1 << 2, - MLX5_IB_QP_MANAGED_SEND = 1 << 3, - MLX5_IB_QP_MANAGED_RECV = 1 << 4, + MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, + MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, + MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, + /* QP uses 1 as its source QP number */ + MLX5_IB_QP_SQPN_QP1 = 1 << 6, }; struct mlx5_umr_wr { @@ -373,6 +399,14 @@ struct mlx5_ib_cq { struct ib_umem *resize_umem; int cqe_size; u32 create_flags; + struct list_head wc_list; + enum ib_cq_notify_flags notify_flags; + struct work_struct notify_work; +}; + +struct mlx5_ib_wc { + struct ib_wc wc; + struct list_head list; }; struct mlx5_ib_srq { @@ -413,7 +447,8 @@ struct mlx5_ib_mr { int ndescs; int max_descs; int desc_size; - struct mlx5_core_mr mmr; + int access_mode; + struct mlx5_core_mkey mmkey; struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; struct list_head list; @@ -425,19 +460,20 @@ struct mlx5_ib_mr { struct mlx5_core_sig_ctx *sig; int live; void *descs_alloc; + int access_flags; /* Needed for rereg MR */ +}; + +struct mlx5_ib_mw { + struct ib_mw ibmw; + struct mlx5_core_mkey mmkey; }; struct mlx5_ib_umr_context { + struct ib_cqe cqe; enum ib_wc_status status; struct completion done; }; -static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) -{ - context->status = -1; - init_completion(&context->done); -} - struct umr_common { struct ib_pd *pd; struct ib_cq *cq; @@ -487,6 +523,14 @@ struct mlx5_mr_cache { unsigned long last_add; }; +struct mlx5_ib_gsi_qp; + +struct mlx5_ib_port_resources { + struct mlx5_ib_resources *devr; + struct mlx5_ib_gsi_qp *gsi; + struct work_struct pkey_change_work; +}; + struct mlx5_ib_resources { struct ib_cq *c0; struct ib_xrcd *x0; @@ -494,6 +538,9 @@ struct mlx5_ib_resources { struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mlx5_ib_port_resources ports[2]; + /* Protects changes to the port resources */ + struct mutex mutex; }; struct mlx5_roce { @@ -558,9 +605,9 @@ static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } -static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey) { - return container_of(mmr, struct mlx5_ib_mr, mmr); + return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) @@ -588,6 +635,11 @@ static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx5_ib_mr, ibmr); } +static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx5_ib_mw, ibmw); +} + struct mlx5_ib_ah { struct ib_ah ibah; struct mlx5_av av; @@ -648,8 +700,14 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); +int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, int zap); +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int access_flags, + struct ib_pd *pd, struct ib_udata *udata); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, @@ -700,7 +758,6 @@ int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); @@ -719,6 +776,14 @@ void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, + u8 port, struct ifla_vf_info *info); +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u8 port, int state); +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u8 port, struct ifla_vf_stats *stats); +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, + u64 guid, int type); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) @@ -739,6 +804,23 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index); +/* GSI QP helper functions */ +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr); +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp); +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask); +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); + +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -758,7 +840,7 @@ static inline u8 convert_access(int acc) static inline int is_qp1(enum ib_qp_type qp_type) { - return qp_type == IB_QPT_GSI; + return qp_type == MLX5_IB_QPT_HW_GSI; } #define MLX5_MAX_UMR_SHIFT 16 diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 6000f7aeede9..4d5bff151cdf 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -40,6 +40,7 @@ #include <rdma/ib_umem_odp.h> #include <rdma/ib_verbs.h> #include "mlx5_ib.h" +#include "user.h" enum { MAX_PENDING_REG_MR = 8, @@ -57,7 +58,7 @@ static int clean_mr(struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* Wait until all page fault handlers using the mr complete. */ @@ -77,6 +78,40 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) +{ + return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >= + length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1)); +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static void update_odp_mr(struct mlx5_ib_mr *mr) +{ + if (mr->umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +} +#endif + static void reg_mr_callback(int status, void *context) { struct mlx5_ib_mr *mr = context; @@ -86,7 +121,7 @@ static void reg_mr_callback(int status, void *context) struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; - struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; int err; spin_lock_irqsave(&ent->lock, flags); @@ -113,7 +148,7 @@ static void reg_mr_callback(int status, void *context) spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); - mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + mr->mmkey.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; cache->last_add = jiffies; @@ -124,10 +159,10 @@ static void reg_mr_callback(int status, void *context) spin_unlock_irqrestore(&ent->lock, flags); write_lock_irqsave(&table->lock, flags); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key), - &mr->mmr); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey); if (err) - pr_err("Error inserting to mr tree. 0x%x\n", -err); + pr_err("Error inserting to mkey tree. 0x%x\n", -err); write_unlock_irqrestore(&table->lock, flags); } @@ -168,7 +203,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) spin_lock_irq(&ent->lock); ent->pending++; spin_unlock_irq(&ent->lock); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), reg_mr_callback, mr, &mr->out); if (err) { @@ -657,14 +692,14 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, NULL); if (err) goto err_in; kfree(in); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; return &mr->ibmr; @@ -693,10 +728,40 @@ static int use_umr(int order) return order <= MLX5_MAX_UMR_SHIFT; } -static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, - struct ib_sge *sg, u64 dma, int n, u32 key, - int page_shift, u64 virt_addr, u64 len, - int access_flags) +static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int npages, int page_shift, int *size, + __be64 **mr_pas, dma_addr_t *dma) +{ + __be64 *pas; + struct device *ddev = dev->ib_dev.dma_device; + + /* + * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. + */ + *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!(*mr_pas)) + return -ENOMEM; + + pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, *size - npages * sizeof(u64)); + + *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, *dma)) { + kfree(*mr_pas); + return -ENOMEM; + } + + return 0; +} + +static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_umr_wr *umrwr = umr_wr(wr); @@ -706,7 +771,6 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, sg->lkey = dev->umrc.pd->local_dma_lkey; wr->next = NULL; - wr->send_flags = 0; wr->sg_list = sg; if (n) wr->num_sge = 1; @@ -718,6 +782,19 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, umrwr->npages = n; umrwr->page_shift = page_shift; umrwr->mkey = key; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift); + + wr->send_flags = 0; + umrwr->target.virt_addr = virt_addr; umrwr->length = len; umrwr->access_flags = access_flags; @@ -734,26 +811,45 @@ static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, umrwr->mkey = key; } -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length, + int access_flags, int *npages, + int *page_shift, int *ncont, int *order) { - struct mlx5_ib_umr_context *context; - struct ib_wc wc; - int err; - - while (1) { - err = ib_poll_cq(cq, 1, &wc); - if (err < 0) { - pr_warn("poll cq error %d\n", err); - return; - } - if (err == 0) - break; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length, + access_flags, 0); + if (IS_ERR(umem)) { + mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } - context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id; - context->status = wc.status; - complete(&context->done); + mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order); + if (!*npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + ib_umem_release(umem); + return ERR_PTR(-EINVAL); } - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + *npages, *ncont, *order, *page_shift); + + return umem; +} + +static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); + + context->status = wc->status; + complete(&context->done); +} + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->cqe.done = mlx5_ib_umr_done; + context->status = -1; + init_completion(&context->done); } static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, @@ -764,13 +860,12 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, struct device *ddev = dev->ib_dev.dma_device; struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr; + struct mlx5_umr_wr umrwr = {}; struct ib_send_wr *bad; struct mlx5_ib_mr *mr; struct ib_sge sg; int size; __be64 *mr_pas; - __be64 *pas; dma_addr_t dma; int err = 0; int i; @@ -790,33 +885,17 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); - /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. - * To avoid copying garbage after the pas array, we allocate - * a little more. */ - size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); - mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); - if (!mr_pas) { - err = -ENOMEM; + err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas, + &dma); + if (err) goto free_mr; - } - pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); - mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); - /* Clear padding after the actual pages. */ - memset(pas + npages, 0, size - npages * sizeof(u64)); - - dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); - if (dma_mapping_error(ddev, dma)) { - err = -ENOMEM; - goto free_pas; - } + mlx5_ib_init_umr_context(&umr_context); - memset(&umrwr, 0, sizeof(umrwr)); - umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; - prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmr.key, + umrwr.wr.wr_cqe = &umr_context.cqe; + prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, page_shift, virt_addr, len, access_flags); - mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { @@ -830,9 +909,9 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, } } - mr->mmr.iova = virt_addr; - mr->mmr.size = len; - mr->mmr.pd = to_mpd(pd)->pdn; + mr->mmkey.iova = virt_addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; mr->live = 1; @@ -840,7 +919,6 @@ unmap_dma: up(&umrc->sem); dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); -free_pas: kfree(mr_pas); free_mr: @@ -929,8 +1007,10 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + mlx5_ib_init_umr_context(&umr_context); + memset(&wr, 0, sizeof(wr)); - wr.wr.wr_id = (u64)(unsigned long)&umr_context; + wr.wr.wr_cqe = &umr_context.cqe; sg.addr = dma; sg.length = ALIGN(npages * sizeof(u64), @@ -944,10 +1024,9 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, wr.wr.opcode = MLX5_IB_WR_UMR; wr.npages = sg.length / sizeof(u64); wr.page_shift = PAGE_SHIFT; - wr.mkey = mr->mmr.key; + wr.mkey = mr->mmkey.key; wr.target.offset = start_page_index; - mlx5_ib_init_umr_context(&umr_context); down(&umrc->sem); err = ib_post_send(umrc->qp, &wr.wr, &bad); if (err) { @@ -974,10 +1053,14 @@ free_pas: } #endif -static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, - u64 length, struct ib_umem *umem, - int npages, int page_shift, - int access_flags) +/* + * If ibmr is NULL it will be allocated by reg_create. + * Else, the given ibmr will be used. + */ +static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, + u64 virt_addr, u64 length, + struct ib_umem *umem, int npages, + int page_shift, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_create_mkey_mbox_in *in; @@ -986,7 +1069,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -1013,7 +1096,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen, NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); @@ -1024,7 +1107,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, mr->live = 1; kvfree(in); - mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); return mr; @@ -1032,11 +1115,23 @@ err_2: kvfree(in); err_1: - kfree(mr); + if (!ibmr) + kfree(mr); return ERR_PTR(err); } +static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + int npages, u64 length, int access_flags) +{ + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->ibmr.length = length; + mr->access_flags = access_flags; +} + struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -1052,22 +1147,11 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); - umem = ib_umem_get(pd->uobject->context, start, length, access_flags, - 0); - if (IS_ERR(umem)) { - mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); - return (void *)umem; - } + umem = mr_umem_get(pd, start, length, access_flags, &npages, + &page_shift, &ncont, &order); - mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); - if (!npages) { - mlx5_ib_warn(dev, "avoid zero region\n"); - err = -EINVAL; - goto error; - } - - mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", - npages, ncont, order, page_shift); + if (IS_ERR(umem)) + return (void *)umem; if (use_umr(order)) { mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, @@ -1083,45 +1167,21 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } if (!mr) - mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, - access_flags); + mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, + page_shift, access_flags); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto error; } - mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); mr->umem = umem; - mr->npages = npages; - atomic_add(npages, &dev->mdev->priv.reg_pages); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + set_mr_fileds(dev, mr, npages, length, access_flags); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (umem->odp_data) { - /* - * This barrier prevents the compiler from moving the - * setting of umem->odp_data->private to point to our - * MR, before reg_umr finished, to ensure that the MR - * initialization have finished before starting to - * handle invalidations. - */ - smp_wmb(); - mr->umem->odp_data->private = mr; - /* - * Make sure we will see the new - * umem->odp_data->private value in the invalidation - * routines, before we can get page faults on the - * MR. Page faults can happen once we put the MR in - * the tree, below this line. Without the barrier, - * there can be a fault handling and an invalidation - * before umem->odp_data->private == mr is visible to - * the invalidation handler. - */ - smp_wmb(); - } + update_odp_mr(mr); #endif return &mr->ibmr; @@ -1135,15 +1195,15 @@ static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct umr_common *umrc = &dev->umrc; struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr; + struct mlx5_umr_wr umrwr = {}; struct ib_send_wr *bad; int err; - memset(&umrwr.wr, 0, sizeof(umrwr)); - umrwr.wr.wr_id = (u64)(unsigned long)&umr_context; - prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmr.key); - mlx5_ib_init_umr_context(&umr_context); + + umrwr.wr.wr_cqe = &umr_context.cqe; + prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); + down(&umrc->sem); err = ib_post_send(umrc->qp, &umrwr.wr, &bad); if (err) { @@ -1165,6 +1225,167 @@ error: return err; } +static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, + u64 length, int npages, int page_shift, int order, + int access_flags, int flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr *bad; + struct mlx5_umr_wr umrwr = {}; + struct ib_sge sg; + struct umr_common *umrc = &dev->umrc; + dma_addr_t dma = 0; + __be64 *mr_pas = NULL; + int size; + int err; + + mlx5_ib_init_umr_context(&umr_context); + + umrwr.wr.wr_cqe = &umr_context.cqe; + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; + + if (flags & IB_MR_REREG_TRANS) { + err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size, + &mr_pas, &dma); + if (err) + return err; + + umrwr.target.virt_addr = virt_addr; + umrwr.length = length; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + } + + prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, + page_shift); + + if (flags & IB_MR_REREG_PD) { + umrwr.pd = pd; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD; + } + + if (flags & IB_MR_REREG_ACCESS) { + umrwr.access_flags = access_flags; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; + } + + /* post send request to UMR QP */ + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr.wr, &bad); + + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; + } + } + + up(&umrc->sem); + if (flags & IB_MR_REREG_TRANS) { + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + kfree(mr_pas); + } + return err; +} + +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int new_access_flags, + struct ib_pd *new_pd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); + struct mlx5_ib_mr *mr = to_mmr(ib_mr); + struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd; + int access_flags = flags & IB_MR_REREG_ACCESS ? + new_access_flags : + mr->access_flags; + u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address; + u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length; + int page_shift = 0; + int npages = 0; + int ncont = 0; + int order = 0; + int err; + + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, virt_addr, length, access_flags); + + if (flags != IB_MR_REREG_PD) { + /* + * Replace umem. This needs to be done whether or not UMR is + * used. + */ + flags |= IB_MR_REREG_TRANS; + ib_umem_release(mr->umem); + mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages, + &page_shift, &ncont, &order); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + mr->umem = NULL; + return err; + } + } + + if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { + /* + * UMR can't be used - MKey needs to be replaced. + */ + if (mr->umred) { + err = unreg_umr(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to unregister MR\n"); + } else { + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to destroy MKey\n"); + } + if (err) + return err; + + mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, + page_shift, access_flags); + + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->umred = 0; + } else { + /* + * Send a UMR WQE + */ + err = rereg_umr(pd, mr, addr, len, npages, page_shift, + order, access_flags, flags); + if (err) { + mlx5_ib_warn(dev, "Failed to rereg UMR\n"); + return err; + } + } + + if (flags & IB_MR_REREG_PD) { + ib_mr->pd = pd; + mr->mmkey.pd = to_mpd(pd)->pdn; + } + + if (flags & IB_MR_REREG_ACCESS) + mr->access_flags = access_flags; + + if (flags & IB_MR_REREG_TRANS) { + atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); + set_mr_fileds(dev, mr, npages, len, access_flags); + mr->mmkey.iova = addr; + mr->mmkey.size = len; + } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + update_odp_mr(mr); +#endif + + return 0; +} + static int mlx5_alloc_priv_descs(struct ib_device *device, struct mlx5_ib_mr *mr, @@ -1236,7 +1457,7 @@ static int clean_mr(struct mlx5_ib_mr *mr) err = destroy_mkey(dev, mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", - mr->mmr.key, err); + mr->mmkey.key, err); return err; } } else { @@ -1300,8 +1521,8 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_create_mkey_mbox_in *in; struct mlx5_ib_mr *mr; - int access_mode, err; - int ndescs = roundup(max_num_sg, 4); + int ndescs = ALIGN(max_num_sg, 4); + int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) @@ -1319,7 +1540,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); if (mr_type == IB_MR_TYPE_MEM_REG) { - access_mode = MLX5_ACCESS_MODE_MTT; + mr->access_mode = MLX5_ACCESS_MODE_MTT; in->seg.log2_page_size = PAGE_SHIFT; err = mlx5_alloc_priv_descs(pd->device, mr, @@ -1329,6 +1550,15 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, mr->desc_size = sizeof(u64); mr->max_descs = ndescs; + } else if (mr_type == IB_MR_TYPE_SG_GAPS) { + mr->access_mode = MLX5_ACCESS_MODE_KLM; + + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(struct mlx5_klm)); + if (err) + goto err_free_in; + mr->desc_size = sizeof(struct mlx5_klm); + mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SIGNATURE) { u32 psv_index[2]; @@ -1347,7 +1577,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, if (err) goto err_free_sig; - access_mode = MLX5_ACCESS_MODE_KLM; + mr->access_mode = MLX5_ACCESS_MODE_KLM; mr->sig->psv_memory.psv_idx = psv_index[0]; mr->sig->psv_wire.psv_idx = psv_index[1]; @@ -1361,14 +1591,14 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, goto err_free_in; } - in->seg.flags = MLX5_PERM_UMR_EN | access_mode; - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), + in->seg.flags = MLX5_PERM_UMR_EN | mr->access_mode; + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, sizeof(*in), NULL, NULL, NULL); if (err) goto err_destroy_psv; - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; kfree(in); @@ -1395,6 +1625,88 @@ err_free: return ERR_PTR(err); } +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_create_mkey_mbox_in *in = NULL; + struct mlx5_ib_mw *mw = NULL; + int ndescs; + int err; + struct mlx5_ib_alloc_mw req = {}; + struct { + __u32 comp_mask; + __u32 response_length; + } resp = {}; + + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); + if (err) + return ERR_PTR(err); + + if (req.comp_mask || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + udata->inlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); + + ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!mw || !in) { + err = -ENOMEM; + goto free; + } + + in->seg.status = MLX5_MKEY_STATUS_FREE; + in->seg.xlt_oct_size = cpu_to_be32(ndescs); + in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); + in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_KLM | + MLX5_PERM_LOCAL_READ; + if (type == IB_MW_TYPE_2) + in->seg.flags_pd |= cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + + err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, sizeof(*in), + NULL, NULL, NULL); + if (err) + goto free; + + mw->ibmw.rkey = mw->mmkey.key; + + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); + if (resp.response_length) { + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); + goto free; + } + } + + kfree(in); + return &mw->ibmw; + +free: + kfree(mw); + kfree(in); + return ERR_PTR(err); +} + +int mlx5_ib_dealloc_mw(struct ib_mw *mw) +{ + struct mlx5_ib_mw *mmw = to_mmw(mw); + int err; + + err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev, + &mmw->mmkey); + if (!err) + kfree(mmw); + return err; +} + int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status) { @@ -1436,6 +1748,32 @@ done: return ret; } +static int +mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, + struct scatterlist *sgl, + unsigned short sg_nents) +{ + struct scatterlist *sg = sgl; + struct mlx5_klm *klms = mr->descs; + u32 lkey = mr->ibmr.pd->local_dma_lkey; + int i; + + mr->ibmr.iova = sg_dma_address(sg); + mr->ibmr.length = 0; + mr->ndescs = sg_nents; + + for_each_sg(sgl, sg, sg_nents, i) { + if (unlikely(i > mr->max_descs)) + break; + klms[i].va = cpu_to_be64(sg_dma_address(sg)); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg)); + klms[i].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg); + } + + return i; +} + static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); @@ -1463,7 +1801,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, mr->desc_size * mr->max_descs, DMA_TO_DEVICE); - n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents); + else + n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index b8d76361a48d..34e79e709c67 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -142,13 +142,13 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, u32 key) { u32 base_key = mlx5_base_mkey(key); - struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); - struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); + struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); + struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!mmr || mmr->key != key || !mr->live) + if (!mmkey || mmkey->key != key || !mr->live) return NULL; - return container_of(mmr, struct mlx5_ib_mr, mmr); + return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, @@ -232,7 +232,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, io_virt += pfault->mpfault.bytes_committed; bcnt -= pfault->mpfault.bytes_committed; - start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; + start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; if (mr->umem->writable) access_mask |= ODP_WRITE_ALLOWED_BIT; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 34cb8e87c7b8..8dee8bc1e0fe 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -58,6 +58,7 @@ enum { static const u32 mlx5_ib_opcode[] = { [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_LSO] = MLX5_OPCODE_LSO, [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, @@ -72,6 +73,9 @@ static const u32 mlx5_ib_opcode[] = { [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, }; +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; +}; static int is_qp0(enum ib_qp_type qp_type) { @@ -260,11 +264,11 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, return 0; } -static int sq_overhead(enum ib_qp_type qp_type) +static int sq_overhead(struct ib_qp_init_attr *attr) { int size = 0; - switch (qp_type) { + switch (attr->qp_type) { case IB_QPT_XRC_INI: size += sizeof(struct mlx5_wqe_xrc_seg); /* fall through */ @@ -287,8 +291,12 @@ static int sq_overhead(enum ib_qp_type qp_type) break; case IB_QPT_UD: + if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + size += sizeof(struct mlx5_wqe_eth_pad) + + sizeof(struct mlx5_wqe_eth_seg); + /* fall through */ case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; @@ -311,7 +319,7 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) int inl_size = 0; int size; - size = sq_overhead(attr->qp_type); + size = sq_overhead(attr); if (size < 0) return size; @@ -348,8 +356,8 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, return -EINVAL; } - qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - - sizeof(struct mlx5_wqe_inline_seg); + qp->max_inline_data = wqe_size - sq_overhead(attr) - + sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) @@ -590,7 +598,7 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_XRC_INI: case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; - case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; @@ -783,7 +791,10 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, int err; uuari = &dev->mdev->priv.uuari; - if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_IPOIB_UD_LSO | + mlx5_ib_create_qp_sqpn_qp1())) return -EINVAL; if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) @@ -828,6 +839,11 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); + if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) { + (*in)->ctx.deth_sqpn = cpu_to_be32(1); + qp->flags |= MLX5_IB_QP_SQPN_QP1; + } + mlx5_fill_page_array(&qp->buf, (*in)->pas); err = mlx5_db_alloc(dev->mdev, &qp->db); @@ -1228,6 +1244,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) qp->flags |= MLX5_IB_QP_MANAGED_RECV; } + + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) + if (!MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n"); + return -EOPNOTSUPP; + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -1271,6 +1295,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, ucmd.sq_wqe_count, max_wqes); return -EINVAL; } + if (init_attr->create_flags & + mlx5_ib_create_qp_sqpn_qp1()) { + mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n"); + return -EINVAL; + } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, &inlen, base); if (err) @@ -1385,6 +1414,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, /* 0xffffff means we ask to work with cqe version 0 */ MLX5_SET(qpc, qpc, user_index, uidx); } + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); + qp->flags |= MLX5_IB_QP_LSO; + } if (init_attr->qp_type == IB_QPT_RAW_PACKET) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; @@ -1494,7 +1530,7 @@ static void get_cqs(struct mlx5_ib_qp *qp, break; case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -1657,7 +1693,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: case MLX5_IB_QPT_REG_UMR: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) @@ -1686,6 +1722,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, break; + case IB_QPT_GSI: + return mlx5_ib_gsi_create_qp(pd, init_attr); + case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: case IB_QPT_MAX: @@ -1704,6 +1743,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); + if (unlikely(qp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_destroy_qp(qp); + destroy_qp_common(dev, mqp); kfree(mqp); @@ -2161,8 +2203,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context = &in->ctx; err = to_mlx5_st(ibqp->qp_type); - if (err < 0) + if (err < 0) { + mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); goto out; + } context->flags = cpu_to_be32(err << 16); @@ -2182,7 +2226,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; } else if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { @@ -2284,6 +2328,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->sq_crq_size |= cpu_to_be16(1 << 4); + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + context->deth_sqpn = cpu_to_be32(1); mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -2363,11 +2409,18 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); + + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? + IB_QPT_GSI : ibqp->qp_type; + mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; @@ -2378,32 +2431,46 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } - if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && - !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - ll)) + if (qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, ibqp->qp_type, attr_mask); goto out; + } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || - attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); goto out; + } if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= - dev->mdev->port_caps[port - 1].pkey_table_len) + dev->mdev->port_caps[port - 1].pkey_table_len) { + mlx5_ib_dbg(dev, "invalid pkey index %d\n", + attr->pkey_index); goto out; + } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); goto out; + } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); goto out; + } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; @@ -2442,6 +2509,59 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, rseg->reserved = 0; } +static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, + struct ib_send_wr *wr, void *qend, + struct mlx5_ib_qp *qp, int *size) +{ + void *seg = eseg; + + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (wr->send_flags & IB_SEND_IP_CSUM) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + + seg += sizeof(struct mlx5_wqe_eth_seg); + *size += sizeof(struct mlx5_wqe_eth_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); + int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start); + u64 left, leftlen, copysz; + void *pdata = ud_wr->header; + + left = ud_wr->hlen; + eseg->mss = cpu_to_be16(ud_wr->mss); + eseg->inline_hdr_sz = cpu_to_be16(left); + + /* + * check if there is space till the end of queue, if yes, + * copy all in one shot, otherwise copy till the end of queue, + * rollback and than the copy the left + */ + leftlen = qend - (void *)eseg->inline_hdr_start; + copysz = min_t(u64, leftlen, left); + + memcpy(seg - size_of_inl_hdr_start, pdata, copysz); + + if (likely(copysz > size_of_inl_hdr_start)) { + seg += ALIGN(copysz - size_of_inl_hdr_start, 16); + *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16; + } + + if (unlikely(copysz < left)) { /* the last wqe in the queue */ + seg = mlx5_get_send_wqe(qp, 0); + left -= copysz; + pdata += copysz; + memcpy(seg, pdata, left); + seg += ALIGN(left, 16); + *size += ALIGN(left, 16) / 16; + } + } + + return seg; +} + static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, struct ib_send_wr *wr) { @@ -2509,6 +2629,11 @@ static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, int ndescs = mr->ndescs; memset(umr, 0, sizeof(*umr)); + + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + umr->flags = MLX5_UMR_CHECK_NOT_FREE; umr->klm_octowords = get_klm_octo(ndescs); umr->mkey_mask = frwr_mkey_mask(); @@ -2558,6 +2683,44 @@ static __be64 get_umr_update_mtt_mask(void) return cpu_to_be64(result); } +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, struct ib_send_wr *wr) { @@ -2576,9 +2739,15 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, umr->mkey_mask = get_umr_update_mtt_mask(); umr->bsf_octowords = get_klm_octo(umrwr->target.offset); umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; - } else { - umr->mkey_mask = get_umr_reg_mr_mask(); } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS) + umr->mkey_mask |= get_umr_update_access_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD) + umr->mkey_mask |= get_umr_update_pd_mask(); + if (!umr->mkey_mask) + umr->mkey_mask = get_umr_reg_mr_mask(); } else { umr->mkey_mask = get_umr_unreg_mr_mask(); } @@ -2603,13 +2772,19 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, int ndescs = ALIGN(mr->ndescs, 8) >> 1; memset(seg, 0, sizeof(*seg)); - seg->flags = get_umr_flags(access) | MLX5_ACCESS_MODE_MTT; + + if (mr->access_mode == MLX5_ACCESS_MODE_MTT) + seg->log2_page_size = ilog2(mr->ibmr.page_size); + else if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + seg->flags = get_umr_flags(access) | mr->access_mode; seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); seg->start_addr = cpu_to_be64(mr->ibmr.iova); seg->len = cpu_to_be64(mr->ibmr.length); seg->xlt_oct_size = cpu_to_be32(ndescs); - seg->log2_page_size = ilog2(mr->ibmr.page_size); } static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) @@ -2630,7 +2805,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w seg->flags = convert_access(umrwr->access_flags); if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { - seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); } seg->len = cpu_to_be64(umrwr->length); @@ -3196,13 +3372,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); - struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_xrc_seg *xrc; - struct mlx5_bf *bf = qp->bf; + struct mlx5_bf *bf; int uninitialized_var(size); - void *qend = qp->sq.qend; + void *qend; unsigned long flags; unsigned idx; int err = 0; @@ -3214,6 +3390,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, u8 next_fence = 0; u8 fence; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); + + qp = to_mqp(ibqp); + bf = qp->bf; + qend = qp->sq.qend; + spin_lock_irqsave(&qp->sq.lock, flags); for (nreq = 0; wr; nreq++, wr = wr->next) { @@ -3373,16 +3556,37 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_UD: case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); size += sizeof(struct mlx5_wqe_datagram_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); break; + case IB_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + seg += sizeof(struct mlx5_wqe_eth_pad); + size += sizeof(struct mlx5_wqe_eth_pad) / 16; + seg = set_eth_seg(seg, wr, qend, qp, &size); + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + } + break; case MLX5_IB_QPT_REG_UMR: if (wr->opcode != MLX5_IB_WR_UMR) { err = -EINVAL; @@ -3502,6 +3706,9 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int ind; int i; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); + spin_lock_irqsave(&qp->rq.lock, flags); ind = qp->rq.head & (qp->rq.wqe_cnt - 1); @@ -3822,6 +4029,10 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int err = 0; u8 raw_packet_qp_state; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, + qp_init_attr); + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* * Wait for any outstanding page faults, in case the user frees memory @@ -3874,6 +4085,8 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; if (qp->flags & MLX5_IB_QP_MANAGED_RECV) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1(); qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 4659256cd95e..3b2ddd64a371 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -75,7 +75,8 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, - struct ib_udata *udata, int buf_size, int *inlen) + struct ib_udata *udata, int buf_size, int *inlen, + int is_xrc) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_srq ucmd = {}; @@ -87,13 +88,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, int ncont; u32 offset; u32 uidx = MLX5_IB_DEFAULT_UIDX; - int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); - if (drv_data < 0) - return -EINVAL; - - ucmdlen = (drv_data < sizeof(ucmd)) ? - drv_data : sizeof(ucmd); + ucmdlen = min(udata->inlen, sizeof(ucmd)); if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { mlx5_ib_dbg(dev, "failed copy udata\n"); @@ -103,15 +99,17 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, if (ucmd.reserved0 || ucmd.reserved1) return -EINVAL; - if (drv_data > sizeof(ucmd) && + if (udata->inlen > sizeof(ucmd) && !ib_is_udata_cleared(udata, sizeof(ucmd), - drv_data - sizeof(ucmd))) + udata->inlen - sizeof(ucmd))) return -EINVAL; - err = get_srq_user_index(to_mucontext(pd->uobject->context), - &ucmd, udata->inlen, &uidx); - if (err) - return err; + if (is_xrc) { + err = get_srq_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + } srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); @@ -151,7 +149,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); - if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && + is_xrc){ xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, xrc_srq_context_entry); MLX5_SET(xrc_srqc, xsrqc, user_index, uidx); @@ -170,7 +169,7 @@ err_umem: static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, int buf_size, - int *inlen) + int *inlen, int is_xrc) { int err; int i; @@ -224,7 +223,8 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; - if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && + is_xrc){ xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, xrc_srq_context_entry); /* 0xffffff means we ask to work with cqe version 0 */ @@ -302,10 +302,14 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, srq->msrq.max_avail_gather); + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + if (pd->uobject) - err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen, + is_xrc); else - err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen, + is_xrc); if (err) { mlx5_ib_warn(dev, "create srq %s failed, err %d\n", @@ -313,7 +317,6 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, goto err_srq; } - is_xrc = (init_attr->srq_type == IB_SRQT_XRC); in->ctx.state_log_sz = ilog2(srq->msrq.max); flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; xrcdn = 0; diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index b94a55404a59..61bc308bb802 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -152,6 +152,13 @@ struct mlx5_ib_create_qp_resp { __u32 uuar_index; }; +struct mlx5_ib_alloc_mw { + __u32 comp_mask; + __u8 num_klms; + __u8 reserved1; + __u16 reserved2; +}; + static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, struct mlx5_ib_create_qp *ucmd, int inlen, diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig index 846dc97cf260..7964eba8e7ed 100644 --- a/drivers/infiniband/hw/nes/Kconfig +++ b/drivers/infiniband/hw/nes/Kconfig @@ -2,7 +2,6 @@ config INFINIBAND_NES tristate "NetEffect RNIC Driver" depends on PCI && INET && INFINIBAND select LIBCRC32C - select INET_LRO ---help--- This is the RDMA Network Interface Card (RNIC) driver for NetEffect Ethernet Cluster Server Adapters. diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 9f9d5c563a61..35cbb17bec12 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -111,17 +111,6 @@ static struct pci_device_id nes_pci_table[] = { MODULE_DEVICE_TABLE(pci, nes_pci_table); -/* registered nes netlink callbacks */ -static struct ibnl_client_cbs nes_nl_cb_table[] = { - [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, - [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, - [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, - [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, - [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, - [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, - [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} -}; - static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *); static int nes_net_event(struct notifier_block *, unsigned long, void *); static int nes_notifiers_registered; @@ -682,17 +671,6 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) } nes_notifiers_registered++; - if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table)) - printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n", - __func__, __LINE__); - - ret = iwpm_init(RDMA_NL_NES); - if (ret) { - printk(KERN_ERR PFX "%s: port mapper initialization failed\n", - pci_name(pcidev)); - goto bail7; - } - INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status); /* Initialize network devices */ @@ -731,7 +709,6 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n", nesdev->netdev_count, nesdev->nesadapter->netdev_count); - ibnl_remove_client(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { @@ -795,8 +772,6 @@ static void nes_remove(struct pci_dev *pcidev) nesdev->nesadapter->netdev_count--; } } - ibnl_remove_client(RDMA_NL_NES); - iwpm_exit(RDMA_NL_NES); nes_notifiers_registered--; if (nes_notifiers_registered == 0) { diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index cb9f0f27308d..7f0aa23aef9d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -482,11 +482,11 @@ static void form_cm_frame(struct sk_buff *skb, iph->ttl = 0x40; iph->protocol = 0x06; /* IPPROTO_TCP */ - iph->saddr = htonl(cm_node->mapped_loc_addr); - iph->daddr = htonl(cm_node->mapped_rem_addr); + iph->saddr = htonl(cm_node->loc_addr); + iph->daddr = htonl(cm_node->rem_addr); - tcph->source = htons(cm_node->mapped_loc_port); - tcph->dest = htons(cm_node->mapped_rem_port); + tcph->source = htons(cm_node->loc_port); + tcph->dest = htons(cm_node->rem_port); tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num); if (flags & SET_ACK) { @@ -525,125 +525,6 @@ static void form_cm_frame(struct sk_buff *skb, cm_packets_created++; } -/* - * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct - */ -static void nes_create_sockaddr(__be32 ip_addr, __be16 port, - struct sockaddr_storage *addr) -{ - struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr; - nes_sockaddr->sin_family = AF_INET; - memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32)); - nes_sockaddr->sin_port = port; -} - -/* - * nes_create_mapinfo - Create a mapinfo object in the port mapper data base - */ -static int nes_create_mapinfo(struct nes_cm_info *cm_info) -{ - struct sockaddr_storage local_sockaddr; - struct sockaddr_storage mapped_sockaddr; - - nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), - &local_sockaddr); - nes_create_sockaddr(htonl(cm_info->mapped_loc_addr), - htons(cm_info->mapped_loc_port), &mapped_sockaddr); - - return iwpm_create_mapinfo(&local_sockaddr, - &mapped_sockaddr, RDMA_NL_NES); -} - -/* - * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base - * and send a remove mapping op message to - * the userspace port mapper - */ -static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port, - u32 mapped_loc_addr, u16 mapped_loc_port) -{ - struct sockaddr_storage local_sockaddr; - struct sockaddr_storage mapped_sockaddr; - - nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr); - nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port), - &mapped_sockaddr); - - iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr); - return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES); -} - -/* - * nes_form_pm_msg - Form a port mapper message with mapping info - */ -static void nes_form_pm_msg(struct nes_cm_info *cm_info, - struct iwpm_sa_data *pm_msg) -{ - nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port), - &pm_msg->loc_addr); - nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port), - &pm_msg->rem_addr); -} - -/* - * nes_form_reg_msg - Form a port mapper message with dev info - */ -static void nes_form_reg_msg(struct nes_vnic *nesvnic, - struct iwpm_dev_data *pm_msg) -{ - memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name, - IWPM_DEVNAME_SIZE); - memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE); -} - -static void record_sockaddr_info(struct sockaddr_storage *addr_info, - nes_addr_t *ip_addr, u16 *port_num) -{ - struct sockaddr_in *in_addr = (struct sockaddr_in *)addr_info; - - if (in_addr->sin_family == AF_INET) { - *ip_addr = ntohl(in_addr->sin_addr.s_addr); - *port_num = ntohs(in_addr->sin_port); - } -} - -/* - * nes_record_pm_msg - Save the received mapping info - */ -static void nes_record_pm_msg(struct nes_cm_info *cm_info, - struct iwpm_sa_data *pm_msg) -{ - record_sockaddr_info(&pm_msg->mapped_loc_addr, - &cm_info->mapped_loc_addr, &cm_info->mapped_loc_port); - - record_sockaddr_info(&pm_msg->mapped_rem_addr, - &cm_info->mapped_rem_addr, &cm_info->mapped_rem_port); -} - -/* - * nes_get_reminfo - Get the address info of the remote connecting peer - */ -static int nes_get_remote_addr(struct nes_cm_node *cm_node) -{ - struct sockaddr_storage mapped_loc_addr, mapped_rem_addr; - struct sockaddr_storage remote_addr; - int ret; - - nes_create_sockaddr(htonl(cm_node->mapped_loc_addr), - htons(cm_node->mapped_loc_port), &mapped_loc_addr); - nes_create_sockaddr(htonl(cm_node->mapped_rem_addr), - htons(cm_node->mapped_rem_port), &mapped_rem_addr); - - ret = iwpm_get_remote_info(&mapped_loc_addr, &mapped_rem_addr, - &remote_addr, RDMA_NL_NES); - if (ret) - nes_debug(NES_DBG_CM, "Unable to find remote peer address info\n"); - else - record_sockaddr_info(&remote_addr, &cm_node->rem_addr, - &cm_node->rem_port); - return ret; -} - /** * print_core - dump a cm core */ @@ -1266,11 +1147,10 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, loc_addr, loc_port, cm_node->rem_addr, cm_node->rem_port, rem_addr, rem_port); - if ((cm_node->mapped_loc_addr == loc_addr) && - (cm_node->mapped_loc_port == loc_port) && - (cm_node->mapped_rem_addr == rem_addr) && - (cm_node->mapped_rem_port == rem_port)) { - + if ((cm_node->loc_addr == loc_addr) && + (cm_node->loc_port == loc_port) && + (cm_node->rem_addr == rem_addr) && + (cm_node->rem_port == rem_port)) { add_ref_cm_node(cm_node); spin_unlock_irqrestore(&cm_core->ht_lock, flags); return cm_node; @@ -1287,8 +1167,8 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core, * find_listener - find a cm node listening on this addr-port pair */ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, - nes_addr_t dst_addr, u16 dst_port, - enum nes_cm_listener_state listener_state, int local) + nes_addr_t dst_addr, u16 dst_port, + enum nes_cm_listener_state listener_state) { unsigned long flags; struct nes_cm_listener *listen_node; @@ -1298,13 +1178,9 @@ static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core, /* walk list and find cm_node associated with this session ID */ spin_lock_irqsave(&cm_core->listen_list_lock, flags); list_for_each_entry(listen_node, &cm_core->listen_list.list, list) { - if (local) { - listen_addr = listen_node->loc_addr; - listen_port = listen_node->loc_port; - } else { - listen_addr = listen_node->mapped_loc_addr; - listen_port = listen_node->mapped_loc_port; - } + listen_addr = listen_node->loc_addr; + listen_port = listen_node->loc_port; + /* compare node pair, return node handle if a match */ if (((listen_addr == dst_addr) || listen_addr == 0x00000000) && @@ -1443,17 +1319,13 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core, if (listener->nesvnic) { nes_manage_apbvt(listener->nesvnic, - listener->mapped_loc_port, + listener->loc_port, PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); - nes_remove_mapinfo(listener->loc_addr, - listener->loc_port, - listener->mapped_loc_addr, - listener->mapped_loc_port); nes_debug(NES_DBG_NLMSG, - "Delete APBVT mapped_loc_port = %04X\n", - listener->mapped_loc_port); + "Delete APBVT loc_port = %04X\n", + listener->loc_port); } nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener); @@ -1602,11 +1474,6 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->rem_addr = cm_info->rem_addr; cm_node->rem_port = cm_info->rem_port; - cm_node->mapped_loc_addr = cm_info->mapped_loc_addr; - cm_node->mapped_rem_addr = cm_info->mapped_rem_addr; - cm_node->mapped_loc_port = cm_info->mapped_loc_port; - cm_node->mapped_rem_port = cm_info->mapped_rem_port; - cm_node->mpa_frame_rev = mpa_version; cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO; cm_node->mpav2_ird_ord = 0; @@ -1655,10 +1522,10 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->loopbackpartner = NULL; /* get the mac addr for the remote node */ - oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr, - NULL, NES_ARP_RESOLVE); - arpindex = nes_addr_resolve_neigh(nesvnic, - cm_node->mapped_rem_addr, oldarpindex); + oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr, + NULL, NES_ARP_RESOLVE); + arpindex = nes_addr_resolve_neigh(nesvnic, cm_node->rem_addr, + oldarpindex); if (arpindex < 0) { kfree(cm_node); return NULL; @@ -1720,14 +1587,12 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core, mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0); } else { if (cm_node->apbvt_set && cm_node->nesvnic) { - nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port, + nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port, PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); } - nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n", - cm_node->mapped_loc_port); - nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port, - cm_node->mapped_loc_addr, cm_node->mapped_loc_port); + nes_debug(NES_DBG_NLMSG, "Delete APBVT loc_port = %04X\n", + cm_node->loc_port); } atomic_dec(&cm_core->node_cnt); @@ -2184,7 +2049,6 @@ static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, cm_node->state = NES_CM_STATE_ESTABLISHED; if (datasize) { cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize; - nes_get_remote_addr(cm_node); handle_rcv_mpa(cm_node, skb); } else { /* rcvd ACK only */ dev_kfree_skb_any(skb); @@ -2399,17 +2263,14 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, struct nes_vnic *nesvnic, struct nes_cm_info *cm_info) { struct nes_cm_listener *listener; - struct iwpm_dev_data pm_reg_msg; - struct iwpm_sa_data pm_msg; unsigned long flags; - int iwpm_err = 0; nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n", cm_info->loc_addr, cm_info->loc_port); /* cannot have multiple matching listeners */ listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port, - NES_CM_LISTENER_EITHER_STATE, 1); + NES_CM_LISTENER_EITHER_STATE); if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) { /* find automatically incs ref count ??? */ @@ -2419,22 +2280,6 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, } if (!listener) { - nes_form_reg_msg(nesvnic, &pm_reg_msg); - iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); - if (iwpm_err) { - nes_debug(NES_DBG_NLMSG, - "Port Mapper reg pid fail (err = %d).\n", iwpm_err); - } - if (iwpm_valid_pid() && !iwpm_err) { - nes_form_pm_msg(cm_info, &pm_msg); - iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES); - if (iwpm_err) - nes_debug(NES_DBG_NLMSG, - "Port Mapper query fail (err = %d).\n", iwpm_err); - else - nes_record_pm_msg(cm_info, &pm_msg); - } - /* create a CM listen node (1/2 node to compare incoming traffic to) */ listener = kzalloc(sizeof(*listener), GFP_ATOMIC); if (!listener) { @@ -2444,8 +2289,6 @@ static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core, listener->loc_addr = cm_info->loc_addr; listener->loc_port = cm_info->loc_port; - listener->mapped_loc_addr = cm_info->mapped_loc_addr; - listener->mapped_loc_port = cm_info->mapped_loc_port; listener->reused_node = 0; atomic_set(&listener->ref_count, 1); @@ -2507,18 +2350,18 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (cm_info->loc_addr == cm_info->rem_addr) { loopbackremotelistener = find_listener(cm_core, - cm_node->mapped_loc_addr, cm_node->mapped_rem_port, - NES_CM_LISTENER_ACTIVE_STATE, 0); + cm_node->loc_addr, cm_node->rem_port, + NES_CM_LISTENER_ACTIVE_STATE); if (loopbackremotelistener == NULL) { create_event(cm_node, NES_CM_EVENT_ABORTED); } else { loopback_cm_info = *cm_info; loopback_cm_info.loc_port = cm_info->rem_port; loopback_cm_info.rem_port = cm_info->loc_port; - loopback_cm_info.mapped_loc_port = - cm_info->mapped_rem_port; - loopback_cm_info.mapped_rem_port = - cm_info->mapped_loc_port; + loopback_cm_info.loc_port = + cm_info->rem_port; + loopback_cm_info.rem_port = + cm_info->loc_port; loopback_cm_info.cm_id = loopbackremotelistener->cm_id; loopbackremotenode = make_cm_node(cm_core, nesvnic, &loopback_cm_info, loopbackremotelistener); @@ -2747,12 +2590,6 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, nfo.rem_addr = ntohl(iph->saddr); nfo.rem_port = ntohs(tcph->source); - /* If port mapper is available these should be mapped address info */ - nfo.mapped_loc_addr = ntohl(iph->daddr); - nfo.mapped_loc_port = ntohs(tcph->dest); - nfo.mapped_rem_addr = ntohl(iph->saddr); - nfo.mapped_rem_port = ntohs(tcph->source); - tmp_daddr = cpu_to_be32(iph->daddr); tmp_saddr = cpu_to_be32(iph->saddr); @@ -2761,8 +2598,8 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, do { cm_node = find_node(cm_core, - nfo.mapped_rem_port, nfo.mapped_rem_addr, - nfo.mapped_loc_port, nfo.mapped_loc_addr); + nfo.rem_port, nfo.rem_addr, + nfo.loc_port, nfo.loc_addr); if (!cm_node) { /* Only type of packet accepted are for */ @@ -2771,9 +2608,9 @@ static int mini_cm_recv_pkt(struct nes_cm_core *cm_core, skb_handled = 0; break; } - listener = find_listener(cm_core, nfo.mapped_loc_addr, - nfo.mapped_loc_port, - NES_CM_LISTENER_ACTIVE_STATE, 0); + listener = find_listener(cm_core, nfo.loc_addr, + nfo.loc_port, + NES_CM_LISTENER_ACTIVE_STATE); if (!listener) { nfo.cm_id = NULL; nfo.conn_type = 0; @@ -2856,12 +2693,22 @@ static struct nes_cm_core *nes_cm_alloc_core(void) nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n"); cm_core->event_wq = create_singlethread_workqueue("nesewq"); + if (!cm_core->event_wq) + goto out_free_cmcore; cm_core->post_event = nes_cm_post_event; nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n"); cm_core->disconn_wq = create_singlethread_workqueue("nesdwq"); + if (!cm_core->disconn_wq) + goto out_free_wq; print_core(cm_core); return cm_core; + +out_free_wq: + destroy_workqueue(cm_core->event_wq); +out_free_cmcore: + kfree(cm_core); + return NULL; } @@ -3121,8 +2968,8 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) atomic_inc(&cm_disconnects); cm_event.event = IW_CM_EVENT_DISCONNECT; cm_event.status = disconn_status; - cm_event.local_addr = cm_id->local_addr; - cm_event.remote_addr = cm_id->remote_addr; + cm_event.local_addr = cm_id->m_local_addr; + cm_event.remote_addr = cm_id->m_remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; @@ -3148,8 +2995,8 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) cm_event.event = IW_CM_EVENT_CLOSE; cm_event.status = 0; cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->local_addr; - cm_event.remote_addr = cm_id->remote_addr; + cm_event.local_addr = cm_id->m_local_addr; + cm_event.remote_addr = cm_id->m_remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; @@ -3240,8 +3087,8 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) u8 *start_ptr = &start_addr; u8 **start_buff = &start_ptr; u16 buff_len = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; ibqp = nes_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -3378,11 +3225,11 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_cm_init_tsa_conn(nesqp, cm_node); nesqp->nesqp_context->tcpPorts[0] = - cpu_to_le16(cm_node->mapped_loc_port); + cpu_to_le16(cm_node->loc_port); nesqp->nesqp_context->tcpPorts[1] = - cpu_to_le16(cm_node->mapped_rem_port); + cpu_to_le16(cm_node->rem_port); - nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3406,9 +3253,9 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) memset(&nes_quad, 0, sizeof(nes_quad)); nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); - nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); - nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); + nes_quad.SrcIpadr = htonl(cm_node->rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3437,8 +3284,8 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_event.event = IW_CM_EVENT_ESTABLISHED; cm_event.status = 0; cm_event.provider_data = (void *)nesqp; - cm_event.local_addr = cm_id->local_addr; - cm_event.remote_addr = cm_id->remote_addr; + cm_event.local_addr = cm_id->m_local_addr; + cm_event.remote_addr = cm_id->m_remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; cm_event.ird = cm_node->ird_size; @@ -3508,11 +3355,8 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct nes_cm_node *cm_node; struct nes_cm_info cm_info; int apbvt_set = 0; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; - struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; - struct iwpm_dev_data pm_reg_msg; - struct iwpm_sa_data pm_msg; - int iwpm_err = 0; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; if (cm_id->remote_addr.ss_family != AF_INET) return -ENOSYS; @@ -3558,37 +3402,13 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_info.cm_id = cm_id; cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; - /* No port mapper available, go with the specified peer information */ - cm_info.mapped_loc_addr = cm_info.loc_addr; - cm_info.mapped_loc_port = cm_info.loc_port; - cm_info.mapped_rem_addr = cm_info.rem_addr; - cm_info.mapped_rem_port = cm_info.rem_port; - - nes_form_reg_msg(nesvnic, &pm_reg_msg); - iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES); - if (iwpm_err) { - nes_debug(NES_DBG_NLMSG, - "Port Mapper reg pid fail (err = %d).\n", iwpm_err); - } - if (iwpm_valid_pid() && !iwpm_err) { - nes_form_pm_msg(&cm_info, &pm_msg); - iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES); - if (iwpm_err) - nes_debug(NES_DBG_NLMSG, - "Port Mapper query fail (err = %d).\n", iwpm_err); - else - nes_record_pm_msg(&cm_info, &pm_msg); - } - if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) { - nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, - PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); + nes_manage_apbvt(nesvnic, cm_info.loc_port, + PCI_FUNC(nesdev->pcidev->devfn), + NES_MANAGE_APBVT_ADD); apbvt_set = 1; } - if (nes_create_mapinfo(&cm_info)) - return -ENOMEM; - cm_id->add_ref(cm_id); /* create a connect CM node connection */ @@ -3597,14 +3417,12 @@ int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) &cm_info); if (!cm_node) { if (apbvt_set) - nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port, + nes_manage_apbvt(nesvnic, cm_info.loc_port, PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_DEL); - nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n", - cm_info.mapped_loc_port); - nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port, - cm_info.mapped_loc_addr, cm_info.mapped_loc_port); + nes_debug(NES_DBG_NLMSG, "Delete loc_port = %04X\n", + cm_info.loc_port); cm_id->rem_ref(cm_id); return -ENOMEM; } @@ -3633,12 +3451,12 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) struct nes_cm_listener *cm_node; struct nes_cm_info cm_info; int err; - struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr; nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n", cm_id, ntohs(laddr->sin_port)); - if (cm_id->local_addr.ss_family != AF_INET) + if (cm_id->m_local_addr.ss_family != AF_INET) return -ENOSYS; nesvnic = to_nesvnic(cm_id->device); if (!nesvnic) @@ -3658,10 +3476,6 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) cm_info.conn_type = NES_CM_IWARP_CONN_TYPE; - /* No port mapper available, go with the specified info */ - cm_info.mapped_loc_addr = cm_info.loc_addr; - cm_info.mapped_loc_port = cm_info.loc_port; - cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info); if (!cm_node) { printk(KERN_ERR "%s[%u] Error returned from listen API call\n", @@ -3673,10 +3487,7 @@ int nes_create_listen(struct iw_cm_id *cm_id, int backlog) cm_node->tos = cm_id->tos; if (!cm_node->reused_node) { - if (nes_create_mapinfo(&cm_info)) - return -ENOMEM; - - err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port, + err = nes_manage_apbvt(nesvnic, cm_node->loc_port, PCI_FUNC(nesvnic->nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD); if (err) { @@ -3786,8 +3597,8 @@ static void cm_event_connected(struct nes_cm_event *event) nesvnic = to_nesvnic(nesqp->ibqp.device); nesdev = nesvnic->nesdev; nesadapter = nesdev->nesadapter; - laddr = (struct sockaddr_in *)&cm_id->local_addr; - raddr = (struct sockaddr_in *)&cm_id->remote_addr; + laddr = (struct sockaddr_in *)&cm_id->m_local_addr; + raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr; if (nesqp->destroyed) @@ -3802,10 +3613,10 @@ static void cm_event_connected(struct nes_cm_event *event) /* set the QP tsa context */ nesqp->nesqp_context->tcpPorts[0] = - cpu_to_le16(cm_node->mapped_loc_port); + cpu_to_le16(cm_node->loc_port); nesqp->nesqp_context->tcpPorts[1] = - cpu_to_le16(cm_node->mapped_rem_port); - nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr); + cpu_to_le16(cm_node->rem_port); + nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr); nesqp->nesqp_context->misc2 |= cpu_to_le32( (u32)PCI_FUNC(nesdev->pcidev->devfn) << @@ -3835,9 +3646,9 @@ static void cm_event_connected(struct nes_cm_event *event) nes_quad.DstIpAdrIndex = cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24); - nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr); - nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port); - nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port); + nes_quad.SrcIpadr = htonl(cm_node->rem_addr); + nes_quad.TcpPorts[0] = htons(cm_node->rem_port); + nes_quad.TcpPorts[1] = htons(cm_node->loc_port); /* Produce hash key */ crc_value = get_crc_value(&nes_quad); @@ -3858,14 +3669,14 @@ static void cm_event_connected(struct nes_cm_event *event) cm_event.provider_data = cm_id->provider_data; cm_event_laddr->sin_family = AF_INET; cm_event_laddr->sin_port = laddr->sin_port; - cm_event.remote_addr = cm_id->remote_addr; + cm_event.remote_addr = cm_id->m_remote_addr; cm_event.private_data = (void *)event->cm_node->mpa_frame_buf; cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size; cm_event.ird = cm_node->ird_size; cm_event.ord = cm_node->ord_size; - cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr); + cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr); ret = cm_id->event_handler(cm_id, &cm_event); nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret); @@ -3913,8 +3724,8 @@ static void cm_event_connect_error(struct nes_cm_event *event) cm_event.event = IW_CM_EVENT_CONNECT_REPLY; cm_event.status = -ECONNRESET; cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->local_addr; - cm_event.remote_addr = cm_id->remote_addr; + cm_event.local_addr = cm_id->m_local_addr; + cm_event.remote_addr = cm_id->m_remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; @@ -3970,8 +3781,8 @@ static void cm_event_reset(struct nes_cm_event *event) cm_event.event = IW_CM_EVENT_DISCONNECT; cm_event.status = -ECONNRESET; cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->local_addr; - cm_event.remote_addr = cm_id->remote_addr; + cm_event.local_addr = cm_id->m_local_addr; + cm_event.remote_addr = cm_id->m_remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; @@ -3981,8 +3792,8 @@ static void cm_event_reset(struct nes_cm_event *event) cm_event.event = IW_CM_EVENT_CLOSE; cm_event.status = 0; cm_event.provider_data = cm_id->provider_data; - cm_event.local_addr = cm_id->local_addr; - cm_event.remote_addr = cm_id->remote_addr; + cm_event.local_addr = cm_id->m_local_addr; + cm_event.remote_addr = cm_id->m_remote_addr; cm_event.private_data = NULL; cm_event.private_data_len = 0; nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 147c2c884227..d827d03e3941 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -293,8 +293,8 @@ struct nes_cm_listener { struct list_head list; struct nes_cm_core *cm_core; u8 loc_mac[ETH_ALEN]; - nes_addr_t loc_addr, mapped_loc_addr; - u16 loc_port, mapped_loc_port; + nes_addr_t loc_addr; + u16 loc_port; struct iw_cm_id *cm_id; enum nes_cm_conn_type conn_type; atomic_t ref_count; @@ -309,9 +309,7 @@ struct nes_cm_listener { /* per connection node and node state information */ struct nes_cm_node { nes_addr_t loc_addr, rem_addr; - nes_addr_t mapped_loc_addr, mapped_rem_addr; u16 loc_port, rem_port; - u16 mapped_loc_port, mapped_rem_port; u8 loc_mac[ETH_ALEN]; u8 rem_mac[ETH_ALEN]; @@ -368,11 +366,6 @@ struct nes_cm_info { u16 rem_port; nes_addr_t loc_addr; nes_addr_t rem_addr; - u16 mapped_loc_port; - u16 mapped_rem_port; - nes_addr_t mapped_loc_addr; - nes_addr_t mapped_rem_addr; - enum nes_cm_conn_type conn_type; int backlog; }; diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 4713dd7ed764..a1c6481d8038 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -35,18 +35,11 @@ #include <linux/moduleparam.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> -#include <linux/ip.h> -#include <linux/tcp.h> #include <linux/if_vlan.h> -#include <linux/inet_lro.h> #include <linux/slab.h> #include "nes.h" -static unsigned int nes_lro_max_aggr = NES_LRO_MAX_AGGR; -module_param(nes_lro_max_aggr, uint, 0444); -MODULE_PARM_DESC(nes_lro_max_aggr, "NIC LRO max packet aggregation"); - static int wide_ppm_offset; module_param(wide_ppm_offset, int, 0644); MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm"); @@ -1642,25 +1635,6 @@ static void nes_rq_wqes_timeout(unsigned long parm) } -static int nes_lro_get_skb_hdr(struct sk_buff *skb, void **iphdr, - void **tcph, u64 *hdr_flags, void *priv) -{ - unsigned int ip_len; - struct iphdr *iph; - skb_reset_network_header(skb); - iph = ip_hdr(skb); - if (iph->protocol != IPPROTO_TCP) - return -1; - ip_len = ip_hdrlen(skb); - skb_set_transport_header(skb, ip_len); - *tcph = tcp_hdr(skb); - - *hdr_flags = LRO_IPV4 | LRO_TCP; - *iphdr = iph; - return 0; -} - - /** * nes_init_nic_qp */ @@ -1895,14 +1869,6 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) return -ENOMEM; } - nesvnic->lro_mgr.max_aggr = nes_lro_max_aggr; - nesvnic->lro_mgr.max_desc = NES_MAX_LRO_DESCRIPTORS; - nesvnic->lro_mgr.lro_arr = nesvnic->lro_desc; - nesvnic->lro_mgr.get_skb_header = nes_lro_get_skb_hdr; - nesvnic->lro_mgr.features = LRO_F_NAPI | LRO_F_EXTRACT_VLAN_ID; - nesvnic->lro_mgr.dev = netdev; - nesvnic->lro_mgr.ip_summed = CHECKSUM_UNNECESSARY; - nesvnic->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; return 0; } @@ -2809,13 +2775,10 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) u16 pkt_type; u16 rqes_processed = 0; u8 sq_cqes = 0; - u8 nes_use_lro = 0; head = cq->cq_head; cq_size = cq->cq_size; cq->cqes_pending = 1; - if (nesvnic->netdev->features & NETIF_F_LRO) - nes_use_lro = 1; do { if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) & NES_NIC_CQE_VALID) { @@ -2950,10 +2913,7 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) __vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag); } - if (nes_use_lro) - lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL); - else - netif_receive_skb(rx_skb); + napi_gro_receive(&nesvnic->napi, rx_skb); skip_rx_indicate0: ; @@ -2984,8 +2944,6 @@ skip_rx_indicate0: } while (1); - if (nes_use_lro) - lro_flush_all(&nesvnic->lro_mgr); if (sq_cqes) { barrier(); /* restart the queue if it had been stopped */ diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index c9080208aad2..1b66ef1e9937 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -33,8 +33,6 @@ #ifndef __NES_HW_H #define __NES_HW_H -#include <linux/inet_lro.h> - #define NES_PHY_TYPE_CX4 1 #define NES_PHY_TYPE_1G 2 #define NES_PHY_TYPE_ARGUS 4 @@ -1049,8 +1047,6 @@ struct nes_hw_tune_timer { #define NES_TIMER_ENABLE_LIMIT 4 #define NES_MAX_LINK_INTERRUPTS 128 #define NES_MAX_LINK_CHECK 200 -#define NES_MAX_LRO_DESCRIPTORS 32 -#define NES_LRO_MAX_AGGR 64 struct nes_adapter { u64 fw_ver; @@ -1263,9 +1259,6 @@ struct nes_vnic { u8 next_qp_nic_index; u8 of_device_registered; u8 rdma_enabled; - u32 lro_max_aggr; - struct net_lro_mgr lro_mgr; - struct net_lro_desc lro_desc[NES_MAX_LRO_DESCRIPTORS]; struct timer_list event_timer; enum ib_event_type delayed_event; enum ib_event_type last_dispatched_event; diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 6a0bdfa0ce2e..3ea9e055fdd3 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -1085,9 +1085,6 @@ static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = { "Free 4Kpbls", "Free 256pbls", "Timer Inits", - "LRO aggregated", - "LRO flushed", - "LRO no_desc", "PAU CreateQPs", "PAU DestroyQPs", }; @@ -1302,9 +1299,6 @@ static void nes_netdev_get_ethtool_stats(struct net_device *netdev, target_stat_values[++index] = nesadapter->free_4kpbl; target_stat_values[++index] = nesadapter->free_256pbl; target_stat_values[++index] = int_mod_timer_init; - target_stat_values[++index] = nesvnic->lro_mgr.stats.aggregated; - target_stat_values[++index] = nesvnic->lro_mgr.stats.flushed; - target_stat_values[++index] = nesvnic->lro_mgr.stats.no_desc; target_stat_values[++index] = atomic_read(&pau_qps_created); target_stat_values[++index] = atomic_read(&pau_qps_destroyed); } @@ -1709,7 +1703,6 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev, netdev->hw_features |= NETIF_F_TSO; netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX; - netdev->hw_features |= NETIF_F_LRO; nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d," " nic_index = %d, logical_port = %d, mac_index = %d.\n", diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 8c4daf7f22ec..fba69a39a7eb 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -56,7 +56,8 @@ static int nes_dereg_mr(struct ib_mr *ib_mr); /** * nes_alloc_mw */ -static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type) +static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type, + struct ib_udata *udata) { struct nes_pd *nespd = to_nespd(ibpd); struct nes_vnic *nesvnic = to_nesvnic(ibpd->device); @@ -3768,6 +3769,8 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.iwcm->create_listen = nes_create_listen; nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen; nesibdev->ibdev.get_port_immutable = nes_port_immutable; + memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name, + sizeof(nesibdev->ibdev.iwcm->ifname)); return nesibdev; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 12503f15fbd6..45bdfa0e3b2b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -114,6 +114,7 @@ struct ocrdma_dev_attr { u8 local_ca_ack_delay; u8 ird; u8 num_ird_pages; + u8 udp_encap; }; struct ocrdma_dma_mem { @@ -356,6 +357,7 @@ struct ocrdma_ah { struct ocrdma_av *av; u16 sgid_index; u32 id; + u8 hdr_type; }; struct ocrdma_qp_hwq_info { @@ -598,4 +600,10 @@ static inline u8 ocrdma_get_ae_link_state(u32 ae_state) return ((ae_state & OCRDMA_AE_LSC_LS_MASK) >> OCRDMA_AE_LSC_LS_SHIFT); } +static inline bool ocrdma_is_udp_encap_supported(struct ocrdma_dev *dev) +{ + return (dev->attr.udp_encap & OCRDMA_L3_TYPE_IPV4) || + (dev->attr.udp_encap & OCRDMA_L3_TYPE_IPV6); +} + #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 3790771f2baa..797362a297b2 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -55,18 +55,46 @@ #define OCRDMA_VID_PCP_SHIFT 0xD +static u16 ocrdma_hdr_type_to_proto_num(int devid, u8 hdr_type) +{ + switch (hdr_type) { + case OCRDMA_L3_TYPE_IB_GRH: + return (u16)0x8915; + case OCRDMA_L3_TYPE_IPV4: + return (u16)0x0800; + case OCRDMA_L3_TYPE_IPV6: + return (u16)0x86dd; + default: + pr_err("ocrdma%d: Invalid network header\n", devid); + return 0; + } +} + static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, struct ib_ah_attr *attr, union ib_gid *sgid, int pdid, bool *isvlan, u16 vlan_tag) { - int status = 0; + int status; struct ocrdma_eth_vlan eth; struct ocrdma_grh grh; int eth_sz; + u16 proto_num = 0; + u8 nxthdr = 0x11; + struct iphdr ipv4; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; memset(ð, 0, sizeof(eth)); memset(&grh, 0, sizeof(grh)); + /* Protocol Number */ + proto_num = ocrdma_hdr_type_to_proto_num(dev->id, ah->hdr_type); + if (!proto_num) + return -EINVAL; + nxthdr = (proto_num == 0x8915) ? 0x1b : 0x11; /* VLAN */ if (!vlan_tag || (vlan_tag > 0xFFF)) vlan_tag = dev->pvid; @@ -78,13 +106,13 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, dev->id); } eth.eth_type = cpu_to_be16(0x8100); - eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + eth.roce_eth_type = cpu_to_be16(proto_num); vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT; eth.vlan_tag = cpu_to_be16(vlan_tag); eth_sz = sizeof(struct ocrdma_eth_vlan); *isvlan = true; } else { - eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); + eth.eth_type = cpu_to_be16(proto_num); eth_sz = sizeof(struct ocrdma_eth_basic); } /* MAC */ @@ -93,18 +121,33 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, if (status) return status; ah->sgid_index = attr->grh.sgid_index; - memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid)); - memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw)); - - grh.tclass_flow = cpu_to_be32((6 << 28) | - (attr->grh.traffic_class << 24) | - attr->grh.flow_label); - /* 0x1b is next header value in GRH */ - grh.pdid_hoplimit = cpu_to_be32((pdid << 16) | - (0x1b << 8) | attr->grh.hop_limit); /* Eth HDR */ memcpy(&ah->av->eth_hdr, ð, eth_sz); - memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); + if (ah->hdr_type == RDMA_NETWORK_IPV4) { + *((__be16 *)&ipv4) = htons((4 << 12) | (5 << 8) | + attr->grh.traffic_class); + ipv4.id = cpu_to_be16(pdid); + ipv4.frag_off = htons(IP_DF); + ipv4.tot_len = htons(0); + ipv4.ttl = attr->grh.hop_limit; + ipv4.protocol = nxthdr; + rdma_gid2ip(&sgid_addr._sockaddr, sgid); + ipv4.saddr = sgid_addr._sockaddr_in.sin_addr.s_addr; + rdma_gid2ip(&dgid_addr._sockaddr, &attr->grh.dgid); + ipv4.daddr = dgid_addr._sockaddr_in.sin_addr.s_addr; + memcpy((u8 *)ah->av + eth_sz, &ipv4, sizeof(struct iphdr)); + } else { + memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid)); + grh.tclass_flow = cpu_to_be32((6 << 28) | + (attr->grh.traffic_class << 24) | + attr->grh.flow_label); + memcpy(&grh.dgid[0], attr->grh.dgid.raw, + sizeof(attr->grh.dgid.raw)); + grh.pdid_hoplimit = cpu_to_be32((pdid << 16) | + (nxthdr << 8) | + attr->grh.hop_limit); + memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); + } if (*isvlan) ah->av->valid |= OCRDMA_AV_VLAN_VALID; ah->av->valid = cpu_to_le32(ah->av->valid); @@ -128,6 +171,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) if (atomic_cmpxchg(&dev->update_sl, 1, 0)) ocrdma_init_service_level(dev); + ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); @@ -148,6 +192,8 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev); dev_put(sgid_attr.ndev); } + /* Get network header type for this GID */ + ah->hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); if ((pd->uctx) && (!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) && @@ -172,6 +218,11 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) ahid_addr = pd->uctx->ah_tbl.va + attr->dlid; *ahid_addr = 0; *ahid_addr |= ah->id & OCRDMA_AH_ID_MASK; + if (ocrdma_is_udp_encap_supported(dev)) { + *ahid_addr |= ((u32)ah->hdr_type & + OCRDMA_AH_L3_TYPE_MASK) << + OCRDMA_AH_L3_TYPE_SHIFT; + } if (isvlan) *ahid_addr |= (OCRDMA_AH_VLAN_VALID_MASK << OCRDMA_AH_VLAN_VALID_SHIFT); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h index 04a30ae67473..3856dd4c7e3d 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h @@ -46,9 +46,10 @@ enum { OCRDMA_AH_ID_MASK = 0x3FF, OCRDMA_AH_VLAN_VALID_MASK = 0x01, - OCRDMA_AH_VLAN_VALID_SHIFT = 0x1F + OCRDMA_AH_VLAN_VALID_SHIFT = 0x1F, + OCRDMA_AH_L3_TYPE_MASK = 0x03, + OCRDMA_AH_L3_TYPE_SHIFT = 0x1D /* 29 bits */ }; - struct ib_ah *ocrdma_create_ah(struct ib_pd *, struct ib_ah_attr *); int ocrdma_destroy_ah(struct ib_ah *); int ocrdma_query_ah(struct ib_ah *, struct ib_ah_attr *); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 283ca842ff74..16740dcb876b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1113,7 +1113,7 @@ mbx_err: static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe, void *payload_va) { - int status = 0; + int status; struct ocrdma_mbx_rsp *rsp = payload_va; if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >> @@ -1144,6 +1144,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_pd = (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT; + attr->udp_encap = (rsp->max_pd_ca_ack_delay & + OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK) >> + OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT; attr->max_dpp_pds = (rsp->max_dpp_pds_credits & OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK) >> OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET; @@ -2138,7 +2141,6 @@ int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, enum ib_qp_state *old_ib_state) { unsigned long flags; - int status = 0; enum ocrdma_qp_state new_state; new_state = get_ocrdma_qp_state(new_ib_state); @@ -2163,7 +2165,7 @@ int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state, qp->state = new_state; spin_unlock_irqrestore(&qp->q_lock, flags); - return status; + return 0; } static u32 ocrdma_set_create_qp_mbx_access_flags(struct ocrdma_qp *qp) @@ -2501,7 +2503,12 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, union ib_gid sgid, zgid; struct ib_gid_attr sgid_attr; u32 vlan_id = 0xFFFF; - u8 mac_addr[6]; + u8 mac_addr[6], hdr_type; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device); if ((ah_attr->ah_flags & IB_AH_GRH) == 0) @@ -2516,6 +2523,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, cmd->params.hop_lmt_rq_psn |= (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT); cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID; + + /* GIDs */ memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0], sizeof(cmd->params.dgid)); @@ -2538,6 +2547,16 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, return status; cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) | (mac_addr[2] << 16) | (mac_addr[3] << 24); + + hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); + if (hdr_type == RDMA_NETWORK_IPV4) { + rdma_gid2ip(&sgid_addr._sockaddr, &sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &ah_attr->grh.dgid); + memcpy(&cmd->params.dgid[0], + &dgid_addr._sockaddr_in.sin_addr.s_addr, 4); + memcpy(&cmd->params.sgid[0], + &sgid_addr._sockaddr_in.sin_addr.s_addr, 4); + } /* convert them to LE format. */ ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid)); ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid)); @@ -2558,7 +2577,9 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, cmd->params.rnt_rc_sl_fl |= (dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT; } - + cmd->params.max_sge_recv_flags |= ((hdr_type << + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT) & + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK); return 0; } @@ -2871,7 +2892,7 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq) static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype, struct ocrdma_dcbx_cfg *dcbxcfg) { - int status = 0; + int status; dma_addr_t pa; struct ocrdma_mqe cmd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index f38743018cb4..3d75f65ce87e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -89,8 +89,10 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; + struct ocrdma_dev *dev; int err; + dev = get_ocrdma_dev(ibdev); err = ocrdma_query_port(ibdev, port_num, &attr); if (err) return err; @@ -98,6 +100,8 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; + if (ocrdma_is_udp_encap_supported(dev)) + immutable->core_cap_flags |= RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 99dd6fdf06d7..0efc9662c6d8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -140,7 +140,11 @@ enum { OCRDMA_DB_RQ_SHIFT = 24 }; -#define OCRDMA_ROUDP_FLAGS_SHIFT 0x03 +enum { + OCRDMA_L3_TYPE_IB_GRH = 0x00, + OCRDMA_L3_TYPE_IPV4 = 0x01, + OCRDMA_L3_TYPE_IPV6 = 0x02 +}; #define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ #define OCRDMA_DB_CQ_RING_ID_EXT_MASK 0x0C00 /* bits 10-11 of qid at 12-11 */ @@ -546,7 +550,8 @@ enum { OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT = 8, OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK = 0xFF << OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT, - + OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT = 3, + OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK = 0x18, OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT = 0, OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK = 0xFFFF, OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT = 16, @@ -1107,6 +1112,8 @@ enum { OCRDMA_QP_PARAMS_STATE_MASK = BIT(5) | BIT(6) | BIT(7), OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC = BIT(8), OCRDMA_QP_PARAMS_FLAGS_INB_ATEN = BIT(9), + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT = 11, + OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK = BIT(11) | BIT(12) | BIT(13), OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT = 16, OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK = 0xFFFF << OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT, @@ -1735,8 +1742,11 @@ enum { /* w1 */ OCRDMA_CQE_UD_XFER_LEN_SHIFT = 16, + OCRDMA_CQE_UD_XFER_LEN_MASK = 0x1FFF, OCRDMA_CQE_PKEY_SHIFT = 0, OCRDMA_CQE_PKEY_MASK = 0xFFFF, + OCRDMA_CQE_UD_L3TYPE_SHIFT = 29, + OCRDMA_CQE_UD_L3TYPE_MASK = 0x07, /* w2 */ OCRDMA_CQE_QPN_SHIFT = 0, @@ -1861,7 +1871,7 @@ struct ocrdma_ewqe_ud_hdr { u32 rsvd_dest_qpn; u32 qkey; u32 rsvd_ahid; - u32 rsvd; + u32 hdr_type; }; /* extended wqe followed by hdr_wqe for Fast Memory register */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 255f774080a4..8bef09a8c49f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -610,7 +610,7 @@ static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev) static void ocrdma_update_stats(struct ocrdma_dev *dev) { ulong now = jiffies, secs; - int status = 0; + int status; struct ocrdma_rdma_stats_resp *rdma_stats = (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va; struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats; @@ -641,7 +641,7 @@ static ssize_t ocrdma_dbgfs_ops_write(struct file *filp, { char tmp_str[32]; long reset; - int status = 0; + int status; struct ocrdma_stats *pstats = filp->private_data; struct ocrdma_dev *dev = pstats->dev; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 12420e4ecf3d..a8496a18e20d 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -419,7 +419,7 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ib_udata *udata) { struct ocrdma_pd *pd = NULL; - int status = 0; + int status; pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (!pd) @@ -468,7 +468,7 @@ static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) { - int status = 0; + int status; if (dev->pd_mgr->pd_prealloc_valid) status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled); @@ -596,7 +596,7 @@ map_err: int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) { - int status = 0; + int status; struct ocrdma_mm *mm, *tmp; struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx); struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device); @@ -623,7 +623,7 @@ int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT; u64 unmapped_db = (u64) dev->nic_info.unmapped_db; unsigned long len = (vma->vm_end - vma->vm_start); - int status = 0; + int status; bool found; if (vma->vm_start & (PAGE_SIZE - 1)) @@ -1285,7 +1285,7 @@ static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp, struct ib_udata *udata, int dpp_offset, int dpp_credit_lmt, int srq) { - int status = 0; + int status; u64 usr_db; struct ocrdma_create_qp_uresp uresp; struct ocrdma_pd *pd = qp->pd; @@ -1494,9 +1494,7 @@ int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, */ if (status < 0) return status; - status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); - - return status; + return ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask); } int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, @@ -1949,7 +1947,7 @@ int ocrdma_modify_srq(struct ib_srq *ibsrq, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata) { - int status = 0; + int status; struct ocrdma_srq *srq; srq = get_ocrdma_srq(ibsrq); @@ -2005,6 +2003,7 @@ static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp, else ud_hdr->qkey = ud_wr(wr)->remote_qkey; ud_hdr->rsvd_ahid = ah->id; + ud_hdr->hdr_type = ah->hdr_type; if (ah->av->valid & OCRDMA_AV_VLAN_VALID) hdr->cw |= (OCRDMA_FLAG_AH_VLAN_PR << OCRDMA_WQE_FLAGS_SHIFT); } @@ -2717,9 +2716,11 @@ static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, return expand; } -static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe) +static int ocrdma_update_ud_rcqe(struct ocrdma_dev *dev, struct ib_wc *ibwc, + struct ocrdma_cqe *cqe) { int status; + u16 hdr_type = 0; status = (le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT; @@ -2728,7 +2729,17 @@ static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe) ibwc->pkey_index = 0; ibwc->wc_flags = IB_WC_GRH; ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >> - OCRDMA_CQE_UD_XFER_LEN_SHIFT); + OCRDMA_CQE_UD_XFER_LEN_SHIFT) & + OCRDMA_CQE_UD_XFER_LEN_MASK; + + if (ocrdma_is_udp_encap_supported(dev)) { + hdr_type = (le32_to_cpu(cqe->ud.rxlen_pkey) >> + OCRDMA_CQE_UD_L3TYPE_SHIFT) & + OCRDMA_CQE_UD_L3TYPE_MASK; + ibwc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; + ibwc->network_hdr_type = hdr_type; + } + return status; } @@ -2791,12 +2802,15 @@ static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, struct ib_wc *ibwc) { + struct ocrdma_dev *dev; + + dev = get_ocrdma_dev(qp->ibqp.device); ibwc->opcode = IB_WC_RECV; ibwc->qp = &qp->ibqp; ibwc->status = IB_WC_SUCCESS; if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) - ocrdma_update_ud_rcqe(ibwc, cqe); + ocrdma_update_ud_rcqe(dev, ibwc, cqe); else ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen); diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig index 495be09781b1..e0fdb9201423 100644 --- a/drivers/infiniband/hw/qib/Kconfig +++ b/drivers/infiniband/hw/qib/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_QIB tristate "Intel PCIe HCA support" - depends on 64BIT + depends on 64BIT && INFINIBAND_RDMAVT ---help--- This is a low-level driver for Intel PCIe QLE InfiniBand host channel adapters. This driver does not support the Intel diff --git a/drivers/infiniband/hw/qib/Makefile b/drivers/infiniband/hw/qib/Makefile index 57f8103e51f8..79ebd79e8405 100644 --- a/drivers/infiniband/hw/qib/Makefile +++ b/drivers/infiniband/hw/qib/Makefile @@ -1,11 +1,11 @@ obj-$(CONFIG_INFINIBAND_QIB) += ib_qib.o -ib_qib-y := qib_cq.o qib_diag.o qib_dma.o qib_driver.o qib_eeprom.o \ - qib_file_ops.o qib_fs.o qib_init.o qib_intr.o qib_keys.o \ - qib_mad.o qib_mmap.o qib_mr.o qib_pcie.o qib_pio_copy.o \ - qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o qib_srq.o \ +ib_qib-y := qib_diag.o qib_driver.o qib_eeprom.o \ + qib_file_ops.o qib_fs.o qib_init.o qib_intr.o \ + qib_mad.o qib_pcie.o qib_pio_copy.o \ + qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o \ qib_sysfs.o qib_twsi.o qib_tx.o qib_uc.o qib_ud.o \ - qib_user_pages.o qib_user_sdma.o qib_verbs_mcast.o qib_iba7220.o \ + qib_user_pages.o qib_user_sdma.o qib_iba7220.o \ qib_sd7220.o qib_iba7322.o qib_verbs.o # 6120 has no fallback if no MSI interrupts, others can do INTx diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 7df16f74bb45..bbf0a163aeab 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -52,6 +52,7 @@ #include <linux/kref.h> #include <linux/sched.h> #include <linux/kthread.h> +#include <rdma/rdma_vt.h> #include "qib_common.h" #include "qib_verbs.h" @@ -229,9 +230,6 @@ struct qib_ctxtdata { u8 redirect_seq_cnt; /* ctxt rcvhdrq head offset */ u32 head; - /* lookaside fields */ - struct qib_qp *lookaside_qp; - u32 lookaside_qpn; /* QPs waiting for context processing */ struct list_head qp_wait_list; #ifdef CONFIG_DEBUG_FS @@ -240,7 +238,7 @@ struct qib_ctxtdata { #endif }; -struct qib_sge_state; +struct rvt_sge_state; struct qib_sdma_txreq { int flags; @@ -258,14 +256,14 @@ struct qib_sdma_desc { struct qib_verbs_txreq { struct qib_sdma_txreq txreq; - struct qib_qp *qp; - struct qib_swqe *wqe; + struct rvt_qp *qp; + struct rvt_swqe *wqe; u32 dwords; u16 hdr_dwords; u16 hdr_inx; struct qib_pio_header *align_buf; - struct qib_mregion *mr; - struct qib_sge_state *ss; + struct rvt_mregion *mr; + struct rvt_sge_state *ss; }; #define QIB_SDMA_TXREQ_F_USELARGEBUF 0x1 @@ -1096,8 +1094,6 @@ struct qib_devdata { u16 psxmitwait_check_rate; /* high volume overflow errors defered to tasklet */ struct tasklet_struct error_tasklet; - /* per device cq worker */ - struct kthread_worker *worker; int assigned_node_id; /* NUMA node closest to HCA */ }; @@ -1135,8 +1131,9 @@ extern spinlock_t qib_devs_lock; extern struct qib_devdata *qib_lookup(int unit); extern u32 qib_cpulist_count; extern unsigned long *qib_cpulist; - +extern u16 qpt_mask; extern unsigned qib_cc_table_size; + int qib_init(struct qib_devdata *, int); int init_chip_wc_pat(struct qib_devdata *dd, u32); int qib_enable_wc(struct qib_devdata *dd); @@ -1323,7 +1320,7 @@ void __qib_sdma_intr(struct qib_pportdata *); void qib_sdma_intr(struct qib_pportdata *); void qib_user_sdma_send_desc(struct qib_pportdata *dd, struct list_head *pktlist); -int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *, +int qib_sdma_verbs_send(struct qib_pportdata *, struct rvt_sge_state *, u32, struct qib_verbs_txreq *); /* ppd->sdma_lock should be locked before calling this. */ int qib_sdma_make_progress(struct qib_pportdata *dd); @@ -1454,6 +1451,8 @@ u64 qib_sps_ints(void); dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, size_t, int); const char *qib_get_unit_name(int unit); +const char *qib_get_card_name(struct rvt_dev_info *rdi); +struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi); /* * Flush write combining store buffers (if present) and perform a write @@ -1540,4 +1539,14 @@ struct qib_hwerror_msgs { void qib_format_hwerrors(u64 hwerrs, const struct qib_hwerror_msgs *hwerrmsgs, size_t nhwerrmsgs, char *msg, size_t lmsg); + +void qib_stop_send_queue(struct rvt_qp *qp); +void qib_quiesce_qp(struct rvt_qp *qp); +void qib_flush_qp_waiters(struct rvt_qp *qp); +int qib_mtu_to_path_mtu(u32 mtu); +u32 qib_mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); +void qib_notify_error_qp(struct rvt_qp *qp); +int qib_get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr); + #endif /* _QIB_KERNEL_H */ diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h index 4fb78abd8ba1..1d6e63eb1146 100644 --- a/drivers/infiniband/hw/qib/qib_common.h +++ b/drivers/infiniband/hw/qib/qib_common.h @@ -742,14 +742,11 @@ struct qib_tid_session_member { #define SIZE_OF_CRC 1 #define QIB_DEFAULT_P_KEY 0xFFFF -#define QIB_PERMISSIVE_LID 0xFFFF #define QIB_AETH_CREDIT_SHIFT 24 #define QIB_AETH_CREDIT_MASK 0x1F #define QIB_AETH_CREDIT_INVAL 0x1F #define QIB_PSN_MASK 0xFFFFFF #define QIB_MSN_MASK 0xFFFFFF -#define QIB_QPN_MASK 0xFFFFFF -#define QIB_MULTICAST_LID_BASE 0xC000 #define QIB_EAGER_TID_ID QLOGIC_IB_I_TID_MASK #define QIB_MULTICAST_QPN 0xFFFFFF diff --git a/drivers/infiniband/hw/qib/qib_cq.c b/drivers/infiniband/hw/qib/qib_cq.c deleted file mode 100644 index 2b45d0b02300..000000000000 --- a/drivers/infiniband/hw/qib/qib_cq.c +++ /dev/null @@ -1,545 +0,0 @@ -/* - * Copyright (c) 2013 Intel Corporation. All rights reserved. - * Copyright (c) 2006, 2007, 2008, 2010 QLogic Corporation. All rights reserved. - * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/kthread.h> - -#include "qib_verbs.h" -#include "qib.h" - -/** - * qib_cq_enter - add a new entry to the completion queue - * @cq: completion queue - * @entry: work completion entry to add - * @sig: true if @entry is a solicitated entry - * - * This may be called with qp->s_lock held. - */ -void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int solicited) -{ - struct qib_cq_wc *wc; - unsigned long flags; - u32 head; - u32 next; - - spin_lock_irqsave(&cq->lock, flags); - - /* - * Note that the head pointer might be writable by user processes. - * Take care to verify it is a sane value. - */ - wc = cq->queue; - head = wc->head; - if (head >= (unsigned) cq->ibcq.cqe) { - head = cq->ibcq.cqe; - next = 0; - } else - next = head + 1; - if (unlikely(next == wc->tail)) { - spin_unlock_irqrestore(&cq->lock, flags); - if (cq->ibcq.event_handler) { - struct ib_event ev; - - ev.device = cq->ibcq.device; - ev.element.cq = &cq->ibcq; - ev.event = IB_EVENT_CQ_ERR; - cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); - } - return; - } - if (cq->ip) { - wc->uqueue[head].wr_id = entry->wr_id; - wc->uqueue[head].status = entry->status; - wc->uqueue[head].opcode = entry->opcode; - wc->uqueue[head].vendor_err = entry->vendor_err; - wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].ex.imm_data = - (__u32 __force)entry->ex.imm_data; - wc->uqueue[head].qp_num = entry->qp->qp_num; - wc->uqueue[head].src_qp = entry->src_qp; - wc->uqueue[head].wc_flags = entry->wc_flags; - wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = entry->slid; - wc->uqueue[head].sl = entry->sl; - wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; - wc->uqueue[head].port_num = entry->port_num; - /* Make sure entry is written before the head index. */ - smp_wmb(); - } else - wc->kqueue[head] = *entry; - wc->head = next; - - if (cq->notify == IB_CQ_NEXT_COMP || - (cq->notify == IB_CQ_SOLICITED && - (solicited || entry->status != IB_WC_SUCCESS))) { - struct kthread_worker *worker; - /* - * This will cause send_complete() to be called in - * another thread. - */ - smp_rmb(); - worker = cq->dd->worker; - if (likely(worker)) { - cq->notify = IB_CQ_NONE; - cq->triggered++; - queue_kthread_work(worker, &cq->comptask); - } - } - - spin_unlock_irqrestore(&cq->lock, flags); -} - -/** - * qib_poll_cq - poll for work completion entries - * @ibcq: the completion queue to poll - * @num_entries: the maximum number of entries to return - * @entry: pointer to array where work completions are placed - * - * Returns the number of completion entries polled. - * - * This may be called from interrupt context. Also called by ib_poll_cq() - * in the generic verbs code. - */ -int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) -{ - struct qib_cq *cq = to_icq(ibcq); - struct qib_cq_wc *wc; - unsigned long flags; - int npolled; - u32 tail; - - /* The kernel can only poll a kernel completion queue */ - if (cq->ip) { - npolled = -EINVAL; - goto bail; - } - - spin_lock_irqsave(&cq->lock, flags); - - wc = cq->queue; - tail = wc->tail; - if (tail > (u32) cq->ibcq.cqe) - tail = (u32) cq->ibcq.cqe; - for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { - if (tail == wc->head) - break; - /* The kernel doesn't need a RMB since it has the lock. */ - *entry = wc->kqueue[tail]; - if (tail >= cq->ibcq.cqe) - tail = 0; - else - tail++; - } - wc->tail = tail; - - spin_unlock_irqrestore(&cq->lock, flags); - -bail: - return npolled; -} - -static void send_complete(struct kthread_work *work) -{ - struct qib_cq *cq = container_of(work, struct qib_cq, comptask); - - /* - * The completion handler will most likely rearm the notification - * and poll for all pending entries. If a new completion entry - * is added while we are in this routine, queue_work() - * won't call us again until we return so we check triggered to - * see if we need to call the handler again. - */ - for (;;) { - u8 triggered = cq->triggered; - - /* - * IPoIB connected mode assumes the callback is from a - * soft IRQ. We simulate this by blocking "bottom halves". - * See the implementation for ipoib_cm_handle_tx_wc(), - * netif_tx_lock_bh() and netif_tx_lock(). - */ - local_bh_disable(); - cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); - local_bh_enable(); - - if (cq->triggered == triggered) - return; - } -} - -/** - * qib_create_cq - create a completion queue - * @ibdev: the device this completion queue is attached to - * @attr: creation attributes - * @context: unused by the QLogic_IB driver - * @udata: user data for libibverbs.so - * - * Returns a pointer to the completion queue or negative errno values - * for failure. - * - * Called by ib_create_cq() in the generic verbs code. - */ -struct ib_cq *qib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_ucontext *context, - struct ib_udata *udata) -{ - int entries = attr->cqe; - struct qib_ibdev *dev = to_idev(ibdev); - struct qib_cq *cq; - struct qib_cq_wc *wc; - struct ib_cq *ret; - u32 sz; - - if (attr->flags) - return ERR_PTR(-EINVAL); - - if (entries < 1 || entries > ib_qib_max_cqes) { - ret = ERR_PTR(-EINVAL); - goto done; - } - - /* Allocate the completion queue structure. */ - cq = kmalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) { - ret = ERR_PTR(-ENOMEM); - goto done; - } - - /* - * Allocate the completion queue entries and head/tail pointers. - * This is allocated separately so that it can be resized and - * also mapped into user space. - * We need to use vmalloc() in order to support mmap and large - * numbers of entries. - */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (entries + 1); - else - sz += sizeof(struct ib_wc) * (entries + 1); - wc = vmalloc_user(sz); - if (!wc) { - ret = ERR_PTR(-ENOMEM); - goto bail_cq; - } - - /* - * Return the address of the WC as the offset to mmap. - * See qib_mmap() for details. - */ - if (udata && udata->outlen >= sizeof(__u64)) { - int err; - - cq->ip = qib_create_mmap_info(dev, sz, context, wc); - if (!cq->ip) { - ret = ERR_PTR(-ENOMEM); - goto bail_wc; - } - - err = ib_copy_to_udata(udata, &cq->ip->offset, - sizeof(cq->ip->offset)); - if (err) { - ret = ERR_PTR(err); - goto bail_ip; - } - } else - cq->ip = NULL; - - spin_lock(&dev->n_cqs_lock); - if (dev->n_cqs_allocated == ib_qib_max_cqs) { - spin_unlock(&dev->n_cqs_lock); - ret = ERR_PTR(-ENOMEM); - goto bail_ip; - } - - dev->n_cqs_allocated++; - spin_unlock(&dev->n_cqs_lock); - - if (cq->ip) { - spin_lock_irq(&dev->pending_lock); - list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - } - - /* - * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. - * The number of entries should be >= the number requested or return - * an error. - */ - cq->dd = dd_from_dev(dev); - cq->ibcq.cqe = entries; - cq->notify = IB_CQ_NONE; - cq->triggered = 0; - spin_lock_init(&cq->lock); - init_kthread_work(&cq->comptask, send_complete); - wc->head = 0; - wc->tail = 0; - cq->queue = wc; - - ret = &cq->ibcq; - - goto done; - -bail_ip: - kfree(cq->ip); -bail_wc: - vfree(wc); -bail_cq: - kfree(cq); -done: - return ret; -} - -/** - * qib_destroy_cq - destroy a completion queue - * @ibcq: the completion queue to destroy. - * - * Returns 0 for success. - * - * Called by ib_destroy_cq() in the generic verbs code. - */ -int qib_destroy_cq(struct ib_cq *ibcq) -{ - struct qib_ibdev *dev = to_idev(ibcq->device); - struct qib_cq *cq = to_icq(ibcq); - - flush_kthread_work(&cq->comptask); - spin_lock(&dev->n_cqs_lock); - dev->n_cqs_allocated--; - spin_unlock(&dev->n_cqs_lock); - if (cq->ip) - kref_put(&cq->ip->ref, qib_release_mmap_info); - else - vfree(cq->queue); - kfree(cq); - - return 0; -} - -/** - * qib_req_notify_cq - change the notification type for a completion queue - * @ibcq: the completion queue - * @notify_flags: the type of notification to request - * - * Returns 0 for success. - * - * This may be called from interrupt context. Also called by - * ib_req_notify_cq() in the generic verbs code. - */ -int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) -{ - struct qib_cq *cq = to_icq(ibcq); - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&cq->lock, flags); - /* - * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow - * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). - */ - if (cq->notify != IB_CQ_NEXT_COMP) - cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; - - if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - cq->queue->head != cq->queue->tail) - ret = 1; - - spin_unlock_irqrestore(&cq->lock, flags); - - return ret; -} - -/** - * qib_resize_cq - change the size of the CQ - * @ibcq: the completion queue - * - * Returns 0 for success. - */ -int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) -{ - struct qib_cq *cq = to_icq(ibcq); - struct qib_cq_wc *old_wc; - struct qib_cq_wc *wc; - u32 head, tail, n; - int ret; - u32 sz; - - if (cqe < 1 || cqe > ib_qib_max_cqes) { - ret = -EINVAL; - goto bail; - } - - /* - * Need to use vmalloc() if we want to support large #s of entries. - */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); - else - sz += sizeof(struct ib_wc) * (cqe + 1); - wc = vmalloc_user(sz); - if (!wc) { - ret = -ENOMEM; - goto bail; - } - - /* Check that we can write the offset to mmap. */ - if (udata && udata->outlen >= sizeof(__u64)) { - __u64 offset = 0; - - ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); - if (ret) - goto bail_free; - } - - spin_lock_irq(&cq->lock); - /* - * Make sure head and tail are sane since they - * might be user writable. - */ - old_wc = cq->queue; - head = old_wc->head; - if (head > (u32) cq->ibcq.cqe) - head = (u32) cq->ibcq.cqe; - tail = old_wc->tail; - if (tail > (u32) cq->ibcq.cqe) - tail = (u32) cq->ibcq.cqe; - if (head < tail) - n = cq->ibcq.cqe + 1 + head - tail; - else - n = head - tail; - if (unlikely((u32)cqe < n)) { - ret = -EINVAL; - goto bail_unlock; - } - for (n = 0; tail != head; n++) { - if (cq->ip) - wc->uqueue[n] = old_wc->uqueue[tail]; - else - wc->kqueue[n] = old_wc->kqueue[tail]; - if (tail == (u32) cq->ibcq.cqe) - tail = 0; - else - tail++; - } - cq->ibcq.cqe = cqe; - wc->head = n; - wc->tail = 0; - cq->queue = wc; - spin_unlock_irq(&cq->lock); - - vfree(old_wc); - - if (cq->ip) { - struct qib_ibdev *dev = to_idev(ibcq->device); - struct qib_mmap_info *ip = cq->ip; - - qib_update_mmap_info(dev, ip, sz, wc); - - /* - * Return the offset to mmap. - * See qib_mmap() for details. - */ - if (udata && udata->outlen >= sizeof(__u64)) { - ret = ib_copy_to_udata(udata, &ip->offset, - sizeof(ip->offset)); - if (ret) - goto bail; - } - - spin_lock_irq(&dev->pending_lock); - if (list_empty(&ip->pending_mmaps)) - list_add(&ip->pending_mmaps, &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - } - - ret = 0; - goto bail; - -bail_unlock: - spin_unlock_irq(&cq->lock); -bail_free: - vfree(wc); -bail: - return ret; -} - -int qib_cq_init(struct qib_devdata *dd) -{ - int ret = 0; - int cpu; - struct task_struct *task; - - if (dd->worker) - return 0; - dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL); - if (!dd->worker) - return -ENOMEM; - init_kthread_worker(dd->worker); - task = kthread_create_on_node( - kthread_worker_fn, - dd->worker, - dd->assigned_node_id, - "qib_cq%d", dd->unit); - if (IS_ERR(task)) - goto task_fail; - cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id)); - kthread_bind(task, cpu); - wake_up_process(task); -out: - return ret; -task_fail: - ret = PTR_ERR(task); - kfree(dd->worker); - dd->worker = NULL; - goto out; -} - -void qib_cq_exit(struct qib_devdata *dd) -{ - struct kthread_worker *worker; - - worker = dd->worker; - if (!worker) - return; - /* blocks future queuing from send_complete() */ - dd->worker = NULL; - smp_wmb(); - flush_kthread_worker(worker); - kthread_stop(worker->task); - kfree(worker); -} diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index f58fdc3d25a2..67ee6438cf59 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -90,6 +90,22 @@ const char *qib_get_unit_name(int unit) return iname; } +const char *qib_get_card_name(struct rvt_dev_info *rdi) +{ + struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = container_of(ibdev, + struct qib_devdata, verbs_dev); + return qib_get_unit_name(dd->unit); +} + +struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi) +{ + struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = container_of(ibdev, + struct qib_devdata, verbs_dev); + return dd->pcidev; +} + /* * Return count of units with at least one port ACTIVE. */ @@ -306,7 +322,9 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, struct qib_ib_header *hdr = (struct qib_ib_header *) rhdr; struct qib_other_headers *ohdr = NULL; struct qib_ibport *ibp = &ppd->ibport_data; - struct qib_qp *qp = NULL; + struct qib_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + struct rvt_qp *qp = NULL; u32 tlen = qib_hdrget_length_in_bytes(rhf_addr); u16 lid = be16_to_cpu(hdr->lrh[1]); int lnh = be16_to_cpu(hdr->lrh[0]) & 3; @@ -319,7 +337,7 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, if (tlen < 24) goto drop; - if (lid < QIB_MULTICAST_LID_BASE) { + if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) { lid &= ~((1 << ppd->lmc) - 1); if (unlikely(lid != ppd->lid)) goto drop; @@ -346,13 +364,16 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, psn = be32_to_cpu(ohdr->bth[2]); /* Get the destination QP number. */ - qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; + qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; if (qp_num != QIB_MULTICAST_QPN) { int ruc_res; - qp = qib_lookup_qpn(ibp, qp_num); - if (!qp) + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!qp) { + rcu_read_unlock(); goto drop; + } /* * Handle only RC QPs - for other QP types drop error @@ -361,9 +382,9 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, spin_lock(&qp->r_lock); /* Check for valid receive state. */ - if (!(ib_qib_state_ops[qp->state] & - QIB_PROCESS_RECV_OK)) { - ibp->n_pkt_drops++; + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; goto unlock; } @@ -383,7 +404,7 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { diff = qib_cmp24(psn, qp->r_psn); if (!qp->r_nak_state && diff >= 0) { - ibp->n_rc_seqnak++; + ibp->rvp.n_rc_seqnak++; qp->r_nak_state = IB_NAK_PSN_ERROR; /* Use the expected PSN. */ @@ -398,7 +419,7 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, */ if (list_empty(&qp->rspwait)) { qp->r_flags |= - QIB_R_RSP_NAK; + RVT_R_RSP_NAK; atomic_inc( &qp->refcount); list_add_tail( @@ -419,12 +440,7 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd, unlock: spin_unlock(&qp->r_lock); - /* - * Notify qib_destroy_qp() if it is waiting - * for us to finish. - */ - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); + rcu_read_unlock(); } /* Unicast QP */ } /* Valid packet with TIDErr */ @@ -456,7 +472,7 @@ u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts) u32 eflags, etype, tlen, i = 0, updegr = 0, crcs = 0; int last; u64 lval; - struct qib_qp *qp, *nqp; + struct rvt_qp *qp, *nqp; l = rcd->head; rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset; @@ -549,15 +565,6 @@ move_along: updegr = 0; } } - /* - * Notify qib_destroy_qp() if it is waiting - * for lookaside_qp to finish. - */ - if (rcd->lookaside_qp) { - if (atomic_dec_and_test(&rcd->lookaside_qp->refcount)) - wake_up(&rcd->lookaside_qp->wait); - rcd->lookaside_qp = NULL; - } rcd->head = l; @@ -567,17 +574,17 @@ move_along: */ list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) { list_del_init(&qp->rspwait); - if (qp->r_flags & QIB_R_RSP_NAK) { - qp->r_flags &= ~QIB_R_RSP_NAK; + if (qp->r_flags & RVT_R_RSP_NAK) { + qp->r_flags &= ~RVT_R_RSP_NAK; qib_send_rc_ack(qp); } - if (qp->r_flags & QIB_R_RSP_SEND) { + if (qp->r_flags & RVT_R_RSP_SEND) { unsigned long flags; - qp->r_flags &= ~QIB_R_RSP_SEND; + qp->r_flags &= ~RVT_R_RSP_SEND; spin_lock_irqsave(&qp->s_lock, flags); - if (ib_qib_state_ops[qp->state] & - QIB_PROCESS_OR_FLUSH_SEND) + if (ib_rvt_state_ops[qp->state] & + RVT_PROCESS_OR_FLUSH_SEND) qib_schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); } diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c index 4b927809d1a1..a3733f25280f 100644 --- a/drivers/infiniband/hw/qib/qib_iba6120.c +++ b/drivers/infiniband/hw/qib/qib_iba6120.c @@ -2956,13 +2956,13 @@ static void pma_6120_timer(unsigned long data) struct qib_ibport *ibp = &ppd->ibport_data; unsigned long flags; - spin_lock_irqsave(&ibp->lock, flags); + spin_lock_irqsave(&ibp->rvp.lock, flags); if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED) { cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; qib_snapshot_counters(ppd, &cs->sword, &cs->rword, &cs->spkts, &cs->rpkts, &cs->xmit_wait); mod_timer(&cs->pma_timer, - jiffies + usecs_to_jiffies(ibp->pma_sample_interval)); + jiffies + usecs_to_jiffies(ibp->rvp.pma_sample_interval)); } else if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) { u64 ta, tb, tc, td, te; @@ -2975,11 +2975,11 @@ static void pma_6120_timer(unsigned long data) cs->rpkts = td - cs->rpkts; cs->xmit_wait = te - cs->xmit_wait; } - spin_unlock_irqrestore(&ibp->lock, flags); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); } /* - * Note that the caller has the ibp->lock held. + * Note that the caller has the ibp->rvp.lock held. */ static void qib_set_cntr_6120_sample(struct qib_pportdata *ppd, u32 intv, u32 start) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 6c8ff10101c0..82d7c4bf5970 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2910,8 +2910,6 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd) spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags); qib_qsfp_deinit(&dd->pport[i].cpspec->qsfp_data); } - if (dd->pport[i].ibport_data.smi_ah) - ib_destroy_ah(&dd->pport[i].ibport_data.smi_ah->ibah); } } @@ -5497,7 +5495,7 @@ static void try_7322_ipg(struct qib_pportdata *ppd) unsigned delay; int ret; - agent = ibp->send_agent; + agent = ibp->rvp.send_agent; if (!agent) goto retry; @@ -5515,7 +5513,7 @@ static void try_7322_ipg(struct qib_pportdata *ppd) ret = PTR_ERR(ah); else { send_buf->ah = ah; - ibp->smi_ah = to_iah(ah); + ibp->smi_ah = ibah_to_rvtah(ah); ret = 0; } } else { diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 4ff340fe904f..3f062f0dd9d8 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -42,6 +42,7 @@ #ifdef CONFIG_INFINIBAND_QIB_DCA #include <linux/dca.h> #endif +#include <rdma/rdma_vt.h> #include "qib.h" #include "qib_common.h" @@ -244,6 +245,13 @@ int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, alloc_percpu(struct qib_pma_counters); if (!ppd->ibport_data.pmastats) return -ENOMEM; + ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64); + if (!(ppd->ibport_data.rvp.rc_acks) || + !(ppd->ibport_data.rvp.rc_qacks) || + !(ppd->ibport_data.rvp.rc_delayed_comp)) + return -ENOMEM; if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) goto bail; @@ -449,8 +457,6 @@ static int loadtime_init(struct qib_devdata *dd) init_timer(&dd->intrchk_timer); dd->intrchk_timer.function = verify_interrupt; dd->intrchk_timer.data = (unsigned long) dd; - - ret = qib_cq_init(dd); done: return ret; } @@ -631,6 +637,9 @@ wq_error: static void qib_free_pportdata(struct qib_pportdata *ppd) { free_percpu(ppd->ibport_data.pmastats); + free_percpu(ppd->ibport_data.rvp.rc_acks); + free_percpu(ppd->ibport_data.rvp.rc_qacks); + free_percpu(ppd->ibport_data.rvp.rc_delayed_comp); ppd->ibport_data.pmastats = NULL; } @@ -1081,7 +1090,7 @@ void qib_free_devdata(struct qib_devdata *dd) qib_dbg_ibdev_exit(&dd->verbs_dev); #endif free_percpu(dd->int_counter); - ib_dealloc_device(&dd->verbs_dev.ibdev); + ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); } u64 qib_int_counter(struct qib_devdata *dd) @@ -1120,9 +1129,12 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) { unsigned long flags; struct qib_devdata *dd; - int ret; + int ret, nports; - dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); + /* extra is * number of ports */ + nports = extra / sizeof(struct qib_pportdata); + dd = (struct qib_devdata *)rvt_alloc_device(sizeof(*dd) + extra, + nports); if (!dd) return ERR_PTR(-ENOMEM); @@ -1171,7 +1183,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) bail: if (!list_empty(&dd->list)) list_del_init(&dd->list); - ib_dealloc_device(&dd->verbs_dev.ibdev); + ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); return ERR_PTR(ret); } @@ -1421,7 +1433,6 @@ static void cleanup_device_data(struct qib_devdata *dd) } kfree(tmp); kfree(dd->boardname); - qib_cq_exit(dd); } /* diff --git a/drivers/infiniband/hw/qib/qib_intr.c b/drivers/infiniband/hw/qib/qib_intr.c index 086616d071b9..a014fd4cd076 100644 --- a/drivers/infiniband/hw/qib/qib_intr.c +++ b/drivers/infiniband/hw/qib/qib_intr.c @@ -74,7 +74,7 @@ static void signal_ib_event(struct qib_pportdata *ppd, enum ib_event_type ev) struct ib_event event; struct qib_devdata *dd = ppd->dd; - event.device = &dd->verbs_dev.ibdev; + event.device = &dd->verbs_dev.rdi.ibdev; event.element.port_num = ppd->port; event.event = ev; ib_dispatch_event(&event); diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c index d725c565518d..2c3c93572c17 100644 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ b/drivers/infiniband/hw/qib/qib_keys.c @@ -46,20 +46,20 @@ * */ -int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) +int qib_alloc_lkey(struct rvt_mregion *mr, int dma_region) { unsigned long flags; u32 r; u32 n; int ret = 0; struct qib_ibdev *dev = to_idev(mr->pd->device); - struct qib_lkey_table *rkt = &dev->lk_table; + struct rvt_lkey_table *rkt = &dev->lk_table; spin_lock_irqsave(&rkt->lock, flags); /* special case for dma_mr lkey == 0 */ if (dma_region) { - struct qib_mregion *tmr; + struct rvt_mregion *tmr; tmr = rcu_access_pointer(dev->dma_mr); if (!tmr) { @@ -90,8 +90,8 @@ int qib_alloc_lkey(struct qib_mregion *mr, int dma_region) * bits are capped in qib_verbs.c to insure enough bits * for generation number */ - mr->lkey = (r << (32 - ib_qib_lkey_table_size)) | - ((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen) + mr->lkey = (r << (32 - ib_rvt_lkey_table_size)) | + ((((1 << (24 - ib_rvt_lkey_table_size)) - 1) & rkt->gen) << 8); if (mr->lkey == 0) { mr->lkey |= 1 << 8; @@ -114,13 +114,13 @@ bail: * qib_free_lkey - free an lkey * @mr: mr to free from tables */ -void qib_free_lkey(struct qib_mregion *mr) +void qib_free_lkey(struct rvt_mregion *mr) { unsigned long flags; u32 lkey = mr->lkey; u32 r; struct qib_ibdev *dev = to_idev(mr->pd->device); - struct qib_lkey_table *rkt = &dev->lk_table; + struct rvt_lkey_table *rkt = &dev->lk_table; spin_lock_irqsave(&rkt->lock, flags); if (!mr->lkey_published) @@ -128,7 +128,7 @@ void qib_free_lkey(struct qib_mregion *mr) if (lkey == 0) RCU_INIT_POINTER(dev->dma_mr, NULL); else { - r = lkey >> (32 - ib_qib_lkey_table_size); + r = lkey >> (32 - ib_rvt_lkey_table_size); RCU_INIT_POINTER(rkt->table[r], NULL); } qib_put_mr(mr); @@ -138,105 +138,6 @@ out: } /** - * qib_lkey_ok - check IB SGE for validity and initialize - * @rkt: table containing lkey to check SGE against - * @pd: protection domain - * @isge: outgoing internal SGE - * @sge: SGE to check - * @acc: access flags - * - * Return 1 if valid and successful, otherwise returns 0. - * - * increments the reference count upon success - * - * Check the IB SGE for validity and initialize our internal version - * of it. - */ -int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, - struct qib_sge *isge, struct ib_sge *sge, int acc) -{ - struct qib_mregion *mr; - unsigned n, m; - size_t off; - - /* - * We use LKEY == zero for kernel virtual addresses - * (see qib_get_dma_mr and qib_dma.c). - */ - rcu_read_lock(); - if (sge->lkey == 0) { - struct qib_ibdev *dev = to_idev(pd->ibpd.device); - - if (pd->user) - goto bail; - mr = rcu_dereference(dev->dma_mr); - if (!mr) - goto bail; - if (unlikely(!atomic_inc_not_zero(&mr->refcount))) - goto bail; - rcu_read_unlock(); - - isge->mr = mr; - isge->vaddr = (void *) sge->addr; - isge->length = sge->length; - isge->sge_length = sge->length; - isge->m = 0; - isge->n = 0; - goto ok; - } - mr = rcu_dereference( - rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]); - if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) - goto bail; - - off = sge->addr - mr->user_base; - if (unlikely(sge->addr < mr->user_base || - off + sge->length > mr->length || - (mr->access_flags & acc) != acc)) - goto bail; - if (unlikely(!atomic_inc_not_zero(&mr->refcount))) - goto bail; - rcu_read_unlock(); - - off += mr->offset; - if (mr->page_shift) { - /* - page sizes are uniform power of 2 so no loop is necessary - entries_spanned_by_off is the number of times the loop below - would have executed. - */ - size_t entries_spanned_by_off; - - entries_spanned_by_off = off >> mr->page_shift; - off -= (entries_spanned_by_off << mr->page_shift); - m = entries_spanned_by_off/QIB_SEGSZ; - n = entries_spanned_by_off%QIB_SEGSZ; - } else { - m = 0; - n = 0; - while (off >= mr->map[m]->segs[n].length) { - off -= mr->map[m]->segs[n].length; - n++; - if (n >= QIB_SEGSZ) { - m++; - n = 0; - } - } - } - isge->mr = mr; - isge->vaddr = mr->map[m]->segs[n].vaddr + off; - isge->length = mr->map[m]->segs[n].length - off; - isge->sge_length = sge->length; - isge->m = m; - isge->n = n; -ok: - return 1; -bail: - rcu_read_unlock(); - return 0; -} - -/** * qib_rkey_ok - check the IB virtual address, length, and RKEY * @qp: qp for validation * @sge: SGE state @@ -249,11 +150,11 @@ bail: * * increments the reference count upon success */ -int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, +int qib_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, u32 len, u64 vaddr, u32 rkey, int acc) { - struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; - struct qib_mregion *mr; + struct rvt_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct rvt_mregion *mr; unsigned n, m; size_t off; @@ -263,7 +164,7 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, */ rcu_read_lock(); if (rkey == 0) { - struct qib_pd *pd = to_ipd(qp->ibqp.pd); + struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd); struct qib_ibdev *dev = to_idev(pd->ibpd.device); if (pd->user) @@ -285,7 +186,7 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, } mr = rcu_dereference( - rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))]); + rkt->table[(rkey >> (32 - ib_rvt_lkey_table_size))]); if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) goto bail; @@ -308,15 +209,15 @@ int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, entries_spanned_by_off = off >> mr->page_shift; off -= (entries_spanned_by_off << mr->page_shift); - m = entries_spanned_by_off/QIB_SEGSZ; - n = entries_spanned_by_off%QIB_SEGSZ; + m = entries_spanned_by_off / RVT_SEGSZ; + n = entries_spanned_by_off % RVT_SEGSZ; } else { m = 0; n = 0; while (off >= mr->map[m]->segs[n].length) { off -= mr->map[m]->segs[n].length; n++; - if (n >= QIB_SEGSZ) { + if (n >= RVT_SEGSZ) { m++; n = 0; } @@ -335,58 +236,3 @@ bail: return 0; } -/* - * Initialize the memory region specified by the work request. - */ -int qib_reg_mr(struct qib_qp *qp, struct ib_reg_wr *wr) -{ - struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; - struct qib_pd *pd = to_ipd(qp->ibqp.pd); - struct qib_mr *mr = to_imr(wr->mr); - struct qib_mregion *mrg; - u32 key = wr->key; - unsigned i, n, m; - int ret = -EINVAL; - unsigned long flags; - u64 *page_list; - size_t ps; - - spin_lock_irqsave(&rkt->lock, flags); - if (pd->user || key == 0) - goto bail; - - mrg = rcu_dereference_protected( - rkt->table[(key >> (32 - ib_qib_lkey_table_size))], - lockdep_is_held(&rkt->lock)); - if (unlikely(mrg == NULL || qp->ibqp.pd != mrg->pd)) - goto bail; - - if (mr->npages > mrg->max_segs) - goto bail; - - ps = mr->ibmr.page_size; - if (mr->ibmr.length > ps * mr->npages) - goto bail; - - mrg->user_base = mr->ibmr.iova; - mrg->iova = mr->ibmr.iova; - mrg->lkey = key; - mrg->length = mr->ibmr.length; - mrg->access_flags = wr->access; - page_list = mr->pages; - m = 0; - n = 0; - for (i = 0; i < mr->npages; i++) { - mrg->map[m]->segs[n].vaddr = (void *) page_list[i]; - mrg->map[m]->segs[n].length = ps; - if (++n == QIB_SEGSZ) { - m++; - n = 0; - } - } - - ret = 0; -bail: - spin_unlock_irqrestore(&rkt->lock, flags); - return ret; -} diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 9625e7c438e5..0bd18375d7df 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -70,7 +70,7 @@ static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) unsigned long flags; unsigned long timeout; - agent = ibp->send_agent; + agent = ibp->rvp.send_agent; if (!agent) return; @@ -79,7 +79,8 @@ static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) return; /* o14-2 */ - if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout)) + if (ibp->rvp.trap_timeout && + time_before(jiffies, ibp->rvp.trap_timeout)) return; send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR, @@ -93,42 +94,42 @@ static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; smp->class_version = 1; smp->method = IB_MGMT_METHOD_TRAP; - ibp->tid++; - smp->tid = cpu_to_be64(ibp->tid); + ibp->rvp.tid++; + smp->tid = cpu_to_be64(ibp->rvp.tid); smp->attr_id = IB_SMP_ATTR_NOTICE; /* o14-1: smp->mkey = 0; */ memcpy(smp->data, data, len); - spin_lock_irqsave(&ibp->lock, flags); - if (!ibp->sm_ah) { - if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (!ibp->rvp.sm_ah) { + if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { struct ib_ah *ah; - ah = qib_create_qp0_ah(ibp, ibp->sm_lid); + ah = qib_create_qp0_ah(ibp, ibp->rvp.sm_lid); if (IS_ERR(ah)) ret = PTR_ERR(ah); else { send_buf->ah = ah; - ibp->sm_ah = to_iah(ah); + ibp->rvp.sm_ah = ibah_to_rvtah(ah); ret = 0; } } else ret = -EINVAL; } else { - send_buf->ah = &ibp->sm_ah->ibah; + send_buf->ah = &ibp->rvp.sm_ah->ibah; ret = 0; } - spin_unlock_irqrestore(&ibp->lock, flags); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); if (!ret) ret = ib_post_send_mad(send_buf, NULL); if (!ret) { /* 4.096 usec. */ - timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000; - ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout); + timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000; + ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout); } else { ib_free_send_mad(send_buf); - ibp->trap_timeout = 0; + ibp->rvp.trap_timeout = 0; } } @@ -141,10 +142,10 @@ void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, struct ib_mad_notice_attr data; if (trap_num == IB_NOTICE_TRAP_BAD_PKEY) - ibp->pkey_violations++; + ibp->rvp.pkey_violations++; else - ibp->qkey_violations++; - ibp->n_pkt_drops++; + ibp->rvp.qkey_violations++; + ibp->rvp.n_pkt_drops++; /* Send violation trap */ data.generic_type = IB_NOTICE_TYPE_SECURITY; @@ -205,8 +206,11 @@ static void qib_bad_mkey(struct qib_ibport *ibp, struct ib_smp *smp) /* * Send a Port Capability Mask Changed trap (ch. 14.3.11). */ -void qib_cap_mask_chg(struct qib_ibport *ibp) +void qib_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) { + struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = dd_from_dev(ibdev); + struct qib_ibport *ibp = &dd->pport[port_num - 1].ibport_data; struct ib_mad_notice_attr data; data.generic_type = IB_NOTICE_TYPE_INFO; @@ -217,8 +221,8 @@ void qib_cap_mask_chg(struct qib_ibport *ibp) data.toggle_count = 0; memset(&data.details, 0, sizeof(data.details)); data.details.ntc_144.lid = data.issuer_lid; - data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags); - + data.details.ntc_144.new_cap_mask = + cpu_to_be32(ibp->rvp.port_cap_flags); qib_send_trap(ibp, &data, sizeof(data)); } @@ -409,37 +413,38 @@ static int check_mkey(struct qib_ibport *ibp, struct ib_smp *smp, int mad_flags) int ret = 0; /* Is the mkey in the process of expiring? */ - if (ibp->mkey_lease_timeout && - time_after_eq(jiffies, ibp->mkey_lease_timeout)) { + if (ibp->rvp.mkey_lease_timeout && + time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) { /* Clear timeout and mkey protection field. */ - ibp->mkey_lease_timeout = 0; - ibp->mkeyprot = 0; + ibp->rvp.mkey_lease_timeout = 0; + ibp->rvp.mkeyprot = 0; } - if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->mkey == 0 || - ibp->mkey == smp->mkey) + if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->rvp.mkey == 0 || + ibp->rvp.mkey == smp->mkey) valid_mkey = 1; /* Unset lease timeout on any valid Get/Set/TrapRepress */ - if (valid_mkey && ibp->mkey_lease_timeout && + if (valid_mkey && ibp->rvp.mkey_lease_timeout && (smp->method == IB_MGMT_METHOD_GET || smp->method == IB_MGMT_METHOD_SET || smp->method == IB_MGMT_METHOD_TRAP_REPRESS)) - ibp->mkey_lease_timeout = 0; + ibp->rvp.mkey_lease_timeout = 0; if (!valid_mkey) { switch (smp->method) { case IB_MGMT_METHOD_GET: /* Bad mkey not a violation below level 2 */ - if (ibp->mkeyprot < 2) + if (ibp->rvp.mkeyprot < 2) break; case IB_MGMT_METHOD_SET: case IB_MGMT_METHOD_TRAP_REPRESS: - if (ibp->mkey_violations != 0xFFFF) - ++ibp->mkey_violations; - if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period) - ibp->mkey_lease_timeout = jiffies + - ibp->mkey_lease_period * HZ; + if (ibp->rvp.mkey_violations != 0xFFFF) + ++ibp->rvp.mkey_violations; + if (!ibp->rvp.mkey_lease_timeout && + ibp->rvp.mkey_lease_period) + ibp->rvp.mkey_lease_timeout = jiffies + + ibp->rvp.mkey_lease_period * HZ; /* Generate a trap notice. */ qib_bad_mkey(ibp, smp); ret = 1; @@ -489,15 +494,15 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, /* Only return the mkey if the protection field allows it. */ if (!(smp->method == IB_MGMT_METHOD_GET && - ibp->mkey != smp->mkey && - ibp->mkeyprot == 1)) - pip->mkey = ibp->mkey; - pip->gid_prefix = ibp->gid_prefix; + ibp->rvp.mkey != smp->mkey && + ibp->rvp.mkeyprot == 1)) + pip->mkey = ibp->rvp.mkey; + pip->gid_prefix = ibp->rvp.gid_prefix; pip->lid = cpu_to_be16(ppd->lid); - pip->sm_lid = cpu_to_be16(ibp->sm_lid); - pip->cap_mask = cpu_to_be32(ibp->port_cap_flags); + pip->sm_lid = cpu_to_be16(ibp->rvp.sm_lid); + pip->cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); /* pip->diag_code; */ - pip->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period); + pip->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period); pip->local_port_num = port; pip->link_width_enabled = ppd->link_width_enabled; pip->link_width_supported = ppd->link_width_supported; @@ -508,7 +513,7 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, pip->portphysstate_linkdown = (dd->f_ibphys_portstate(ppd->lastibcstat) << 4) | (get_linkdowndefaultstate(ppd) ? 1 : 2); - pip->mkeyprot_resv_lmc = (ibp->mkeyprot << 6) | ppd->lmc; + pip->mkeyprot_resv_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc; pip->linkspeedactive_enabled = (ppd->link_speed_active << 4) | ppd->link_speed_enabled; switch (ppd->ibmtu) { @@ -529,9 +534,9 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, mtu = IB_MTU_256; break; } - pip->neighbormtu_mastersmsl = (mtu << 4) | ibp->sm_sl; + pip->neighbormtu_mastersmsl = (mtu << 4) | ibp->rvp.sm_sl; pip->vlcap_inittype = ppd->vls_supported << 4; /* InitType = 0 */ - pip->vl_high_limit = ibp->vl_high_limit; + pip->vl_high_limit = ibp->rvp.vl_high_limit; pip->vl_arb_high_cap = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_CAP); pip->vl_arb_low_cap = @@ -542,20 +547,20 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, /* pip->vlstallcnt_hoqlife; */ pip->operationalvl_pei_peo_fpi_fpo = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OP_VLS) << 4; - pip->mkey_violations = cpu_to_be16(ibp->mkey_violations); + pip->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations); /* P_KeyViolations are counted by hardware. */ - pip->pkey_violations = cpu_to_be16(ibp->pkey_violations); - pip->qkey_violations = cpu_to_be16(ibp->qkey_violations); + pip->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations); + pip->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations); /* Only the hardware GUID is supported for now */ pip->guid_cap = QIB_GUIDS_PER_PORT; - pip->clientrereg_resv_subnetto = ibp->subnet_timeout; + pip->clientrereg_resv_subnetto = ibp->rvp.subnet_timeout; /* 32.768 usec. response time (guessing) */ pip->resv_resptimevalue = 3; pip->localphyerrors_overrunerrors = (get_phyerrthreshold(ppd) << 4) | get_overrunthreshold(ppd); /* pip->max_credit_hint; */ - if (ibp->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { + if (ibp->rvp.port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { u32 v; v = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKLATENCY); @@ -685,13 +690,13 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, event.device = ibdev; event.element.port_num = port; - ibp->mkey = pip->mkey; - ibp->gid_prefix = pip->gid_prefix; - ibp->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); + ibp->rvp.mkey = pip->mkey; + ibp->rvp.gid_prefix = pip->gid_prefix; + ibp->rvp.mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); lid = be16_to_cpu(pip->lid); /* Must be a valid unicast LID address. */ - if (lid == 0 || lid >= QIB_MULTICAST_LID_BASE) + if (lid == 0 || lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) smp->status |= IB_SMP_INVALID_FIELD; else if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) { if (ppd->lid != lid) @@ -706,21 +711,21 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, smlid = be16_to_cpu(pip->sm_lid); msl = pip->neighbormtu_mastersmsl & 0xF; /* Must be a valid unicast LID address. */ - if (smlid == 0 || smlid >= QIB_MULTICAST_LID_BASE) + if (smlid == 0 || smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) smp->status |= IB_SMP_INVALID_FIELD; - else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) { - spin_lock_irqsave(&ibp->lock, flags); - if (ibp->sm_ah) { - if (smlid != ibp->sm_lid) - ibp->sm_ah->attr.dlid = smlid; - if (msl != ibp->sm_sl) - ibp->sm_ah->attr.sl = msl; + else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) { + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (ibp->rvp.sm_ah) { + if (smlid != ibp->rvp.sm_lid) + ibp->rvp.sm_ah->attr.dlid = smlid; + if (msl != ibp->rvp.sm_sl) + ibp->rvp.sm_ah->attr.sl = msl; } - spin_unlock_irqrestore(&ibp->lock, flags); - if (smlid != ibp->sm_lid) - ibp->sm_lid = smlid; - if (msl != ibp->sm_sl) - ibp->sm_sl = msl; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + if (smlid != ibp->rvp.sm_lid) + ibp->rvp.sm_lid = smlid; + if (msl != ibp->rvp.sm_sl) + ibp->rvp.sm_sl = msl; event.event = IB_EVENT_SM_CHANGE; ib_dispatch_event(&event); } @@ -768,10 +773,10 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, smp->status |= IB_SMP_INVALID_FIELD; } - ibp->mkeyprot = pip->mkeyprot_resv_lmc >> 6; - ibp->vl_high_limit = pip->vl_high_limit; + ibp->rvp.mkeyprot = pip->mkeyprot_resv_lmc >> 6; + ibp->rvp.vl_high_limit = pip->vl_high_limit; (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_LIMIT, - ibp->vl_high_limit); + ibp->rvp.vl_high_limit); mtu = ib_mtu_enum_to_int((pip->neighbormtu_mastersmsl >> 4) & 0xF); if (mtu == -1) @@ -789,13 +794,13 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, } if (pip->mkey_violations == 0) - ibp->mkey_violations = 0; + ibp->rvp.mkey_violations = 0; if (pip->pkey_violations == 0) - ibp->pkey_violations = 0; + ibp->rvp.pkey_violations = 0; if (pip->qkey_violations == 0) - ibp->qkey_violations = 0; + ibp->rvp.qkey_violations = 0; ore = pip->localphyerrors_overrunerrors; if (set_phyerrthreshold(ppd, (ore >> 4) & 0xF)) @@ -804,7 +809,7 @@ static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, if (set_overrunthreshold(ppd, (ore & 0xF))) smp->status |= IB_SMP_INVALID_FIELD; - ibp->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; + ibp->rvp.subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; /* * Do the port state change now that the other link parameters @@ -1028,7 +1033,7 @@ static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys) (void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); event.event = IB_EVENT_PKEY_CHANGE; - event.device = &dd->verbs_dev.ibdev; + event.device = &dd->verbs_dev.rdi.ibdev; event.element.port_num = port; ib_dispatch_event(&event); } @@ -1062,7 +1067,7 @@ static int subn_get_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev, memset(smp->data, 0, sizeof(smp->data)); - if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) + if (!(ibp->rvp.port_cap_flags & IB_PORT_SL_MAP_SUP)) smp->status |= IB_SMP_UNSUP_METHOD; else for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2) @@ -1078,7 +1083,7 @@ static int subn_set_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev, u8 *p = (u8 *) smp->data; unsigned i; - if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) { + if (!(ibp->rvp.port_cap_flags & IB_PORT_SL_MAP_SUP)) { smp->status |= IB_SMP_UNSUP_METHOD; return reply(smp); } @@ -1195,20 +1200,20 @@ static int pma_get_portsamplescontrol(struct ib_pma_mad *pmp, pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; goto bail; } - spin_lock_irqsave(&ibp->lock, flags); + spin_lock_irqsave(&ibp->rvp.lock, flags); p->tick = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PMA_TICKS); p->sample_status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); p->counter_width = 4; /* 32 bit counters */ p->counter_mask0_9 = COUNTER_MASK0_9; - p->sample_start = cpu_to_be32(ibp->pma_sample_start); - p->sample_interval = cpu_to_be32(ibp->pma_sample_interval); - p->tag = cpu_to_be16(ibp->pma_tag); - p->counter_select[0] = ibp->pma_counter_select[0]; - p->counter_select[1] = ibp->pma_counter_select[1]; - p->counter_select[2] = ibp->pma_counter_select[2]; - p->counter_select[3] = ibp->pma_counter_select[3]; - p->counter_select[4] = ibp->pma_counter_select[4]; - spin_unlock_irqrestore(&ibp->lock, flags); + p->sample_start = cpu_to_be32(ibp->rvp.pma_sample_start); + p->sample_interval = cpu_to_be32(ibp->rvp.pma_sample_interval); + p->tag = cpu_to_be16(ibp->rvp.pma_tag); + p->counter_select[0] = ibp->rvp.pma_counter_select[0]; + p->counter_select[1] = ibp->rvp.pma_counter_select[1]; + p->counter_select[2] = ibp->rvp.pma_counter_select[2]; + p->counter_select[3] = ibp->rvp.pma_counter_select[3]; + p->counter_select[4] = ibp->rvp.pma_counter_select[4]; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); bail: return reply((struct ib_smp *) pmp); @@ -1233,7 +1238,7 @@ static int pma_set_portsamplescontrol(struct ib_pma_mad *pmp, goto bail; } - spin_lock_irqsave(&ibp->lock, flags); + spin_lock_irqsave(&ibp->rvp.lock, flags); /* Port Sampling code owns the PS* HW counters */ xmit_flags = ppd->cong_stats.flags; @@ -1242,18 +1247,18 @@ static int pma_set_portsamplescontrol(struct ib_pma_mad *pmp, if (status == IB_PMA_SAMPLE_STATUS_DONE || (status == IB_PMA_SAMPLE_STATUS_RUNNING && xmit_flags == IB_PMA_CONG_HW_CONTROL_TIMER)) { - ibp->pma_sample_start = be32_to_cpu(p->sample_start); - ibp->pma_sample_interval = be32_to_cpu(p->sample_interval); - ibp->pma_tag = be16_to_cpu(p->tag); - ibp->pma_counter_select[0] = p->counter_select[0]; - ibp->pma_counter_select[1] = p->counter_select[1]; - ibp->pma_counter_select[2] = p->counter_select[2]; - ibp->pma_counter_select[3] = p->counter_select[3]; - ibp->pma_counter_select[4] = p->counter_select[4]; - dd->f_set_cntr_sample(ppd, ibp->pma_sample_interval, - ibp->pma_sample_start); + ibp->rvp.pma_sample_start = be32_to_cpu(p->sample_start); + ibp->rvp.pma_sample_interval = be32_to_cpu(p->sample_interval); + ibp->rvp.pma_tag = be16_to_cpu(p->tag); + ibp->rvp.pma_counter_select[0] = p->counter_select[0]; + ibp->rvp.pma_counter_select[1] = p->counter_select[1]; + ibp->rvp.pma_counter_select[2] = p->counter_select[2]; + ibp->rvp.pma_counter_select[3] = p->counter_select[3]; + ibp->rvp.pma_counter_select[4] = p->counter_select[4]; + dd->f_set_cntr_sample(ppd, ibp->rvp.pma_sample_interval, + ibp->rvp.pma_sample_start); } - spin_unlock_irqrestore(&ibp->lock, flags); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); ret = pma_get_portsamplescontrol(pmp, ibdev, port); @@ -1357,8 +1362,8 @@ static int pma_get_portsamplesresult(struct ib_pma_mad *pmp, int i; memset(pmp->data, 0, sizeof(pmp->data)); - spin_lock_irqsave(&ibp->lock, flags); - p->tag = cpu_to_be16(ibp->pma_tag); + spin_lock_irqsave(&ibp->rvp.lock, flags); + p->tag = cpu_to_be16(ibp->rvp.pma_tag); if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER) p->sample_status = IB_PMA_SAMPLE_STATUS_DONE; else { @@ -1373,11 +1378,11 @@ static int pma_get_portsamplesresult(struct ib_pma_mad *pmp, ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; } } - for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++) + for (i = 0; i < ARRAY_SIZE(ibp->rvp.pma_counter_select); i++) p->counter[i] = cpu_to_be32( get_cache_hw_sample_counters( - ppd, ibp->pma_counter_select[i])); - spin_unlock_irqrestore(&ibp->lock, flags); + ppd, ibp->rvp.pma_counter_select[i])); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); return reply((struct ib_smp *) pmp); } @@ -1397,8 +1402,8 @@ static int pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, /* Port Sampling code owns the PS* HW counters */ memset(pmp->data, 0, sizeof(pmp->data)); - spin_lock_irqsave(&ibp->lock, flags); - p->tag = cpu_to_be16(ibp->pma_tag); + spin_lock_irqsave(&ibp->rvp.lock, flags); + p->tag = cpu_to_be16(ibp->rvp.pma_tag); if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER) p->sample_status = IB_PMA_SAMPLE_STATUS_DONE; else { @@ -1415,11 +1420,11 @@ static int pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER; } } - for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++) + for (i = 0; i < ARRAY_SIZE(ibp->rvp.pma_counter_select); i++) p->counter[i] = cpu_to_be64( get_cache_hw_sample_counters( - ppd, ibp->pma_counter_select[i])); - spin_unlock_irqrestore(&ibp->lock, flags); + ppd, ibp->rvp.pma_counter_select[i])); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); return reply((struct ib_smp *) pmp); } @@ -1453,7 +1458,7 @@ static int pma_get_portcounters(struct ib_pma_mad *pmp, cntrs.excessive_buffer_overrun_errors -= ibp->z_excessive_buffer_overrun_errors; cntrs.vl15_dropped -= ibp->z_vl15_dropped; - cntrs.vl15_dropped += ibp->n_vl15_dropped; + cntrs.vl15_dropped += ibp->rvp.n_vl15_dropped; memset(pmp->data, 0, sizeof(pmp->data)); @@ -1546,9 +1551,9 @@ static int pma_get_portcounters_cong(struct ib_pma_mad *pmp, pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; qib_get_counters(ppd, &cntrs); - spin_lock_irqsave(&ppd->ibport_data.lock, flags); + spin_lock_irqsave(&ppd->ibport_data.rvp.lock, flags); xmit_wait_counter = xmit_wait_get_value_delta(ppd); - spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + spin_unlock_irqrestore(&ppd->ibport_data.rvp.lock, flags); /* Adjust counters for any resets done. */ cntrs.symbol_error_counter -= ibp->z_symbol_error_counter; @@ -1564,7 +1569,7 @@ static int pma_get_portcounters_cong(struct ib_pma_mad *pmp, cntrs.excessive_buffer_overrun_errors -= ibp->z_excessive_buffer_overrun_errors; cntrs.vl15_dropped -= ibp->z_vl15_dropped; - cntrs.vl15_dropped += ibp->n_vl15_dropped; + cntrs.vl15_dropped += ibp->rvp.n_vl15_dropped; cntrs.port_xmit_data -= ibp->z_port_xmit_data; cntrs.port_rcv_data -= ibp->z_port_rcv_data; cntrs.port_xmit_packets -= ibp->z_port_xmit_packets; @@ -1743,7 +1748,7 @@ static int pma_set_portcounters(struct ib_pma_mad *pmp, cntrs.excessive_buffer_overrun_errors; if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) { - ibp->n_vl15_dropped = 0; + ibp->rvp.n_vl15_dropped = 0; ibp->z_vl15_dropped = cntrs.vl15_dropped; } @@ -1778,11 +1783,11 @@ static int pma_set_portcounters_cong(struct ib_pma_mad *pmp, ret = pma_get_portcounters_cong(pmp, ibdev, port); if (counter_select & IB_PMA_SEL_CONG_XMIT) { - spin_lock_irqsave(&ppd->ibport_data.lock, flags); + spin_lock_irqsave(&ppd->ibport_data.rvp.lock, flags); ppd->cong_stats.counter = 0; dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, 0x0); - spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + spin_unlock_irqrestore(&ppd->ibport_data.rvp.lock, flags); } if (counter_select & IB_PMA_SEL_CONG_PORT_DATA) { ibp->z_port_xmit_data = cntrs.port_xmit_data; @@ -1806,7 +1811,7 @@ static int pma_set_portcounters_cong(struct ib_pma_mad *pmp, cntrs.local_link_integrity_errors; ibp->z_excessive_buffer_overrun_errors = cntrs.excessive_buffer_overrun_errors; - ibp->n_vl15_dropped = 0; + ibp->rvp.n_vl15_dropped = 0; ibp->z_vl15_dropped = cntrs.vl15_dropped; } @@ -1916,12 +1921,12 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, ret = subn_get_vl_arb(smp, ibdev, port); goto bail; case IB_SMP_ATTR_SM_INFO: - if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) { + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) { ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; goto bail; } - if (ibp->port_cap_flags & IB_PORT_SM) { + if (ibp->rvp.port_cap_flags & IB_PORT_SM) { ret = IB_MAD_RESULT_SUCCESS; goto bail; } @@ -1950,12 +1955,12 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, ret = subn_set_vl_arb(smp, ibdev, port); goto bail; case IB_SMP_ATTR_SM_INFO: - if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) { + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) { ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; goto bail; } - if (ibp->port_cap_flags & IB_PORT_SM) { + if (ibp->rvp.port_cap_flags & IB_PORT_SM) { ret = IB_MAD_RESULT_SUCCESS; goto bail; } @@ -2443,12 +2448,6 @@ bail: return ret; } -static void send_handler(struct ib_mad_agent *agent, - struct ib_mad_send_wc *mad_send_wc) -{ - ib_free_send_mad(mad_send_wc->send_buf); -} - static void xmit_wait_timer_func(unsigned long opaque) { struct qib_pportdata *ppd = (struct qib_pportdata *)opaque; @@ -2456,7 +2455,7 @@ static void xmit_wait_timer_func(unsigned long opaque) unsigned long flags; u8 status; - spin_lock_irqsave(&ppd->ibport_data.lock, flags); + spin_lock_irqsave(&ppd->ibport_data.rvp.lock, flags); if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_SAMPLE) { status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT); if (status == IB_PMA_SAMPLE_STATUS_DONE) { @@ -2469,74 +2468,35 @@ static void xmit_wait_timer_func(unsigned long opaque) ppd->cong_stats.counter = xmit_wait_get_value_delta(ppd); dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, 0x0); done: - spin_unlock_irqrestore(&ppd->ibport_data.lock, flags); + spin_unlock_irqrestore(&ppd->ibport_data.rvp.lock, flags); mod_timer(&ppd->cong_stats.timer, jiffies + HZ); } -int qib_create_agents(struct qib_ibdev *dev) +void qib_notify_create_mad_agent(struct rvt_dev_info *rdi, int port_idx) { - struct qib_devdata *dd = dd_from_dev(dev); - struct ib_mad_agent *agent; - struct qib_ibport *ibp; - int p; - int ret; + struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = container_of(ibdev, + struct qib_devdata, verbs_dev); - for (p = 0; p < dd->num_pports; p++) { - ibp = &dd->pport[p].ibport_data; - agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI, - NULL, 0, send_handler, - NULL, NULL, 0); - if (IS_ERR(agent)) { - ret = PTR_ERR(agent); - goto err; - } - - /* Initialize xmit_wait structure */ - dd->pport[p].cong_stats.counter = 0; - init_timer(&dd->pport[p].cong_stats.timer); - dd->pport[p].cong_stats.timer.function = xmit_wait_timer_func; - dd->pport[p].cong_stats.timer.data = - (unsigned long)(&dd->pport[p]); - dd->pport[p].cong_stats.timer.expires = 0; - add_timer(&dd->pport[p].cong_stats.timer); - - ibp->send_agent = agent; - } - - return 0; - -err: - for (p = 0; p < dd->num_pports; p++) { - ibp = &dd->pport[p].ibport_data; - if (ibp->send_agent) { - agent = ibp->send_agent; - ibp->send_agent = NULL; - ib_unregister_mad_agent(agent); - } - } - - return ret; + /* Initialize xmit_wait structure */ + dd->pport[port_idx].cong_stats.counter = 0; + init_timer(&dd->pport[port_idx].cong_stats.timer); + dd->pport[port_idx].cong_stats.timer.function = xmit_wait_timer_func; + dd->pport[port_idx].cong_stats.timer.data = + (unsigned long)(&dd->pport[port_idx]); + dd->pport[port_idx].cong_stats.timer.expires = 0; + add_timer(&dd->pport[port_idx].cong_stats.timer); } -void qib_free_agents(struct qib_ibdev *dev) +void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx) { - struct qib_devdata *dd = dd_from_dev(dev); - struct ib_mad_agent *agent; - struct qib_ibport *ibp; - int p; - - for (p = 0; p < dd->num_pports; p++) { - ibp = &dd->pport[p].ibport_data; - if (ibp->send_agent) { - agent = ibp->send_agent; - ibp->send_agent = NULL; - ib_unregister_mad_agent(agent); - } - if (ibp->sm_ah) { - ib_destroy_ah(&ibp->sm_ah->ibah); - ibp->sm_ah = NULL; - } - if (dd->pport[p].cong_stats.timer.data) - del_timer_sync(&dd->pport[p].cong_stats.timer); - } + struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = container_of(ibdev, + struct qib_devdata, verbs_dev); + + if (dd->pport[port_idx].cong_stats.timer.data) + del_timer_sync(&dd->pport[port_idx].cong_stats.timer); + + if (dd->pport[port_idx].ibport_data.smi_ah) + ib_destroy_ah(&dd->pport[port_idx].ibport_data.smi_ah->ibah); } diff --git a/drivers/infiniband/hw/qib/qib_mmap.c b/drivers/infiniband/hw/qib/qib_mmap.c deleted file mode 100644 index 34927b700b0e..000000000000 --- a/drivers/infiniband/hw/qib/qib_mmap.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> -#include <linux/errno.h> -#include <asm/pgtable.h> - -#include "qib_verbs.h" - -/** - * qib_release_mmap_info - free mmap info structure - * @ref: a pointer to the kref within struct qib_mmap_info - */ -void qib_release_mmap_info(struct kref *ref) -{ - struct qib_mmap_info *ip = - container_of(ref, struct qib_mmap_info, ref); - struct qib_ibdev *dev = to_idev(ip->context->device); - - spin_lock_irq(&dev->pending_lock); - list_del(&ip->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - - vfree(ip->obj); - kfree(ip); -} - -/* - * open and close keep track of how many times the CQ is mapped, - * to avoid releasing it. - */ -static void qib_vma_open(struct vm_area_struct *vma) -{ - struct qib_mmap_info *ip = vma->vm_private_data; - - kref_get(&ip->ref); -} - -static void qib_vma_close(struct vm_area_struct *vma) -{ - struct qib_mmap_info *ip = vma->vm_private_data; - - kref_put(&ip->ref, qib_release_mmap_info); -} - -static const struct vm_operations_struct qib_vm_ops = { - .open = qib_vma_open, - .close = qib_vma_close, -}; - -/** - * qib_mmap - create a new mmap region - * @context: the IB user context of the process making the mmap() call - * @vma: the VMA to be initialized - * Return zero if the mmap is OK. Otherwise, return an errno. - */ -int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) -{ - struct qib_ibdev *dev = to_idev(context->device); - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long size = vma->vm_end - vma->vm_start; - struct qib_mmap_info *ip, *pp; - int ret = -EINVAL; - - /* - * Search the device's list of objects waiting for a mmap call. - * Normally, this list is very short since a call to create a - * CQ, QP, or SRQ is soon followed by a call to mmap(). - */ - spin_lock_irq(&dev->pending_lock); - list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, - pending_mmaps) { - /* Only the creator is allowed to mmap the object */ - if (context != ip->context || (__u64) offset != ip->offset) - continue; - /* Don't allow a mmap larger than the object. */ - if (size > ip->size) - break; - - list_del_init(&ip->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - - ret = remap_vmalloc_range(vma, ip->obj, 0); - if (ret) - goto done; - vma->vm_ops = &qib_vm_ops; - vma->vm_private_data = ip; - qib_vma_open(vma); - goto done; - } - spin_unlock_irq(&dev->pending_lock); -done: - return ret; -} - -/* - * Allocate information for qib_mmap - */ -struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, - u32 size, - struct ib_ucontext *context, - void *obj) { - struct qib_mmap_info *ip; - - ip = kmalloc(sizeof(*ip), GFP_KERNEL); - if (!ip) - goto bail; - - size = PAGE_ALIGN(size); - - spin_lock_irq(&dev->mmap_offset_lock); - if (dev->mmap_offset == 0) - dev->mmap_offset = PAGE_SIZE; - ip->offset = dev->mmap_offset; - dev->mmap_offset += size; - spin_unlock_irq(&dev->mmap_offset_lock); - - INIT_LIST_HEAD(&ip->pending_mmaps); - ip->size = size; - ip->context = context; - ip->obj = obj; - kref_init(&ip->ref); - -bail: - return ip; -} - -void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip, - u32 size, void *obj) -{ - size = PAGE_ALIGN(size); - - spin_lock_irq(&dev->mmap_offset_lock); - if (dev->mmap_offset == 0) - dev->mmap_offset = PAGE_SIZE; - ip->offset = dev->mmap_offset; - dev->mmap_offset += size; - spin_unlock_irq(&dev->mmap_offset_lock); - - ip->size = size; - ip->obj = obj; -} diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c deleted file mode 100644 index 5f53304e8a9b..000000000000 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ /dev/null @@ -1,490 +0,0 @@ -/* - * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. - * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <rdma/ib_umem.h> -#include <rdma/ib_smi.h> - -#include "qib.h" - -/* Fast memory region */ -struct qib_fmr { - struct ib_fmr ibfmr; - struct qib_mregion mr; /* must be last */ -}; - -static inline struct qib_fmr *to_ifmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct qib_fmr, ibfmr); -} - -static int init_qib_mregion(struct qib_mregion *mr, struct ib_pd *pd, - int count) -{ - int m, i = 0; - int rval = 0; - - m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; - for (; i < m; i++) { - mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL); - if (!mr->map[i]) - goto bail; - } - mr->mapsz = m; - init_completion(&mr->comp); - /* count returning the ptr to user */ - atomic_set(&mr->refcount, 1); - mr->pd = pd; - mr->max_segs = count; -out: - return rval; -bail: - while (i) - kfree(mr->map[--i]); - rval = -ENOMEM; - goto out; -} - -static void deinit_qib_mregion(struct qib_mregion *mr) -{ - int i = mr->mapsz; - - mr->mapsz = 0; - while (i) - kfree(mr->map[--i]); -} - - -/** - * qib_get_dma_mr - get a DMA memory region - * @pd: protection domain for this memory region - * @acc: access flags - * - * Returns the memory region on success, otherwise returns an errno. - * Note that all DMA addresses should be created via the - * struct ib_dma_mapping_ops functions (see qib_dma.c). - */ -struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc) -{ - struct qib_mr *mr = NULL; - struct ib_mr *ret; - int rval; - - if (to_ipd(pd)->user) { - ret = ERR_PTR(-EPERM); - goto bail; - } - - mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - rval = init_qib_mregion(&mr->mr, pd, 0); - if (rval) { - ret = ERR_PTR(rval); - goto bail; - } - - - rval = qib_alloc_lkey(&mr->mr, 1); - if (rval) { - ret = ERR_PTR(rval); - goto bail_mregion; - } - - mr->mr.access_flags = acc; - ret = &mr->ibmr; -done: - return ret; - -bail_mregion: - deinit_qib_mregion(&mr->mr); -bail: - kfree(mr); - goto done; -} - -static struct qib_mr *alloc_mr(int count, struct ib_pd *pd) -{ - struct qib_mr *mr; - int rval = -ENOMEM; - int m; - - /* Allocate struct plus pointers to first level page tables. */ - m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ; - mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL); - if (!mr) - goto bail; - - rval = init_qib_mregion(&mr->mr, pd, count); - if (rval) - goto bail; - - rval = qib_alloc_lkey(&mr->mr, 0); - if (rval) - goto bail_mregion; - mr->ibmr.lkey = mr->mr.lkey; - mr->ibmr.rkey = mr->mr.lkey; -done: - return mr; - -bail_mregion: - deinit_qib_mregion(&mr->mr); -bail: - kfree(mr); - mr = ERR_PTR(rval); - goto done; -} - -/** - * qib_reg_user_mr - register a userspace memory region - * @pd: protection domain for this memory region - * @start: starting userspace address - * @length: length of region to register - * @mr_access_flags: access flags for this memory region - * @udata: unused by the QLogic_IB driver - * - * Returns the memory region on success, otherwise returns an errno. - */ -struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt_addr, int mr_access_flags, - struct ib_udata *udata) -{ - struct qib_mr *mr; - struct ib_umem *umem; - struct scatterlist *sg; - int n, m, entry; - struct ib_mr *ret; - - if (length == 0) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - - umem = ib_umem_get(pd->uobject->context, start, length, - mr_access_flags, 0); - if (IS_ERR(umem)) - return (void *) umem; - - n = umem->nmap; - - mr = alloc_mr(n, pd); - if (IS_ERR(mr)) { - ret = (struct ib_mr *)mr; - ib_umem_release(umem); - goto bail; - } - - mr->mr.user_base = start; - mr->mr.iova = virt_addr; - mr->mr.length = length; - mr->mr.offset = ib_umem_offset(umem); - mr->mr.access_flags = mr_access_flags; - mr->umem = umem; - - if (is_power_of_2(umem->page_size)) - mr->mr.page_shift = ilog2(umem->page_size); - m = 0; - n = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - void *vaddr; - - vaddr = page_address(sg_page(sg)); - if (!vaddr) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - mr->mr.map[m]->segs[n].vaddr = vaddr; - mr->mr.map[m]->segs[n].length = umem->page_size; - n++; - if (n == QIB_SEGSZ) { - m++; - n = 0; - } - } - ret = &mr->ibmr; - -bail: - return ret; -} - -/** - * qib_dereg_mr - unregister and free a memory region - * @ibmr: the memory region to free - * - * Returns 0 on success. - * - * Note that this is called to free MRs created by qib_get_dma_mr() - * or qib_reg_user_mr(). - */ -int qib_dereg_mr(struct ib_mr *ibmr) -{ - struct qib_mr *mr = to_imr(ibmr); - int ret = 0; - unsigned long timeout; - - kfree(mr->pages); - qib_free_lkey(&mr->mr); - - qib_put_mr(&mr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&mr->mr.comp, - 5 * HZ); - if (!timeout) { - qib_get_mr(&mr->mr); - ret = -EBUSY; - goto out; - } - deinit_qib_mregion(&mr->mr); - if (mr->umem) - ib_umem_release(mr->umem); - kfree(mr); -out: - return ret; -} - -/* - * Allocate a memory region usable with the - * IB_WR_REG_MR send work request. - * - * Return the memory region on success, otherwise return an errno. - */ -struct ib_mr *qib_alloc_mr(struct ib_pd *pd, - enum ib_mr_type mr_type, - u32 max_num_sg) -{ - struct qib_mr *mr; - - if (mr_type != IB_MR_TYPE_MEM_REG) - return ERR_PTR(-EINVAL); - - mr = alloc_mr(max_num_sg, pd); - if (IS_ERR(mr)) - return (struct ib_mr *)mr; - - mr->pages = kcalloc(max_num_sg, sizeof(u64), GFP_KERNEL); - if (!mr->pages) - goto err; - - return &mr->ibmr; - -err: - qib_dereg_mr(&mr->ibmr); - return ERR_PTR(-ENOMEM); -} - -static int qib_set_page(struct ib_mr *ibmr, u64 addr) -{ - struct qib_mr *mr = to_imr(ibmr); - - if (unlikely(mr->npages == mr->mr.max_segs)) - return -ENOMEM; - - mr->pages[mr->npages++] = addr; - - return 0; -} - -int qib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) -{ - struct qib_mr *mr = to_imr(ibmr); - - mr->npages = 0; - - return ib_sg_to_pages(ibmr, sg, sg_nents, qib_set_page); -} - -/** - * qib_alloc_fmr - allocate a fast memory region - * @pd: the protection domain for this memory region - * @mr_access_flags: access flags for this memory region - * @fmr_attr: fast memory region attributes - * - * Returns the memory region on success, otherwise returns an errno. - */ -struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct qib_fmr *fmr; - int m; - struct ib_fmr *ret; - int rval = -ENOMEM; - - /* Allocate struct plus pointers to first level page tables. */ - m = (fmr_attr->max_pages + QIB_SEGSZ - 1) / QIB_SEGSZ; - fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL); - if (!fmr) - goto bail; - - rval = init_qib_mregion(&fmr->mr, pd, fmr_attr->max_pages); - if (rval) - goto bail; - - /* - * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & - * rkey. - */ - rval = qib_alloc_lkey(&fmr->mr, 0); - if (rval) - goto bail_mregion; - fmr->ibfmr.rkey = fmr->mr.lkey; - fmr->ibfmr.lkey = fmr->mr.lkey; - /* - * Resources are allocated but no valid mapping (RKEY can't be - * used). - */ - fmr->mr.access_flags = mr_access_flags; - fmr->mr.max_segs = fmr_attr->max_pages; - fmr->mr.page_shift = fmr_attr->page_shift; - - ret = &fmr->ibfmr; -done: - return ret; - -bail_mregion: - deinit_qib_mregion(&fmr->mr); -bail: - kfree(fmr); - ret = ERR_PTR(rval); - goto done; -} - -/** - * qib_map_phys_fmr - set up a fast memory region - * @ibmfr: the fast memory region to set up - * @page_list: the list of pages to associate with the fast memory region - * @list_len: the number of pages to associate with the fast memory region - * @iova: the virtual address of the start of the fast memory region - * - * This may be called from interrupt context. - */ - -int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova) -{ - struct qib_fmr *fmr = to_ifmr(ibfmr); - struct qib_lkey_table *rkt; - unsigned long flags; - int m, n, i; - u32 ps; - int ret; - - i = atomic_read(&fmr->mr.refcount); - if (i > 2) - return -EBUSY; - - if (list_len > fmr->mr.max_segs) { - ret = -EINVAL; - goto bail; - } - rkt = &to_idev(ibfmr->device)->lk_table; - spin_lock_irqsave(&rkt->lock, flags); - fmr->mr.user_base = iova; - fmr->mr.iova = iova; - ps = 1 << fmr->mr.page_shift; - fmr->mr.length = list_len * ps; - m = 0; - n = 0; - for (i = 0; i < list_len; i++) { - fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; - fmr->mr.map[m]->segs[n].length = ps; - if (++n == QIB_SEGSZ) { - m++; - n = 0; - } - } - spin_unlock_irqrestore(&rkt->lock, flags); - ret = 0; - -bail: - return ret; -} - -/** - * qib_unmap_fmr - unmap fast memory regions - * @fmr_list: the list of fast memory regions to unmap - * - * Returns 0 on success. - */ -int qib_unmap_fmr(struct list_head *fmr_list) -{ - struct qib_fmr *fmr; - struct qib_lkey_table *rkt; - unsigned long flags; - - list_for_each_entry(fmr, fmr_list, ibfmr.list) { - rkt = &to_idev(fmr->ibfmr.device)->lk_table; - spin_lock_irqsave(&rkt->lock, flags); - fmr->mr.user_base = 0; - fmr->mr.iova = 0; - fmr->mr.length = 0; - spin_unlock_irqrestore(&rkt->lock, flags); - } - return 0; -} - -/** - * qib_dealloc_fmr - deallocate a fast memory region - * @ibfmr: the fast memory region to deallocate - * - * Returns 0 on success. - */ -int qib_dealloc_fmr(struct ib_fmr *ibfmr) -{ - struct qib_fmr *fmr = to_ifmr(ibfmr); - int ret = 0; - unsigned long timeout; - - qib_free_lkey(&fmr->mr); - qib_put_mr(&fmr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&fmr->mr.comp, - 5 * HZ); - if (!timeout) { - qib_get_mr(&fmr->mr); - ret = -EBUSY; - goto out; - } - deinit_qib_mregion(&fmr->mr); - kfree(fmr); -out: - return ret; -} - -void mr_rcu_callback(struct rcu_head *list) -{ - struct qib_mregion *mr = container_of(list, struct qib_mregion, list); - - complete(&mr->comp); -} diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 3eff35c2d453..575b737d9ef3 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -34,32 +34,38 @@ #include <linux/err.h> #include <linux/vmalloc.h> -#include <linux/jhash.h> +#include <rdma/rdma_vt.h> #ifdef CONFIG_DEBUG_FS #include <linux/seq_file.h> #endif #include "qib.h" -#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) +/* + * mask field which was present in now deleted qib_qpn_table + * is not present in rvt_qpn_table. Defining the same field + * as qpt_mask here instead of adding the mask field to + * rvt_qpn_table. + */ +u16 qpt_mask; -static inline unsigned mk_qpn(struct qib_qpn_table *qpt, - struct qpn_map *map, unsigned off) +static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, + struct rvt_qpn_map *map, unsigned off) { - return (map - qpt->map) * BITS_PER_PAGE + off; + return (map - qpt->map) * RVT_BITS_PER_PAGE + off; } -static inline unsigned find_next_offset(struct qib_qpn_table *qpt, - struct qpn_map *map, unsigned off, +static inline unsigned find_next_offset(struct rvt_qpn_table *qpt, + struct rvt_qpn_map *map, unsigned off, unsigned n) { - if (qpt->mask) { + if (qpt_mask) { off++; - if (((off & qpt->mask) >> 1) >= n) - off = (off | qpt->mask) + 2; - } else - off = find_next_zero_bit(map->page, BITS_PER_PAGE, off); + if (((off & qpt_mask) >> 1) >= n) + off = (off | qpt_mask) + 2; + } else { + off = find_next_zero_bit(map->page, RVT_BITS_PER_PAGE, off); + } return off; } @@ -100,7 +106,7 @@ static u32 credit_table[31] = { 32768 /* 1E */ }; -static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map, +static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map, gfp_t gfp) { unsigned long page = get_zeroed_page(gfp); @@ -121,12 +127,15 @@ static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map, * Allocate the next available QPN or * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI. */ -static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, - enum ib_qp_type type, u8 port, gfp_t gfp) +int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, + enum ib_qp_type type, u8 port, gfp_t gfp) { u32 i, offset, max_scan, qpn; - struct qpn_map *map; + struct rvt_qpn_map *map; u32 ret; + struct qib_ibdev *verbs_dev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = container_of(verbs_dev, struct qib_devdata, + verbs_dev); if (type == IB_QPT_SMI || type == IB_QPT_GSI) { unsigned n; @@ -143,12 +152,12 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, } qpn = qpt->last + 2; - if (qpn >= QPN_MAX) + if (qpn >= RVT_QPN_MAX) qpn = 2; - if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues) - qpn = (qpn | qpt->mask) + 2; - offset = qpn & BITS_PER_PAGE_MASK; - map = &qpt->map[qpn / BITS_PER_PAGE]; + if (qpt_mask && ((qpn & qpt_mask) >> 1) >= dd->n_krcv_queues) + qpn = (qpn | qpt_mask) + 2; + offset = qpn & RVT_BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / RVT_BITS_PER_PAGE]; max_scan = qpt->nmaps - !offset; for (i = 0;;) { if (unlikely(!map->page)) { @@ -173,14 +182,14 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, * We just need to be sure we don't loop * forever. */ - } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + } while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX); /* * In order to keep the number of pages allocated to a * minimum, we scan the all existing pages before increasing * the size of the bitmap table. */ if (++i > max_scan) { - if (qpt->nmaps == QPNMAP_ENTRIES) + if (qpt->nmaps == RVT_QPNMAP_ENTRIES) break; map = &qpt->map[qpt->nmaps++]; offset = 0; @@ -200,706 +209,113 @@ bail: return ret; } -static void free_qpn(struct qib_qpn_table *qpt, u32 qpn) -{ - struct qpn_map *map; - - map = qpt->map + qpn / BITS_PER_PAGE; - if (map->page) - clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); -} - -static inline unsigned qpn_hash(struct qib_ibdev *dev, u32 qpn) -{ - return jhash_1word(qpn, dev->qp_rnd) & - (dev->qp_table_size - 1); -} - - -/* - * Put the QP into the hash table. - * The hash table holds a reference to the QP. - */ -static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp) -{ - struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - unsigned long flags; - unsigned n = qpn_hash(dev, qp->ibqp.qp_num); - - atomic_inc(&qp->refcount); - spin_lock_irqsave(&dev->qpt_lock, flags); - - if (qp->ibqp.qp_num == 0) - rcu_assign_pointer(ibp->qp0, qp); - else if (qp->ibqp.qp_num == 1) - rcu_assign_pointer(ibp->qp1, qp); - else { - qp->next = dev->qp_table[n]; - rcu_assign_pointer(dev->qp_table[n], qp); - } - - spin_unlock_irqrestore(&dev->qpt_lock, flags); -} - -/* - * Remove the QP from the table so it can't be found asynchronously by - * the receive interrupt routine. - */ -static void remove_qp(struct qib_ibdev *dev, struct qib_qp *qp) -{ - struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - unsigned n = qpn_hash(dev, qp->ibqp.qp_num); - unsigned long flags; - int removed = 1; - - spin_lock_irqsave(&dev->qpt_lock, flags); - - if (rcu_dereference_protected(ibp->qp0, - lockdep_is_held(&dev->qpt_lock)) == qp) { - RCU_INIT_POINTER(ibp->qp0, NULL); - } else if (rcu_dereference_protected(ibp->qp1, - lockdep_is_held(&dev->qpt_lock)) == qp) { - RCU_INIT_POINTER(ibp->qp1, NULL); - } else { - struct qib_qp *q; - struct qib_qp __rcu **qpp; - - removed = 0; - qpp = &dev->qp_table[n]; - for (; (q = rcu_dereference_protected(*qpp, - lockdep_is_held(&dev->qpt_lock))) != NULL; - qpp = &q->next) - if (q == qp) { - RCU_INIT_POINTER(*qpp, - rcu_dereference_protected(qp->next, - lockdep_is_held(&dev->qpt_lock))); - removed = 1; - break; - } - } - - spin_unlock_irqrestore(&dev->qpt_lock, flags); - if (removed) { - synchronize_rcu(); - atomic_dec(&qp->refcount); - } -} - /** * qib_free_all_qps - check for QPs still in use - * @qpt: the QP table to empty - * - * There should not be any QPs still in use. - * Free memory for table. */ -unsigned qib_free_all_qps(struct qib_devdata *dd) +unsigned qib_free_all_qps(struct rvt_dev_info *rdi) { - struct qib_ibdev *dev = &dd->verbs_dev; - unsigned long flags; - struct qib_qp *qp; + struct qib_ibdev *verbs_dev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = container_of(verbs_dev, struct qib_devdata, + verbs_dev); unsigned n, qp_inuse = 0; for (n = 0; n < dd->num_pports; n++) { struct qib_ibport *ibp = &dd->pport[n].ibport_data; - if (!qib_mcast_tree_empty(ibp)) - qp_inuse++; rcu_read_lock(); - if (rcu_dereference(ibp->qp0)) + if (rcu_dereference(ibp->rvp.qp[0])) qp_inuse++; - if (rcu_dereference(ibp->qp1)) + if (rcu_dereference(ibp->rvp.qp[1])) qp_inuse++; rcu_read_unlock(); } - - spin_lock_irqsave(&dev->qpt_lock, flags); - for (n = 0; n < dev->qp_table_size; n++) { - qp = rcu_dereference_protected(dev->qp_table[n], - lockdep_is_held(&dev->qpt_lock)); - RCU_INIT_POINTER(dev->qp_table[n], NULL); - - for (; qp; qp = rcu_dereference_protected(qp->next, - lockdep_is_held(&dev->qpt_lock))) - qp_inuse++; - } - spin_unlock_irqrestore(&dev->qpt_lock, flags); - synchronize_rcu(); - return qp_inuse; } -/** - * qib_lookup_qpn - return the QP with the given QPN - * @qpt: the QP table - * @qpn: the QP number to look up - * - * The caller is responsible for decrementing the QP reference count - * when done. - */ -struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn) +void qib_notify_qp_reset(struct rvt_qp *qp) { - struct qib_qp *qp = NULL; - - rcu_read_lock(); - if (unlikely(qpn <= 1)) { - if (qpn == 0) - qp = rcu_dereference(ibp->qp0); - else - qp = rcu_dereference(ibp->qp1); - if (qp) - atomic_inc(&qp->refcount); - } else { - struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; - unsigned n = qpn_hash(dev, qpn); - - for (qp = rcu_dereference(dev->qp_table[n]); qp; - qp = rcu_dereference(qp->next)) - if (qp->ibqp.qp_num == qpn) { - atomic_inc(&qp->refcount); - break; - } - } - rcu_read_unlock(); - return qp; -} - -/** - * qib_reset_qp - initialize the QP state to the reset state - * @qp: the QP to reset - * @type: the QP type - */ -static void qib_reset_qp(struct qib_qp *qp, enum ib_qp_type type) -{ - qp->remote_qpn = 0; - qp->qkey = 0; - qp->qp_access_flags = 0; - atomic_set(&qp->s_dma_busy, 0); - qp->s_flags &= QIB_S_SIGNAL_REQ_WR; - qp->s_hdrwords = 0; - qp->s_wqe = NULL; - qp->s_draining = 0; - qp->s_next_psn = 0; - qp->s_last_psn = 0; - qp->s_sending_psn = 0; - qp->s_sending_hpsn = 0; - qp->s_psn = 0; - qp->r_psn = 0; - qp->r_msn = 0; - if (type == IB_QPT_RC) { - qp->s_state = IB_OPCODE_RC_SEND_LAST; - qp->r_state = IB_OPCODE_RC_SEND_LAST; - } else { - qp->s_state = IB_OPCODE_UC_SEND_LAST; - qp->r_state = IB_OPCODE_UC_SEND_LAST; - } - qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; - qp->r_nak_state = 0; - qp->r_aflags = 0; - qp->r_flags = 0; - qp->s_head = 0; - qp->s_tail = 0; - qp->s_cur = 0; - qp->s_acked = 0; - qp->s_last = 0; - qp->s_ssn = 1; - qp->s_lsn = 0; - qp->s_mig_state = IB_MIG_MIGRATED; - memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); - qp->r_head_ack_queue = 0; - qp->s_tail_ack_queue = 0; - qp->s_num_rd_atomic = 0; - if (qp->r_rq.wq) { - qp->r_rq.wq->head = 0; - qp->r_rq.wq->tail = 0; - } - qp->r_sge.num_sge = 0; -} - -static void clear_mr_refs(struct qib_qp *qp, int clr_sends) -{ - unsigned n; - - if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) - qib_put_ss(&qp->s_rdma_read_sge); - - qib_put_ss(&qp->r_sge); - - if (clr_sends) { - while (qp->s_last != qp->s_head) { - struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_last); - unsigned i; - - for (i = 0; i < wqe->wr.num_sge; i++) { - struct qib_sge *sge = &wqe->sg_list[i]; - - qib_put_mr(sge->mr); - } - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&to_iah(wqe->ud_wr.ah)->refcount); - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - } - if (qp->s_rdma_mr) { - qib_put_mr(qp->s_rdma_mr); - qp->s_rdma_mr = NULL; - } - } - - if (qp->ibqp.qp_type != IB_QPT_RC) - return; + struct qib_qp_priv *priv = qp->priv; - for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { - struct qib_ack_entry *e = &qp->s_ack_queue[n]; - - if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && - e->rdma_sge.mr) { - qib_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } - } + atomic_set(&priv->s_dma_busy, 0); } -/** - * qib_error_qp - put a QP into the error state - * @qp: the QP to put into the error state - * @err: the receive completion error to signal if a RWQE is active - * - * Flushes both send and receive work queues. - * Returns true if last WQE event should be generated. - * The QP r_lock and s_lock should be held and interrupts disabled. - * If we are already in error state, just return. - */ -int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err) +void qib_notify_error_qp(struct rvt_qp *qp) { + struct qib_qp_priv *priv = qp->priv; struct qib_ibdev *dev = to_idev(qp->ibqp.device); - struct ib_wc wc; - int ret = 0; - - if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) - goto bail; - - qp->state = IB_QPS_ERR; - - if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { - qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); - del_timer(&qp->s_timer); - } - - if (qp->s_flags & QIB_S_ANY_WAIT_SEND) - qp->s_flags &= ~QIB_S_ANY_WAIT_SEND; - spin_lock(&dev->pending_lock); - if (!list_empty(&qp->iowait) && !(qp->s_flags & QIB_S_BUSY)) { - qp->s_flags &= ~QIB_S_ANY_WAIT_IO; - list_del_init(&qp->iowait); + spin_lock(&dev->rdi.pending_lock); + if (!list_empty(&priv->iowait) && !(qp->s_flags & RVT_S_BUSY)) { + qp->s_flags &= ~RVT_S_ANY_WAIT_IO; + list_del_init(&priv->iowait); } - spin_unlock(&dev->pending_lock); + spin_unlock(&dev->rdi.pending_lock); - if (!(qp->s_flags & QIB_S_BUSY)) { + if (!(qp->s_flags & RVT_S_BUSY)) { qp->s_hdrwords = 0; if (qp->s_rdma_mr) { - qib_put_mr(qp->s_rdma_mr); + rvt_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; } - if (qp->s_tx) { - qib_put_txreq(qp->s_tx); - qp->s_tx = NULL; + if (priv->s_tx) { + qib_put_txreq(priv->s_tx); + priv->s_tx = NULL; } } - - /* Schedule the sending tasklet to drain the send work queue. */ - if (qp->s_last != qp->s_head) - qib_schedule_send(qp); - - clear_mr_refs(qp, 0); - - memset(&wc, 0, sizeof(wc)); - wc.qp = &qp->ibqp; - wc.opcode = IB_WC_RECV; - - if (test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) { - wc.wr_id = qp->r_wr_id; - wc.status = err; - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); - } - wc.status = IB_WC_WR_FLUSH_ERR; - - if (qp->r_rq.wq) { - struct qib_rwq *wq; - u32 head; - u32 tail; - - spin_lock(&qp->r_rq.lock); - - /* sanity check pointers before trusting them */ - wq = qp->r_rq.wq; - head = wq->head; - if (head >= qp->r_rq.size) - head = 0; - tail = wq->tail; - if (tail >= qp->r_rq.size) - tail = 0; - while (tail != head) { - wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; - if (++tail >= qp->r_rq.size) - tail = 0; - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); - } - wq->tail = tail; - - spin_unlock(&qp->r_rq.lock); - } else if (qp->ibqp.event_handler) - ret = 1; - -bail: - return ret; } -/** - * qib_modify_qp - modify the attributes of a queue pair - * @ibqp: the queue pair who's attributes we're modifying - * @attr: the new attributes - * @attr_mask: the mask of attributes to modify - * @udata: user data for libibverbs.so - * - * Returns 0 on success, otherwise returns an errno. - */ -int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) +static int mtu_to_enum(u32 mtu) { - struct qib_ibdev *dev = to_idev(ibqp->device); - struct qib_qp *qp = to_iqp(ibqp); - enum ib_qp_state cur_state, new_state; - struct ib_event ev; - int lastwqe = 0; - int mig = 0; - int ret; - u32 pmtu = 0; /* for gcc warning only */ - - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_lock); - - cur_state = attr_mask & IB_QP_CUR_STATE ? - attr->cur_qp_state : qp->state; - new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, IB_LINK_LAYER_UNSPECIFIED)) - goto inval; - - if (attr_mask & IB_QP_AV) { - if (attr->ah_attr.dlid >= QIB_MULTICAST_LID_BASE) - goto inval; - if (qib_check_ah(qp->ibqp.device, &attr->ah_attr)) - goto inval; - } - - if (attr_mask & IB_QP_ALT_PATH) { - if (attr->alt_ah_attr.dlid >= QIB_MULTICAST_LID_BASE) - goto inval; - if (qib_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) - goto inval; - if (attr->alt_pkey_index >= qib_get_npkeys(dd_from_dev(dev))) - goto inval; - } - - if (attr_mask & IB_QP_PKEY_INDEX) - if (attr->pkey_index >= qib_get_npkeys(dd_from_dev(dev))) - goto inval; - - if (attr_mask & IB_QP_MIN_RNR_TIMER) - if (attr->min_rnr_timer > 31) - goto inval; - - if (attr_mask & IB_QP_PORT) - if (qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI || - attr->port_num == 0 || - attr->port_num > ibqp->device->phys_port_cnt) - goto inval; - - if (attr_mask & IB_QP_DEST_QPN) - if (attr->dest_qp_num > QIB_QPN_MASK) - goto inval; - - if (attr_mask & IB_QP_RETRY_CNT) - if (attr->retry_cnt > 7) - goto inval; - - if (attr_mask & IB_QP_RNR_RETRY) - if (attr->rnr_retry > 7) - goto inval; - - /* - * Don't allow invalid path_mtu values. OK to set greater - * than the active mtu (or even the max_cap, if we have tuned - * that to a small mtu. We'll set qp->path_mtu - * to the lesser of requested attribute mtu and active, - * for packetizing messages. - * Note that the QP port has to be set in INIT and MTU in RTR. - */ - if (attr_mask & IB_QP_PATH_MTU) { - struct qib_devdata *dd = dd_from_dev(dev); - int mtu, pidx = qp->port_num - 1; - - mtu = ib_mtu_enum_to_int(attr->path_mtu); - if (mtu == -1) - goto inval; - if (mtu > dd->pport[pidx].ibmtu) { - switch (dd->pport[pidx].ibmtu) { - case 4096: - pmtu = IB_MTU_4096; - break; - case 2048: - pmtu = IB_MTU_2048; - break; - case 1024: - pmtu = IB_MTU_1024; - break; - case 512: - pmtu = IB_MTU_512; - break; - case 256: - pmtu = IB_MTU_256; - break; - default: - pmtu = IB_MTU_2048; - } - } else - pmtu = attr->path_mtu; - } - - if (attr_mask & IB_QP_PATH_MIG_STATE) { - if (attr->path_mig_state == IB_MIG_REARM) { - if (qp->s_mig_state == IB_MIG_ARMED) - goto inval; - if (new_state != IB_QPS_RTS) - goto inval; - } else if (attr->path_mig_state == IB_MIG_MIGRATED) { - if (qp->s_mig_state == IB_MIG_REARM) - goto inval; - if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD) - goto inval; - if (qp->s_mig_state == IB_MIG_ARMED) - mig = 1; - } else - goto inval; - } - - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - if (attr->max_dest_rd_atomic > QIB_MAX_RDMA_ATOMIC) - goto inval; + int enum_mtu; - switch (new_state) { - case IB_QPS_RESET: - if (qp->state != IB_QPS_RESET) { - qp->state = IB_QPS_RESET; - spin_lock(&dev->pending_lock); - if (!list_empty(&qp->iowait)) - list_del_init(&qp->iowait); - spin_unlock(&dev->pending_lock); - qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT); - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - /* Stop the sending work queue and retry timer */ - cancel_work_sync(&qp->s_work); - del_timer_sync(&qp->s_timer); - wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); - if (qp->s_tx) { - qib_put_txreq(qp->s_tx); - qp->s_tx = NULL; - } - remove_qp(dev, qp); - wait_event(qp->wait, !atomic_read(&qp->refcount)); - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_lock); - clear_mr_refs(qp, 1); - qib_reset_qp(qp, ibqp->qp_type); - } + switch (mtu) { + case 4096: + enum_mtu = IB_MTU_4096; break; - - case IB_QPS_RTR: - /* Allow event to retrigger if QP set to RTR more than once */ - qp->r_flags &= ~QIB_R_COMM_EST; - qp->state = new_state; + case 2048: + enum_mtu = IB_MTU_2048; break; - - case IB_QPS_SQD: - qp->s_draining = qp->s_last != qp->s_cur; - qp->state = new_state; + case 1024: + enum_mtu = IB_MTU_1024; break; - - case IB_QPS_SQE: - if (qp->ibqp.qp_type == IB_QPT_RC) - goto inval; - qp->state = new_state; + case 512: + enum_mtu = IB_MTU_512; break; - - case IB_QPS_ERR: - lastwqe = qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + case 256: + enum_mtu = IB_MTU_256; break; - default: - qp->state = new_state; - break; - } - - if (attr_mask & IB_QP_PKEY_INDEX) - qp->s_pkey_index = attr->pkey_index; - - if (attr_mask & IB_QP_PORT) - qp->port_num = attr->port_num; - - if (attr_mask & IB_QP_DEST_QPN) - qp->remote_qpn = attr->dest_qp_num; - - if (attr_mask & IB_QP_SQ_PSN) { - qp->s_next_psn = attr->sq_psn & QIB_PSN_MASK; - qp->s_psn = qp->s_next_psn; - qp->s_sending_psn = qp->s_next_psn; - qp->s_last_psn = qp->s_next_psn - 1; - qp->s_sending_hpsn = qp->s_last_psn; - } - - if (attr_mask & IB_QP_RQ_PSN) - qp->r_psn = attr->rq_psn & QIB_PSN_MASK; - - if (attr_mask & IB_QP_ACCESS_FLAGS) - qp->qp_access_flags = attr->qp_access_flags; - - if (attr_mask & IB_QP_AV) { - qp->remote_ah_attr = attr->ah_attr; - qp->s_srate = attr->ah_attr.static_rate; - } - - if (attr_mask & IB_QP_ALT_PATH) { - qp->alt_ah_attr = attr->alt_ah_attr; - qp->s_alt_pkey_index = attr->alt_pkey_index; - } - - if (attr_mask & IB_QP_PATH_MIG_STATE) { - qp->s_mig_state = attr->path_mig_state; - if (mig) { - qp->remote_ah_attr = qp->alt_ah_attr; - qp->port_num = qp->alt_ah_attr.port_num; - qp->s_pkey_index = qp->s_alt_pkey_index; - } - } - - if (attr_mask & IB_QP_PATH_MTU) { - qp->path_mtu = pmtu; - qp->pmtu = ib_mtu_enum_to_int(pmtu); - } - - if (attr_mask & IB_QP_RETRY_CNT) { - qp->s_retry_cnt = attr->retry_cnt; - qp->s_retry = attr->retry_cnt; - } - - if (attr_mask & IB_QP_RNR_RETRY) { - qp->s_rnr_retry_cnt = attr->rnr_retry; - qp->s_rnr_retry = attr->rnr_retry; + enum_mtu = IB_MTU_2048; } - - if (attr_mask & IB_QP_MIN_RNR_TIMER) - qp->r_min_rnr_timer = attr->min_rnr_timer; - - if (attr_mask & IB_QP_TIMEOUT) { - qp->timeout = attr->timeout; - qp->timeout_jiffies = - usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / - 1000UL); - } - - if (attr_mask & IB_QP_QKEY) - qp->qkey = attr->qkey; - - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - qp->r_max_rd_atomic = attr->max_dest_rd_atomic; - - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) - qp->s_max_rd_atomic = attr->max_rd_atomic; - - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - insert_qp(dev, qp); - - if (lastwqe) { - ev.device = qp->ibqp.device; - ev.element.qp = &qp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); - } - if (mig) { - ev.device = qp->ibqp.device; - ev.element.qp = &qp->ibqp; - ev.event = IB_EVENT_PATH_MIG; - qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); - } - ret = 0; - goto bail; - -inval: - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - ret = -EINVAL; - -bail: - return ret; + return enum_mtu; } -int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_qp_init_attr *init_attr) +int qib_get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr) { - struct qib_qp *qp = to_iqp(ibqp); + int mtu, pmtu, pidx = qp->port_num - 1; + struct qib_ibdev *verbs_dev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = container_of(verbs_dev, struct qib_devdata, + verbs_dev); + mtu = ib_mtu_enum_to_int(attr->path_mtu); + if (mtu == -1) + return -EINVAL; + + if (mtu > dd->pport[pidx].ibmtu) + pmtu = mtu_to_enum(dd->pport[pidx].ibmtu); + else + pmtu = attr->path_mtu; + return pmtu; +} - attr->qp_state = qp->state; - attr->cur_qp_state = attr->qp_state; - attr->path_mtu = qp->path_mtu; - attr->path_mig_state = qp->s_mig_state; - attr->qkey = qp->qkey; - attr->rq_psn = qp->r_psn & QIB_PSN_MASK; - attr->sq_psn = qp->s_next_psn & QIB_PSN_MASK; - attr->dest_qp_num = qp->remote_qpn; - attr->qp_access_flags = qp->qp_access_flags; - attr->cap.max_send_wr = qp->s_size - 1; - attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; - attr->cap.max_send_sge = qp->s_max_sge; - attr->cap.max_recv_sge = qp->r_rq.max_sge; - attr->cap.max_inline_data = 0; - attr->ah_attr = qp->remote_ah_attr; - attr->alt_ah_attr = qp->alt_ah_attr; - attr->pkey_index = qp->s_pkey_index; - attr->alt_pkey_index = qp->s_alt_pkey_index; - attr->en_sqd_async_notify = 0; - attr->sq_draining = qp->s_draining; - attr->max_rd_atomic = qp->s_max_rd_atomic; - attr->max_dest_rd_atomic = qp->r_max_rd_atomic; - attr->min_rnr_timer = qp->r_min_rnr_timer; - attr->port_num = qp->port_num; - attr->timeout = qp->timeout; - attr->retry_cnt = qp->s_retry_cnt; - attr->rnr_retry = qp->s_rnr_retry_cnt; - attr->alt_port_num = qp->alt_ah_attr.port_num; - attr->alt_timeout = qp->alt_timeout; +int qib_mtu_to_path_mtu(u32 mtu) +{ + return mtu_to_enum(mtu); +} - init_attr->event_handler = qp->ibqp.event_handler; - init_attr->qp_context = qp->ibqp.qp_context; - init_attr->send_cq = qp->ibqp.send_cq; - init_attr->recv_cq = qp->ibqp.recv_cq; - init_attr->srq = qp->ibqp.srq; - init_attr->cap = attr->cap; - if (qp->s_flags & QIB_S_SIGNAL_REQ_WR) - init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; - else - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; - init_attr->qp_type = qp->ibqp.qp_type; - init_attr->port_num = qp->port_num; - return 0; +u32 qib_mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu) +{ + return ib_mtu_enum_to_int(pmtu); } /** @@ -908,7 +324,7 @@ int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, * * Returns the AETH. */ -__be32 qib_compute_aeth(struct qib_qp *qp) +__be32 qib_compute_aeth(struct rvt_qp *qp) { u32 aeth = qp->r_msn & QIB_MSN_MASK; @@ -921,7 +337,7 @@ __be32 qib_compute_aeth(struct qib_qp *qp) } else { u32 min, max, x; u32 credits; - struct qib_rwq *wq = qp->r_rq.wq; + struct rvt_rwq *wq = qp->r_rq.wq; u32 head; u32 tail; @@ -962,315 +378,63 @@ __be32 qib_compute_aeth(struct qib_qp *qp) return cpu_to_be32(aeth); } -/** - * qib_create_qp - create a queue pair for a device - * @ibpd: the protection domain who's device we create the queue pair for - * @init_attr: the attributes of the queue pair - * @udata: user data for libibverbs.so - * - * Returns the queue pair on success, otherwise returns an errno. - * - * Called by the ib_create_qp() core verbs function. - */ -struct ib_qp *qib_create_qp(struct ib_pd *ibpd, - struct ib_qp_init_attr *init_attr, - struct ib_udata *udata) +void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, gfp_t gfp) { - struct qib_qp *qp; - int err; - struct qib_swqe *swq = NULL; - struct qib_ibdev *dev; - struct qib_devdata *dd; - size_t sz; - size_t sg_list_sz; - struct ib_qp *ret; - gfp_t gfp; + struct qib_qp_priv *priv; + priv = kzalloc(sizeof(*priv), gfp); + if (!priv) + return ERR_PTR(-ENOMEM); + priv->owner = qp; - if (init_attr->cap.max_send_sge > ib_qib_max_sges || - init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || - init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO)) - return ERR_PTR(-EINVAL); - - /* GFP_NOIO is applicable in RC QPs only */ - if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO && - init_attr->qp_type != IB_QPT_RC) - return ERR_PTR(-EINVAL); - - gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ? - GFP_NOIO : GFP_KERNEL; - - /* Check receive queue parameters if no SRQ is specified. */ - if (!init_attr->srq) { - if (init_attr->cap.max_recv_sge > ib_qib_max_sges || - init_attr->cap.max_recv_wr > ib_qib_max_qp_wrs) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - if (init_attr->cap.max_send_sge + - init_attr->cap.max_send_wr + - init_attr->cap.max_recv_sge + - init_attr->cap.max_recv_wr == 0) { - ret = ERR_PTR(-EINVAL); - goto bail; - } + priv->s_hdr = kzalloc(sizeof(*priv->s_hdr), gfp); + if (!priv->s_hdr) { + kfree(priv); + return ERR_PTR(-ENOMEM); } + init_waitqueue_head(&priv->wait_dma); + INIT_WORK(&priv->s_work, _qib_do_send); + INIT_LIST_HEAD(&priv->iowait); - switch (init_attr->qp_type) { - case IB_QPT_SMI: - case IB_QPT_GSI: - if (init_attr->port_num == 0 || - init_attr->port_num > ibpd->device->phys_port_cnt) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - case IB_QPT_UC: - case IB_QPT_RC: - case IB_QPT_UD: - sz = sizeof(struct qib_sge) * - init_attr->cap.max_send_sge + - sizeof(struct qib_swqe); - swq = __vmalloc((init_attr->cap.max_send_wr + 1) * sz, - gfp, PAGE_KERNEL); - if (swq == NULL) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - sz = sizeof(*qp); - sg_list_sz = 0; - if (init_attr->srq) { - struct qib_srq *srq = to_isrq(init_attr->srq); - - if (srq->rq.max_sge > 1) - sg_list_sz = sizeof(*qp->r_sg_list) * - (srq->rq.max_sge - 1); - } else if (init_attr->cap.max_recv_sge > 1) - sg_list_sz = sizeof(*qp->r_sg_list) * - (init_attr->cap.max_recv_sge - 1); - qp = kzalloc(sz + sg_list_sz, gfp); - if (!qp) { - ret = ERR_PTR(-ENOMEM); - goto bail_swq; - } - RCU_INIT_POINTER(qp->next, NULL); - qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), gfp); - if (!qp->s_hdr) { - ret = ERR_PTR(-ENOMEM); - goto bail_qp; - } - qp->timeout_jiffies = - usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / - 1000UL); - if (init_attr->srq) - sz = 0; - else { - qp->r_rq.size = init_attr->cap.max_recv_wr + 1; - qp->r_rq.max_sge = init_attr->cap.max_recv_sge; - sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + - sizeof(struct qib_rwqe); - if (gfp != GFP_NOIO) - qp->r_rq.wq = vmalloc_user( - sizeof(struct qib_rwq) + - qp->r_rq.size * sz); - else - qp->r_rq.wq = __vmalloc( - sizeof(struct qib_rwq) + - qp->r_rq.size * sz, - gfp, PAGE_KERNEL); - - if (!qp->r_rq.wq) { - ret = ERR_PTR(-ENOMEM); - goto bail_qp; - } - } - - /* - * ib_create_qp() will initialize qp->ibqp - * except for qp->ibqp.qp_num. - */ - spin_lock_init(&qp->r_lock); - spin_lock_init(&qp->s_lock); - spin_lock_init(&qp->r_rq.lock); - atomic_set(&qp->refcount, 0); - init_waitqueue_head(&qp->wait); - init_waitqueue_head(&qp->wait_dma); - init_timer(&qp->s_timer); - qp->s_timer.data = (unsigned long)qp; - INIT_WORK(&qp->s_work, qib_do_send); - INIT_LIST_HEAD(&qp->iowait); - INIT_LIST_HEAD(&qp->rspwait); - qp->state = IB_QPS_RESET; - qp->s_wq = swq; - qp->s_size = init_attr->cap.max_send_wr + 1; - qp->s_max_sge = init_attr->cap.max_send_sge; - if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) - qp->s_flags = QIB_S_SIGNAL_REQ_WR; - dev = to_idev(ibpd->device); - dd = dd_from_dev(dev); - err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type, - init_attr->port_num, gfp); - if (err < 0) { - ret = ERR_PTR(err); - vfree(qp->r_rq.wq); - goto bail_qp; - } - qp->ibqp.qp_num = err; - qp->port_num = init_attr->port_num; - qib_reset_qp(qp, init_attr->qp_type); - break; - - default: - /* Don't support raw QPs */ - ret = ERR_PTR(-ENOSYS); - goto bail; - } - - init_attr->cap.max_inline_data = 0; - - /* - * Return the address of the RWQ as the offset to mmap. - * See qib_mmap() for details. - */ - if (udata && udata->outlen >= sizeof(__u64)) { - if (!qp->r_rq.wq) { - __u64 offset = 0; - - err = ib_copy_to_udata(udata, &offset, - sizeof(offset)); - if (err) { - ret = ERR_PTR(err); - goto bail_ip; - } - } else { - u32 s = sizeof(struct qib_rwq) + qp->r_rq.size * sz; - - qp->ip = qib_create_mmap_info(dev, s, - ibpd->uobject->context, - qp->r_rq.wq); - if (!qp->ip) { - ret = ERR_PTR(-ENOMEM); - goto bail_ip; - } - - err = ib_copy_to_udata(udata, &(qp->ip->offset), - sizeof(qp->ip->offset)); - if (err) { - ret = ERR_PTR(err); - goto bail_ip; - } - } - } - - spin_lock(&dev->n_qps_lock); - if (dev->n_qps_allocated == ib_qib_max_qps) { - spin_unlock(&dev->n_qps_lock); - ret = ERR_PTR(-ENOMEM); - goto bail_ip; - } - - dev->n_qps_allocated++; - spin_unlock(&dev->n_qps_lock); - - if (qp->ip) { - spin_lock_irq(&dev->pending_lock); - list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - } - - ret = &qp->ibqp; - goto bail; - -bail_ip: - if (qp->ip) - kref_put(&qp->ip->ref, qib_release_mmap_info); - else - vfree(qp->r_rq.wq); - free_qpn(&dev->qpn_table, qp->ibqp.qp_num); -bail_qp: - kfree(qp->s_hdr); - kfree(qp); -bail_swq: - vfree(swq); -bail: - return ret; + return priv; } -/** - * qib_destroy_qp - destroy a queue pair - * @ibqp: the queue pair to destroy - * - * Returns 0 on success. - * - * Note that this can be called while the QP is actively sending or - * receiving! - */ -int qib_destroy_qp(struct ib_qp *ibqp) +void qib_qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { - struct qib_qp *qp = to_iqp(ibqp); - struct qib_ibdev *dev = to_idev(ibqp->device); + struct qib_qp_priv *priv = qp->priv; - /* Make sure HW and driver activity is stopped. */ - spin_lock_irq(&qp->s_lock); - if (qp->state != IB_QPS_RESET) { - qp->state = IB_QPS_RESET; - spin_lock(&dev->pending_lock); - if (!list_empty(&qp->iowait)) - list_del_init(&qp->iowait); - spin_unlock(&dev->pending_lock); - qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT); - spin_unlock_irq(&qp->s_lock); - cancel_work_sync(&qp->s_work); - del_timer_sync(&qp->s_timer); - wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); - if (qp->s_tx) { - qib_put_txreq(qp->s_tx); - qp->s_tx = NULL; - } - remove_qp(dev, qp); - wait_event(qp->wait, !atomic_read(&qp->refcount)); - clear_mr_refs(qp, 1); - } else - spin_unlock_irq(&qp->s_lock); + kfree(priv->s_hdr); + kfree(priv); +} - /* all user's cleaned up, mark it available */ - free_qpn(&dev->qpn_table, qp->ibqp.qp_num); - spin_lock(&dev->n_qps_lock); - dev->n_qps_allocated--; - spin_unlock(&dev->n_qps_lock); +void qib_stop_send_queue(struct rvt_qp *qp) +{ + struct qib_qp_priv *priv = qp->priv; - if (qp->ip) - kref_put(&qp->ip->ref, qib_release_mmap_info); - else - vfree(qp->r_rq.wq); - vfree(qp->s_wq); - kfree(qp->s_hdr); - kfree(qp); - return 0; + cancel_work_sync(&priv->s_work); + del_timer_sync(&qp->s_timer); } -/** - * qib_init_qpn_table - initialize the QP number table for a device - * @qpt: the QPN table - */ -void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt) +void qib_quiesce_qp(struct rvt_qp *qp) { - spin_lock_init(&qpt->lock); - qpt->last = 1; /* start with QPN 2 */ - qpt->nmaps = 1; - qpt->mask = dd->qpn_mask; + struct qib_qp_priv *priv = qp->priv; + + wait_event(priv->wait_dma, !atomic_read(&priv->s_dma_busy)); + if (priv->s_tx) { + qib_put_txreq(priv->s_tx); + priv->s_tx = NULL; + } } -/** - * qib_free_qpn_table - free the QP number table for a device - * @qpt: the QPN table - */ -void qib_free_qpn_table(struct qib_qpn_table *qpt) +void qib_flush_qp_waiters(struct rvt_qp *qp) { - int i; + struct qib_qp_priv *priv = qp->priv; + struct qib_ibdev *dev = to_idev(qp->ibqp.device); - for (i = 0; i < ARRAY_SIZE(qpt->map); i++) - if (qpt->map[i].page) - free_page((unsigned long) qpt->map[i].page); + spin_lock(&dev->rdi.pending_lock); + if (!list_empty(&priv->iowait)) + list_del_init(&priv->iowait); + spin_unlock(&dev->rdi.pending_lock); } /** @@ -1280,7 +444,7 @@ void qib_free_qpn_table(struct qib_qpn_table *qpt) * * The QP s_lock should be held. */ -void qib_get_credit(struct qib_qp *qp, u32 aeth) +void qib_get_credit(struct rvt_qp *qp, u32 aeth) { u32 credit = (aeth >> QIB_AETH_CREDIT_SHIFT) & QIB_AETH_CREDIT_MASK; @@ -1290,31 +454,70 @@ void qib_get_credit(struct qib_qp *qp, u32 aeth) * honor the credit field. */ if (credit == QIB_AETH_CREDIT_INVAL) { - if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) { - qp->s_flags |= QIB_S_UNLIMITED_CREDIT; - if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) { - qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT; + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) { + qp->s_flags |= RVT_S_UNLIMITED_CREDIT; + if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT; qib_schedule_send(qp); } } - } else if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) { + } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) { /* Compute new LSN (i.e., MSN + credit) */ credit = (aeth + credit_table[credit]) & QIB_MSN_MASK; if (qib_cmp24(credit, qp->s_lsn) > 0) { qp->s_lsn = credit; - if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) { - qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT; + if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT; qib_schedule_send(qp); } } } } +/** + * qib_check_send_wqe - validate wr/wqe + * @qp - The qp + * @wqe - The built wqe + * + * validate wr/wqe. This is called + * prior to inserting the wqe into + * the ring but after the wqe has been + * setup. + * + * Returns 1 to force direct progress, 0 otherwise, -EINVAL on failure + */ +int qib_check_send_wqe(struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + struct rvt_ah *ah; + int ret = 0; + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + if (wqe->length > 0x80000000U) + return -EINVAL; + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + ah = ibah_to_rvtah(wqe->ud_wr.ah); + if (wqe->length > (1 << ah->log_pmtu)) + return -EINVAL; + /* progress hint */ + ret = 1; + break; + default: + break; + } + return ret; +} + #ifdef CONFIG_DEBUG_FS struct qib_qp_iter { struct qib_ibdev *dev; - struct qib_qp *qp; + struct rvt_qp *qp; int n; }; @@ -1340,14 +543,14 @@ int qib_qp_iter_next(struct qib_qp_iter *iter) struct qib_ibdev *dev = iter->dev; int n = iter->n; int ret = 1; - struct qib_qp *pqp = iter->qp; - struct qib_qp *qp; + struct rvt_qp *pqp = iter->qp; + struct rvt_qp *qp; - for (; n < dev->qp_table_size; n++) { + for (; n < dev->rdi.qp_dev->qp_table_size; n++) { if (pqp) qp = rcu_dereference(pqp->next); else - qp = rcu_dereference(dev->qp_table[n]); + qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); pqp = qp; if (qp) { iter->qp = qp; @@ -1364,10 +567,11 @@ static const char * const qp_type_str[] = { void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter) { - struct qib_swqe *wqe; - struct qib_qp *qp = iter->qp; + struct rvt_swqe *wqe; + struct rvt_qp *qp = iter->qp; + struct qib_qp_priv *priv = qp->priv; - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); seq_printf(s, "N %d QP%u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x\n", iter->n, @@ -1377,8 +581,8 @@ void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter) wqe->wr.opcode, qp->s_hdrwords, qp->s_flags, - atomic_read(&qp->s_dma_busy), - !list_empty(&qp->iowait), + atomic_read(&priv->s_dma_busy), + !list_empty(&priv->iowait), qp->timeout, wqe->ssn, qp->s_lsn, diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index e6b7556d5221..9088e26d3ac8 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -40,7 +40,7 @@ static void rc_timeout(unsigned long arg); -static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe, +static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 psn, u32 pmtu) { u32 len; @@ -54,9 +54,9 @@ static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe, return wqe->length - len; } -static void start_timer(struct qib_qp *qp) +static void start_timer(struct rvt_qp *qp) { - qp->s_flags |= QIB_S_TIMER; + qp->s_flags |= RVT_S_TIMER; qp->s_timer.function = rc_timeout; /* 4.096 usec. * (1 << qp->timeout) */ qp->s_timer.expires = jiffies + qp->timeout_jiffies; @@ -74,17 +74,17 @@ static void start_timer(struct qib_qp *qp) * Note that we are in the responder's side of the QP context. * Note the QP s_lock must be held. */ -static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, +static int qib_make_rc_ack(struct qib_ibdev *dev, struct rvt_qp *qp, struct qib_other_headers *ohdr, u32 pmtu) { - struct qib_ack_entry *e; + struct rvt_ack_entry *e; u32 hwords; u32 len; u32 bth0; u32 bth2; /* Don't send an ACK if we aren't supposed to. */ - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ @@ -95,7 +95,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, case OP(RDMA_READ_RESPONSE_ONLY): e = &qp->s_ack_queue[qp->s_tail_ack_queue]; if (e->rdma_sge.mr) { - qib_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } /* FALLTHROUGH */ @@ -112,7 +112,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, case OP(ACKNOWLEDGE): /* Check for no next entry in the queue. */ if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { - if (qp->s_flags & QIB_S_ACK_PENDING) + if (qp->s_flags & RVT_S_ACK_PENDING) goto normal; goto bail; } @@ -133,7 +133,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, /* Copy SGE state in case we need to resend */ qp->s_rdma_mr = e->rdma_sge.mr; if (qp->s_rdma_mr) - qib_get_mr(qp->s_rdma_mr); + rvt_get_mr(qp->s_rdma_mr); qp->s_ack_rdma_sge.sge = e->rdma_sge; qp->s_ack_rdma_sge.num_sge = 1; qp->s_cur_sge = &qp->s_ack_rdma_sge; @@ -172,7 +172,7 @@ static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp, qp->s_cur_sge = &qp->s_ack_rdma_sge; qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr; if (qp->s_rdma_mr) - qib_get_mr(qp->s_rdma_mr); + rvt_get_mr(qp->s_rdma_mr); len = qp->s_ack_rdma_sge.sge.sge_length; if (len > pmtu) len = pmtu; @@ -196,7 +196,7 @@ normal: * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~QIB_S_ACK_PENDING; + qp->s_flags &= ~RVT_S_ACK_PENDING; qp->s_cur_sge = NULL; if (qp->s_nak_state) ohdr->u.aeth = @@ -218,7 +218,7 @@ normal: bail: qp->s_ack_state = OP(ACKNOWLEDGE); - qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING); + qp->s_flags &= ~(RVT_S_RESP_PENDING | RVT_S_ACK_PENDING); return 0; } @@ -226,63 +226,60 @@ bail: * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) * @qp: a pointer to the QP * + * Assumes the s_lock is held. + * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_rc_req(struct qib_qp *qp) +int qib_make_rc_req(struct rvt_qp *qp) { + struct qib_qp_priv *priv = qp->priv; struct qib_ibdev *dev = to_idev(qp->ibqp.device); struct qib_other_headers *ohdr; - struct qib_sge_state *ss; - struct qib_swqe *wqe; + struct rvt_sge_state *ss; + struct rvt_swqe *wqe; u32 hwords; u32 len; u32 bth0; u32 bth2; u32 pmtu = qp->pmtu; char newreq; - unsigned long flags; int ret = 0; int delta; - ohdr = &qp->s_hdr->u.oth; + ohdr = &priv->s_hdr->u.oth; if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr->u.l.oth; - - /* - * The lock is needed to synchronize between the sending tasklet, - * the receive interrupt handler, and timeout resends. - */ - spin_lock_irqsave(&qp->s_lock, flags); + ohdr = &priv->s_hdr->u.l.oth; /* Sending responses has higher priority over sending requests. */ - if ((qp->s_flags & QIB_S_RESP_PENDING) && + if ((qp->s_flags & RVT_S_RESP_PENDING) && qib_make_rc_ack(dev, qp, ohdr, pmtu)) goto done; - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) { - if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - if (qp->s_last == qp->s_head) + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ - if (atomic_read(&qp->s_dma_busy)) { - qp->s_flags |= QIB_S_WAIT_DMA; + if (atomic_read(&priv->s_dma_busy)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done; } - if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) goto bail; if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) { if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { - qp->s_flags |= QIB_S_WAIT_PSN; + qp->s_flags |= RVT_S_WAIT_PSN; goto bail; } qp->s_sending_psn = qp->s_psn; @@ -294,10 +291,10 @@ int qib_make_rc_req(struct qib_qp *qp) bth0 = 0; /* Send a request. */ - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); switch (qp->s_state) { default: - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) goto bail; /* * Resend an old request or start a new one. @@ -317,11 +314,11 @@ int qib_make_rc_req(struct qib_qp *qp) */ if ((wqe->wr.send_flags & IB_SEND_FENCE) && qp->s_num_rd_atomic) { - qp->s_flags |= QIB_S_WAIT_FENCE; + qp->s_flags |= RVT_S_WAIT_FENCE; goto bail; } - wqe->psn = qp->s_next_psn; newreq = 1; + qp->s_psn = wqe->psn; } /* * Note that we have to be careful not to modify the @@ -335,14 +332,12 @@ int qib_make_rc_req(struct qib_qp *qp) case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: /* If no credit, return. */ - if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { - qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; + qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } - wqe->lpsn = wqe->psn; if (len > pmtu) { - wqe->lpsn += (len - 1) / pmtu; qp->s_state = OP(SEND_FIRST); len = pmtu; break; @@ -363,14 +358,14 @@ int qib_make_rc_req(struct qib_qp *qp) break; case IB_WR_RDMA_WRITE: - if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; /* FALLTHROUGH */ case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ - if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) && + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { - qp->s_flags |= QIB_S_WAIT_SSN_CREDIT; + qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } @@ -380,9 +375,7 @@ int qib_make_rc_req(struct qib_qp *qp) cpu_to_be32(wqe->rdma_wr.rkey); ohdr->u.rc.reth.length = cpu_to_be32(len); hwords += sizeof(struct ib_reth) / sizeof(u32); - wqe->lpsn = wqe->psn; if (len > pmtu) { - wqe->lpsn += (len - 1) / pmtu; qp->s_state = OP(RDMA_WRITE_FIRST); len = pmtu; break; @@ -411,19 +404,12 @@ int qib_make_rc_req(struct qib_qp *qp) if (newreq) { if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { - qp->s_flags |= QIB_S_WAIT_RDMAR; + qp->s_flags |= RVT_S_WAIT_RDMAR; goto bail; } qp->s_num_rd_atomic++; - if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - /* - * Adjust s_next_psn to count the - * expected number of responses. - */ - if (len > pmtu) - qp->s_next_psn += (len - 1) / pmtu; - wqe->lpsn = qp->s_next_psn++; } ohdr->u.rc.reth.vaddr = @@ -449,13 +435,12 @@ int qib_make_rc_req(struct qib_qp *qp) if (newreq) { if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { - qp->s_flags |= QIB_S_WAIT_RDMAR; + qp->s_flags |= RVT_S_WAIT_RDMAR; goto bail; } qp->s_num_rd_atomic++; - if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - wqe->lpsn = wqe->psn; } if (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { qp->s_state = OP(COMPARE_SWAP); @@ -498,11 +483,8 @@ int qib_make_rc_req(struct qib_qp *qp) } if (wqe->wr.opcode == IB_WR_RDMA_READ) qp->s_psn = wqe->lpsn + 1; - else { + else qp->s_psn++; - if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; - } break; case OP(RDMA_READ_RESPONSE_FIRST): @@ -522,8 +504,6 @@ int qib_make_rc_req(struct qib_qp *qp) /* FALLTHROUGH */ case OP(SEND_MIDDLE): bth2 = qp->s_psn++ & QIB_PSN_MASK; - if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; if (len > pmtu) { @@ -563,8 +543,6 @@ int qib_make_rc_req(struct qib_qp *qp) /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): bth2 = qp->s_psn++ & QIB_PSN_MASK; - if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; if (len > pmtu) { @@ -618,9 +596,9 @@ int qib_make_rc_req(struct qib_qp *qp) delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8; if (delta && delta % QIB_PSN_CREDIT == 0) bth2 |= IB_BTH_REQ_ACK; - if (qp->s_flags & QIB_S_SEND_ONE) { - qp->s_flags &= ~QIB_S_SEND_ONE; - qp->s_flags |= QIB_S_WAIT_ACK; + if (qp->s_flags & RVT_S_SEND_ONE) { + qp->s_flags &= ~RVT_S_SEND_ONE; + qp->s_flags |= RVT_S_WAIT_ACK; bth2 |= IB_BTH_REQ_ACK; } qp->s_len -= len; @@ -629,13 +607,9 @@ int qib_make_rc_req(struct qib_qp *qp) qp->s_cur_size = len; qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2); done: - ret = 1; - goto unlock; - + return 1; bail: - qp->s_flags &= ~QIB_S_BUSY; -unlock: - spin_unlock_irqrestore(&qp->s_lock, flags); + qp->s_flags &= ~RVT_S_BUSY; return ret; } @@ -647,7 +621,7 @@ unlock: * Note that RDMA reads and atomics are handled in the * send side QP state and tasklet. */ -void qib_send_rc_ack(struct qib_qp *qp) +void qib_send_rc_ack(struct rvt_qp *qp) { struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); @@ -665,11 +639,11 @@ void qib_send_rc_ack(struct qib_qp *qp) spin_lock_irqsave(&qp->s_lock, flags); - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto unlock; /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ - if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt) + if ((qp->s_flags & RVT_S_RESP_PENDING) || qp->s_rdma_ack_cnt) goto queue_ack; /* Construct the header with s_lock held so APM doesn't change it. */ @@ -758,9 +732,9 @@ void qib_send_rc_ack(struct qib_qp *qp) goto done; queue_ack: - if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { - ibp->n_rc_qacks++; - qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING; + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + this_cpu_inc(*ibp->rvp.rc_qacks); + qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; qp->s_nak_state = qp->r_nak_state; qp->s_ack_psn = qp->r_ack_psn; @@ -782,10 +756,10 @@ done: * for the given QP. * Called at interrupt level with the QP s_lock held. */ -static void reset_psn(struct qib_qp *qp, u32 psn) +static void reset_psn(struct rvt_qp *qp, u32 psn) { u32 n = qp->s_acked; - struct qib_swqe *wqe = get_swqe_ptr(qp, n); + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); u32 opcode; qp->s_cur = n; @@ -808,7 +782,7 @@ static void reset_psn(struct qib_qp *qp, u32 psn) n = 0; if (n == qp->s_tail) break; - wqe = get_swqe_ptr(qp, n); + wqe = rvt_get_swqe_ptr(qp, n); diff = qib_cmp24(psn, wqe->psn); if (diff < 0) break; @@ -854,22 +828,22 @@ static void reset_psn(struct qib_qp *qp, u32 psn) done: qp->s_psn = psn; /* - * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer + * Set RVT_S_WAIT_PSN as qib_rc_complete() may start the timer * asynchronously before the send tasklet can get scheduled. * Doing it in qib_make_rc_req() is too late. */ if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) && (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) - qp->s_flags |= QIB_S_WAIT_PSN; + qp->s_flags |= RVT_S_WAIT_PSN; } /* * Back up requester to resend the last un-ACKed request. * The QP r_lock and s_lock should be held and interrupts disabled. */ -static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait) +static void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait) { - struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked); + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct qib_ibport *ibp; if (qp->s_retry == 0) { @@ -878,7 +852,7 @@ static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait) qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); - qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else /* XXX need to handle delayed completion */ return; @@ -887,15 +861,15 @@ static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait) ibp = to_iport(qp->ibqp.device, qp->port_num); if (wqe->wr.opcode == IB_WR_RDMA_READ) - ibp->n_rc_resends++; + ibp->rvp.n_rc_resends++; else - ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; + ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; - qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | - QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN | - QIB_S_WAIT_ACK); + qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | + RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | + RVT_S_WAIT_ACK); if (wait) - qp->s_flags |= QIB_S_SEND_ONE; + qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); } @@ -904,16 +878,16 @@ static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait) */ static void rc_timeout(unsigned long arg) { - struct qib_qp *qp = (struct qib_qp *)arg; + struct rvt_qp *qp = (struct rvt_qp *)arg; struct qib_ibport *ibp; unsigned long flags; spin_lock_irqsave(&qp->r_lock, flags); spin_lock(&qp->s_lock); - if (qp->s_flags & QIB_S_TIMER) { + if (qp->s_flags & RVT_S_TIMER) { ibp = to_iport(qp->ibqp.device, qp->port_num); - ibp->n_rc_timeouts++; - qp->s_flags &= ~QIB_S_TIMER; + ibp->rvp.n_rc_timeouts++; + qp->s_flags &= ~RVT_S_TIMER; del_timer(&qp->s_timer); qib_restart_rc(qp, qp->s_last_psn + 1, 1); qib_schedule_send(qp); @@ -927,12 +901,12 @@ static void rc_timeout(unsigned long arg) */ void qib_rc_rnr_retry(unsigned long arg) { - struct qib_qp *qp = (struct qib_qp *)arg; + struct rvt_qp *qp = (struct rvt_qp *)arg; unsigned long flags; spin_lock_irqsave(&qp->s_lock, flags); - if (qp->s_flags & QIB_S_WAIT_RNR) { - qp->s_flags &= ~QIB_S_WAIT_RNR; + if (qp->s_flags & RVT_S_WAIT_RNR) { + qp->s_flags &= ~RVT_S_WAIT_RNR; del_timer(&qp->s_timer); qib_schedule_send(qp); } @@ -943,14 +917,14 @@ void qib_rc_rnr_retry(unsigned long arg) * Set qp->s_sending_psn to the next PSN after the given one. * This would be psn+1 except when RDMA reads are present. */ -static void reset_sending_psn(struct qib_qp *qp, u32 psn) +static void reset_sending_psn(struct rvt_qp *qp, u32 psn) { - struct qib_swqe *wqe; + struct rvt_swqe *wqe; u32 n = qp->s_last; /* Find the work request corresponding to the given PSN. */ for (;;) { - wqe = get_swqe_ptr(qp, n); + wqe = rvt_get_swqe_ptr(qp, n); if (qib_cmp24(psn, wqe->lpsn) <= 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ) qp->s_sending_psn = wqe->lpsn + 1; @@ -968,16 +942,16 @@ static void reset_sending_psn(struct qib_qp *qp, u32 psn) /* * This should be called with the QP s_lock held and interrupts disabled. */ -void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr) +void qib_rc_send_complete(struct rvt_qp *qp, struct qib_ib_header *hdr) { struct qib_other_headers *ohdr; - struct qib_swqe *wqe; + struct rvt_swqe *wqe; struct ib_wc wc; unsigned i; u32 opcode; u32 psn; - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) return; /* Find out where the BTH is */ @@ -1002,22 +976,30 @@ void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr) * there are still requests that haven't been acked. */ if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && - !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) && - (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + !(qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) start_timer(qp); while (qp->s_last != qp->s_acked) { - wqe = get_swqe_ptr(qp, qp->s_last); + u32 s_last; + + wqe = rvt_get_swqe_ptr(qp, qp->s_last); if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 && qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; + s_last = qp->s_last; + if (++s_last >= qp->s_size) + s_last = 0; + qp->s_last = s_last; + /* see post_send() */ + barrier(); for (i = 0; i < wqe->wr.num_sge; i++) { - struct qib_sge *sge = &wqe->sg_list[i]; + struct rvt_sge *sge = &wqe->sg_list[i]; - qib_put_mr(sge->mr); + rvt_put_mr(sge->mr); } /* Post a send completion queue entry if requested. */ - if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr.wr_id; @@ -1025,25 +1007,23 @@ void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr) wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; wc.byte_len = wqe->length; wc.qp = &qp->ibqp; - qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0); } - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; } /* * If we were waiting for sends to complete before resending, * and they are now complete, restart sending. */ - if (qp->s_flags & QIB_S_WAIT_PSN && + if (qp->s_flags & RVT_S_WAIT_PSN && qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { - qp->s_flags &= ~QIB_S_WAIT_PSN; + qp->s_flags &= ~RVT_S_WAIT_PSN; qp->s_sending_psn = qp->s_psn; qp->s_sending_hpsn = qp->s_psn - 1; qib_schedule_send(qp); } } -static inline void update_last_psn(struct qib_qp *qp, u32 psn) +static inline void update_last_psn(struct rvt_qp *qp, u32 psn) { qp->s_last_psn = psn; } @@ -1053,8 +1033,8 @@ static inline void update_last_psn(struct qib_qp *qp, u32 psn) * This is similar to qib_send_complete but has to check to be sure * that the SGEs are not being referenced if the SWQE is being resent. */ -static struct qib_swqe *do_rc_completion(struct qib_qp *qp, - struct qib_swqe *wqe, +static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, struct qib_ibport *ibp) { struct ib_wc wc; @@ -1067,13 +1047,21 @@ static struct qib_swqe *do_rc_completion(struct qib_qp *qp, */ if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 || qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + u32 s_last; + for (i = 0; i < wqe->wr.num_sge; i++) { - struct qib_sge *sge = &wqe->sg_list[i]; + struct rvt_sge *sge = &wqe->sg_list[i]; - qib_put_mr(sge->mr); + rvt_put_mr(sge->mr); } + s_last = qp->s_last; + if (++s_last >= qp->s_size) + s_last = 0; + qp->s_last = s_last; + /* see post_send() */ + barrier(); /* Post a send completion queue entry if requested. */ - if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr.wr_id; @@ -1081,12 +1069,10 @@ static struct qib_swqe *do_rc_completion(struct qib_qp *qp, wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode]; wc.byte_len = wqe->length; wc.qp = &qp->ibqp; - qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0); } - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; } else - ibp->n_rc_delayed_comp++; + this_cpu_inc(*ibp->rvp.rc_delayed_comp); qp->s_retry = qp->s_retry_cnt; update_last_psn(qp, wqe->lpsn); @@ -1100,7 +1086,7 @@ static struct qib_swqe *do_rc_completion(struct qib_qp *qp, if (++qp->s_cur >= qp->s_size) qp->s_cur = 0; qp->s_acked = qp->s_cur; - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); if (qp->s_acked != qp->s_tail) { qp->s_state = OP(SEND_LAST); qp->s_psn = wqe->psn; @@ -1110,7 +1096,7 @@ static struct qib_swqe *do_rc_completion(struct qib_qp *qp, qp->s_acked = 0; if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) qp->s_draining = 0; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); } return wqe; } @@ -1126,19 +1112,19 @@ static struct qib_swqe *do_rc_completion(struct qib_qp *qp, * Called at interrupt level with the QP s_lock held. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, +static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, struct qib_ctxtdata *rcd) { struct qib_ibport *ibp; enum ib_wc_status status; - struct qib_swqe *wqe; + struct rvt_swqe *wqe; int ret = 0; u32 ack_psn; int diff; /* Remove QP from retry timer */ - if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { - qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { + qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); del_timer(&qp->s_timer); } @@ -1151,7 +1137,7 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, ack_psn = psn; if (aeth >> 29) ack_psn--; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); ibp = to_iport(qp->ibqp.device, qp->port_num); /* @@ -1186,11 +1172,11 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { /* Retry this request. */ - if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) { - qp->r_flags |= QIB_R_RDMAR_SEQ; + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; qib_restart_rc(qp, qp->s_last_psn + 1, 0); if (list_empty(&qp->rspwait)) { - qp->r_flags |= QIB_R_RSP_SEND; + qp->r_flags |= RVT_R_RSP_SEND; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); @@ -1213,14 +1199,14 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { qp->s_num_rd_atomic--; /* Restart sending task if fence is complete */ - if ((qp->s_flags & QIB_S_WAIT_FENCE) && + if ((qp->s_flags & RVT_S_WAIT_FENCE) && !qp->s_num_rd_atomic) { - qp->s_flags &= ~(QIB_S_WAIT_FENCE | - QIB_S_WAIT_ACK); + qp->s_flags &= ~(RVT_S_WAIT_FENCE | + RVT_S_WAIT_ACK); qib_schedule_send(qp); - } else if (qp->s_flags & QIB_S_WAIT_RDMAR) { - qp->s_flags &= ~(QIB_S_WAIT_RDMAR | - QIB_S_WAIT_ACK); + } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { + qp->s_flags &= ~(RVT_S_WAIT_RDMAR | + RVT_S_WAIT_ACK); qib_schedule_send(qp); } } @@ -1231,7 +1217,7 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, switch (aeth >> 29) { case 0: /* ACK */ - ibp->n_rc_acks++; + this_cpu_inc(*ibp->rvp.rc_acks); if (qp->s_acked != qp->s_tail) { /* * We are expecting more ACKs so @@ -1248,8 +1234,8 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, qp->s_state = OP(SEND_LAST); qp->s_psn = psn + 1; } - if (qp->s_flags & QIB_S_WAIT_ACK) { - qp->s_flags &= ~QIB_S_WAIT_ACK; + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; qib_schedule_send(qp); } qib_get_credit(qp, aeth); @@ -1260,10 +1246,10 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, goto bail; case 1: /* RNR NAK */ - ibp->n_rnr_naks++; + ibp->rvp.n_rnr_naks++; if (qp->s_acked == qp->s_tail) goto bail; - if (qp->s_flags & QIB_S_WAIT_RNR) + if (qp->s_flags & RVT_S_WAIT_RNR) goto bail; if (qp->s_rnr_retry == 0) { status = IB_WC_RNR_RETRY_EXC_ERR; @@ -1275,12 +1261,12 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, /* The last valid PSN is the previous PSN. */ update_last_psn(qp, psn - 1); - ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; + ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK; reset_psn(qp, psn); - qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK); - qp->s_flags |= QIB_S_WAIT_RNR; + qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); + qp->s_flags |= RVT_S_WAIT_RNR; qp->s_timer.function = qib_rc_rnr_retry; qp->s_timer.expires = jiffies + usecs_to_jiffies( ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) & @@ -1296,7 +1282,7 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, switch ((aeth >> QIB_AETH_CREDIT_SHIFT) & QIB_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ - ibp->n_seq_naks++; + ibp->rvp.n_seq_naks++; /* * Back up to the responder's expected PSN. * Note that we might get a NAK in the middle of an @@ -1309,21 +1295,21 @@ static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode, case 1: /* Invalid Request */ status = IB_WC_REM_INV_REQ_ERR; - ibp->n_other_naks++; + ibp->rvp.n_other_naks++; goto class_b; case 2: /* Remote Access Error */ status = IB_WC_REM_ACCESS_ERR; - ibp->n_other_naks++; + ibp->rvp.n_other_naks++; goto class_b; case 3: /* Remote Operation Error */ status = IB_WC_REM_OP_ERR; - ibp->n_other_naks++; + ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { qib_send_complete(qp, wqe, status); - qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1349,18 +1335,18 @@ bail: * We have seen an out of sequence RDMA read middle or last packet. * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. */ -static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn, +static void rdma_seq_err(struct rvt_qp *qp, struct qib_ibport *ibp, u32 psn, struct qib_ctxtdata *rcd) { - struct qib_swqe *wqe; + struct rvt_swqe *wqe; /* Remove QP from retry timer */ - if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) { - qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR); + if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { + qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); del_timer(&qp->s_timer); } - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); while (qib_cmp24(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || @@ -1370,11 +1356,11 @@ static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn, wqe = do_rc_completion(qp, wqe, ibp); } - ibp->n_rdma_seq++; - qp->r_flags |= QIB_R_RDMAR_SEQ; + ibp->rvp.n_rdma_seq++; + qp->r_flags |= RVT_R_RDMAR_SEQ; qib_restart_rc(qp, qp->s_last_psn + 1, 0); if (list_empty(&qp->rspwait)) { - qp->r_flags |= QIB_R_RSP_SEND; + qp->r_flags |= RVT_R_RSP_SEND; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); } @@ -1399,12 +1385,12 @@ static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn, static void qib_rc_rcv_resp(struct qib_ibport *ibp, struct qib_other_headers *ohdr, void *data, u32 tlen, - struct qib_qp *qp, + struct rvt_qp *qp, u32 opcode, u32 psn, u32 hdrsize, u32 pmtu, struct qib_ctxtdata *rcd) { - struct qib_swqe *wqe; + struct rvt_swqe *wqe; struct qib_pportdata *ppd = ppd_from_ibp(ibp); enum ib_wc_status status; unsigned long flags; @@ -1425,7 +1411,7 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp, * If send tasklet not running attempt to progress * SDMA queue. */ - if (!(qp->s_flags & QIB_S_BUSY)) { + if (!(qp->s_flags & RVT_S_BUSY)) { /* Acquire SDMA Lock */ spin_lock_irqsave(&ppd->sdma_lock, flags); /* Invoke sdma make progress */ @@ -1437,11 +1423,12 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp, } spin_lock_irqsave(&qp->s_lock, flags); - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto ack_done; /* Ignore invalid responses. */ - if (qib_cmp24(psn, qp->s_next_psn) >= 0) + smp_read_barrier_depends(); /* see post_one_send */ + if (qib_cmp24(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0) goto ack_done; /* Ignore duplicate responses. */ @@ -1460,15 +1447,15 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp, * Skip everything other than the PSN we expect, if we are waiting * for a reply to a restarted RDMA read or atomic op. */ - if (qp->r_flags & QIB_R_RDMAR_SEQ) { + if (qp->r_flags & RVT_R_RDMAR_SEQ) { if (qib_cmp24(psn, qp->s_last_psn + 1) != 0) goto ack_done; - qp->r_flags &= ~QIB_R_RDMAR_SEQ; + qp->r_flags &= ~RVT_R_RDMAR_SEQ; } if (unlikely(qp->s_acked == qp->s_tail)) goto ack_done; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); status = IB_WC_SUCCESS; switch (opcode) { @@ -1487,7 +1474,7 @@ static void qib_rc_rcv_resp(struct qib_ibport *ibp, opcode != OP(RDMA_READ_RESPONSE_FIRST)) goto ack_done; hdrsize += 4; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) goto ack_op_err; /* @@ -1515,10 +1502,10 @@ read_middle: * We got a response so update the timeout. * 4.096 usec. * (1 << qp->timeout) */ - qp->s_flags |= QIB_S_TIMER; + qp->s_flags |= RVT_S_TIMER; mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); - if (qp->s_flags & QIB_S_WAIT_ACK) { - qp->s_flags &= ~QIB_S_WAIT_ACK; + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; qib_schedule_send(qp); } @@ -1553,7 +1540,7 @@ read_middle: * have to be careful to copy the data to the right * location. */ - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, wqe, psn, pmtu); goto read_last; @@ -1598,7 +1585,7 @@ ack_len_err: ack_err: if (qp->s_last == qp->s_acked) { qib_send_complete(qp, wqe, status); - qib_error_qp(qp, IB_WC_WR_FLUSH_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1623,14 +1610,14 @@ bail: */ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, void *data, - struct qib_qp *qp, + struct rvt_qp *qp, u32 opcode, u32 psn, int diff, struct qib_ctxtdata *rcd) { struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - struct qib_ack_entry *e; + struct rvt_ack_entry *e; unsigned long flags; u8 i, prev; int old_req; @@ -1642,7 +1629,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, * Don't queue the NAK if we already sent one. */ if (!qp->r_nak_state) { - ibp->n_rc_seqnak++; + ibp->rvp.n_rc_seqnak++; qp->r_nak_state = IB_NAK_PSN_ERROR; /* Use the expected PSN. */ qp->r_ack_psn = qp->r_psn; @@ -1652,7 +1639,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, * Otherwise, we end up propagating congestion. */ if (list_empty(&qp->rspwait)) { - qp->r_flags |= QIB_R_RSP_NAK; + qp->r_flags |= RVT_R_RSP_NAK; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); } @@ -1678,7 +1665,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, */ e = NULL; old_req = 1; - ibp->n_rc_dupreq++; + ibp->rvp.n_rc_dupreq++; spin_lock_irqsave(&qp->s_lock, flags); @@ -1732,7 +1719,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, if (unlikely(offset + len != e->rdma_sge.sge_length)) goto unlock_done; if (e->rdma_sge.mr) { - qib_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } if (len != 0) { @@ -1740,7 +1727,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, u64 vaddr = be64_to_cpu(reth->vaddr); int ok; - ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, IB_ACCESS_REMOTE_READ); if (unlikely(!ok)) goto unlock_done; @@ -1791,7 +1778,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, * which doesn't accept a RDMA read response or atomic * response as an ACK for earlier SENDs or RDMA writes. */ - if (!(qp->s_flags & QIB_S_RESP_PENDING)) { + if (!(qp->s_flags & RVT_S_RESP_PENDING)) { spin_unlock_irqrestore(&qp->s_lock, flags); qp->r_nak_state = 0; qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; @@ -1805,7 +1792,7 @@ static int qib_rc_rcv_error(struct qib_other_headers *ohdr, break; } qp->s_ack_state = OP(ACKNOWLEDGE); - qp->s_flags |= QIB_S_RESP_PENDING; + qp->s_flags |= RVT_S_RESP_PENDING; qp->r_nak_state = 0; qib_schedule_send(qp); @@ -1818,13 +1805,13 @@ send_ack: return 0; } -void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err) +void qib_rc_error(struct rvt_qp *qp, enum ib_wc_status err) { unsigned long flags; int lastwqe; spin_lock_irqsave(&qp->s_lock, flags); - lastwqe = qib_error_qp(qp, err); + lastwqe = rvt_error_qp(qp, err); spin_unlock_irqrestore(&qp->s_lock, flags); if (lastwqe) { @@ -1837,7 +1824,7 @@ void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err) } } -static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n) +static inline void qib_update_ack_queue(struct rvt_qp *qp, unsigned n) { unsigned next; @@ -1862,7 +1849,7 @@ static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n) * Called at interrupt level. */ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, - int has_grh, void *data, u32 tlen, struct qib_qp *qp) + int has_grh, void *data, u32 tlen, struct rvt_qp *qp) { struct qib_ibport *ibp = &rcd->ppd->ibport_data; struct qib_other_headers *ohdr; @@ -1948,8 +1935,8 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, break; } - if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { - qp->r_flags |= QIB_R_COMM_EST; + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) { + qp->r_flags |= RVT_R_COMM_EST; if (qp->ibqp.event_handler) { struct ib_event ev; @@ -2026,9 +2013,9 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; qib_copy_sge(&qp->r_sge, data, tlen, 1); - qib_put_ss(&qp->r_sge); + rvt_put_ss(&qp->r_sge); qp->r_msn++; - if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) break; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -2047,7 +2034,7 @@ send_last: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & cpu_to_be32(IB_BTH_SOLICITED)) != 0); break; @@ -2069,7 +2056,7 @@ send_last: int ok; /* Check rkey & NAK */ - ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, rkey, IB_ACCESS_REMOTE_WRITE); if (unlikely(!ok)) goto nack_acc; @@ -2096,7 +2083,7 @@ send_last: goto send_last; case OP(RDMA_READ_REQUEST): { - struct qib_ack_entry *e; + struct rvt_ack_entry *e; u32 len; u8 next; @@ -2114,7 +2101,7 @@ send_last: } e = &qp->s_ack_queue[qp->r_head_ack_queue]; if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - qib_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } reth = &ohdr->u.rc.reth; @@ -2125,7 +2112,7 @@ send_last: int ok; /* Check rkey & NAK */ - ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, IB_ACCESS_REMOTE_READ); if (unlikely(!ok)) goto nack_acc_unlck; @@ -2157,7 +2144,7 @@ send_last: qp->r_head_ack_queue = next; /* Schedule the send tasklet. */ - qp->s_flags |= QIB_S_RESP_PENDING; + qp->s_flags |= RVT_S_RESP_PENDING; qib_schedule_send(qp); goto sunlock; @@ -2166,7 +2153,7 @@ send_last: case OP(COMPARE_SWAP): case OP(FETCH_ADD): { struct ib_atomic_eth *ateth; - struct qib_ack_entry *e; + struct rvt_ack_entry *e; u64 vaddr; atomic64_t *maddr; u64 sdata; @@ -2186,7 +2173,7 @@ send_last: } e = &qp->s_ack_queue[qp->r_head_ack_queue]; if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - qib_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } ateth = &ohdr->u.atomic_eth; @@ -2196,7 +2183,7 @@ send_last: goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); /* Check rkey & NAK */ - if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), vaddr, rkey, IB_ACCESS_REMOTE_ATOMIC))) goto nack_acc_unlck; @@ -2208,7 +2195,7 @@ send_last: (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, be64_to_cpu(ateth->compare_data), sdata); - qib_put_mr(qp->r_sge.sge.mr); + rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; e->opcode = opcode; e->sent = 0; @@ -2221,7 +2208,7 @@ send_last: qp->r_head_ack_queue = next; /* Schedule the send tasklet. */ - qp->s_flags |= QIB_S_RESP_PENDING; + qp->s_flags |= RVT_S_RESP_PENDING; qib_schedule_send(qp); goto sunlock; @@ -2245,7 +2232,7 @@ rnr_nak: qp->r_ack_psn = qp->r_psn; /* Queue RNR NAK for later */ if (list_empty(&qp->rspwait)) { - qp->r_flags |= QIB_R_RSP_NAK; + qp->r_flags |= RVT_R_RSP_NAK; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); } @@ -2257,7 +2244,7 @@ nack_op_err: qp->r_ack_psn = qp->r_psn; /* Queue NAK for later */ if (list_empty(&qp->rspwait)) { - qp->r_flags |= QIB_R_RSP_NAK; + qp->r_flags |= RVT_R_RSP_NAK; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); } @@ -2271,7 +2258,7 @@ nack_inv: qp->r_ack_psn = qp->r_psn; /* Queue NAK for later */ if (list_empty(&qp->rspwait)) { - qp->r_flags |= QIB_R_RSP_NAK; + qp->r_flags |= RVT_R_RSP_NAK; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); } diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index b1aa21bdd484..a5f07a64b228 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -79,16 +79,16 @@ const u32 ib_qib_rnr_table[32] = { * Validate a RWQE and fill in the SGE state. * Return 1 if OK. */ -static int qib_init_sge(struct qib_qp *qp, struct qib_rwqe *wqe) +static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) { int i, j, ret; struct ib_wc wc; - struct qib_lkey_table *rkt; - struct qib_pd *pd; - struct qib_sge_state *ss; + struct rvt_lkey_table *rkt; + struct rvt_pd *pd; + struct rvt_sge_state *ss; - rkt = &to_idev(qp->ibqp.device)->lk_table; - pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; + pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); ss = &qp->r_sge; ss->sg_list = qp->r_sg_list; qp->r_len = 0; @@ -96,7 +96,7 @@ static int qib_init_sge(struct qib_qp *qp, struct qib_rwqe *wqe) if (wqe->sg_list[i].length == 0) continue; /* Check LKEY */ - if (!qib_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) goto bad_lkey; qp->r_len += wqe->sg_list[i].length; @@ -109,9 +109,9 @@ static int qib_init_sge(struct qib_qp *qp, struct qib_rwqe *wqe) bad_lkey: while (j) { - struct qib_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; - qib_put_mr(sge->mr); + rvt_put_mr(sge->mr); } ss->num_sge = 0; memset(&wc, 0, sizeof(wc)); @@ -120,7 +120,7 @@ bad_lkey: wc.opcode = IB_WC_RECV; wc.qp = &qp->ibqp; /* Signal solicited completion event. */ - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); ret = 0; bail: return ret; @@ -136,19 +136,19 @@ bail: * * Can be called from interrupt level. */ -int qib_get_rwqe(struct qib_qp *qp, int wr_id_only) +int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only) { unsigned long flags; - struct qib_rq *rq; - struct qib_rwq *wq; - struct qib_srq *srq; - struct qib_rwqe *wqe; + struct rvt_rq *rq; + struct rvt_rwq *wq; + struct rvt_srq *srq; + struct rvt_rwqe *wqe; void (*handler)(struct ib_event *, void *); u32 tail; int ret; if (qp->ibqp.srq) { - srq = to_isrq(qp->ibqp.srq); + srq = ibsrq_to_rvtsrq(qp->ibqp.srq); handler = srq->ibsrq.event_handler; rq = &srq->rq; } else { @@ -158,7 +158,7 @@ int qib_get_rwqe(struct qib_qp *qp, int wr_id_only) } spin_lock_irqsave(&rq->lock, flags); - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { ret = 0; goto unlock; } @@ -174,7 +174,7 @@ int qib_get_rwqe(struct qib_qp *qp, int wr_id_only) } /* Make sure entry is read after head index is read. */ smp_rmb(); - wqe = get_rwqe_ptr(rq, tail); + wqe = rvt_get_rwqe_ptr(rq, tail); /* * Even though we update the tail index in memory, the verbs * consumer is not supposed to post more entries until a @@ -190,7 +190,7 @@ int qib_get_rwqe(struct qib_qp *qp, int wr_id_only) qp->r_wr_id = wqe->wr_id; ret = 1; - set_bit(QIB_R_WRID_VALID, &qp->r_aflags); + set_bit(RVT_R_WRID_VALID, &qp->r_aflags); if (handler) { u32 n; @@ -227,7 +227,7 @@ bail: * Switch to alternate path. * The QP s_lock should be held and interrupts disabled. */ -void qib_migrate_qp(struct qib_qp *qp) +void qib_migrate_qp(struct rvt_qp *qp) { struct ib_event ev; @@ -266,7 +266,7 @@ static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) * The s_lock will be acquired around the qib_migrate_qp() call. */ int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, - int has_grh, struct qib_qp *qp, u32 bth0) + int has_grh, struct rvt_qp *qp, u32 bth0) { __be64 guid; unsigned long flags; @@ -279,7 +279,8 @@ int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH)) goto err; guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index); - if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + if (!gid_ok(&hdr->u.l.grh.dgid, + ibp->rvp.gid_prefix, guid)) goto err; if (!gid_ok(&hdr->u.l.grh.sgid, qp->alt_ah_attr.grh.dgid.global.subnet_prefix, @@ -311,7 +312,8 @@ int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, goto err; guid = get_sguid(ibp, qp->remote_ah_attr.grh.sgid_index); - if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + if (!gid_ok(&hdr->u.l.grh.dgid, + ibp->rvp.gid_prefix, guid)) goto err; if (!gid_ok(&hdr->u.l.grh.sgid, qp->remote_ah_attr.grh.dgid.global.subnet_prefix, @@ -353,12 +355,15 @@ err: * receive interrupts since this is a connected protocol and all packets * will pass through here. */ -static void qib_ruc_loopback(struct qib_qp *sqp) +static void qib_ruc_loopback(struct rvt_qp *sqp) { struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct qib_qp *qp; - struct qib_swqe *wqe; - struct qib_sge *sge; + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_sge *sge; unsigned long flags; struct ib_wc wc; u64 sdata; @@ -367,29 +372,33 @@ static void qib_ruc_loopback(struct qib_qp *sqp) int release; int ret; + rcu_read_lock(); /* * Note that we check the responder QP state after * checking the requester's state. */ - qp = qib_lookup_qpn(ibp, sqp->remote_qpn); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, sqp->remote_qpn); + if (!qp) + goto done; spin_lock_irqsave(&sqp->s_lock, flags); /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT)) || - !(ib_qib_state_ops[sqp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) goto unlock; - sqp->s_flags |= QIB_S_BUSY; + sqp->s_flags |= RVT_S_BUSY; again: - if (sqp->s_last == sqp->s_head) + smp_read_barrier_depends(); /* see post_one_send() */ + if (sqp->s_last == ACCESS_ONCE(sqp->s_head)) goto clr_busy; - wqe = get_swqe_ptr(sqp, sqp->s_last); + wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); /* Return if it is not OK to start a new work reqeust. */ - if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_NEXT_SEND_OK)) { - if (!(ib_qib_state_ops[sqp->state] & QIB_FLUSH_SEND)) + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) goto clr_busy; /* We are in the error state, flush the work request. */ send_status = IB_WC_WR_FLUSH_ERR; @@ -407,9 +416,9 @@ again: } spin_unlock_irqrestore(&sqp->s_lock, flags); - if (!qp || !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) || + if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; /* * For RC, the requester would timeout and retry so * shortcut the timeouts and just signal too many retries. @@ -458,7 +467,7 @@ again: goto inv_err; if (wqe->length == 0) break; - if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, wqe->rdma_wr.remote_addr, wqe->rdma_wr.rkey, IB_ACCESS_REMOTE_WRITE))) @@ -471,7 +480,7 @@ again: case IB_WR_RDMA_READ: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) goto inv_err; - if (unlikely(!qib_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, wqe->rdma_wr.remote_addr, wqe->rdma_wr.rkey, IB_ACCESS_REMOTE_READ))) @@ -489,7 +498,7 @@ again: case IB_WR_ATOMIC_FETCH_AND_ADD: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) goto inv_err; - if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), wqe->atomic_wr.remote_addr, wqe->atomic_wr.rkey, IB_ACCESS_REMOTE_ATOMIC))) @@ -502,7 +511,7 @@ again: (u64) atomic64_add_return(sdata, maddr) - sdata : (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, sdata, wqe->atomic_wr.swap); - qib_put_mr(qp->r_sge.sge.mr); + rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; goto send_comp; @@ -526,11 +535,11 @@ again: sge->sge_length -= len; if (sge->sge_length == 0) { if (!release) - qib_put_mr(sge->mr); + rvt_put_mr(sge->mr); if (--sqp->s_sge.num_sge) *sge = *sqp->s_sge.sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= QIB_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -543,9 +552,9 @@ again: sqp->s_len -= len; } if (release) - qib_put_ss(&qp->r_sge); + rvt_put_ss(&qp->r_sge); - if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) goto send_comp; if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) @@ -561,12 +570,12 @@ again: wc.sl = qp->remote_ah_attr.sl; wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: spin_lock_irqsave(&sqp->s_lock, flags); - ibp->n_loop_pkts++; + ibp->rvp.n_loop_pkts++; flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; qib_send_complete(sqp, wqe, send_status); @@ -576,7 +585,7 @@ rnr_nak: /* Handle RNR NAK */ if (qp->ibqp.qp_type == IB_QPT_UC) goto send_comp; - ibp->n_rnr_naks++; + ibp->rvp.n_rnr_naks++; /* * Note: we don't need the s_lock held since the BUSY flag * makes this single threaded. @@ -588,9 +597,9 @@ rnr_nak: if (sqp->s_rnr_retry_cnt < 7) sqp->s_rnr_retry--; spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_RECV_OK)) + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) goto clr_busy; - sqp->s_flags |= QIB_S_WAIT_RNR; + sqp->s_flags |= RVT_S_WAIT_RNR; sqp->s_timer.function = qib_rc_rnr_retry; sqp->s_timer.expires = jiffies + usecs_to_jiffies(ib_qib_rnr_table[qp->r_min_rnr_timer]); @@ -618,9 +627,9 @@ serr: spin_lock_irqsave(&sqp->s_lock, flags); qib_send_complete(sqp, wqe, send_status); if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = qib_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - sqp->s_flags &= ~QIB_S_BUSY; + sqp->s_flags &= ~RVT_S_BUSY; spin_unlock_irqrestore(&sqp->s_lock, flags); if (lastwqe) { struct ib_event ev; @@ -633,12 +642,11 @@ serr: goto done; } clr_busy: - sqp->s_flags &= ~QIB_S_BUSY; + sqp->s_flags &= ~RVT_S_BUSY; unlock: spin_unlock_irqrestore(&sqp->s_lock, flags); done: - if (qp && atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); + rcu_read_unlock(); } /** @@ -663,7 +671,7 @@ u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, hdr->next_hdr = IB_GRH_NEXT_HDR; hdr->hop_limit = grh->hop_limit; /* The SGID is 32-bit aligned. */ - hdr->sgid.global.subnet_prefix = ibp->gid_prefix; + hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; hdr->sgid.global.interface_id = grh->sgid_index ? ibp->guids[grh->sgid_index - 1] : ppd_from_ibp(ibp)->guid; hdr->dgid = grh->dgid; @@ -672,9 +680,10 @@ u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, return sizeof(struct ib_grh) / sizeof(u32); } -void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, +void qib_make_ruc_header(struct rvt_qp *qp, struct qib_other_headers *ohdr, u32 bth0, u32 bth2) { + struct qib_qp_priv *priv = qp->priv; struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); u16 lrh0; u32 nwords; @@ -685,17 +694,18 @@ void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, nwords = (qp->s_cur_size + extra_bytes) >> 2; lrh0 = QIB_LRH_BTH; if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { - qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh, + qp->s_hdrwords += qib_make_grh(ibp, &priv->s_hdr->u.l.grh, &qp->remote_ah_attr.grh, qp->s_hdrwords, nwords); lrh0 = QIB_LRH_GRH; } lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 | qp->remote_ah_attr.sl << 4; - qp->s_hdr->lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr->lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); - qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); - qp->s_hdr->lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | + priv->s_hdr->lrh[0] = cpu_to_be16(lrh0); + priv->s_hdr->lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + priv->s_hdr->lrh[2] = + cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + priv->s_hdr->lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | qp->remote_ah_attr.src_path_bits); bth0 |= qib_get_pkey(ibp, qp->s_pkey_index); bth0 |= extra_bytes << 20; @@ -707,20 +717,29 @@ void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, this_cpu_inc(ibp->pmastats->n_unicast_xmit); } +void _qib_do_send(struct work_struct *work) +{ + struct qib_qp_priv *priv = container_of(work, struct qib_qp_priv, + s_work); + struct rvt_qp *qp = priv->owner; + + qib_do_send(qp); +} + /** * qib_do_send - perform a send on a QP - * @work: contains a pointer to the QP + * @qp: pointer to the QP * * Process entries in the send work queue until credit or queue is * exhausted. Only allow one CPU to send a packet per QP (tasklet). * Otherwise, two threads could send packets out of order. */ -void qib_do_send(struct work_struct *work) +void qib_do_send(struct rvt_qp *qp) { - struct qib_qp *qp = container_of(work, struct qib_qp, s_work); + struct qib_qp_priv *priv = qp->priv; struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct qib_pportdata *ppd = ppd_from_ibp(ibp); - int (*make_req)(struct qib_qp *qp); + int (*make_req)(struct rvt_qp *qp); unsigned long flags; if ((qp->ibqp.qp_type == IB_QPT_RC || @@ -745,50 +764,59 @@ void qib_do_send(struct work_struct *work) return; } - qp->s_flags |= QIB_S_BUSY; - - spin_unlock_irqrestore(&qp->s_lock, flags); + qp->s_flags |= RVT_S_BUSY; do { /* Check for a constructed packet to be sent. */ if (qp->s_hdrwords != 0) { + spin_unlock_irqrestore(&qp->s_lock, flags); /* * If the packet cannot be sent now, return and * the send tasklet will be woken up later. */ - if (qib_verbs_send(qp, qp->s_hdr, qp->s_hdrwords, + if (qib_verbs_send(qp, priv->s_hdr, qp->s_hdrwords, qp->s_cur_sge, qp->s_cur_size)) - break; + return; /* Record that s_hdr is empty. */ qp->s_hdrwords = 0; + spin_lock_irqsave(&qp->s_lock, flags); } } while (make_req(qp)); + + spin_unlock_irqrestore(&qp->s_lock, flags); } /* * This should be called with s_lock held. */ -void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, +void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status) { u32 old_last, last; unsigned i; - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) return; + last = qp->s_last; + old_last = last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + /* See post_send() */ + barrier(); for (i = 0; i < wqe->wr.num_sge; i++) { - struct qib_sge *sge = &wqe->sg_list[i]; + struct rvt_sge *sge = &wqe->sg_list[i]; - qib_put_mr(sge->mr); + rvt_put_mr(sge->mr); } if (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&to_iah(wqe->ud_wr.ah)->refcount); + atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); /* See ch. 11.2.4.1 and 10.7.3.1 */ - if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) || + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED) || status != IB_WC_SUCCESS) { struct ib_wc wc; @@ -800,15 +828,10 @@ void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, wc.qp = &qp->ibqp; if (status == IB_WC_SUCCESS) wc.byte_len = wqe->length; - qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, status != IB_WC_SUCCESS); } - last = qp->s_last; - old_last = last; - if (++last >= qp->s_size) - last = 0; - qp->s_last = last; if (qp->s_acked == old_last) qp->s_acked = last; if (qp->s_cur == old_last) diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index c6d6a54d2e19..891873b38a1e 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -513,7 +513,9 @@ int qib_sdma_running(struct qib_pportdata *ppd) static void complete_sdma_err_req(struct qib_pportdata *ppd, struct qib_verbs_txreq *tx) { - atomic_inc(&tx->qp->s_dma_busy); + struct qib_qp_priv *priv = tx->qp->priv; + + atomic_inc(&priv->s_dma_busy); /* no sdma descriptors, so no unmap_desc */ tx->txreq.start_idx = 0; tx->txreq.next_descq_idx = 0; @@ -531,18 +533,19 @@ static void complete_sdma_err_req(struct qib_pportdata *ppd, * 3) The SGE addresses are suitable for passing to dma_map_single(). */ int qib_sdma_verbs_send(struct qib_pportdata *ppd, - struct qib_sge_state *ss, u32 dwords, + struct rvt_sge_state *ss, u32 dwords, struct qib_verbs_txreq *tx) { unsigned long flags; - struct qib_sge *sge; - struct qib_qp *qp; + struct rvt_sge *sge; + struct rvt_qp *qp; int ret = 0; u16 tail; __le64 *descqp; u64 sdmadesc[2]; u32 dwoffset; dma_addr_t addr; + struct qib_qp_priv *priv; spin_lock_irqsave(&ppd->sdma_lock, flags); @@ -621,7 +624,7 @@ retry: if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= QIB_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -644,8 +647,8 @@ retry: descqp[0] |= cpu_to_le64(SDMA_DESC_DMA_HEAD); if (tx->txreq.flags & QIB_SDMA_TXREQ_F_INTREQ) descqp[0] |= cpu_to_le64(SDMA_DESC_INTR); - - atomic_inc(&tx->qp->s_dma_busy); + priv = tx->qp->priv; + atomic_inc(&priv->s_dma_busy); tx->txreq.next_descq_idx = tail; ppd->dd->f_sdma_update_tail(ppd, tail); ppd->sdma_descq_added += tx->txreq.sg_count; @@ -663,13 +666,14 @@ unmap: unmap_desc(ppd, tail); } qp = tx->qp; + priv = qp->priv; qib_put_txreq(tx); spin_lock(&qp->r_lock); spin_lock(&qp->s_lock); if (qp->ibqp.qp_type == IB_QPT_RC) { /* XXX what about error sending RDMA read responses? */ - if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) - qib_error_qp(qp, IB_WC_GENERAL_ERR); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) + rvt_error_qp(qp, IB_WC_GENERAL_ERR); } else if (qp->s_wqe) qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock(&qp->s_lock); @@ -679,8 +683,9 @@ unmap: busy: qp = tx->qp; + priv = qp->priv; spin_lock(&qp->s_lock); - if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { struct qib_ibdev *dev; /* @@ -690,19 +695,19 @@ busy: */ tx->ss = ss; tx->dwords = dwords; - qp->s_tx = tx; + priv->s_tx = tx; dev = &ppd->dd->verbs_dev; - spin_lock(&dev->pending_lock); - if (list_empty(&qp->iowait)) { + spin_lock(&dev->rdi.pending_lock); + if (list_empty(&priv->iowait)) { struct qib_ibport *ibp; ibp = &ppd->ibport_data; - ibp->n_dmawait++; - qp->s_flags |= QIB_S_WAIT_DMA_DESC; - list_add_tail(&qp->iowait, &dev->dmawait); + ibp->rvp.n_dmawait++; + qp->s_flags |= RVT_S_WAIT_DMA_DESC; + list_add_tail(&priv->iowait, &dev->dmawait); } - spin_unlock(&dev->pending_lock); - qp->s_flags &= ~QIB_S_BUSY; + spin_unlock(&dev->rdi.pending_lock); + qp->s_flags &= ~RVT_S_BUSY; spin_unlock(&qp->s_lock); ret = -EBUSY; } else { diff --git a/drivers/infiniband/hw/qib/qib_srq.c b/drivers/infiniband/hw/qib/qib_srq.c deleted file mode 100644 index d6235931a1ba..000000000000 --- a/drivers/infiniband/hw/qib/qib_srq.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. - * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> - -#include "qib_verbs.h" - -/** - * qib_post_srq_receive - post a receive on a shared receive queue - * @ibsrq: the SRQ to post the receive on - * @wr: the list of work requests to post - * @bad_wr: A pointer to the first WR to cause a problem is put here - * - * This may be called from interrupt context. - */ -int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) -{ - struct qib_srq *srq = to_isrq(ibsrq); - struct qib_rwq *wq; - unsigned long flags; - int ret; - - for (; wr; wr = wr->next) { - struct qib_rwqe *wqe; - u32 next; - int i; - - if ((unsigned) wr->num_sge > srq->rq.max_sge) { - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - - spin_lock_irqsave(&srq->rq.lock, flags); - wq = srq->rq.wq; - next = wq->head + 1; - if (next >= srq->rq.size) - next = 0; - if (next == wq->tail) { - spin_unlock_irqrestore(&srq->rq.lock, flags); - *bad_wr = wr; - ret = -ENOMEM; - goto bail; - } - - wqe = get_rwqe_ptr(&srq->rq, wq->head); - wqe->wr_id = wr->wr_id; - wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; - /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; - spin_unlock_irqrestore(&srq->rq.lock, flags); - } - ret = 0; - -bail: - return ret; -} - -/** - * qib_create_srq - create a shared receive queue - * @ibpd: the protection domain of the SRQ to create - * @srq_init_attr: the attributes of the SRQ - * @udata: data from libibverbs when creating a user SRQ - */ -struct ib_srq *qib_create_srq(struct ib_pd *ibpd, - struct ib_srq_init_attr *srq_init_attr, - struct ib_udata *udata) -{ - struct qib_ibdev *dev = to_idev(ibpd->device); - struct qib_srq *srq; - u32 sz; - struct ib_srq *ret; - - if (srq_init_attr->srq_type != IB_SRQT_BASIC) { - ret = ERR_PTR(-ENOSYS); - goto done; - } - - if (srq_init_attr->attr.max_sge == 0 || - srq_init_attr->attr.max_sge > ib_qib_max_srq_sges || - srq_init_attr->attr.max_wr == 0 || - srq_init_attr->attr.max_wr > ib_qib_max_srq_wrs) { - ret = ERR_PTR(-EINVAL); - goto done; - } - - srq = kmalloc(sizeof(*srq), GFP_KERNEL); - if (!srq) { - ret = ERR_PTR(-ENOMEM); - goto done; - } - - /* - * Need to use vmalloc() if we want to support large #s of entries. - */ - srq->rq.size = srq_init_attr->attr.max_wr + 1; - srq->rq.max_sge = srq_init_attr->attr.max_sge; - sz = sizeof(struct ib_sge) * srq->rq.max_sge + - sizeof(struct qib_rwqe); - srq->rq.wq = vmalloc_user(sizeof(struct qib_rwq) + srq->rq.size * sz); - if (!srq->rq.wq) { - ret = ERR_PTR(-ENOMEM); - goto bail_srq; - } - - /* - * Return the address of the RWQ as the offset to mmap. - * See qib_mmap() for details. - */ - if (udata && udata->outlen >= sizeof(__u64)) { - int err; - u32 s = sizeof(struct qib_rwq) + srq->rq.size * sz; - - srq->ip = - qib_create_mmap_info(dev, s, ibpd->uobject->context, - srq->rq.wq); - if (!srq->ip) { - ret = ERR_PTR(-ENOMEM); - goto bail_wq; - } - - err = ib_copy_to_udata(udata, &srq->ip->offset, - sizeof(srq->ip->offset)); - if (err) { - ret = ERR_PTR(err); - goto bail_ip; - } - } else - srq->ip = NULL; - - /* - * ib_create_srq() will initialize srq->ibsrq. - */ - spin_lock_init(&srq->rq.lock); - srq->rq.wq->head = 0; - srq->rq.wq->tail = 0; - srq->limit = srq_init_attr->attr.srq_limit; - - spin_lock(&dev->n_srqs_lock); - if (dev->n_srqs_allocated == ib_qib_max_srqs) { - spin_unlock(&dev->n_srqs_lock); - ret = ERR_PTR(-ENOMEM); - goto bail_ip; - } - - dev->n_srqs_allocated++; - spin_unlock(&dev->n_srqs_lock); - - if (srq->ip) { - spin_lock_irq(&dev->pending_lock); - list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - } - - ret = &srq->ibsrq; - goto done; - -bail_ip: - kfree(srq->ip); -bail_wq: - vfree(srq->rq.wq); -bail_srq: - kfree(srq); -done: - return ret; -} - -/** - * qib_modify_srq - modify a shared receive queue - * @ibsrq: the SRQ to modify - * @attr: the new attributes of the SRQ - * @attr_mask: indicates which attributes to modify - * @udata: user data for libibverbs.so - */ -int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, - enum ib_srq_attr_mask attr_mask, - struct ib_udata *udata) -{ - struct qib_srq *srq = to_isrq(ibsrq); - struct qib_rwq *wq; - int ret = 0; - - if (attr_mask & IB_SRQ_MAX_WR) { - struct qib_rwq *owq; - struct qib_rwqe *p; - u32 sz, size, n, head, tail; - - /* Check that the requested sizes are below the limits. */ - if ((attr->max_wr > ib_qib_max_srq_wrs) || - ((attr_mask & IB_SRQ_LIMIT) ? - attr->srq_limit : srq->limit) > attr->max_wr) { - ret = -EINVAL; - goto bail; - } - - sz = sizeof(struct qib_rwqe) + - srq->rq.max_sge * sizeof(struct ib_sge); - size = attr->max_wr + 1; - wq = vmalloc_user(sizeof(struct qib_rwq) + size * sz); - if (!wq) { - ret = -ENOMEM; - goto bail; - } - - /* Check that we can write the offset to mmap. */ - if (udata && udata->inlen >= sizeof(__u64)) { - __u64 offset_addr; - __u64 offset = 0; - - ret = ib_copy_from_udata(&offset_addr, udata, - sizeof(offset_addr)); - if (ret) - goto bail_free; - udata->outbuf = - (void __user *) (unsigned long) offset_addr; - ret = ib_copy_to_udata(udata, &offset, - sizeof(offset)); - if (ret) - goto bail_free; - } - - spin_lock_irq(&srq->rq.lock); - /* - * validate head and tail pointer values and compute - * the number of remaining WQEs. - */ - owq = srq->rq.wq; - head = owq->head; - tail = owq->tail; - if (head >= srq->rq.size || tail >= srq->rq.size) { - ret = -EINVAL; - goto bail_unlock; - } - n = head; - if (n < tail) - n += srq->rq.size - tail; - else - n -= tail; - if (size <= n) { - ret = -EINVAL; - goto bail_unlock; - } - n = 0; - p = wq->wq; - while (tail != head) { - struct qib_rwqe *wqe; - int i; - - wqe = get_rwqe_ptr(&srq->rq, tail); - p->wr_id = wqe->wr_id; - p->num_sge = wqe->num_sge; - for (i = 0; i < wqe->num_sge; i++) - p->sg_list[i] = wqe->sg_list[i]; - n++; - p = (struct qib_rwqe *)((char *) p + sz); - if (++tail >= srq->rq.size) - tail = 0; - } - srq->rq.wq = wq; - srq->rq.size = size; - wq->head = n; - wq->tail = 0; - if (attr_mask & IB_SRQ_LIMIT) - srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); - - vfree(owq); - - if (srq->ip) { - struct qib_mmap_info *ip = srq->ip; - struct qib_ibdev *dev = to_idev(srq->ibsrq.device); - u32 s = sizeof(struct qib_rwq) + size * sz; - - qib_update_mmap_info(dev, ip, s, wq); - - /* - * Return the offset to mmap. - * See qib_mmap() for details. - */ - if (udata && udata->inlen >= sizeof(__u64)) { - ret = ib_copy_to_udata(udata, &ip->offset, - sizeof(ip->offset)); - if (ret) - goto bail; - } - - /* - * Put user mapping info onto the pending list - * unless it already is on the list. - */ - spin_lock_irq(&dev->pending_lock); - if (list_empty(&ip->pending_mmaps)) - list_add(&ip->pending_mmaps, - &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - } - } else if (attr_mask & IB_SRQ_LIMIT) { - spin_lock_irq(&srq->rq.lock); - if (attr->srq_limit >= srq->rq.size) - ret = -EINVAL; - else - srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); - } - goto bail; - -bail_unlock: - spin_unlock_irq(&srq->rq.lock); -bail_free: - vfree(wq); -bail: - return ret; -} - -int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) -{ - struct qib_srq *srq = to_isrq(ibsrq); - - attr->max_wr = srq->rq.size - 1; - attr->max_sge = srq->rq.max_sge; - attr->srq_limit = srq->limit; - return 0; -} - -/** - * qib_destroy_srq - destroy a shared receive queue - * @ibsrq: the SRQ to destroy - */ -int qib_destroy_srq(struct ib_srq *ibsrq) -{ - struct qib_srq *srq = to_isrq(ibsrq); - struct qib_ibdev *dev = to_idev(ibsrq->device); - - spin_lock(&dev->n_srqs_lock); - dev->n_srqs_allocated--; - spin_unlock(&dev->n_srqs_lock); - if (srq->ip) - kref_put(&srq->ip->ref, qib_release_mmap_info); - else - vfree(srq->rq.wq); - kfree(srq); - - return 0; -} diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 81f56cdff2bc..fe4cf5e4acec 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -406,7 +406,13 @@ static struct kobj_type qib_sl2vl_ktype = { #define QIB_DIAGC_ATTR(N) \ static struct qib_diagc_attr qib_diagc_attr_##N = { \ .attr = { .name = __stringify(N), .mode = 0664 }, \ - .counter = offsetof(struct qib_ibport, n_##N) \ + .counter = offsetof(struct qib_ibport, rvp.n_##N) \ + } + +#define QIB_DIAGC_ATTR_PER_CPU(N) \ + static struct qib_diagc_attr qib_diagc_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0664 }, \ + .counter = offsetof(struct qib_ibport, rvp.z_##N) \ } struct qib_diagc_attr { @@ -414,10 +420,11 @@ struct qib_diagc_attr { size_t counter; }; +QIB_DIAGC_ATTR_PER_CPU(rc_acks); +QIB_DIAGC_ATTR_PER_CPU(rc_qacks); +QIB_DIAGC_ATTR_PER_CPU(rc_delayed_comp); + QIB_DIAGC_ATTR(rc_resends); -QIB_DIAGC_ATTR(rc_acks); -QIB_DIAGC_ATTR(rc_qacks); -QIB_DIAGC_ATTR(rc_delayed_comp); QIB_DIAGC_ATTR(seq_naks); QIB_DIAGC_ATTR(rdma_seq); QIB_DIAGC_ATTR(rnr_naks); @@ -449,6 +456,35 @@ static struct attribute *diagc_default_attributes[] = { NULL }; +static u64 get_all_cpu_total(u64 __percpu *cntr) +{ + int cpu; + u64 counter = 0; + + for_each_possible_cpu(cpu) + counter += *per_cpu_ptr(cntr, cpu); + return counter; +} + +#define def_write_per_cpu(cntr) \ +static void write_per_cpu_##cntr(struct qib_pportdata *ppd, u32 data) \ +{ \ + struct qib_devdata *dd = ppd->dd; \ + struct qib_ibport *qibp = &ppd->ibport_data; \ + /* A write can only zero the counter */ \ + if (data == 0) \ + qibp->rvp.z_##cntr = get_all_cpu_total(qibp->rvp.cntr); \ + else \ + qib_dev_err(dd, "Per CPU cntrs can only be zeroed"); \ +} + +def_write_per_cpu(rc_acks) +def_write_per_cpu(rc_qacks) +def_write_per_cpu(rc_delayed_comp) + +#define READ_PER_CPU_CNTR(cntr) (get_all_cpu_total(qibp->rvp.cntr) - \ + qibp->rvp.z_##cntr) + static ssize_t diagc_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -458,7 +494,16 @@ static ssize_t diagc_attr_show(struct kobject *kobj, struct attribute *attr, container_of(kobj, struct qib_pportdata, diagc_kobj); struct qib_ibport *qibp = &ppd->ibport_data; - return sprintf(buf, "%u\n", *(u32 *)((char *)qibp + dattr->counter)); + if (!strncmp(dattr->attr.name, "rc_acks", 7)) + return sprintf(buf, "%llu\n", READ_PER_CPU_CNTR(rc_acks)); + else if (!strncmp(dattr->attr.name, "rc_qacks", 8)) + return sprintf(buf, "%llu\n", READ_PER_CPU_CNTR(rc_qacks)); + else if (!strncmp(dattr->attr.name, "rc_delayed_comp", 15)) + return sprintf(buf, "%llu\n", + READ_PER_CPU_CNTR(rc_delayed_comp)); + else + return sprintf(buf, "%u\n", + *(u32 *)((char *)qibp + dattr->counter)); } static ssize_t diagc_attr_store(struct kobject *kobj, struct attribute *attr, @@ -475,7 +520,15 @@ static ssize_t diagc_attr_store(struct kobject *kobj, struct attribute *attr, ret = kstrtou32(buf, 0, &val); if (ret) return ret; - *(u32 *)((char *) qibp + dattr->counter) = val; + + if (!strncmp(dattr->attr.name, "rc_acks", 7)) + write_per_cpu_rc_acks(ppd, val); + else if (!strncmp(dattr->attr.name, "rc_qacks", 8)) + write_per_cpu_rc_qacks(ppd, val); + else if (!strncmp(dattr->attr.name, "rc_delayed_comp", 15)) + write_per_cpu_rc_delayed_comp(ppd, val); + else + *(u32 *)((char *)qibp + dattr->counter) = val; return size; } @@ -502,7 +555,7 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } @@ -511,7 +564,7 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); int ret; @@ -533,7 +586,7 @@ static ssize_t show_boardversion(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -545,7 +598,7 @@ static ssize_t show_localbus_info(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -557,7 +610,7 @@ static ssize_t show_nctxts(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); /* Return the number of user ports (contexts) available. */ @@ -572,7 +625,7 @@ static ssize_t show_nfreectxts(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ @@ -583,7 +636,7 @@ static ssize_t show_serial(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); buf[sizeof(dd->serial)] = '\0'; @@ -597,7 +650,7 @@ static ssize_t store_chip_reset(struct device *device, size_t count) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); int ret; @@ -618,7 +671,7 @@ static ssize_t show_tempsense(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, ibdev.dev); + container_of(device, struct qib_ibdev, rdi.ibdev.dev); struct qib_devdata *dd = dd_from_dev(dev); int ret; int idx; @@ -778,7 +831,7 @@ bail: */ int qib_verbs_register_sysfs(struct qib_devdata *dd) { - struct ib_device *dev = &dd->verbs_dev.ibdev; + struct ib_device *dev = &dd->verbs_dev.rdi.ibdev; int i, ret; for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) { diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 06a564589c35..7bdbc79ceaa3 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -41,61 +41,62 @@ * qib_make_uc_req - construct a request packet (SEND, RDMA write) * @qp: a pointer to the QP * + * Assumes the s_lock is held. + * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_uc_req(struct qib_qp *qp) +int qib_make_uc_req(struct rvt_qp *qp) { + struct qib_qp_priv *priv = qp->priv; struct qib_other_headers *ohdr; - struct qib_swqe *wqe; - unsigned long flags; + struct rvt_swqe *wqe; u32 hwords; u32 bth0; u32 len; u32 pmtu = qp->pmtu; int ret = 0; - spin_lock_irqsave(&qp->s_lock, flags); - - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) { - if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - if (qp->s_last == qp->s_head) + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ - if (atomic_read(&qp->s_dma_busy)) { - qp->s_flags |= QIB_S_WAIT_DMA; + if (atomic_read(&priv->s_dma_busy)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } - ohdr = &qp->s_hdr->u.oth; + ohdr = &priv->s_hdr->u.oth; if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr->u.l.oth; + ohdr = &priv->s_hdr->u.l.oth; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; bth0 = 0; /* Get the next send request. */ - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); qp->s_wqe = NULL; switch (qp->s_state) { default: - if (!(ib_qib_state_ops[qp->state] & - QIB_PROCESS_NEXT_SEND_OK)) + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_NEXT_SEND_OK)) goto bail; /* Check if send work queue is empty. */ - if (qp->s_cur == qp->s_head) + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_cur == ACCESS_ONCE(qp->s_head)) goto bail; /* * Start a new request. */ - wqe->psn = qp->s_next_psn; - qp->s_psn = qp->s_next_psn; + qp->s_psn = wqe->psn; qp->s_sge.sge = wqe->sg_list[0]; qp->s_sge.sg_list = wqe->sg_list + 1; qp->s_sge.num_sge = wqe->wr.num_sge; @@ -214,15 +215,11 @@ int qib_make_uc_req(struct qib_qp *qp) qp->s_cur_sge = &qp->s_sge; qp->s_cur_size = len; qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), - qp->s_next_psn++ & QIB_PSN_MASK); + qp->s_psn++ & QIB_PSN_MASK); done: - ret = 1; - goto unlock; - + return 1; bail: - qp->s_flags &= ~QIB_S_BUSY; -unlock: - spin_unlock_irqrestore(&qp->s_lock, flags); + qp->s_flags &= ~RVT_S_BUSY; return ret; } @@ -240,7 +237,7 @@ unlock: * Called at interrupt level. */ void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, - int has_grh, void *data, u32 tlen, struct qib_qp *qp) + int has_grh, void *data, u32 tlen, struct rvt_qp *qp) { struct qib_other_headers *ohdr; u32 opcode; @@ -278,10 +275,10 @@ void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, inv: if (qp->r_state == OP(SEND_FIRST) || qp->r_state == OP(SEND_MIDDLE)) { - set_bit(QIB_R_REWIND_SGE, &qp->r_aflags); + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); qp->r_sge.num_sge = 0; } else - qib_put_ss(&qp->r_sge); + rvt_put_ss(&qp->r_sge); qp->r_state = OP(SEND_LAST); switch (opcode) { case OP(SEND_FIRST): @@ -328,8 +325,8 @@ inv: goto inv; } - if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) { - qp->r_flags |= QIB_R_COMM_EST; + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) { + qp->r_flags |= RVT_R_COMM_EST; if (qp->ibqp.event_handler) { struct ib_event ev; @@ -346,7 +343,7 @@ inv: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): send_first: - if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) qp->r_sge = qp->s_rdma_read_sge; else { ret = qib_get_rwqe(qp, 0); @@ -400,7 +397,7 @@ send_last: goto rewind; wc.opcode = IB_WC_RECV; qib_copy_sge(&qp->r_sge, data, tlen, 0); - qib_put_ss(&qp->s_rdma_read_sge); + rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -414,7 +411,7 @@ last_imm: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & cpu_to_be32(IB_BTH_SOLICITED)) != 0); break; @@ -438,7 +435,7 @@ rdma_first: int ok; /* Check rkey */ - ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, rkey, IB_ACCESS_REMOTE_WRITE); if (unlikely(!ok)) goto drop; @@ -483,8 +480,8 @@ rdma_last_imm: tlen -= (hdrsize + pad + 4); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags)) - qib_put_ss(&qp->s_rdma_read_sge); + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) + rvt_put_ss(&qp->s_rdma_read_sge); else { ret = qib_get_rwqe(qp, 1); if (ret < 0) @@ -495,7 +492,7 @@ rdma_last_imm: wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; qib_copy_sge(&qp->r_sge, data, tlen, 1); - qib_put_ss(&qp->r_sge); + rvt_put_ss(&qp->r_sge); goto last_imm; case OP(RDMA_WRITE_LAST): @@ -511,7 +508,7 @@ rdma_last: if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; qib_copy_sge(&qp->r_sge, data, tlen, 1); - qib_put_ss(&qp->r_sge); + rvt_put_ss(&qp->r_sge); break; default: @@ -523,10 +520,10 @@ rdma_last: return; rewind: - set_bit(QIB_R_REWIND_SGE, &qp->r_aflags); + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); qp->r_sge.num_sge = 0; drop: - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; return; op_err: diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 59193f67ea78..d9502137de62 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -32,6 +32,7 @@ */ #include <rdma/ib_smi.h> +#include <rdma/ib_verbs.h> #include "qib.h" #include "qib_mad.h" @@ -46,22 +47,26 @@ * Note that the receive interrupt handler may be calling qib_ud_rcv() * while this is being called. */ -static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) +static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) { struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct qib_pportdata *ppd; - struct qib_qp *qp; + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + struct rvt_qp *qp; struct ib_ah_attr *ah_attr; unsigned long flags; - struct qib_sge_state ssge; - struct qib_sge *sge; + struct rvt_sge_state ssge; + struct rvt_sge *sge; struct ib_wc wc; u32 length; enum ib_qp_type sqptype, dqptype; - qp = qib_lookup_qpn(ibp, swqe->ud_wr.remote_qpn); + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn); if (!qp) { - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; + rcu_read_unlock(); return; } @@ -71,12 +76,12 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) IB_QPT_UD : qp->ibqp.qp_type; if (dqptype != sqptype || - !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { - ibp->n_pkt_drops++; + !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; goto drop; } - ah_attr = &to_iah(swqe->ud_wr.ah)->attr; + ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr; ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -140,8 +145,8 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) /* * Get the next work request entry to find where to put the data. */ - if (qp->r_flags & QIB_R_REUSE_SGE) - qp->r_flags &= ~QIB_R_REUSE_SGE; + if (qp->r_flags & RVT_R_REUSE_SGE) + qp->r_flags &= ~RVT_R_REUSE_SGE; else { int ret; @@ -152,14 +157,14 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) } if (!ret) { if (qp->ibqp.qp_num == 0) - ibp->n_vl15_dropped++; + ibp->rvp.n_vl15_dropped++; goto bail_unlock; } } /* Silently drop packets which are too big. */ if (unlikely(wc.byte_len > qp->r_len)) { - qp->r_flags |= QIB_R_REUSE_SGE; - ibp->n_pkt_drops++; + qp->r_flags |= RVT_R_REUSE_SGE; + ibp->rvp.n_pkt_drops++; goto bail_unlock; } @@ -189,7 +194,7 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) if (--ssge.num_sge) *sge = *ssge.sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= QIB_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -201,8 +206,8 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) } length -= len; } - qib_put_ss(&qp->r_sge); - if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) goto bail_unlock; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -216,30 +221,31 @@ static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe) wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, swqe->wr.send_flags & IB_SEND_SOLICITED); - ibp->n_loop_pkts++; + ibp->rvp.n_loop_pkts++; bail_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); drop: - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); + rcu_read_unlock(); } /** * qib_make_ud_req - construct a UD request packet * @qp: the QP * + * Assumes the s_lock is held. + * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_ud_req(struct qib_qp *qp) +int qib_make_ud_req(struct rvt_qp *qp) { + struct qib_qp_priv *priv = qp->priv; struct qib_other_headers *ohdr; struct ib_ah_attr *ah_attr; struct qib_pportdata *ppd; struct qib_ibport *ibp; - struct qib_swqe *wqe; - unsigned long flags; + struct rvt_swqe *wqe; u32 nwords; u32 extra_bytes; u32 bth0; @@ -248,28 +254,29 @@ int qib_make_ud_req(struct qib_qp *qp) int ret = 0; int next_cur; - spin_lock_irqsave(&qp->s_lock, flags); - - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) { - if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - if (qp->s_last == qp->s_head) + smp_read_barrier_depends(); /* see post_one_send */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ - if (atomic_read(&qp->s_dma_busy)) { - qp->s_flags |= QIB_S_WAIT_DMA; + if (atomic_read(&priv->s_dma_busy)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } - if (qp->s_cur == qp->s_head) + /* see post_one_send() */ + smp_read_barrier_depends(); + if (qp->s_cur == ACCESS_ONCE(qp->s_head)) goto bail; - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); next_cur = qp->s_cur + 1; if (next_cur >= qp->s_size) next_cur = 0; @@ -277,9 +284,9 @@ int qib_make_ud_req(struct qib_qp *qp) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &to_iah(wqe->ud_wr.ah)->attr; - if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) { - if (ah_attr->dlid != QIB_PERMISSIVE_LID) + ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + if (ah_attr->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + if (ah_attr->dlid != be16_to_cpu(IB_LID_PERMISSIVE)) this_cpu_inc(ibp->pmastats->n_multicast_xmit); else this_cpu_inc(ibp->pmastats->n_unicast_xmit); @@ -287,6 +294,7 @@ int qib_make_ud_req(struct qib_qp *qp) this_cpu_inc(ibp->pmastats->n_unicast_xmit); lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); if (unlikely(lid == ppd->lid)) { + unsigned long flags; /* * If DMAs are in progress, we can't generate * a completion for the loopback packet since @@ -294,11 +302,12 @@ int qib_make_ud_req(struct qib_qp *qp) * XXX Instead of waiting, we could queue a * zero length descriptor so we get a callback. */ - if (atomic_read(&qp->s_dma_busy)) { - qp->s_flags |= QIB_S_WAIT_DMA; + if (atomic_read(&priv->s_dma_busy)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } qp->s_cur = next_cur; + local_irq_save(flags); spin_unlock_irqrestore(&qp->s_lock, flags); qib_ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, flags); @@ -324,11 +333,11 @@ int qib_make_ud_req(struct qib_qp *qp) if (ah_attr->ah_flags & IB_AH_GRH) { /* Header size in 32-bit words. */ - qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh, + qp->s_hdrwords += qib_make_grh(ibp, &priv->s_hdr->u.l.grh, &ah_attr->grh, qp->s_hdrwords, nwords); lrh0 = QIB_LRH_GRH; - ohdr = &qp->s_hdr->u.l.oth; + ohdr = &priv->s_hdr->u.l.oth; /* * Don't worry about sending to locally attached multicast * QPs. It is unspecified by the spec. what happens. @@ -336,7 +345,7 @@ int qib_make_ud_req(struct qib_qp *qp) } else { /* Header size in 32-bit words. */ lrh0 = QIB_LRH_BTH; - ohdr = &qp->s_hdr->u.oth; + ohdr = &priv->s_hdr->u.oth; } if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { qp->s_hdrwords++; @@ -349,15 +358,16 @@ int qib_make_ud_req(struct qib_qp *qp) lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ else lrh0 |= ibp->sl_to_vl[ah_attr->sl] << 12; - qp->s_hdr->lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr->lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ - qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + priv->s_hdr->lrh[0] = cpu_to_be16(lrh0); + priv->s_hdr->lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + priv->s_hdr->lrh[2] = + cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); lid = ppd->lid; if (lid) { lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1); - qp->s_hdr->lrh[3] = cpu_to_be16(lid); + priv->s_hdr->lrh[3] = cpu_to_be16(lid); } else - qp->s_hdr->lrh[3] = IB_LID_PERMISSIVE; + priv->s_hdr->lrh[3] = IB_LID_PERMISSIVE; if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= IB_BTH_SOLICITED; bth0 |= extra_bytes << 20; @@ -368,11 +378,11 @@ int qib_make_ud_req(struct qib_qp *qp) /* * Use the multicast QP if the destination LID is a multicast LID. */ - ohdr->bth[1] = ah_attr->dlid >= QIB_MULTICAST_LID_BASE && - ah_attr->dlid != QIB_PERMISSIVE_LID ? + ohdr->bth[1] = ah_attr->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) && + ah_attr->dlid != be16_to_cpu(IB_LID_PERMISSIVE) ? cpu_to_be32(QIB_MULTICAST_QPN) : cpu_to_be32(wqe->ud_wr.remote_qpn); - ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & QIB_PSN_MASK); + ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). @@ -382,13 +392,9 @@ int qib_make_ud_req(struct qib_qp *qp) ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); done: - ret = 1; - goto unlock; - + return 1; bail: - qp->s_flags &= ~QIB_S_BUSY; -unlock: - spin_unlock_irqrestore(&qp->s_lock, flags); + qp->s_flags &= ~RVT_S_BUSY; return ret; } @@ -426,7 +432,7 @@ static unsigned qib_lookup_pkey(struct qib_ibport *ibp, u16 pkey) * Called at interrupt level. */ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, - int has_grh, void *data, u32 tlen, struct qib_qp *qp) + int has_grh, void *data, u32 tlen, struct rvt_qp *qp) { struct qib_other_headers *ohdr; int opcode; @@ -446,7 +452,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */ } qkey = be32_to_cpu(ohdr->u.ud.deth[0]); - src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & QIB_QPN_MASK; + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; /* * Get the number of bytes the message was padded by @@ -531,8 +537,8 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, /* * Get the next work request entry to find where to put the data. */ - if (qp->r_flags & QIB_R_REUSE_SGE) - qp->r_flags &= ~QIB_R_REUSE_SGE; + if (qp->r_flags & RVT_R_REUSE_SGE) + qp->r_flags &= ~RVT_R_REUSE_SGE; else { int ret; @@ -543,13 +549,13 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, } if (!ret) { if (qp->ibqp.qp_num == 0) - ibp->n_vl15_dropped++; + ibp->rvp.n_vl15_dropped++; return; } } /* Silently drop packets which are too big. */ if (unlikely(wc.byte_len > qp->r_len)) { - qp->r_flags |= QIB_R_REUSE_SGE; + qp->r_flags |= RVT_R_REUSE_SGE; goto drop; } if (has_grh) { @@ -559,8 +565,8 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, } else qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); - qib_put_ss(&qp->r_sge); - if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -576,15 +582,15 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, /* * Save the LMC lower bits if the destination LID is a unicast LID. */ - wc.dlid_path_bits = dlid >= QIB_MULTICAST_LID_BASE ? 0 : + wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 : dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & cpu_to_be32(IB_BTH_SOLICITED)) != 0); return; drop: - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index baf1e42b6896..cbf6200e6afc 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -41,6 +41,7 @@ #include <linux/mm.h> #include <linux/random.h> #include <linux/vmalloc.h> +#include <rdma/rdma_vt.h> #include "qib.h" #include "qib_common.h" @@ -49,8 +50,8 @@ static unsigned int ib_qib_qp_table_size = 256; module_param_named(qp_table_size, ib_qib_qp_table_size, uint, S_IRUGO); MODULE_PARM_DESC(qp_table_size, "QP table size"); -unsigned int ib_qib_lkey_table_size = 16; -module_param_named(lkey_table_size, ib_qib_lkey_table_size, uint, +static unsigned int qib_lkey_table_size = 16; +module_param_named(lkey_table_size, qib_lkey_table_size, uint, S_IRUGO); MODULE_PARM_DESC(lkey_table_size, "LKEY table size in bits (2^n, 1 <= n <= 23)"); @@ -113,36 +114,6 @@ module_param_named(disable_sma, ib_qib_disable_sma, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(disable_sma, "Disable the SMA"); /* - * Note that it is OK to post send work requests in the SQE and ERR - * states; qib_do_send() will process them and generate error - * completions as per IB 1.2 C10-96. - */ -const int ib_qib_state_ops[IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = 0, - [IB_QPS_INIT] = QIB_POST_RECV_OK, - [IB_QPS_RTR] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK, - [IB_QPS_RTS] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | - QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK | - QIB_PROCESS_NEXT_SEND_OK, - [IB_QPS_SQD] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | - QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK, - [IB_QPS_SQE] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK | - QIB_POST_SEND_OK | QIB_FLUSH_SEND, - [IB_QPS_ERR] = QIB_POST_RECV_OK | QIB_FLUSH_RECV | - QIB_POST_SEND_OK | QIB_FLUSH_SEND, -}; - -struct qib_ucontext { - struct ib_ucontext ibucontext; -}; - -static inline struct qib_ucontext *to_iucontext(struct ib_ucontext - *ibucontext) -{ - return container_of(ibucontext, struct qib_ucontext, ibucontext); -} - -/* * Translate ib_wr_opcode into ib_wc_opcode. */ const enum ib_wc_opcode ib_qib_wc_opcode[] = { @@ -166,9 +137,9 @@ __be64 ib_qib_sys_image_guid; * @data: the data to copy * @length: the length of the data */ -void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release) +void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release) { - struct qib_sge *sge = &ss->sge; + struct rvt_sge *sge = &ss->sge; while (length) { u32 len = sge->length; @@ -184,11 +155,11 @@ void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release) sge->sge_length -= len; if (sge->sge_length == 0) { if (release) - qib_put_mr(sge->mr); + rvt_put_mr(sge->mr); if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= QIB_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -208,9 +179,9 @@ void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release) * @ss: the SGE state * @length: the number of bytes to skip */ -void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release) +void qib_skip_sge(struct rvt_sge_state *ss, u32 length, int release) { - struct qib_sge *sge = &ss->sge; + struct rvt_sge *sge = &ss->sge; while (length) { u32 len = sge->length; @@ -225,11 +196,11 @@ void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release) sge->sge_length -= len; if (sge->sge_length == 0) { if (release) - qib_put_mr(sge->mr); + rvt_put_mr(sge->mr); if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= QIB_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -248,10 +219,10 @@ void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release) * Don't modify the qib_sge_state to get the count. * Return zero if any of the segments is not aligned. */ -static u32 qib_count_sge(struct qib_sge_state *ss, u32 length) +static u32 qib_count_sge(struct rvt_sge_state *ss, u32 length) { - struct qib_sge *sg_list = ss->sg_list; - struct qib_sge sge = ss->sge; + struct rvt_sge *sg_list = ss->sg_list; + struct rvt_sge sge = ss->sge; u8 num_sge = ss->num_sge; u32 ndesc = 1; /* count the header */ @@ -276,7 +247,7 @@ static u32 qib_count_sge(struct qib_sge_state *ss, u32 length) if (--num_sge) sge = *sg_list++; } else if (sge.length == 0 && sge.mr->lkey) { - if (++sge.n >= QIB_SEGSZ) { + if (++sge.n >= RVT_SEGSZ) { if (++sge.m >= sge.mr->mapsz) break; sge.n = 0; @@ -294,9 +265,9 @@ static u32 qib_count_sge(struct qib_sge_state *ss, u32 length) /* * Copy from the SGEs to the data buffer. */ -static void qib_copy_from_sge(void *data, struct qib_sge_state *ss, u32 length) +static void qib_copy_from_sge(void *data, struct rvt_sge_state *ss, u32 length) { - struct qib_sge *sge = &ss->sge; + struct rvt_sge *sge = &ss->sge; while (length) { u32 len = sge->length; @@ -314,7 +285,7 @@ static void qib_copy_from_sge(void *data, struct qib_sge_state *ss, u32 length) if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= QIB_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -330,242 +301,6 @@ static void qib_copy_from_sge(void *data, struct qib_sge_state *ss, u32 length) } /** - * qib_post_one_send - post one RC, UC, or UD send work request - * @qp: the QP to post on - * @wr: the work request to send - */ -static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr, - int *scheduled) -{ - struct qib_swqe *wqe; - u32 next; - int i; - int j; - int acc; - int ret; - unsigned long flags; - struct qib_lkey_table *rkt; - struct qib_pd *pd; - int avoid_schedule = 0; - - spin_lock_irqsave(&qp->s_lock, flags); - - /* Check that state is OK to post send. */ - if (unlikely(!(ib_qib_state_ops[qp->state] & QIB_POST_SEND_OK))) - goto bail_inval; - - /* IB spec says that num_sge == 0 is OK. */ - if (wr->num_sge > qp->s_max_sge) - goto bail_inval; - - /* - * Don't allow RDMA reads or atomic operations on UC or - * undefined operations. - * Make sure buffer is large enough to hold the result for atomics. - */ - if (wr->opcode == IB_WR_REG_MR) { - if (qib_reg_mr(qp, reg_wr(wr))) - goto bail_inval; - } else if (qp->ibqp.qp_type == IB_QPT_UC) { - if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) - goto bail_inval; - } else if (qp->ibqp.qp_type != IB_QPT_RC) { - /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ - if (wr->opcode != IB_WR_SEND && - wr->opcode != IB_WR_SEND_WITH_IMM) - goto bail_inval; - /* Check UD destination address PD */ - if (qp->ibqp.pd != ud_wr(wr)->ah->pd) - goto bail_inval; - } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) - goto bail_inval; - else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && - (wr->num_sge == 0 || - wr->sg_list[0].length < sizeof(u64) || - wr->sg_list[0].addr & (sizeof(u64) - 1))) - goto bail_inval; - else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) - goto bail_inval; - - next = qp->s_head + 1; - if (next >= qp->s_size) - next = 0; - if (next == qp->s_last) { - ret = -ENOMEM; - goto bail; - } - - rkt = &to_idev(qp->ibqp.device)->lk_table; - pd = to_ipd(qp->ibqp.pd); - wqe = get_swqe_ptr(qp, qp->s_head); - - if (qp->ibqp.qp_type != IB_QPT_UC && - qp->ibqp.qp_type != IB_QPT_RC) - memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr)); - else if (wr->opcode == IB_WR_REG_MR) - memcpy(&wqe->reg_wr, reg_wr(wr), - sizeof(wqe->reg_wr)); - else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || - wr->opcode == IB_WR_RDMA_WRITE || - wr->opcode == IB_WR_RDMA_READ) - memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr)); - else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr)); - else - memcpy(&wqe->wr, wr, sizeof(wqe->wr)); - - wqe->length = 0; - j = 0; - if (wr->num_sge) { - acc = wr->opcode >= IB_WR_RDMA_READ ? - IB_ACCESS_LOCAL_WRITE : 0; - for (i = 0; i < wr->num_sge; i++) { - u32 length = wr->sg_list[i].length; - int ok; - - if (length == 0) - continue; - ok = qib_lkey_ok(rkt, pd, &wqe->sg_list[j], - &wr->sg_list[i], acc); - if (!ok) - goto bail_inval_free; - wqe->length += length; - j++; - } - wqe->wr.num_sge = j; - } - if (qp->ibqp.qp_type == IB_QPT_UC || - qp->ibqp.qp_type == IB_QPT_RC) { - if (wqe->length > 0x80000000U) - goto bail_inval_free; - if (wqe->length <= qp->pmtu) - avoid_schedule = 1; - } else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport + - qp->port_num - 1)->ibmtu) { - goto bail_inval_free; - } else { - atomic_inc(&to_iah(ud_wr(wr)->ah)->refcount); - avoid_schedule = 1; - } - wqe->ssn = qp->s_ssn++; - qp->s_head = next; - - ret = 0; - goto bail; - -bail_inval_free: - while (j) { - struct qib_sge *sge = &wqe->sg_list[--j]; - - qib_put_mr(sge->mr); - } -bail_inval: - ret = -EINVAL; -bail: - if (!ret && !wr->next && !avoid_schedule && - !qib_sdma_empty( - dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) { - qib_schedule_send(qp); - *scheduled = 1; - } - spin_unlock_irqrestore(&qp->s_lock, flags); - return ret; -} - -/** - * qib_post_send - post a send on a QP - * @ibqp: the QP to post the send on - * @wr: the list of work requests to post - * @bad_wr: the first bad WR is put here - * - * This may be called from interrupt context. - */ -static int qib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) -{ - struct qib_qp *qp = to_iqp(ibqp); - int err = 0; - int scheduled = 0; - - for (; wr; wr = wr->next) { - err = qib_post_one_send(qp, wr, &scheduled); - if (err) { - *bad_wr = wr; - goto bail; - } - } - - /* Try to do the send work in the caller's context. */ - if (!scheduled) - qib_do_send(&qp->s_work); - -bail: - return err; -} - -/** - * qib_post_receive - post a receive on a QP - * @ibqp: the QP to post the receive on - * @wr: the WR to post - * @bad_wr: the first bad WR is put here - * - * This may be called from interrupt context. - */ -static int qib_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) -{ - struct qib_qp *qp = to_iqp(ibqp); - struct qib_rwq *wq = qp->r_rq.wq; - unsigned long flags; - int ret; - - /* Check that state is OK to post receive. */ - if (!(ib_qib_state_ops[qp->state] & QIB_POST_RECV_OK) || !wq) { - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - - for (; wr; wr = wr->next) { - struct qib_rwqe *wqe; - u32 next; - int i; - - if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - - spin_lock_irqsave(&qp->r_rq.lock, flags); - next = wq->head + 1; - if (next >= qp->r_rq.size) - next = 0; - if (next == wq->tail) { - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - *bad_wr = wr; - ret = -ENOMEM; - goto bail; - } - - wqe = get_rwqe_ptr(&qp->r_rq, wq->head); - wqe->wr_id = wr->wr_id; - wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; - /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - } - ret = 0; - -bail: - return ret; -} - -/** * qib_qp_rcv - processing an incoming packet on a QP * @rcd: the context pointer * @hdr: the packet header @@ -579,15 +314,15 @@ bail: * Called at interrupt level. */ static void qib_qp_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, - int has_grh, void *data, u32 tlen, struct qib_qp *qp) + int has_grh, void *data, u32 tlen, struct rvt_qp *qp) { struct qib_ibport *ibp = &rcd->ppd->ibport_data; spin_lock(&qp->r_lock); /* Check for valid receive state. */ - if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) { - ibp->n_pkt_drops++; + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; goto unlock; } @@ -632,8 +367,10 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) struct qib_pportdata *ppd = rcd->ppd; struct qib_ibport *ibp = &ppd->ibport_data; struct qib_ib_header *hdr = rhdr; + struct qib_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; struct qib_other_headers *ohdr; - struct qib_qp *qp; + struct rvt_qp *qp; u32 qp_num; int lnh; u8 opcode; @@ -645,7 +382,7 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) /* Check for a valid destination LID (see ch. 7.11.1). */ lid = be16_to_cpu(hdr->lrh[1]); - if (lid < QIB_MULTICAST_LID_BASE) { + if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) { lid &= ~((1 << ppd->lmc) - 1); if (unlikely(lid != ppd->lid)) goto drop; @@ -674,50 +411,40 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen) #endif /* Get the destination QP number. */ - qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK; + qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; if (qp_num == QIB_MULTICAST_QPN) { - struct qib_mcast *mcast; - struct qib_mcast_qp *p; + struct rvt_mcast *mcast; + struct rvt_mcast_qp *p; if (lnh != QIB_LRH_GRH) goto drop; - mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid); + mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid); if (mcast == NULL) goto drop; this_cpu_inc(ibp->pmastats->n_multicast_rcv); list_for_each_entry_rcu(p, &mcast->qp_list, list) qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp); /* - * Notify qib_multicast_detach() if it is waiting for us + * Notify rvt_multicast_detach() if it is waiting for us * to finish. */ if (atomic_dec_return(&mcast->refcount) <= 1) wake_up(&mcast->wait); } else { - if (rcd->lookaside_qp) { - if (rcd->lookaside_qpn != qp_num) { - if (atomic_dec_and_test( - &rcd->lookaside_qp->refcount)) - wake_up( - &rcd->lookaside_qp->wait); - rcd->lookaside_qp = NULL; - } + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!qp) { + rcu_read_unlock(); + goto drop; } - if (!rcd->lookaside_qp) { - qp = qib_lookup_qpn(ibp, qp_num); - if (!qp) - goto drop; - rcd->lookaside_qp = qp; - rcd->lookaside_qpn = qp_num; - } else - qp = rcd->lookaside_qp; this_cpu_inc(ibp->pmastats->n_unicast_rcv); qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp); + rcu_read_unlock(); } return; drop: - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; } /* @@ -728,23 +455,25 @@ static void mem_timer(unsigned long data) { struct qib_ibdev *dev = (struct qib_ibdev *) data; struct list_head *list = &dev->memwait; - struct qib_qp *qp = NULL; + struct rvt_qp *qp = NULL; + struct qib_qp_priv *priv = NULL; unsigned long flags; - spin_lock_irqsave(&dev->pending_lock, flags); + spin_lock_irqsave(&dev->rdi.pending_lock, flags); if (!list_empty(list)) { - qp = list_entry(list->next, struct qib_qp, iowait); - list_del_init(&qp->iowait); + priv = list_entry(list->next, struct qib_qp_priv, iowait); + qp = priv->owner; + list_del_init(&priv->iowait); atomic_inc(&qp->refcount); if (!list_empty(list)) mod_timer(&dev->mem_timer, jiffies + 1); } - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock_irqrestore(&dev->rdi.pending_lock, flags); if (qp) { spin_lock_irqsave(&qp->s_lock, flags); - if (qp->s_flags & QIB_S_WAIT_KMEM) { - qp->s_flags &= ~QIB_S_WAIT_KMEM; + if (qp->s_flags & RVT_S_WAIT_KMEM) { + qp->s_flags &= ~RVT_S_WAIT_KMEM; qib_schedule_send(qp); } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -753,9 +482,9 @@ static void mem_timer(unsigned long data) } } -static void update_sge(struct qib_sge_state *ss, u32 length) +static void update_sge(struct rvt_sge_state *ss, u32 length) { - struct qib_sge *sge = &ss->sge; + struct rvt_sge *sge = &ss->sge; sge->vaddr += length; sge->length -= length; @@ -764,7 +493,7 @@ static void update_sge(struct qib_sge_state *ss, u32 length) if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= QIB_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) return; sge->n = 0; @@ -810,7 +539,7 @@ static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) } #endif -static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, +static void copy_io(u32 __iomem *piobuf, struct rvt_sge_state *ss, u32 length, unsigned flush_wc) { u32 extra = 0; @@ -947,30 +676,31 @@ static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss, } static noinline struct qib_verbs_txreq *__get_txreq(struct qib_ibdev *dev, - struct qib_qp *qp) + struct rvt_qp *qp) { + struct qib_qp_priv *priv = qp->priv; struct qib_verbs_txreq *tx; unsigned long flags; spin_lock_irqsave(&qp->s_lock, flags); - spin_lock(&dev->pending_lock); + spin_lock(&dev->rdi.pending_lock); if (!list_empty(&dev->txreq_free)) { struct list_head *l = dev->txreq_free.next; list_del(l); - spin_unlock(&dev->pending_lock); + spin_unlock(&dev->rdi.pending_lock); spin_unlock_irqrestore(&qp->s_lock, flags); tx = list_entry(l, struct qib_verbs_txreq, txreq.list); } else { - if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK && - list_empty(&qp->iowait)) { + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK && + list_empty(&priv->iowait)) { dev->n_txwait++; - qp->s_flags |= QIB_S_WAIT_TX; - list_add_tail(&qp->iowait, &dev->txwait); + qp->s_flags |= RVT_S_WAIT_TX; + list_add_tail(&priv->iowait, &dev->txwait); } - qp->s_flags &= ~QIB_S_BUSY; - spin_unlock(&dev->pending_lock); + qp->s_flags &= ~RVT_S_BUSY; + spin_unlock(&dev->rdi.pending_lock); spin_unlock_irqrestore(&qp->s_lock, flags); tx = ERR_PTR(-EBUSY); } @@ -978,22 +708,22 @@ static noinline struct qib_verbs_txreq *__get_txreq(struct qib_ibdev *dev, } static inline struct qib_verbs_txreq *get_txreq(struct qib_ibdev *dev, - struct qib_qp *qp) + struct rvt_qp *qp) { struct qib_verbs_txreq *tx; unsigned long flags; - spin_lock_irqsave(&dev->pending_lock, flags); + spin_lock_irqsave(&dev->rdi.pending_lock, flags); /* assume the list non empty */ if (likely(!list_empty(&dev->txreq_free))) { struct list_head *l = dev->txreq_free.next; list_del(l); - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock_irqrestore(&dev->rdi.pending_lock, flags); tx = list_entry(l, struct qib_verbs_txreq, txreq.list); } else { /* call slow path to get the extra lock */ - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock_irqrestore(&dev->rdi.pending_lock, flags); tx = __get_txreq(dev, qp); } return tx; @@ -1002,16 +732,15 @@ static inline struct qib_verbs_txreq *get_txreq(struct qib_ibdev *dev, void qib_put_txreq(struct qib_verbs_txreq *tx) { struct qib_ibdev *dev; - struct qib_qp *qp; + struct rvt_qp *qp; + struct qib_qp_priv *priv; unsigned long flags; qp = tx->qp; dev = to_idev(qp->ibqp.device); - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); if (tx->mr) { - qib_put_mr(tx->mr); + rvt_put_mr(tx->mr); tx->mr = NULL; } if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) { @@ -1022,21 +751,23 @@ void qib_put_txreq(struct qib_verbs_txreq *tx) kfree(tx->align_buf); } - spin_lock_irqsave(&dev->pending_lock, flags); + spin_lock_irqsave(&dev->rdi.pending_lock, flags); /* Put struct back on free list */ list_add(&tx->txreq.list, &dev->txreq_free); if (!list_empty(&dev->txwait)) { /* Wake up first QP wanting a free struct */ - qp = list_entry(dev->txwait.next, struct qib_qp, iowait); - list_del_init(&qp->iowait); + priv = list_entry(dev->txwait.next, struct qib_qp_priv, + iowait); + qp = priv->owner; + list_del_init(&priv->iowait); atomic_inc(&qp->refcount); - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock_irqrestore(&dev->rdi.pending_lock, flags); spin_lock_irqsave(&qp->s_lock, flags); - if (qp->s_flags & QIB_S_WAIT_TX) { - qp->s_flags &= ~QIB_S_WAIT_TX; + if (qp->s_flags & RVT_S_WAIT_TX) { + qp->s_flags &= ~RVT_S_WAIT_TX; qib_schedule_send(qp); } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1044,7 +775,7 @@ void qib_put_txreq(struct qib_verbs_txreq *tx) if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } else - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock_irqrestore(&dev->rdi.pending_lock, flags); } /* @@ -1055,36 +786,39 @@ void qib_put_txreq(struct qib_verbs_txreq *tx) */ void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail) { - struct qib_qp *qp, *nqp; - struct qib_qp *qps[20]; + struct rvt_qp *qp, *nqp; + struct qib_qp_priv *qpp, *nqpp; + struct rvt_qp *qps[20]; struct qib_ibdev *dev; unsigned i, n; n = 0; dev = &ppd->dd->verbs_dev; - spin_lock(&dev->pending_lock); + spin_lock(&dev->rdi.pending_lock); /* Search wait list for first QP wanting DMA descriptors. */ - list_for_each_entry_safe(qp, nqp, &dev->dmawait, iowait) { + list_for_each_entry_safe(qpp, nqpp, &dev->dmawait, iowait) { + qp = qpp->owner; + nqp = nqpp->owner; if (qp->port_num != ppd->port) continue; if (n == ARRAY_SIZE(qps)) break; - if (qp->s_tx->txreq.sg_count > avail) + if (qpp->s_tx->txreq.sg_count > avail) break; - avail -= qp->s_tx->txreq.sg_count; - list_del_init(&qp->iowait); + avail -= qpp->s_tx->txreq.sg_count; + list_del_init(&qpp->iowait); atomic_inc(&qp->refcount); qps[n++] = qp; } - spin_unlock(&dev->pending_lock); + spin_unlock(&dev->rdi.pending_lock); for (i = 0; i < n; i++) { qp = qps[i]; spin_lock(&qp->s_lock); - if (qp->s_flags & QIB_S_WAIT_DMA_DESC) { - qp->s_flags &= ~QIB_S_WAIT_DMA_DESC; + if (qp->s_flags & RVT_S_WAIT_DMA_DESC) { + qp->s_flags &= ~RVT_S_WAIT_DMA_DESC; qib_schedule_send(qp); } spin_unlock(&qp->s_lock); @@ -1100,7 +834,8 @@ static void sdma_complete(struct qib_sdma_txreq *cookie, int status) { struct qib_verbs_txreq *tx = container_of(cookie, struct qib_verbs_txreq, txreq); - struct qib_qp *qp = tx->qp; + struct rvt_qp *qp = tx->qp; + struct qib_qp_priv *priv = qp->priv; spin_lock(&qp->s_lock); if (tx->wqe) @@ -1117,11 +852,11 @@ static void sdma_complete(struct qib_sdma_txreq *cookie, int status) } qib_rc_send_complete(qp, hdr); } - if (atomic_dec_and_test(&qp->s_dma_busy)) { + if (atomic_dec_and_test(&priv->s_dma_busy)) { if (qp->state == IB_QPS_RESET) - wake_up(&qp->wait_dma); - else if (qp->s_flags & QIB_S_WAIT_DMA) { - qp->s_flags &= ~QIB_S_WAIT_DMA; + wake_up(&priv->wait_dma); + else if (qp->s_flags & RVT_S_WAIT_DMA) { + qp->s_flags &= ~RVT_S_WAIT_DMA; qib_schedule_send(qp); } } @@ -1130,22 +865,23 @@ static void sdma_complete(struct qib_sdma_txreq *cookie, int status) qib_put_txreq(tx); } -static int wait_kmem(struct qib_ibdev *dev, struct qib_qp *qp) +static int wait_kmem(struct qib_ibdev *dev, struct rvt_qp *qp) { + struct qib_qp_priv *priv = qp->priv; unsigned long flags; int ret = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { - spin_lock(&dev->pending_lock); - if (list_empty(&qp->iowait)) { + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + spin_lock(&dev->rdi.pending_lock); + if (list_empty(&priv->iowait)) { if (list_empty(&dev->memwait)) mod_timer(&dev->mem_timer, jiffies + 1); - qp->s_flags |= QIB_S_WAIT_KMEM; - list_add_tail(&qp->iowait, &dev->memwait); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->iowait, &dev->memwait); } - spin_unlock(&dev->pending_lock); - qp->s_flags &= ~QIB_S_BUSY; + spin_unlock(&dev->rdi.pending_lock); + qp->s_flags &= ~RVT_S_BUSY; ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1153,10 +889,11 @@ static int wait_kmem(struct qib_ibdev *dev, struct qib_qp *qp) return ret; } -static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, - u32 hdrwords, struct qib_sge_state *ss, u32 len, +static int qib_verbs_send_dma(struct rvt_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct rvt_sge_state *ss, u32 len, u32 plen, u32 dwords) { + struct qib_qp_priv *priv = qp->priv; struct qib_ibdev *dev = to_idev(qp->ibqp.device); struct qib_devdata *dd = dd_from_dev(dev); struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); @@ -1167,9 +904,9 @@ static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, u32 ndesc; int ret; - tx = qp->s_tx; + tx = priv->s_tx; if (tx) { - qp->s_tx = NULL; + priv->s_tx = NULL; /* resend previously constructed packet */ ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx); goto bail; @@ -1182,7 +919,6 @@ static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr, control = dd->f_setpbc_control(ppd, plen, qp->s_srate, be16_to_cpu(hdr->lrh[0]) >> 12); tx->qp = qp; - atomic_inc(&qp->refcount); tx->wqe = qp->s_wqe; tx->mr = qp->s_rdma_mr; if (qp->s_rdma_mr) @@ -1245,7 +981,7 @@ err_tx: qib_put_txreq(tx); ret = wait_kmem(dev, qp); unaligned: - ibp->n_unaligned++; + ibp->rvp.n_unaligned++; bail: return ret; bail_tx: @@ -1257,8 +993,9 @@ bail_tx: * If we are now in the error state, return zero to flush the * send work request. */ -static int no_bufs_available(struct qib_qp *qp) +static int no_bufs_available(struct rvt_qp *qp) { + struct qib_qp_priv *priv = qp->priv; struct qib_ibdev *dev = to_idev(qp->ibqp.device); struct qib_devdata *dd; unsigned long flags; @@ -1271,25 +1008,25 @@ static int no_bufs_available(struct qib_qp *qp) * enabling the PIO avail interrupt. */ spin_lock_irqsave(&qp->s_lock, flags); - if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) { - spin_lock(&dev->pending_lock); - if (list_empty(&qp->iowait)) { + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + spin_lock(&dev->rdi.pending_lock); + if (list_empty(&priv->iowait)) { dev->n_piowait++; - qp->s_flags |= QIB_S_WAIT_PIO; - list_add_tail(&qp->iowait, &dev->piowait); + qp->s_flags |= RVT_S_WAIT_PIO; + list_add_tail(&priv->iowait, &dev->piowait); dd = dd_from_dev(dev); dd->f_wantpiobuf_intr(dd, 1); } - spin_unlock(&dev->pending_lock); - qp->s_flags &= ~QIB_S_BUSY; + spin_unlock(&dev->rdi.pending_lock); + qp->s_flags &= ~RVT_S_BUSY; ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); return ret; } -static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr, - u32 hdrwords, struct qib_sge_state *ss, u32 len, +static int qib_verbs_send_pio(struct rvt_qp *qp, struct qib_ib_header *ibhdr, + u32 hdrwords, struct rvt_sge_state *ss, u32 len, u32 plen, u32 dwords) { struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); @@ -1370,7 +1107,7 @@ done: } qib_sendbuf_done(dd, pbufn); if (qp->s_rdma_mr) { - qib_put_mr(qp->s_rdma_mr); + rvt_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; } if (qp->s_wqe) { @@ -1394,10 +1131,10 @@ done: * @len: the length of the packet in bytes * * Return zero if packet is sent or queued OK. - * Return non-zero and clear qp->s_flags QIB_S_BUSY otherwise. + * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise. */ -int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr, - u32 hdrwords, struct qib_sge_state *ss, u32 len) +int qib_verbs_send(struct rvt_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct rvt_sge_state *ss, u32 len) { struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device); u32 plen; @@ -1529,10 +1266,11 @@ void qib_ib_piobufavail(struct qib_devdata *dd) { struct qib_ibdev *dev = &dd->verbs_dev; struct list_head *list; - struct qib_qp *qps[5]; - struct qib_qp *qp; + struct rvt_qp *qps[5]; + struct rvt_qp *qp; unsigned long flags; unsigned i, n; + struct qib_qp_priv *priv; list = &dev->piowait; n = 0; @@ -1543,25 +1281,26 @@ void qib_ib_piobufavail(struct qib_devdata *dd) * could end up with QPs on the wait list with the interrupt * disabled. */ - spin_lock_irqsave(&dev->pending_lock, flags); + spin_lock_irqsave(&dev->rdi.pending_lock, flags); while (!list_empty(list)) { if (n == ARRAY_SIZE(qps)) goto full; - qp = list_entry(list->next, struct qib_qp, iowait); - list_del_init(&qp->iowait); + priv = list_entry(list->next, struct qib_qp_priv, iowait); + qp = priv->owner; + list_del_init(&priv->iowait); atomic_inc(&qp->refcount); qps[n++] = qp; } dd->f_wantpiobuf_intr(dd, 0); full: - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock_irqrestore(&dev->rdi.pending_lock, flags); for (i = 0; i < n; i++) { qp = qps[i]; spin_lock_irqsave(&qp->s_lock, flags); - if (qp->s_flags & QIB_S_WAIT_PIO) { - qp->s_flags &= ~QIB_S_WAIT_PIO; + if (qp->s_flags & RVT_S_WAIT_PIO) { + qp->s_flags &= ~RVT_S_WAIT_PIO; qib_schedule_send(qp); } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1572,82 +1311,24 @@ full: } } -static int qib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, - struct ib_udata *uhw) -{ - struct qib_devdata *dd = dd_from_ibdev(ibdev); - struct qib_ibdev *dev = to_idev(ibdev); - - if (uhw->inlen || uhw->outlen) - return -EINVAL; - memset(props, 0, sizeof(*props)); - - props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | - IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | - IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; - props->page_size_cap = PAGE_SIZE; - props->vendor_id = - QIB_SRC_OUI_1 << 16 | QIB_SRC_OUI_2 << 8 | QIB_SRC_OUI_3; - props->vendor_part_id = dd->deviceid; - props->hw_ver = dd->minrev; - props->sys_image_guid = ib_qib_sys_image_guid; - props->max_mr_size = ~0ULL; - props->max_qp = ib_qib_max_qps; - props->max_qp_wr = ib_qib_max_qp_wrs; - props->max_sge = ib_qib_max_sges; - props->max_sge_rd = ib_qib_max_sges; - props->max_cq = ib_qib_max_cqs; - props->max_ah = ib_qib_max_ahs; - props->max_cqe = ib_qib_max_cqes; - props->max_mr = dev->lk_table.max; - props->max_fmr = dev->lk_table.max; - props->max_map_per_fmr = 32767; - props->max_pd = ib_qib_max_pds; - props->max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC; - props->max_qp_init_rd_atom = 255; - /* props->max_res_rd_atom */ - props->max_srq = ib_qib_max_srqs; - props->max_srq_wr = ib_qib_max_srq_wrs; - props->max_srq_sge = ib_qib_max_srq_sges; - /* props->local_ca_ack_delay */ - props->atomic_cap = IB_ATOMIC_GLOB; - props->max_pkeys = qib_get_npkeys(dd); - props->max_mcast_grp = ib_qib_max_mcast_grps; - props->max_mcast_qp_attach = ib_qib_max_mcast_qp_attached; - props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * - props->max_mcast_grp; - - return 0; -} - -static int qib_query_port(struct ib_device *ibdev, u8 port, +static int qib_query_port(struct rvt_dev_info *rdi, u8 port_num, struct ib_port_attr *props) { - struct qib_devdata *dd = dd_from_ibdev(ibdev); - struct qib_ibport *ibp = to_iport(ibdev, port); - struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = dd_from_dev(ibdev); + struct qib_pportdata *ppd = &dd->pport[port_num - 1]; enum ib_mtu mtu; u16 lid = ppd->lid; - memset(props, 0, sizeof(*props)); props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE); props->lmc = ppd->lmc; - props->sm_lid = ibp->sm_lid; - props->sm_sl = ibp->sm_sl; props->state = dd->f_iblink_state(ppd->lastibcstat); props->phys_state = dd->f_ibphys_portstate(ppd->lastibcstat); - props->port_cap_flags = ibp->port_cap_flags; props->gid_tbl_len = QIB_GUIDS_PER_PORT; - props->max_msg_sz = 0x80000000; - props->pkey_tbl_len = qib_get_npkeys(dd); - props->bad_pkey_cntr = ibp->pkey_violations; - props->qkey_viol_cntr = ibp->qkey_violations; props->active_width = ppd->link_width_active; /* See rate_show() */ props->active_speed = ppd->link_speed_active; props->max_vl_num = qib_num_vls(ppd->vls_supported); - props->init_type_reply = 0; props->max_mtu = qib_ibmtu ? qib_ibmtu : IB_MTU_4096; switch (ppd->ibmtu) { @@ -1670,7 +1351,6 @@ static int qib_query_port(struct ib_device *ibdev, u8 port, mtu = IB_MTU_2048; } props->active_mtu = mtu; - props->subnet_timeout = ibp->subnet_timeout; return 0; } @@ -1714,185 +1394,70 @@ bail: return ret; } -static int qib_modify_port(struct ib_device *ibdev, u8 port, - int port_modify_mask, struct ib_port_modify *props) +static int qib_shut_down_port(struct rvt_dev_info *rdi, u8 port_num) { - struct qib_ibport *ibp = to_iport(ibdev, port); - struct qib_pportdata *ppd = ppd_from_ibp(ibp); - - ibp->port_cap_flags |= props->set_port_cap_mask; - ibp->port_cap_flags &= ~props->clr_port_cap_mask; - if (props->set_port_cap_mask || props->clr_port_cap_mask) - qib_cap_mask_chg(ibp); - if (port_modify_mask & IB_PORT_SHUTDOWN) - qib_set_linkstate(ppd, QIB_IB_LINKDOWN); - if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) - ibp->qkey_violations = 0; - return 0; -} - -static int qib_query_gid(struct ib_device *ibdev, u8 port, - int index, union ib_gid *gid) -{ - struct qib_devdata *dd = dd_from_ibdev(ibdev); - int ret = 0; - - if (!port || port > dd->num_pports) - ret = -EINVAL; - else { - struct qib_ibport *ibp = to_iport(ibdev, port); - struct qib_pportdata *ppd = ppd_from_ibp(ibp); - - gid->global.subnet_prefix = ibp->gid_prefix; - if (index == 0) - gid->global.interface_id = ppd->guid; - else if (index < QIB_GUIDS_PER_PORT) - gid->global.interface_id = ibp->guids[index - 1]; - else - ret = -EINVAL; - } - - return ret; -} + struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); + struct qib_devdata *dd = dd_from_dev(ibdev); + struct qib_pportdata *ppd = &dd->pport[port_num - 1]; -static struct ib_pd *qib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) -{ - struct qib_ibdev *dev = to_idev(ibdev); - struct qib_pd *pd; - struct ib_pd *ret; + qib_set_linkstate(ppd, QIB_IB_LINKDOWN); - /* - * This is actually totally arbitrary. Some correctness tests - * assume there's a maximum number of PDs that can be allocated. - * We don't actually have this limit, but we fail the test if - * we allow allocations of more than we report for this value. - */ - - pd = kmalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - spin_lock(&dev->n_pds_lock); - if (dev->n_pds_allocated == ib_qib_max_pds) { - spin_unlock(&dev->n_pds_lock); - kfree(pd); - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - dev->n_pds_allocated++; - spin_unlock(&dev->n_pds_lock); - - /* ib_alloc_pd() will initialize pd->ibpd. */ - pd->user = udata != NULL; - - ret = &pd->ibpd; - -bail: - return ret; + return 0; } -static int qib_dealloc_pd(struct ib_pd *ibpd) +static int qib_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, + int guid_index, __be64 *guid) { - struct qib_pd *pd = to_ipd(ibpd); - struct qib_ibdev *dev = to_idev(ibpd->device); - - spin_lock(&dev->n_pds_lock); - dev->n_pds_allocated--; - spin_unlock(&dev->n_pds_lock); + struct qib_ibport *ibp = container_of(rvp, struct qib_ibport, rvp); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); - kfree(pd); + if (guid_index == 0) + *guid = ppd->guid; + else if (guid_index < QIB_GUIDS_PER_PORT) + *guid = ibp->guids[guid_index - 1]; + else + return -EINVAL; return 0; } int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr) { - /* A multicast address requires a GRH (see ch. 8.4.1). */ - if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE && - ah_attr->dlid != QIB_PERMISSIVE_LID && - !(ah_attr->ah_flags & IB_AH_GRH)) - goto bail; - if ((ah_attr->ah_flags & IB_AH_GRH) && - ah_attr->grh.sgid_index >= QIB_GUIDS_PER_PORT) - goto bail; - if (ah_attr->dlid == 0) - goto bail; - if (ah_attr->port_num < 1 || - ah_attr->port_num > ibdev->phys_port_cnt) - goto bail; - if (ah_attr->static_rate != IB_RATE_PORT_CURRENT && - ib_rate_to_mult(ah_attr->static_rate) < 0) - goto bail; if (ah_attr->sl > 15) - goto bail; + return -EINVAL; + return 0; -bail: - return -EINVAL; } -/** - * qib_create_ah - create an address handle - * @pd: the protection domain - * @ah_attr: the attributes of the AH - * - * This may be called from interrupt context. - */ -static struct ib_ah *qib_create_ah(struct ib_pd *pd, - struct ib_ah_attr *ah_attr) +static void qib_notify_new_ah(struct ib_device *ibdev, + struct ib_ah_attr *ah_attr, + struct rvt_ah *ah) { - struct qib_ah *ah; - struct ib_ah *ret; - struct qib_ibdev *dev = to_idev(pd->device); - unsigned long flags; + struct qib_ibport *ibp; + struct qib_pportdata *ppd; - if (qib_check_ah(pd->device, ah_attr)) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - - ah = kmalloc(sizeof(*ah), GFP_ATOMIC); - if (!ah) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - spin_lock_irqsave(&dev->n_ahs_lock, flags); - if (dev->n_ahs_allocated == ib_qib_max_ahs) { - spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - kfree(ah); - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - dev->n_ahs_allocated++; - spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - - /* ib_create_ah() will initialize ah->ibah. */ - ah->attr = *ah_attr; - atomic_set(&ah->refcount, 0); - - ret = &ah->ibah; + /* + * Do not trust reading anything from rvt_ah at this point as it is not + * done being setup. We can however modify things which we need to set. + */ -bail: - return ret; + ibp = to_iport(ibdev, ah_attr->port_num); + ppd = ppd_from_ibp(ibp); + ah->vl = ibp->sl_to_vl[ah->attr.sl]; + ah->log_pmtu = ilog2(ppd->ibmtu); } struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid) { struct ib_ah_attr attr; struct ib_ah *ah = ERR_PTR(-EINVAL); - struct qib_qp *qp0; + struct rvt_qp *qp0; memset(&attr, 0, sizeof(attr)); attr.dlid = dlid; attr.port_num = ppd_from_ibp(ibp)->port; rcu_read_lock(); - qp0 = rcu_dereference(ibp->qp0); + qp0 = rcu_dereference(ibp->rvp.qp[0]); if (qp0) ah = ib_create_ah(qp0->ibqp.pd, &attr); rcu_read_unlock(); @@ -1900,51 +1465,6 @@ struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid) } /** - * qib_destroy_ah - destroy an address handle - * @ibah: the AH to destroy - * - * This may be called from interrupt context. - */ -static int qib_destroy_ah(struct ib_ah *ibah) -{ - struct qib_ibdev *dev = to_idev(ibah->device); - struct qib_ah *ah = to_iah(ibah); - unsigned long flags; - - if (atomic_read(&ah->refcount) != 0) - return -EBUSY; - - spin_lock_irqsave(&dev->n_ahs_lock, flags); - dev->n_ahs_allocated--; - spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - - kfree(ah); - - return 0; -} - -static int qib_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) -{ - struct qib_ah *ah = to_iah(ibah); - - if (qib_check_ah(ibah->device, ah_attr)) - return -EINVAL; - - ah->attr = *ah_attr; - - return 0; -} - -static int qib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) -{ - struct qib_ah *ah = to_iah(ibah); - - *ah_attr = ah->attr; - - return 0; -} - -/** * qib_get_npkeys - return the size of the PKEY table for context 0 * @dd: the qlogic_ib device */ @@ -1973,75 +1493,27 @@ unsigned qib_get_pkey(struct qib_ibport *ibp, unsigned index) return ret; } -static int qib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, - u16 *pkey) -{ - struct qib_devdata *dd = dd_from_ibdev(ibdev); - int ret; - - if (index >= qib_get_npkeys(dd)) { - ret = -EINVAL; - goto bail; - } - - *pkey = qib_get_pkey(to_iport(ibdev, port), index); - ret = 0; - -bail: - return ret; -} - -/** - * qib_alloc_ucontext - allocate a ucontest - * @ibdev: the infiniband device - * @udata: not used by the QLogic_IB driver - */ - -static struct ib_ucontext *qib_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) -{ - struct qib_ucontext *context; - struct ib_ucontext *ret; - - context = kmalloc(sizeof(*context), GFP_KERNEL); - if (!context) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - ret = &context->ibucontext; - -bail: - return ret; -} - -static int qib_dealloc_ucontext(struct ib_ucontext *context) -{ - kfree(to_iucontext(context)); - return 0; -} - static void init_ibport(struct qib_pportdata *ppd) { struct qib_verbs_counters cntrs; struct qib_ibport *ibp = &ppd->ibport_data; - spin_lock_init(&ibp->lock); + spin_lock_init(&ibp->rvp.lock); /* Set the prefix to the default value (see ch. 4.1.1) */ - ibp->gid_prefix = IB_DEFAULT_GID_PREFIX; - ibp->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE); - ibp->port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP | + ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; + ibp->rvp.sm_lid = be16_to_cpu(IB_LID_PERMISSIVE); + ibp->rvp.port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP | IB_PORT_SL_MAP_SUP | IB_PORT_TRAP_SUP | IB_PORT_AUTO_MIGR_SUP | IB_PORT_DR_NOTICE_SUP | IB_PORT_CAP_MASK_NOTICE_SUP | IB_PORT_OTHER_LOCAL_CHANGES_SUP; if (ppd->dd->flags & QIB_HAS_LINK_LATENCY) - ibp->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; - ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; - ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; - ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; - ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; - ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + ibp->rvp.port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; + ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; /* Snapshot current HW counters to "clear" them. */ qib_get_counters(ppd, &cntrs); @@ -2061,26 +1533,55 @@ static void init_ibport(struct qib_pportdata *ppd) ibp->z_excessive_buffer_overrun_errors = cntrs.excessive_buffer_overrun_errors; ibp->z_vl15_dropped = cntrs.vl15_dropped; - RCU_INIT_POINTER(ibp->qp0, NULL); - RCU_INIT_POINTER(ibp->qp1, NULL); + RCU_INIT_POINTER(ibp->rvp.qp[0], NULL); + RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); } -static int qib_port_immutable(struct ib_device *ibdev, u8 port_num, - struct ib_port_immutable *immutable) +/** + * qib_fill_device_attr - Fill in rvt dev info device attributes. + * @dd: the device data structure + */ +static void qib_fill_device_attr(struct qib_devdata *dd) { - struct ib_port_attr attr; - int err; - - err = qib_query_port(ibdev, port_num, &attr); - if (err) - return err; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; - immutable->pkey_tbl_len = attr.pkey_tbl_len; - immutable->gid_tbl_len = attr.gid_tbl_len; - immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; - immutable->max_mad_size = IB_MGMT_MAD_SIZE; + memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props)); - return 0; + rdi->dparms.props.max_pd = ib_qib_max_pds; + rdi->dparms.props.max_ah = ib_qib_max_ahs; + rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + rdi->dparms.props.page_size_cap = PAGE_SIZE; + rdi->dparms.props.vendor_id = + QIB_SRC_OUI_1 << 16 | QIB_SRC_OUI_2 << 8 | QIB_SRC_OUI_3; + rdi->dparms.props.vendor_part_id = dd->deviceid; + rdi->dparms.props.hw_ver = dd->minrev; + rdi->dparms.props.sys_image_guid = ib_qib_sys_image_guid; + rdi->dparms.props.max_mr_size = ~0ULL; + rdi->dparms.props.max_qp = ib_qib_max_qps; + rdi->dparms.props.max_qp_wr = ib_qib_max_qp_wrs; + rdi->dparms.props.max_sge = ib_qib_max_sges; + rdi->dparms.props.max_sge_rd = ib_qib_max_sges; + rdi->dparms.props.max_cq = ib_qib_max_cqs; + rdi->dparms.props.max_cqe = ib_qib_max_cqes; + rdi->dparms.props.max_ah = ib_qib_max_ahs; + rdi->dparms.props.max_mr = rdi->lkey_table.max; + rdi->dparms.props.max_fmr = rdi->lkey_table.max; + rdi->dparms.props.max_map_per_fmr = 32767; + rdi->dparms.props.max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC; + rdi->dparms.props.max_qp_init_rd_atom = 255; + rdi->dparms.props.max_srq = ib_qib_max_srqs; + rdi->dparms.props.max_srq_wr = ib_qib_max_srq_wrs; + rdi->dparms.props.max_srq_sge = ib_qib_max_srq_sges; + rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB; + rdi->dparms.props.max_pkeys = qib_get_npkeys(dd); + rdi->dparms.props.max_mcast_grp = ib_qib_max_mcast_grps; + rdi->dparms.props.max_mcast_qp_attach = ib_qib_max_mcast_qp_attached; + rdi->dparms.props.max_total_mcast_qp_attach = + rdi->dparms.props.max_mcast_qp_attach * + rdi->dparms.props.max_mcast_grp; } /** @@ -2091,68 +1592,20 @@ static int qib_port_immutable(struct ib_device *ibdev, u8 port_num, int qib_register_ib_device(struct qib_devdata *dd) { struct qib_ibdev *dev = &dd->verbs_dev; - struct ib_device *ibdev = &dev->ibdev; + struct ib_device *ibdev = &dev->rdi.ibdev; struct qib_pportdata *ppd = dd->pport; - unsigned i, lk_tab_size; + unsigned i, ctxt; int ret; - dev->qp_table_size = ib_qib_qp_table_size; get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd)); - dev->qp_table = kmalloc_array( - dev->qp_table_size, - sizeof(*dev->qp_table), - GFP_KERNEL); - if (!dev->qp_table) { - ret = -ENOMEM; - goto err_qpt; - } - for (i = 0; i < dev->qp_table_size; i++) - RCU_INIT_POINTER(dev->qp_table[i], NULL); - for (i = 0; i < dd->num_pports; i++) init_ibport(ppd + i); /* Only need to initialize non-zero fields. */ - spin_lock_init(&dev->qpt_lock); - spin_lock_init(&dev->n_pds_lock); - spin_lock_init(&dev->n_ahs_lock); - spin_lock_init(&dev->n_cqs_lock); - spin_lock_init(&dev->n_qps_lock); - spin_lock_init(&dev->n_srqs_lock); - spin_lock_init(&dev->n_mcast_grps_lock); - init_timer(&dev->mem_timer); - dev->mem_timer.function = mem_timer; - dev->mem_timer.data = (unsigned long) dev; - - qib_init_qpn_table(dd, &dev->qpn_table); + setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev); + + qpt_mask = dd->qpn_mask; - /* - * The top ib_qib_lkey_table_size bits are used to index the - * table. The lower 8 bits can be owned by the user (copied from - * the LKEY). The remaining bits act as a generation number or tag. - */ - spin_lock_init(&dev->lk_table.lock); - /* insure generation is at least 4 bits see keys.c */ - if (ib_qib_lkey_table_size > MAX_LKEY_TABLE_BITS) { - qib_dev_warn(dd, "lkey bits %u too large, reduced to %u\n", - ib_qib_lkey_table_size, MAX_LKEY_TABLE_BITS); - ib_qib_lkey_table_size = MAX_LKEY_TABLE_BITS; - } - dev->lk_table.max = 1 << ib_qib_lkey_table_size; - lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); - dev->lk_table.table = (struct qib_mregion __rcu **) - vmalloc(lk_tab_size); - if (dev->lk_table.table == NULL) { - ret = -ENOMEM; - goto err_lk; - } - RCU_INIT_POINTER(dev->dma_mr, NULL); - for (i = 0; i < dev->lk_table.max; i++) - RCU_INIT_POINTER(dev->lk_table.table[i], NULL); - INIT_LIST_HEAD(&dev->pending_mmaps); - spin_lock_init(&dev->pending_lock); - dev->mmap_offset = PAGE_SIZE; - spin_lock_init(&dev->mmap_offset_lock); INIT_LIST_HEAD(&dev->piowait); INIT_LIST_HEAD(&dev->dmawait); INIT_LIST_HEAD(&dev->txwait); @@ -2194,110 +1647,91 @@ int qib_register_ib_device(struct qib_devdata *dd) strlcpy(ibdev->name, "qib%d", IB_DEVICE_NAME_MAX); ibdev->owner = THIS_MODULE; ibdev->node_guid = ppd->guid; - ibdev->uverbs_abi_ver = QIB_UVERBS_ABI_VERSION; - ibdev->uverbs_cmd_mask = - (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | - (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | - (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | - (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_CREATE_AH) | - (1ull << IB_USER_VERBS_CMD_MODIFY_AH) | - (1ull << IB_USER_VERBS_CMD_QUERY_AH) | - (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | - (1ull << IB_USER_VERBS_CMD_REG_MR) | - (1ull << IB_USER_VERBS_CMD_DEREG_MR) | - (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | - (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | - (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | - (1ull << IB_USER_VERBS_CMD_POLL_CQ) | - (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | - (1ull << IB_USER_VERBS_CMD_CREATE_QP) | - (1ull << IB_USER_VERBS_CMD_QUERY_QP) | - (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | - (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | - (1ull << IB_USER_VERBS_CMD_POST_SEND) | - (1ull << IB_USER_VERBS_CMD_POST_RECV) | - (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | - (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | - (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | - (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | - (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | - (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); - ibdev->node_type = RDMA_NODE_IB_CA; ibdev->phys_port_cnt = dd->num_pports; - ibdev->num_comp_vectors = 1; ibdev->dma_device = &dd->pcidev->dev; - ibdev->query_device = qib_query_device; ibdev->modify_device = qib_modify_device; - ibdev->query_port = qib_query_port; - ibdev->modify_port = qib_modify_port; - ibdev->query_pkey = qib_query_pkey; - ibdev->query_gid = qib_query_gid; - ibdev->alloc_ucontext = qib_alloc_ucontext; - ibdev->dealloc_ucontext = qib_dealloc_ucontext; - ibdev->alloc_pd = qib_alloc_pd; - ibdev->dealloc_pd = qib_dealloc_pd; - ibdev->create_ah = qib_create_ah; - ibdev->destroy_ah = qib_destroy_ah; - ibdev->modify_ah = qib_modify_ah; - ibdev->query_ah = qib_query_ah; - ibdev->create_srq = qib_create_srq; - ibdev->modify_srq = qib_modify_srq; - ibdev->query_srq = qib_query_srq; - ibdev->destroy_srq = qib_destroy_srq; - ibdev->create_qp = qib_create_qp; - ibdev->modify_qp = qib_modify_qp; - ibdev->query_qp = qib_query_qp; - ibdev->destroy_qp = qib_destroy_qp; - ibdev->post_send = qib_post_send; - ibdev->post_recv = qib_post_receive; - ibdev->post_srq_recv = qib_post_srq_receive; - ibdev->create_cq = qib_create_cq; - ibdev->destroy_cq = qib_destroy_cq; - ibdev->resize_cq = qib_resize_cq; - ibdev->poll_cq = qib_poll_cq; - ibdev->req_notify_cq = qib_req_notify_cq; - ibdev->get_dma_mr = qib_get_dma_mr; - ibdev->reg_user_mr = qib_reg_user_mr; - ibdev->dereg_mr = qib_dereg_mr; - ibdev->alloc_mr = qib_alloc_mr; - ibdev->map_mr_sg = qib_map_mr_sg; - ibdev->alloc_fmr = qib_alloc_fmr; - ibdev->map_phys_fmr = qib_map_phys_fmr; - ibdev->unmap_fmr = qib_unmap_fmr; - ibdev->dealloc_fmr = qib_dealloc_fmr; - ibdev->attach_mcast = qib_multicast_attach; - ibdev->detach_mcast = qib_multicast_detach; ibdev->process_mad = qib_process_mad; - ibdev->mmap = qib_mmap; - ibdev->dma_ops = &qib_dma_mapping_ops; - ibdev->get_port_immutable = qib_port_immutable; snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), "Intel Infiniband HCA %s", init_utsname()->nodename); - ret = ib_register_device(ibdev, qib_create_port_files); - if (ret) - goto err_reg; + /* + * Fill in rvt info object. + */ + dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; + dd->verbs_dev.rdi.driver_f.get_card_name = qib_get_card_name; + dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; + dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; + dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe; + dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah; + dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn; + dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc; + dd->verbs_dev.rdi.driver_f.qp_priv_free = qib_qp_priv_free; + dd->verbs_dev.rdi.driver_f.free_all_qps = qib_free_all_qps; + dd->verbs_dev.rdi.driver_f.notify_qp_reset = qib_notify_qp_reset; + dd->verbs_dev.rdi.driver_f.do_send = qib_do_send; + dd->verbs_dev.rdi.driver_f.schedule_send = qib_schedule_send; + dd->verbs_dev.rdi.driver_f.quiesce_qp = qib_quiesce_qp; + dd->verbs_dev.rdi.driver_f.stop_send_queue = qib_stop_send_queue; + dd->verbs_dev.rdi.driver_f.flush_qp_waiters = qib_flush_qp_waiters; + dd->verbs_dev.rdi.driver_f.notify_error_qp = qib_notify_error_qp; + dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = qib_mtu_to_path_mtu; + dd->verbs_dev.rdi.driver_f.mtu_from_qp = qib_mtu_from_qp; + dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = qib_get_pmtu_from_attr; + dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _qib_schedule_send; + dd->verbs_dev.rdi.driver_f.query_port_state = qib_query_port; + dd->verbs_dev.rdi.driver_f.shut_down_port = qib_shut_down_port; + dd->verbs_dev.rdi.driver_f.cap_mask_chg = qib_cap_mask_chg; + dd->verbs_dev.rdi.driver_f.notify_create_mad_agent = + qib_notify_create_mad_agent; + dd->verbs_dev.rdi.driver_f.notify_free_mad_agent = + qib_notify_free_mad_agent; + + dd->verbs_dev.rdi.dparms.max_rdma_atomic = QIB_MAX_RDMA_ATOMIC; + dd->verbs_dev.rdi.driver_f.get_guid_be = qib_get_guid_be; + dd->verbs_dev.rdi.dparms.lkey_table_size = qib_lkey_table_size; + dd->verbs_dev.rdi.dparms.qp_table_size = ib_qib_qp_table_size; + dd->verbs_dev.rdi.dparms.qpn_start = 1; + dd->verbs_dev.rdi.dparms.qpn_res_start = QIB_KD_QP; + dd->verbs_dev.rdi.dparms.qpn_res_end = QIB_KD_QP; /* Reserve one QP */ + dd->verbs_dev.rdi.dparms.qpn_inc = 1; + dd->verbs_dev.rdi.dparms.qos_shift = 1; + dd->verbs_dev.rdi.dparms.psn_mask = QIB_PSN_MASK; + dd->verbs_dev.rdi.dparms.psn_shift = QIB_PSN_SHIFT; + dd->verbs_dev.rdi.dparms.psn_modify_mask = QIB_PSN_MASK; + dd->verbs_dev.rdi.dparms.nports = dd->num_pports; + dd->verbs_dev.rdi.dparms.npkeys = qib_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id; + dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; + dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; + + snprintf(dd->verbs_dev.rdi.dparms.cq_name, + sizeof(dd->verbs_dev.rdi.dparms.cq_name), + "qib_cq%d", dd->unit); + + qib_fill_device_attr(dd); + + ppd = dd->pport; + for (i = 0; i < dd->num_pports; i++, ppd++) { + ctxt = ppd->hw_pidx; + rvt_init_port(&dd->verbs_dev.rdi, + &ppd->ibport_data.rvp, + i, + dd->rcd[ctxt]->pkeys); + } - ret = qib_create_agents(dev); + ret = rvt_register_device(&dd->verbs_dev.rdi); if (ret) - goto err_agents; + goto err_tx; ret = qib_verbs_register_sysfs(dd); if (ret) goto err_class; - goto bail; + return ret; err_class: - qib_free_agents(dev); -err_agents: - ib_unregister_device(ibdev); -err_reg: + rvt_unregister_device(&dd->verbs_dev.rdi); err_tx: while (!list_empty(&dev->txreq_free)) { struct list_head *l = dev->txreq_free.next; @@ -2313,27 +1747,17 @@ err_tx: sizeof(struct qib_pio_header), dev->pio_hdrs, dev->pio_hdrs_phys); err_hdrs: - vfree(dev->lk_table.table); -err_lk: - kfree(dev->qp_table); -err_qpt: qib_dev_err(dd, "cannot register verbs: %d!\n", -ret); -bail: return ret; } void qib_unregister_ib_device(struct qib_devdata *dd) { struct qib_ibdev *dev = &dd->verbs_dev; - struct ib_device *ibdev = &dev->ibdev; - u32 qps_inuse; - unsigned lk_tab_size; qib_verbs_unregister_sysfs(dd); - qib_free_agents(dev); - - ib_unregister_device(ibdev); + rvt_unregister_device(&dd->verbs_dev.rdi); if (!list_empty(&dev->piowait)) qib_dev_err(dd, "piowait list not empty!\n"); @@ -2343,16 +1767,8 @@ void qib_unregister_ib_device(struct qib_devdata *dd) qib_dev_err(dd, "txwait list not empty!\n"); if (!list_empty(&dev->memwait)) qib_dev_err(dd, "memwait list not empty!\n"); - if (dev->dma_mr) - qib_dev_err(dd, "DMA MR not NULL!\n"); - - qps_inuse = qib_free_all_qps(dd); - if (qps_inuse) - qib_dev_err(dd, "QP memory leak! %u still in use\n", - qps_inuse); del_timer_sync(&dev->mem_timer); - qib_free_qpn_table(&dev->qpn_table); while (!list_empty(&dev->txreq_free)) { struct list_head *l = dev->txreq_free.next; struct qib_verbs_txreq *tx; @@ -2366,21 +1782,36 @@ void qib_unregister_ib_device(struct qib_devdata *dd) dd->pport->sdma_descq_cnt * sizeof(struct qib_pio_header), dev->pio_hdrs, dev->pio_hdrs_phys); - lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); - vfree(dev->lk_table.table); - kfree(dev->qp_table); } -/* - * This must be called with s_lock held. +/** + * _qib_schedule_send - schedule progress + * @qp - the qp + * + * This schedules progress w/o regard to the s_flags. + * + * It is only used in post send, which doesn't hold + * the s_lock. */ -void qib_schedule_send(struct qib_qp *qp) +void _qib_schedule_send(struct rvt_qp *qp) { - if (qib_send_ok(qp)) { - struct qib_ibport *ibp = - to_iport(qp->ibqp.device, qp->port_num); - struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct qib_pportdata *ppd = ppd_from_ibp(ibp); + struct qib_qp_priv *priv = qp->priv; - queue_work(ppd->qib_wq, &qp->s_work); - } + queue_work(ppd->qib_wq, &priv->s_work); +} + +/** + * qib_schedule_send - schedule progress + * @qp - the qp + * + * This schedules qp progress. The s_lock + * should be held. + */ +void qib_schedule_send(struct rvt_qp *qp) +{ + if (qib_send_ok(qp)) + _qib_schedule_send(qp); } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 6c5e77753d85..4b76a8d59337 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -45,6 +45,8 @@ #include <linux/completion.h> #include <rdma/ib_pack.h> #include <rdma/ib_user_verbs.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_cq.h> struct qib_ctxtdata; struct qib_pportdata; @@ -53,9 +55,7 @@ struct qib_verbs_txreq; #define QIB_MAX_RDMA_ATOMIC 16 #define QIB_GUIDS_PER_PORT 5 - -#define QPN_MAX (1 << 24) -#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) +#define QIB_PSN_SHIFT 8 /* * Increment this value if any changes that break userspace ABI @@ -63,12 +63,6 @@ struct qib_verbs_txreq; */ #define QIB_UVERBS_ABI_VERSION 2 -/* - * Define an ib_cq_notify value that is not valid so we know when CQ - * notifications are armed. - */ -#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) - #define IB_SEQ_NAK (3 << 29) /* AETH NAK opcode values */ @@ -79,17 +73,6 @@ struct qib_verbs_txreq; #define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 #define IB_NAK_INVALID_RD_REQUEST 0x64 -/* Flags for checking QP state (see ib_qib_state_ops[]) */ -#define QIB_POST_SEND_OK 0x01 -#define QIB_POST_RECV_OK 0x02 -#define QIB_PROCESS_RECV_OK 0x04 -#define QIB_PROCESS_SEND_OK 0x08 -#define QIB_PROCESS_NEXT_SEND_OK 0x10 -#define QIB_FLUSH_SEND 0x20 -#define QIB_FLUSH_RECV 0x40 -#define QIB_PROCESS_OR_FLUSH_SEND \ - (QIB_PROCESS_SEND_OK | QIB_FLUSH_SEND) - /* IB Performance Manager status values */ #define IB_PMA_SAMPLE_STATUS_DONE 0x00 #define IB_PMA_SAMPLE_STATUS_STARTED 0x01 @@ -203,468 +186,21 @@ struct qib_pio_header { } __packed; /* - * There is one struct qib_mcast for each multicast GID. - * All attached QPs are then stored as a list of - * struct qib_mcast_qp. + * qib specific data structure that will be hidden from rvt after the queue pair + * is made common. */ -struct qib_mcast_qp { - struct list_head list; - struct qib_qp *qp; -}; - -struct qib_mcast { - struct rb_node rb_node; - union ib_gid mgid; - struct list_head qp_list; - wait_queue_head_t wait; - atomic_t refcount; - int n_attached; -}; - -/* Protection domain */ -struct qib_pd { - struct ib_pd ibpd; - int user; /* non-zero if created from user space */ -}; - -/* Address Handle */ -struct qib_ah { - struct ib_ah ibah; - struct ib_ah_attr attr; - atomic_t refcount; -}; - -/* - * This structure is used by qib_mmap() to validate an offset - * when an mmap() request is made. The vm_area_struct then uses - * this as its vm_private_data. - */ -struct qib_mmap_info { - struct list_head pending_mmaps; - struct ib_ucontext *context; - void *obj; - __u64 offset; - struct kref ref; - unsigned size; -}; - -/* - * This structure is used to contain the head pointer, tail pointer, - * and completion queue entries as a single memory allocation so - * it can be mmap'ed into user space. - */ -struct qib_cq_wc { - u32 head; /* index of next entry to fill */ - u32 tail; /* index of next ib_poll_cq() entry */ - union { - /* these are actually size ibcq.cqe + 1 */ - struct ib_uverbs_wc uqueue[0]; - struct ib_wc kqueue[0]; - }; -}; - -/* - * The completion queue structure. - */ -struct qib_cq { - struct ib_cq ibcq; - struct kthread_work comptask; - struct qib_devdata *dd; - spinlock_t lock; /* protect changes in this struct */ - u8 notify; - u8 triggered; - struct qib_cq_wc *queue; - struct qib_mmap_info *ip; -}; - -/* - * A segment is a linear region of low physical memory. - * XXX Maybe we should use phys addr here and kmap()/kunmap(). - * Used by the verbs layer. - */ -struct qib_seg { - void *vaddr; - size_t length; -}; - -/* The number of qib_segs that fit in a page. */ -#define QIB_SEGSZ (PAGE_SIZE / sizeof(struct qib_seg)) - -struct qib_segarray { - struct qib_seg segs[QIB_SEGSZ]; -}; - -struct qib_mregion { - struct ib_pd *pd; /* shares refcnt of ibmr.pd */ - u64 user_base; /* User's address for this region */ - u64 iova; /* IB start address of this region */ - size_t length; - u32 lkey; - u32 offset; /* offset (bytes) to start of region */ - int access_flags; - u32 max_segs; /* number of qib_segs in all the arrays */ - u32 mapsz; /* size of the map array */ - u8 page_shift; /* 0 - non unform/non powerof2 sizes */ - u8 lkey_published; /* in global table */ - struct completion comp; /* complete when refcount goes to zero */ - struct rcu_head list; - atomic_t refcount; - struct qib_segarray *map[0]; /* the segments */ -}; - -/* - * These keep track of the copy progress within a memory region. - * Used by the verbs layer. - */ -struct qib_sge { - struct qib_mregion *mr; - void *vaddr; /* kernel virtual address of segment */ - u32 sge_length; /* length of the SGE */ - u32 length; /* remaining length of the segment */ - u16 m; /* current index: mr->map[m] */ - u16 n; /* current index: mr->map[m]->segs[n] */ -}; - -/* Memory region */ -struct qib_mr { - struct ib_mr ibmr; - struct ib_umem *umem; - u64 *pages; - u32 npages; - struct qib_mregion mr; /* must be last */ -}; - -/* - * Send work request queue entry. - * The size of the sg_list is determined when the QP is created and stored - * in qp->s_max_sge. - */ -struct qib_swqe { - union { - struct ib_send_wr wr; /* don't use wr.sg_list */ - struct ib_ud_wr ud_wr; - struct ib_reg_wr reg_wr; - struct ib_rdma_wr rdma_wr; - struct ib_atomic_wr atomic_wr; - }; - u32 psn; /* first packet sequence number */ - u32 lpsn; /* last packet sequence number */ - u32 ssn; /* send sequence number */ - u32 length; /* total length of data in sg_list */ - struct qib_sge sg_list[0]; -}; - -/* - * Receive work request queue entry. - * The size of the sg_list is determined when the QP (or SRQ) is created - * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). - */ -struct qib_rwqe { - u64 wr_id; - u8 num_sge; - struct ib_sge sg_list[0]; -}; - -/* - * This structure is used to contain the head pointer, tail pointer, - * and receive work queue entries as a single memory allocation so - * it can be mmap'ed into user space. - * Note that the wq array elements are variable size so you can't - * just index into the array to get the N'th element; - * use get_rwqe_ptr() instead. - */ -struct qib_rwq { - u32 head; /* new work requests posted to the head */ - u32 tail; /* receives pull requests from here. */ - struct qib_rwqe wq[0]; -}; - -struct qib_rq { - struct qib_rwq *wq; - u32 size; /* size of RWQE array */ - u8 max_sge; - spinlock_t lock /* protect changes in this struct */ - ____cacheline_aligned_in_smp; -}; - -struct qib_srq { - struct ib_srq ibsrq; - struct qib_rq rq; - struct qib_mmap_info *ip; - /* send signal when number of RWQEs < limit */ - u32 limit; -}; - -struct qib_sge_state { - struct qib_sge *sg_list; /* next SGE to be used if any */ - struct qib_sge sge; /* progress state for the current SGE */ - u32 total_len; - u8 num_sge; -}; - -/* - * This structure holds the information that the send tasklet needs - * to send a RDMA read response or atomic operation. - */ -struct qib_ack_entry { - u8 opcode; - u8 sent; - u32 psn; - u32 lpsn; - union { - struct qib_sge rdma_sge; - u64 atomic_data; - }; -}; - -/* - * Variables prefixed with s_ are for the requester (sender). - * Variables prefixed with r_ are for the responder (receiver). - * Variables prefixed with ack_ are for responder replies. - * - * Common variables are protected by both r_rq.lock and s_lock in that order - * which only happens in modify_qp() or changing the QP 'state'. - */ -struct qib_qp { - struct ib_qp ibqp; - /* read mostly fields above and below */ - struct ib_ah_attr remote_ah_attr; - struct ib_ah_attr alt_ah_attr; - struct qib_qp __rcu *next; /* link list for QPN hash table */ - struct qib_swqe *s_wq; /* send work queue */ - struct qib_mmap_info *ip; - struct qib_ib_header *s_hdr; /* next packet header to send */ - unsigned long timeout_jiffies; /* computed from timeout */ - - enum ib_mtu path_mtu; - u32 remote_qpn; - u32 pmtu; /* decoded from path_mtu */ - u32 qkey; /* QKEY for this QP (for UD or RD) */ - u32 s_size; /* send work queue size */ - u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ - - u8 state; /* QP state */ - u8 qp_access_flags; - u8 alt_timeout; /* Alternate path timeout for this QP */ - u8 timeout; /* Timeout for this QP */ - u8 s_srate; - u8 s_mig_state; - u8 port_num; - u8 s_pkey_index; /* PKEY index to use */ - u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ - u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ - u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ - u8 s_retry_cnt; /* number of times to retry */ - u8 s_rnr_retry_cnt; - u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ - u8 s_max_sge; /* size of s_wq->sg_list */ - u8 s_draining; - - /* start of read/write fields */ - - atomic_t refcount ____cacheline_aligned_in_smp; - wait_queue_head_t wait; - - - struct qib_ack_entry s_ack_queue[QIB_MAX_RDMA_ATOMIC + 1] - ____cacheline_aligned_in_smp; - struct qib_sge_state s_rdma_read_sge; - - spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ - unsigned long r_aflags; - u64 r_wr_id; /* ID for current receive WQE */ - u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ - u32 r_len; /* total length of r_sge */ - u32 r_rcv_len; /* receive data len processed */ - u32 r_psn; /* expected rcv packet sequence number */ - u32 r_msn; /* message sequence number */ - - u8 r_state; /* opcode of last packet received */ - u8 r_flags; - u8 r_head_ack_queue; /* index into s_ack_queue[] */ - - struct list_head rspwait; /* link for waititing to respond */ - - struct qib_sge_state r_sge; /* current receive data */ - struct qib_rq r_rq; /* receive work queue */ - - spinlock_t s_lock ____cacheline_aligned_in_smp; - struct qib_sge_state *s_cur_sge; - u32 s_flags; - struct qib_verbs_txreq *s_tx; - struct qib_swqe *s_wqe; - struct qib_sge_state s_sge; /* current send request data */ - struct qib_mregion *s_rdma_mr; - atomic_t s_dma_busy; - u32 s_cur_size; /* size of send packet in bytes */ - u32 s_len; /* total length of s_sge */ - u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ - u32 s_next_psn; /* PSN for next request */ - u32 s_last_psn; /* last response PSN processed */ - u32 s_sending_psn; /* lowest PSN that is being sent */ - u32 s_sending_hpsn; /* highest PSN that is being sent */ - u32 s_psn; /* current packet sequence number */ - u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ - u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ - u32 s_head; /* new entries added here */ - u32 s_tail; /* next entry to process */ - u32 s_cur; /* current work queue entry */ - u32 s_acked; /* last un-ACK'ed entry */ - u32 s_last; /* last completed entry */ - u32 s_ssn; /* SSN of tail entry */ - u32 s_lsn; /* limit sequence number (credit) */ - u16 s_hdrwords; /* size of s_hdr in 32 bit words */ - u16 s_rdma_ack_cnt; - u8 s_state; /* opcode of last packet sent */ - u8 s_ack_state; /* opcode of packet to ACK */ - u8 s_nak_state; /* non-zero if NAK is pending */ - u8 r_nak_state; /* non-zero if NAK is pending */ - u8 s_retry; /* requester retry counter */ - u8 s_rnr_retry; /* requester RNR retry counter */ - u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ - u8 s_tail_ack_queue; /* index into s_ack_queue[] */ - - struct qib_sge_state s_ack_rdma_sge; - struct timer_list s_timer; +struct qib_qp_priv { + struct qib_ib_header *s_hdr; /* next packet header to send */ struct list_head iowait; /* link for wait PIO buf */ - + atomic_t s_dma_busy; + struct qib_verbs_txreq *s_tx; struct work_struct s_work; - wait_queue_head_t wait_dma; - - struct qib_sge r_sg_list[0] /* verified SGEs */ - ____cacheline_aligned_in_smp; + struct rvt_qp *owner; }; -/* - * Atomic bit definitions for r_aflags. - */ -#define QIB_R_WRID_VALID 0 -#define QIB_R_REWIND_SGE 1 - -/* - * Bit definitions for r_flags. - */ -#define QIB_R_REUSE_SGE 0x01 -#define QIB_R_RDMAR_SEQ 0x02 -#define QIB_R_RSP_NAK 0x04 -#define QIB_R_RSP_SEND 0x08 -#define QIB_R_COMM_EST 0x10 - -/* - * Bit definitions for s_flags. - * - * QIB_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled - * QIB_S_BUSY - send tasklet is processing the QP - * QIB_S_TIMER - the RC retry timer is active - * QIB_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics - * QIB_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs - * before processing the next SWQE - * QIB_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete - * before processing the next SWQE - * QIB_S_WAIT_RNR - waiting for RNR timeout - * QIB_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE - * QIB_S_WAIT_DMA - waiting for send DMA queue to drain before generating - * next send completion entry not via send DMA - * QIB_S_WAIT_PIO - waiting for a send buffer to be available - * QIB_S_WAIT_TX - waiting for a struct qib_verbs_txreq to be available - * QIB_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available - * QIB_S_WAIT_KMEM - waiting for kernel memory to be available - * QIB_S_WAIT_PSN - waiting for a packet to exit the send DMA queue - * QIB_S_WAIT_ACK - waiting for an ACK packet before sending more requests - * QIB_S_SEND_ONE - send one packet, request ACK, then wait for ACK - */ -#define QIB_S_SIGNAL_REQ_WR 0x0001 -#define QIB_S_BUSY 0x0002 -#define QIB_S_TIMER 0x0004 -#define QIB_S_RESP_PENDING 0x0008 -#define QIB_S_ACK_PENDING 0x0010 -#define QIB_S_WAIT_FENCE 0x0020 -#define QIB_S_WAIT_RDMAR 0x0040 -#define QIB_S_WAIT_RNR 0x0080 -#define QIB_S_WAIT_SSN_CREDIT 0x0100 -#define QIB_S_WAIT_DMA 0x0200 -#define QIB_S_WAIT_PIO 0x0400 -#define QIB_S_WAIT_TX 0x0800 -#define QIB_S_WAIT_DMA_DESC 0x1000 -#define QIB_S_WAIT_KMEM 0x2000 -#define QIB_S_WAIT_PSN 0x4000 -#define QIB_S_WAIT_ACK 0x8000 -#define QIB_S_SEND_ONE 0x10000 -#define QIB_S_UNLIMITED_CREDIT 0x20000 - -/* - * Wait flags that would prevent any packet type from being sent. - */ -#define QIB_S_ANY_WAIT_IO (QIB_S_WAIT_PIO | QIB_S_WAIT_TX | \ - QIB_S_WAIT_DMA_DESC | QIB_S_WAIT_KMEM) - -/* - * Wait flags that would prevent send work requests from making progress. - */ -#define QIB_S_ANY_WAIT_SEND (QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | \ - QIB_S_WAIT_RNR | QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_DMA | \ - QIB_S_WAIT_PSN | QIB_S_WAIT_ACK) - -#define QIB_S_ANY_WAIT (QIB_S_ANY_WAIT_IO | QIB_S_ANY_WAIT_SEND) - #define QIB_PSN_CREDIT 16 -/* - * Since struct qib_swqe is not a fixed size, we can't simply index into - * struct qib_qp.s_wq. This function does the array index computation. - */ -static inline struct qib_swqe *get_swqe_ptr(struct qib_qp *qp, - unsigned n) -{ - return (struct qib_swqe *)((char *)qp->s_wq + - (sizeof(struct qib_swqe) + - qp->s_max_sge * - sizeof(struct qib_sge)) * n); -} - -/* - * Since struct qib_rwqe is not a fixed size, we can't simply index into - * struct qib_rwq.wq. This function does the array index computation. - */ -static inline struct qib_rwqe *get_rwqe_ptr(struct qib_rq *rq, unsigned n) -{ - return (struct qib_rwqe *) - ((char *) rq->wq->wq + - (sizeof(struct qib_rwqe) + - rq->max_sge * sizeof(struct ib_sge)) * n); -} - -/* - * QPN-map pages start out as NULL, they get allocated upon - * first use and are never deallocated. This way, - * large bitmaps are not allocated unless large numbers of QPs are used. - */ -struct qpn_map { - void *page; -}; - -struct qib_qpn_table { - spinlock_t lock; /* protect changes in this struct */ - unsigned flags; /* flags for QP0/1 allocated for each port */ - u32 last; /* last QP number allocated */ - u32 nmaps; /* size of the map table */ - u16 limit; - u16 mask; - /* bit map of free QP numbers other than 0/1 */ - struct qpn_map map[QPNMAP_ENTRIES]; -}; - -#define MAX_LKEY_TABLE_BITS 23 - -struct qib_lkey_table { - spinlock_t lock; /* protect changes in this struct */ - u32 next; /* next unused index (speeds search) */ - u32 gen; /* generation count */ - u32 max; /* size of the table */ - struct qib_mregion __rcu **table; -}; - struct qib_opcode_stats { u64 n_packets; /* number of packets */ u64 n_bytes; /* total number of bytes */ @@ -682,21 +218,9 @@ struct qib_pma_counters { }; struct qib_ibport { - struct qib_qp __rcu *qp0; - struct qib_qp __rcu *qp1; - struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ - struct qib_ah *sm_ah; - struct qib_ah *smi_ah; - struct rb_root mcast_tree; - spinlock_t lock; /* protect changes in this struct */ - - /* non-zero when timer is set */ - unsigned long mkey_lease_timeout; - unsigned long trap_timeout; - __be64 gid_prefix; /* in network order */ - __be64 mkey; + struct rvt_ibport rvp; + struct rvt_ah *smi_ah; __be64 guids[QIB_GUIDS_PER_PORT - 1]; /* writable GUIDs */ - u64 tid; /* TID for traps */ struct qib_pma_counters __percpu *pmastats; u64 z_unicast_xmit; /* starting count for PMA */ u64 z_unicast_rcv; /* starting count for PMA */ @@ -715,82 +239,25 @@ struct qib_ibport { u32 z_local_link_integrity_errors; /* starting count for PMA */ u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ u32 z_vl15_dropped; /* starting count for PMA */ - u32 n_rc_resends; - u32 n_rc_acks; - u32 n_rc_qacks; - u32 n_rc_delayed_comp; - u32 n_seq_naks; - u32 n_rdma_seq; - u32 n_rnr_naks; - u32 n_other_naks; - u32 n_loop_pkts; - u32 n_pkt_drops; - u32 n_vl15_dropped; - u32 n_rc_timeouts; - u32 n_dmawait; - u32 n_unaligned; - u32 n_rc_dupreq; - u32 n_rc_seqnak; - u32 port_cap_flags; - u32 pma_sample_start; - u32 pma_sample_interval; - __be16 pma_counter_select[5]; - u16 pma_tag; - u16 pkey_violations; - u16 qkey_violations; - u16 mkey_violations; - u16 mkey_lease_period; - u16 sm_lid; - u16 repress_traps; - u8 sm_sl; - u8 mkeyprot; - u8 subnet_timeout; - u8 vl_high_limit; u8 sl_to_vl[16]; - }; - struct qib_ibdev { - struct ib_device ibdev; - struct list_head pending_mmaps; - spinlock_t mmap_offset_lock; /* protect mmap_offset */ - u32 mmap_offset; - struct qib_mregion __rcu *dma_mr; - - /* QP numbers are shared by all IB ports */ - struct qib_qpn_table qpn_table; - struct qib_lkey_table lk_table; + struct rvt_dev_info rdi; + struct list_head piowait; /* list for wait PIO buf */ struct list_head dmawait; /* list for wait DMA */ struct list_head txwait; /* list for wait qib_verbs_txreq */ struct list_head memwait; /* list for wait kernel memory */ struct list_head txreq_free; struct timer_list mem_timer; - struct qib_qp __rcu **qp_table; struct qib_pio_header *pio_hdrs; dma_addr_t pio_hdrs_phys; - /* list of QPs waiting for RNR timer */ - spinlock_t pending_lock; /* protect wait lists, PMA counters, etc. */ - u32 qp_table_size; /* size of the hash table */ u32 qp_rnd; /* random bytes for hash */ - spinlock_t qpt_lock; u32 n_piowait; u32 n_txwait; - u32 n_pds_allocated; /* number of PDs allocated for device */ - spinlock_t n_pds_lock; - u32 n_ahs_allocated; /* number of AHs allocated for device */ - spinlock_t n_ahs_lock; - u32 n_cqs_allocated; /* number of CQs allocated for device */ - spinlock_t n_cqs_lock; - u32 n_qps_allocated; /* number of QPs allocated for device */ - spinlock_t n_qps_lock; - u32 n_srqs_allocated; /* number of SRQs allocated for device */ - spinlock_t n_srqs_lock; - u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ - spinlock_t n_mcast_grps_lock; #ifdef CONFIG_DEBUG_FS /* per HCA debugfs */ struct dentry *qib_ibdev_dbg; @@ -813,56 +280,27 @@ struct qib_verbs_counters { u32 vl15_dropped; }; -static inline struct qib_mr *to_imr(struct ib_mr *ibmr) -{ - return container_of(ibmr, struct qib_mr, ibmr); -} - -static inline struct qib_pd *to_ipd(struct ib_pd *ibpd) -{ - return container_of(ibpd, struct qib_pd, ibpd); -} - -static inline struct qib_ah *to_iah(struct ib_ah *ibah) -{ - return container_of(ibah, struct qib_ah, ibah); -} - -static inline struct qib_cq *to_icq(struct ib_cq *ibcq) -{ - return container_of(ibcq, struct qib_cq, ibcq); -} - -static inline struct qib_srq *to_isrq(struct ib_srq *ibsrq) -{ - return container_of(ibsrq, struct qib_srq, ibsrq); -} - -static inline struct qib_qp *to_iqp(struct ib_qp *ibqp) -{ - return container_of(ibqp, struct qib_qp, ibqp); -} - static inline struct qib_ibdev *to_idev(struct ib_device *ibdev) { - return container_of(ibdev, struct qib_ibdev, ibdev); + struct rvt_dev_info *rdi; + + rdi = container_of(ibdev, struct rvt_dev_info, ibdev); + return container_of(rdi, struct qib_ibdev, rdi); } /* * Send if not busy or waiting for I/O and either * a RC response is pending or we can process send work requests. */ -static inline int qib_send_ok(struct qib_qp *qp) +static inline int qib_send_ok(struct rvt_qp *qp) { - return !(qp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT_IO)) && - (qp->s_hdrwords || (qp->s_flags & QIB_S_RESP_PENDING) || - !(qp->s_flags & QIB_S_ANY_WAIT_SEND)); + return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) && + (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); } -/* - * This must be called with s_lock held. - */ -void qib_schedule_send(struct qib_qp *qp); +void _qib_schedule_send(struct rvt_qp *qp); +void qib_schedule_send(struct rvt_qp *qp); static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) { @@ -878,7 +316,7 @@ static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl, u32 qp1, u32 qp2, __be16 lid1, __be16 lid2); -void qib_cap_mask_chg(struct qib_ibport *ibp); +void qib_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num); void qib_sys_guid_chg(struct qib_ibport *ibp); void qib_node_desc_chg(struct qib_ibport *ibp); int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, @@ -886,8 +324,8 @@ int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_mad_hdr *in, size_t in_mad_size, struct ib_mad_hdr *out, size_t *out_mad_size, u16 *out_mad_pkey_index); -int qib_create_agents(struct qib_ibdev *dev); -void qib_free_agents(struct qib_ibdev *dev); +void qib_notify_create_mad_agent(struct rvt_dev_info *rdi, int port_idx); +void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx); /* * Compare the lower 24 bits of the two values. @@ -898,8 +336,6 @@ static inline int qib_cmp24(u32 a, u32 b) return (((int) a) - ((int) b)) << 8; } -struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid); - int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords, u64 *rwords, u64 *spkts, u64 *rpkts, u64 *xmit_wait); @@ -907,35 +343,17 @@ int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords, int qib_get_counters(struct qib_pportdata *ppd, struct qib_verbs_counters *cntrs); -int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); - -int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); - -int qib_mcast_tree_empty(struct qib_ibport *ibp); - -__be32 qib_compute_aeth(struct qib_qp *qp); - -struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn); - -struct ib_qp *qib_create_qp(struct ib_pd *ibpd, - struct ib_qp_init_attr *init_attr, - struct ib_udata *udata); - -int qib_destroy_qp(struct ib_qp *ibqp); - -int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err); - -int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata); - -int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_qp_init_attr *init_attr); - -unsigned qib_free_all_qps(struct qib_devdata *dd); +__be32 qib_compute_aeth(struct rvt_qp *qp); -void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt); - -void qib_free_qpn_table(struct qib_qpn_table *qpt); +/* + * Functions provided by qib driver for rdmavt to use + */ +unsigned qib_free_all_qps(struct rvt_dev_info *rdi); +void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, gfp_t gfp); +void qib_qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); +void qib_notify_qp_reset(struct rvt_qp *qp); +int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, + enum ib_qp_type type, u8 port, gfp_t gfp); #ifdef CONFIG_DEBUG_FS @@ -949,7 +367,7 @@ void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter); #endif -void qib_get_credit(struct qib_qp *qp, u32 aeth); +void qib_get_credit(struct rvt_qp *qp, u32 aeth); unsigned qib_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult); @@ -957,166 +375,66 @@ void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail); void qib_put_txreq(struct qib_verbs_txreq *tx); -int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr, - u32 hdrwords, struct qib_sge_state *ss, u32 len); +int qib_verbs_send(struct rvt_qp *qp, struct qib_ib_header *hdr, + u32 hdrwords, struct rvt_sge_state *ss, u32 len); -void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, +void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release); -void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release); +void qib_skip_sge(struct rvt_sge_state *ss, u32 length, int release); void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, - int has_grh, void *data, u32 tlen, struct qib_qp *qp); + int has_grh, void *data, u32 tlen, struct rvt_qp *qp); void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr, - int has_grh, void *data, u32 tlen, struct qib_qp *qp); + int has_grh, void *data, u32 tlen, struct rvt_qp *qp); int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); +int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); + struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); void qib_rc_rnr_retry(unsigned long arg); -void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr); +void qib_rc_send_complete(struct rvt_qp *qp, struct qib_ib_header *hdr); -void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err); +void qib_rc_error(struct rvt_qp *qp, enum ib_wc_status err); -int qib_post_ud_send(struct qib_qp *qp, struct ib_send_wr *wr); +int qib_post_ud_send(struct rvt_qp *qp, struct ib_send_wr *wr); void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr, - int has_grh, void *data, u32 tlen, struct qib_qp *qp); - -int qib_alloc_lkey(struct qib_mregion *mr, int dma_region); - -void qib_free_lkey(struct qib_mregion *mr); - -int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd, - struct qib_sge *isge, struct ib_sge *sge, int acc); - -int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge, - u32 len, u64 vaddr, u32 rkey, int acc); - -int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); - -struct ib_srq *qib_create_srq(struct ib_pd *ibpd, - struct ib_srq_init_attr *srq_init_attr, - struct ib_udata *udata); - -int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, - enum ib_srq_attr_mask attr_mask, - struct ib_udata *udata); - -int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); - -int qib_destroy_srq(struct ib_srq *ibsrq); - -int qib_cq_init(struct qib_devdata *dd); - -void qib_cq_exit(struct qib_devdata *dd); - -void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int sig); - -int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); - -struct ib_cq *qib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_ucontext *context, - struct ib_udata *udata); - -int qib_destroy_cq(struct ib_cq *ibcq); - -int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); - -int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); - -struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc); - -struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt_addr, int mr_access_flags, - struct ib_udata *udata); - -int qib_dereg_mr(struct ib_mr *ibmr); - -struct ib_mr *qib_alloc_mr(struct ib_pd *pd, - enum ib_mr_type mr_type, - u32 max_entries); - -int qib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); - -int qib_reg_mr(struct qib_qp *qp, struct ib_reg_wr *wr); - -struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr); - -int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova); - -int qib_unmap_fmr(struct list_head *fmr_list); - -int qib_dealloc_fmr(struct ib_fmr *ibfmr); - -static inline void qib_get_mr(struct qib_mregion *mr) -{ - atomic_inc(&mr->refcount); -} + int has_grh, void *data, u32 tlen, struct rvt_qp *qp); void mr_rcu_callback(struct rcu_head *list); -static inline void qib_put_mr(struct qib_mregion *mr) -{ - if (unlikely(atomic_dec_and_test(&mr->refcount))) - call_rcu(&mr->list, mr_rcu_callback); -} - -static inline void qib_put_ss(struct qib_sge_state *ss) -{ - while (ss->num_sge) { - qib_put_mr(ss->sge.mr); - if (--ss->num_sge) - ss->sge = *ss->sg_list++; - } -} - - -void qib_release_mmap_info(struct kref *ref); +int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only); -struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, u32 size, - struct ib_ucontext *context, - void *obj); - -void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip, - u32 size, void *obj); - -int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); - -int qib_get_rwqe(struct qib_qp *qp, int wr_id_only); - -void qib_migrate_qp(struct qib_qp *qp); +void qib_migrate_qp(struct rvt_qp *qp); int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr, - int has_grh, struct qib_qp *qp, u32 bth0); + int has_grh, struct rvt_qp *qp, u32 bth0); u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr, struct ib_global_route *grh, u32 hwords, u32 nwords); -void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr, +void qib_make_ruc_header(struct rvt_qp *qp, struct qib_other_headers *ohdr, u32 bth0, u32 bth2); -void qib_do_send(struct work_struct *work); +void _qib_do_send(struct work_struct *work); + +void qib_do_send(struct rvt_qp *qp); -void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe, +void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status); -void qib_send_rc_ack(struct qib_qp *qp); +void qib_send_rc_ack(struct rvt_qp *qp); -int qib_make_rc_req(struct qib_qp *qp); +int qib_make_rc_req(struct rvt_qp *qp); -int qib_make_uc_req(struct qib_qp *qp); +int qib_make_uc_req(struct rvt_qp *qp); -int qib_make_ud_req(struct qib_qp *qp); +int qib_make_ud_req(struct rvt_qp *qp); int qib_register_ib_device(struct qib_devdata *); @@ -1150,11 +468,11 @@ extern const enum ib_wc_opcode ib_qib_wc_opcode[]; #define IB_PHYSPORTSTATE_CFG_ENH 0x10 #define IB_PHYSPORTSTATE_CFG_WAIT_ENH 0x13 -extern const int ib_qib_state_ops[]; +extern const int ib_rvt_state_ops[]; extern __be64 ib_qib_sys_image_guid; /* in network order */ -extern unsigned int ib_qib_lkey_table_size; +extern unsigned int ib_rvt_lkey_table_size; extern unsigned int ib_qib_max_cqes; @@ -1178,6 +496,4 @@ extern unsigned int ib_qib_max_srq_wrs; extern const u32 ib_qib_rnr_table[]; -extern struct ib_dma_mapping_ops qib_dma_mapping_ops; - #endif /* QIB_VERBS_H */ diff --git a/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/drivers/infiniband/hw/qib/qib_verbs_mcast.c deleted file mode 100644 index b2fb5286dbd9..000000000000 --- a/drivers/infiniband/hw/qib/qib_verbs_mcast.c +++ /dev/null @@ -1,363 +0,0 @@ -/* - * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. - * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/rculist.h> - -#include "qib.h" - -/** - * qib_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct - * @qp: the QP to link - */ -static struct qib_mcast_qp *qib_mcast_qp_alloc(struct qib_qp *qp) -{ - struct qib_mcast_qp *mqp; - - mqp = kmalloc(sizeof(*mqp), GFP_KERNEL); - if (!mqp) - goto bail; - - mqp->qp = qp; - atomic_inc(&qp->refcount); - -bail: - return mqp; -} - -static void qib_mcast_qp_free(struct qib_mcast_qp *mqp) -{ - struct qib_qp *qp = mqp->qp; - - /* Notify qib_destroy_qp() if it is waiting. */ - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); - - kfree(mqp); -} - -/** - * qib_mcast_alloc - allocate the multicast GID structure - * @mgid: the multicast GID - * - * A list of QPs will be attached to this structure. - */ -static struct qib_mcast *qib_mcast_alloc(union ib_gid *mgid) -{ - struct qib_mcast *mcast; - - mcast = kmalloc(sizeof(*mcast), GFP_KERNEL); - if (!mcast) - goto bail; - - mcast->mgid = *mgid; - INIT_LIST_HEAD(&mcast->qp_list); - init_waitqueue_head(&mcast->wait); - atomic_set(&mcast->refcount, 0); - mcast->n_attached = 0; - -bail: - return mcast; -} - -static void qib_mcast_free(struct qib_mcast *mcast) -{ - struct qib_mcast_qp *p, *tmp; - - list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) - qib_mcast_qp_free(p); - - kfree(mcast); -} - -/** - * qib_mcast_find - search the global table for the given multicast GID - * @ibp: the IB port structure - * @mgid: the multicast GID to search for - * - * Returns NULL if not found. - * - * The caller is responsible for decrementing the reference count if found. - */ -struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid) -{ - struct rb_node *n; - unsigned long flags; - struct qib_mcast *mcast; - - spin_lock_irqsave(&ibp->lock, flags); - n = ibp->mcast_tree.rb_node; - while (n) { - int ret; - - mcast = rb_entry(n, struct qib_mcast, rb_node); - - ret = memcmp(mgid->raw, mcast->mgid.raw, - sizeof(union ib_gid)); - if (ret < 0) - n = n->rb_left; - else if (ret > 0) - n = n->rb_right; - else { - atomic_inc(&mcast->refcount); - spin_unlock_irqrestore(&ibp->lock, flags); - goto bail; - } - } - spin_unlock_irqrestore(&ibp->lock, flags); - - mcast = NULL; - -bail: - return mcast; -} - -/** - * qib_mcast_add - insert mcast GID into table and attach QP struct - * @mcast: the mcast GID table - * @mqp: the QP to attach - * - * Return zero if both were added. Return EEXIST if the GID was already in - * the table but the QP was added. Return ESRCH if the QP was already - * attached and neither structure was added. - */ -static int qib_mcast_add(struct qib_ibdev *dev, struct qib_ibport *ibp, - struct qib_mcast *mcast, struct qib_mcast_qp *mqp) -{ - struct rb_node **n = &ibp->mcast_tree.rb_node; - struct rb_node *pn = NULL; - int ret; - - spin_lock_irq(&ibp->lock); - - while (*n) { - struct qib_mcast *tmcast; - struct qib_mcast_qp *p; - - pn = *n; - tmcast = rb_entry(pn, struct qib_mcast, rb_node); - - ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, - sizeof(union ib_gid)); - if (ret < 0) { - n = &pn->rb_left; - continue; - } - if (ret > 0) { - n = &pn->rb_right; - continue; - } - - /* Search the QP list to see if this is already there. */ - list_for_each_entry_rcu(p, &tmcast->qp_list, list) { - if (p->qp == mqp->qp) { - ret = ESRCH; - goto bail; - } - } - if (tmcast->n_attached == ib_qib_max_mcast_qp_attached) { - ret = ENOMEM; - goto bail; - } - - tmcast->n_attached++; - - list_add_tail_rcu(&mqp->list, &tmcast->qp_list); - ret = EEXIST; - goto bail; - } - - spin_lock(&dev->n_mcast_grps_lock); - if (dev->n_mcast_grps_allocated == ib_qib_max_mcast_grps) { - spin_unlock(&dev->n_mcast_grps_lock); - ret = ENOMEM; - goto bail; - } - - dev->n_mcast_grps_allocated++; - spin_unlock(&dev->n_mcast_grps_lock); - - mcast->n_attached++; - - list_add_tail_rcu(&mqp->list, &mcast->qp_list); - - atomic_inc(&mcast->refcount); - rb_link_node(&mcast->rb_node, pn, n); - rb_insert_color(&mcast->rb_node, &ibp->mcast_tree); - - ret = 0; - -bail: - spin_unlock_irq(&ibp->lock); - - return ret; -} - -int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - struct qib_qp *qp = to_iqp(ibqp); - struct qib_ibdev *dev = to_idev(ibqp->device); - struct qib_ibport *ibp; - struct qib_mcast *mcast; - struct qib_mcast_qp *mqp; - int ret; - - if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { - ret = -EINVAL; - goto bail; - } - - /* - * Allocate data structures since its better to do this outside of - * spin locks and it will most likely be needed. - */ - mcast = qib_mcast_alloc(gid); - if (mcast == NULL) { - ret = -ENOMEM; - goto bail; - } - mqp = qib_mcast_qp_alloc(qp); - if (mqp == NULL) { - qib_mcast_free(mcast); - ret = -ENOMEM; - goto bail; - } - ibp = to_iport(ibqp->device, qp->port_num); - switch (qib_mcast_add(dev, ibp, mcast, mqp)) { - case ESRCH: - /* Neither was used: OK to attach the same QP twice. */ - qib_mcast_qp_free(mqp); - qib_mcast_free(mcast); - break; - - case EEXIST: /* The mcast wasn't used */ - qib_mcast_free(mcast); - break; - - case ENOMEM: - /* Exceeded the maximum number of mcast groups. */ - qib_mcast_qp_free(mqp); - qib_mcast_free(mcast); - ret = -ENOMEM; - goto bail; - - default: - break; - } - - ret = 0; - -bail: - return ret; -} - -int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - struct qib_qp *qp = to_iqp(ibqp); - struct qib_ibdev *dev = to_idev(ibqp->device); - struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num); - struct qib_mcast *mcast = NULL; - struct qib_mcast_qp *p, *tmp, *delp = NULL; - struct rb_node *n; - int last = 0; - int ret; - - if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) - return -EINVAL; - - spin_lock_irq(&ibp->lock); - - /* Find the GID in the mcast table. */ - n = ibp->mcast_tree.rb_node; - while (1) { - if (n == NULL) { - spin_unlock_irq(&ibp->lock); - return -EINVAL; - } - - mcast = rb_entry(n, struct qib_mcast, rb_node); - ret = memcmp(gid->raw, mcast->mgid.raw, - sizeof(union ib_gid)); - if (ret < 0) - n = n->rb_left; - else if (ret > 0) - n = n->rb_right; - else - break; - } - - /* Search the QP list. */ - list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { - if (p->qp != qp) - continue; - /* - * We found it, so remove it, but don't poison the forward - * link until we are sure there are no list walkers. - */ - list_del_rcu(&p->list); - mcast->n_attached--; - delp = p; - - /* If this was the last attached QP, remove the GID too. */ - if (list_empty(&mcast->qp_list)) { - rb_erase(&mcast->rb_node, &ibp->mcast_tree); - last = 1; - } - break; - } - - spin_unlock_irq(&ibp->lock); - /* QP not attached */ - if (!delp) - return -EINVAL; - /* - * Wait for any list walkers to finish before freeing the - * list element. - */ - wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); - qib_mcast_qp_free(delp); - - if (last) { - atomic_dec(&mcast->refcount); - wait_event(mcast->wait, !atomic_read(&mcast->refcount)); - qib_mcast_free(mcast); - spin_lock_irq(&dev->n_mcast_grps_lock); - dev->n_mcast_grps_allocated--; - spin_unlock_irq(&dev->n_mcast_grps_lock); - } - return 0; -} - -int qib_mcast_tree_empty(struct qib_ibport *ibp) -{ - return ibp->mcast_tree.rb_node == NULL; -} diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile new file mode 100644 index 000000000000..988b6a0101a4 --- /dev/null +++ b/drivers/infiniband/sw/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig new file mode 100644 index 000000000000..11aa6a34bd71 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -0,0 +1,6 @@ +config INFINIBAND_RDMAVT + tristate "RDMA verbs transport library" + depends on 64BIT + default m + ---help--- + This is a common software verbs provider for RDMA networks. diff --git a/drivers/infiniband/sw/rdmavt/Makefile b/drivers/infiniband/sw/rdmavt/Makefile new file mode 100644 index 000000000000..ccaa7992ac97 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/Makefile @@ -0,0 +1,13 @@ +# +# rdmavt driver +# +# +# +# Called from the kernel module build system. +# +obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt.o + +rdmavt-y := vt.o ah.o cq.o dma.o mad.o mcast.o mmap.o mr.o pd.o qp.o srq.o \ + trace.o + +CFLAGS_trace.o = -I$(src) diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c new file mode 100644 index 000000000000..16c446142c2a --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -0,0 +1,196 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/slab.h> +#include "ah.h" +#include "vt.h" /* for prints */ + +/** + * rvt_check_ah - validate the attributes of AH + * @ibdev: the ib device + * @ah_attr: the attributes of the AH + * + * If driver supports a more detailed check_ah function call back to it + * otherwise just check the basics. + * + * Return: 0 on success + */ +int rvt_check_ah(struct ib_device *ibdev, + struct ib_ah_attr *ah_attr) +{ + int err; + struct ib_port_attr port_attr; + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, + ah_attr->port_num); + + err = ib_query_port(ibdev, ah_attr->port_num, &port_attr); + if (err) + return -EINVAL; + if (ah_attr->port_num < 1 || + ah_attr->port_num > ibdev->phys_port_cnt) + return -EINVAL; + if (ah_attr->static_rate != IB_RATE_PORT_CURRENT && + ib_rate_to_mbps(ah_attr->static_rate) < 0) + return -EINVAL; + if ((ah_attr->ah_flags & IB_AH_GRH) && + ah_attr->grh.sgid_index >= port_attr.gid_tbl_len) + return -EINVAL; + if (link != IB_LINK_LAYER_ETHERNET) { + if (ah_attr->dlid == 0) + return -EINVAL; + if (ah_attr->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) && + ah_attr->dlid != be16_to_cpu(IB_LID_PERMISSIVE) && + !(ah_attr->ah_flags & IB_AH_GRH)) + return -EINVAL; + } + if (rdi->driver_f.check_ah) + return rdi->driver_f.check_ah(ibdev, ah_attr); + return 0; +} +EXPORT_SYMBOL(rvt_check_ah); + +/** + * rvt_create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + * + * Return: newly allocated ah + */ +struct ib_ah *rvt_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct rvt_ah *ah; + struct rvt_dev_info *dev = ib_to_rvt(pd->device); + unsigned long flags; + + if (rvt_check_ah(pd->device, ah_attr)) + return ERR_PTR(-EINVAL); + + ah = kmalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + if (dev->n_ahs_allocated == dev->dparms.props.max_ah) { + spin_unlock(&dev->n_ahs_lock); + kfree(ah); + return ERR_PTR(-ENOMEM); + } + + dev->n_ahs_allocated++; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + ah->attr = *ah_attr; + atomic_set(&ah->refcount, 0); + + if (dev->driver_f.notify_new_ah) + dev->driver_f.notify_new_ah(pd->device, ah_attr, ah); + + return &ah->ibah; +} + +/** + * rvt_destory_ah - Destory an address handle + * @ibah: address handle + * + * Return: 0 on success + */ +int rvt_destroy_ah(struct ib_ah *ibah) +{ + struct rvt_dev_info *dev = ib_to_rvt(ibah->device); + struct rvt_ah *ah = ibah_to_rvtah(ibah); + unsigned long flags; + + if (atomic_read(&ah->refcount) != 0) + return -EBUSY; + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + dev->n_ahs_allocated--; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + kfree(ah); + + return 0; +} + +/** + * rvt_modify_ah - modify an ah with given attrs + * @ibah: address handle to modify + * @ah_attr: attrs to apply + * + * Return: 0 on success + */ +int rvt_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct rvt_ah *ah = ibah_to_rvtah(ibah); + + if (rvt_check_ah(ibah->device, ah_attr)) + return -EINVAL; + + ah->attr = *ah_attr; + + return 0; +} + +/** + * rvt_query_ah - return attrs for ah + * @ibah: address handle to query + * @ah_attr: return info in this + * + * Return: always 0 + */ +int rvt_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct rvt_ah *ah = ibah_to_rvtah(ibah); + + *ah_attr = ah->attr; + + return 0; +} diff --git a/drivers/infiniband/sw/rdmavt/ah.h b/drivers/infiniband/sw/rdmavt/ah.h new file mode 100644 index 000000000000..e9c36be87d79 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/ah.h @@ -0,0 +1,59 @@ +#ifndef DEF_RVTAH_H +#define DEF_RVTAH_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> + +struct ib_ah *rvt_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr); +int rvt_destroy_ah(struct ib_ah *ibah); +int rvt_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); +int rvt_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); + +#endif /* DEF_RVTAH_H */ diff --git a/drivers/staging/rdma/hfi1/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 4f046ffe7e60..b1ffc8b4a6c0 100644 --- a/drivers/staging/rdma/hfi1/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -48,25 +45,23 @@ * */ -#include <linux/err.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/kthread.h> - -#include "verbs.h" -#include "hfi.h" +#include "cq.h" +#include "vt.h" /** - * hfi1_cq_enter - add a new entry to the completion queue + * rvt_cq_enter - add a new entry to the completion queue * @cq: completion queue * @entry: work completion entry to add - * @sig: true if @entry is a solicited entry + * @sig: true if @entry is solicited * * This may be called with qp->s_lock held. */ -void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited) +void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) { - struct hfi1_cq_wc *wc; + struct rvt_cq_wc *wc; unsigned long flags; u32 head; u32 next; @@ -79,11 +74,13 @@ void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited) */ wc = cq->queue; head = wc->head; - if (head >= (unsigned) cq->ibcq.cqe) { + if (head >= (unsigned)cq->ibcq.cqe) { head = cq->ibcq.cqe; next = 0; - } else + } else { next = head + 1; + } + if (unlikely(next == wc->tail)) { spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { @@ -114,8 +111,9 @@ void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited) wc->uqueue[head].port_num = entry->port_num; /* Make sure entry is written before the head index. */ smp_wmb(); - } else + } else { wc->kqueue[head] = *entry; + } wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || @@ -126,10 +124,10 @@ void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited) * This will cause send_complete() to be called in * another thread. */ - smp_read_barrier_depends(); /* see hfi1_cq_exit */ - worker = cq->dd->worker; + smp_read_barrier_depends(); /* see rvt_cq_exit */ + worker = cq->rdi->worker; if (likely(worker)) { - cq->notify = IB_CQ_NONE; + cq->notify = RVT_CQ_NONE; cq->triggered++; queue_kthread_work(worker, &cq->comptask); } @@ -137,59 +135,11 @@ void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited) spin_unlock_irqrestore(&cq->lock, flags); } - -/** - * hfi1_poll_cq - poll for work completion entries - * @ibcq: the completion queue to poll - * @num_entries: the maximum number of entries to return - * @entry: pointer to array where work completions are placed - * - * Returns the number of completion entries polled. - * - * This may be called from interrupt context. Also called by ib_poll_cq() - * in the generic verbs code. - */ -int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) -{ - struct hfi1_cq *cq = to_icq(ibcq); - struct hfi1_cq_wc *wc; - unsigned long flags; - int npolled; - u32 tail; - - /* The kernel can only poll a kernel completion queue */ - if (cq->ip) { - npolled = -EINVAL; - goto bail; - } - - spin_lock_irqsave(&cq->lock, flags); - - wc = cq->queue; - tail = wc->tail; - if (tail > (u32) cq->ibcq.cqe) - tail = (u32) cq->ibcq.cqe; - for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { - if (tail == wc->head) - break; - /* The kernel doesn't need a RMB since it has the lock. */ - *entry = wc->kqueue[tail]; - if (tail >= cq->ibcq.cqe) - tail = 0; - else - tail++; - } - wc->tail = tail; - - spin_unlock_irqrestore(&cq->lock, flags); - -bail: - return npolled; -} +EXPORT_SYMBOL(rvt_cq_enter); static void send_complete(struct kthread_work *work) { - struct hfi1_cq *cq = container_of(work, struct hfi1_cq, comptask); + struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask); /* * The completion handler will most likely rearm the notification @@ -217,26 +167,25 @@ static void send_complete(struct kthread_work *work) } /** - * hfi1_create_cq - create a completion queue + * rvt_create_cq - create a completion queue * @ibdev: the device this completion queue is attached to * @attr: creation attributes - * @context: unused by the driver + * @context: unused by the QLogic_IB driver * @udata: user data for libibverbs.so * - * Returns a pointer to the completion queue or negative errno values - * for failure. - * * Called by ib_create_cq() in the generic verbs code. + * + * Return: pointer to the completion queue or negative errno values + * for failure. */ -struct ib_cq *hfi1_create_cq( - struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_ucontext *context, - struct ib_udata *udata) +struct ib_cq *rvt_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) { - struct hfi1_ibdev *dev = to_idev(ibdev); - struct hfi1_cq *cq; - struct hfi1_cq_wc *wc; + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + struct rvt_cq *cq; + struct rvt_cq_wc *wc; struct ib_cq *ret; u32 sz; unsigned int entries = attr->cqe; @@ -244,11 +193,11 @@ struct ib_cq *hfi1_create_cq( if (attr->flags) return ERR_PTR(-EINVAL); - if (entries < 1 || entries > hfi1_max_cqes) + if (entries < 1 || entries > rdi->dparms.props.max_cqe) return ERR_PTR(-EINVAL); /* Allocate the completion queue structure. */ - cq = kmalloc(sizeof(*cq), GFP_KERNEL); + cq = kzalloc(sizeof(*cq), GFP_KERNEL); if (!cq) return ERR_PTR(-ENOMEM); @@ -272,12 +221,12 @@ struct ib_cq *hfi1_create_cq( /* * Return the address of the WC as the offset to mmap. - * See hfi1_mmap() for details. + * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { int err; - cq->ip = hfi1_create_mmap_info(dev, sz, context, wc); + cq->ip = rvt_create_mmap_info(rdi, sz, context, wc); if (!cq->ip) { ret = ERR_PTR(-ENOMEM); goto bail_wc; @@ -289,23 +238,22 @@ struct ib_cq *hfi1_create_cq( ret = ERR_PTR(err); goto bail_ip; } - } else - cq->ip = NULL; + } - spin_lock(&dev->n_cqs_lock); - if (dev->n_cqs_allocated == hfi1_max_cqs) { - spin_unlock(&dev->n_cqs_lock); + spin_lock(&rdi->n_cqs_lock); + if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) { + spin_unlock(&rdi->n_cqs_lock); ret = ERR_PTR(-ENOMEM); goto bail_ip; } - dev->n_cqs_allocated++; - spin_unlock(&dev->n_cqs_lock); + rdi->n_cqs_allocated++; + spin_unlock(&rdi->n_cqs_lock); if (cq->ip) { - spin_lock_irq(&dev->pending_lock); - list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); + spin_lock_irq(&rdi->pending_lock); + list_add(&cq->ip->pending_mmaps, &rdi->pending_mmaps); + spin_unlock_irq(&rdi->pending_lock); } /* @@ -313,14 +261,11 @@ struct ib_cq *hfi1_create_cq( * The number of entries should be >= the number requested or return * an error. */ - cq->dd = dd_from_dev(dev); + cq->rdi = rdi; cq->ibcq.cqe = entries; - cq->notify = IB_CQ_NONE; - cq->triggered = 0; + cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); init_kthread_work(&cq->comptask, send_complete); - wc->head = 0; - wc->tail = 0; cq->queue = wc; ret = &cq->ibcq; @@ -338,24 +283,24 @@ done: } /** - * hfi1_destroy_cq - destroy a completion queue + * rvt_destroy_cq - destroy a completion queue * @ibcq: the completion queue to destroy. * - * Returns 0 for success. - * * Called by ib_destroy_cq() in the generic verbs code. + * + * Return: always 0 */ -int hfi1_destroy_cq(struct ib_cq *ibcq) +int rvt_destroy_cq(struct ib_cq *ibcq) { - struct hfi1_ibdev *dev = to_idev(ibcq->device); - struct hfi1_cq *cq = to_icq(ibcq); + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); + struct rvt_dev_info *rdi = cq->rdi; flush_kthread_work(&cq->comptask); - spin_lock(&dev->n_cqs_lock); - dev->n_cqs_allocated--; - spin_unlock(&dev->n_cqs_lock); + spin_lock(&rdi->n_cqs_lock); + rdi->n_cqs_allocated--; + spin_unlock(&rdi->n_cqs_lock); if (cq->ip) - kref_put(&cq->ip->ref, hfi1_release_mmap_info); + kref_put(&cq->ip->ref, rvt_release_mmap_info); else vfree(cq->queue); kfree(cq); @@ -364,18 +309,18 @@ int hfi1_destroy_cq(struct ib_cq *ibcq) } /** - * hfi1_req_notify_cq - change the notification type for a completion queue + * rvt_req_notify_cq - change the notification type for a completion queue * @ibcq: the completion queue * @notify_flags: the type of notification to request * - * Returns 0 for success. - * * This may be called from interrupt context. Also called by * ib_req_notify_cq() in the generic verbs code. + * + * Return: 0 for success. */ -int hfi1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) { - struct hfi1_cq *cq = to_icq(ibcq); + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); unsigned long flags; int ret = 0; @@ -397,24 +342,23 @@ int hfi1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) } /** - * hfi1_resize_cq - change the size of the CQ + * rvt_resize_cq - change the size of the CQ * @ibcq: the completion queue * - * Returns 0 for success. + * Return: 0 for success. */ -int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { - struct hfi1_cq *cq = to_icq(ibcq); - struct hfi1_cq_wc *old_wc; - struct hfi1_cq_wc *wc; + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); + struct rvt_cq_wc *old_wc; + struct rvt_cq_wc *wc; u32 head, tail, n; int ret; u32 sz; + struct rvt_dev_info *rdi = cq->rdi; - if (cqe < 1 || cqe > hfi1_max_cqes) { - ret = -EINVAL; - goto bail; - } + if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) + return -EINVAL; /* * Need to use vmalloc() if we want to support large #s of entries. @@ -425,10 +369,8 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) else sz += sizeof(struct ib_wc) * (cqe + 1); wc = vmalloc_user(sz); - if (!wc) { - ret = -ENOMEM; - goto bail; - } + if (!wc) + return -ENOMEM; /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { @@ -446,11 +388,11 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) */ old_wc = cq->queue; head = old_wc->head; - if (head > (u32) cq->ibcq.cqe) - head = (u32) cq->ibcq.cqe; + if (head > (u32)cq->ibcq.cqe) + head = (u32)cq->ibcq.cqe; tail = old_wc->tail; - if (tail > (u32) cq->ibcq.cqe) - tail = (u32) cq->ibcq.cqe; + if (tail > (u32)cq->ibcq.cqe) + tail = (u32)cq->ibcq.cqe; if (head < tail) n = cq->ibcq.cqe + 1 + head - tail; else @@ -464,7 +406,7 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) wc->uqueue[n] = old_wc->uqueue[tail]; else wc->kqueue[n] = old_wc->kqueue[tail]; - if (tail == (u32) cq->ibcq.cqe) + if (tail == (u32)cq->ibcq.cqe) tail = 0; else tail++; @@ -478,80 +420,131 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) vfree(old_wc); if (cq->ip) { - struct hfi1_ibdev *dev = to_idev(ibcq->device); - struct hfi1_mmap_info *ip = cq->ip; + struct rvt_mmap_info *ip = cq->ip; - hfi1_update_mmap_info(dev, ip, sz, wc); + rvt_update_mmap_info(rdi, ip, sz, wc); /* * Return the offset to mmap. - * See hfi1_mmap() for details. + * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { ret = ib_copy_to_udata(udata, &ip->offset, sizeof(ip->offset)); if (ret) - goto bail; + return ret; } - spin_lock_irq(&dev->pending_lock); + spin_lock_irq(&rdi->pending_lock); if (list_empty(&ip->pending_mmaps)) - list_add(&ip->pending_mmaps, &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); + list_add(&ip->pending_mmaps, &rdi->pending_mmaps); + spin_unlock_irq(&rdi->pending_lock); } - ret = 0; - goto bail; + return 0; bail_unlock: spin_unlock_irq(&cq->lock); bail_free: vfree(wc); -bail: return ret; } -int hfi1_cq_init(struct hfi1_devdata *dd) +/** + * rvt_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + * + * Return: the number of completion entries polled. + */ +int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); + struct rvt_cq_wc *wc; + unsigned long flags; + int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) + return -EINVAL; + + spin_lock_irqsave(&cq->lock, flags); + + wc = cq->queue; + tail = wc->tail; + if (tail > (u32)cq->ibcq.cqe) + tail = (u32)cq->ibcq.cqe; + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (tail == wc->head) + break; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; + else + tail++; + } + wc->tail = tail; + + spin_unlock_irqrestore(&cq->lock, flags); + + return npolled; +} + +/** + * rvt_driver_cq_init - Init cq resources on behalf of driver + * @rdi: rvt dev structure + * + * Return: 0 on success + */ +int rvt_driver_cq_init(struct rvt_dev_info *rdi) { int ret = 0; int cpu; struct task_struct *task; - if (dd->worker) + if (rdi->worker) return 0; - dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL); - if (!dd->worker) + rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL); + if (!rdi->worker) return -ENOMEM; - init_kthread_worker(dd->worker); + init_kthread_worker(rdi->worker); task = kthread_create_on_node( kthread_worker_fn, - dd->worker, - dd->assigned_node_id, - "hfi1_cq%d", dd->unit); - if (IS_ERR(task)) - goto task_fail; - cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id)); + rdi->worker, + rdi->dparms.node, + "%s", rdi->dparms.cq_name); + if (IS_ERR(task)) { + kfree(rdi->worker); + rdi->worker = NULL; + return PTR_ERR(task); + } + + cpu = cpumask_first(cpumask_of_node(rdi->dparms.node)); kthread_bind(task, cpu); wake_up_process(task); -out: return ret; -task_fail: - ret = PTR_ERR(task); - kfree(dd->worker); - dd->worker = NULL; - goto out; } -void hfi1_cq_exit(struct hfi1_devdata *dd) +/** + * rvt_cq_exit - tear down cq reources + * @rdi: rvt dev structure + */ +void rvt_cq_exit(struct rvt_dev_info *rdi) { struct kthread_worker *worker; - worker = dd->worker; + worker = rdi->worker; if (!worker) return; /* blocks future queuing from send_complete() */ - dd->worker = NULL; - smp_wmb(); /* See hfi1_cq_enter */ + rdi->worker = NULL; + smp_wmb(); /* See rdi_cq_enter */ flush_kthread_worker(worker); kthread_stop(worker->task); kfree(worker); diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h new file mode 100644 index 000000000000..6182c29eff66 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -0,0 +1,64 @@ +#ifndef DEF_RVTCQ_H +#define DEF_RVTCQ_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_cq.h> + +struct ib_cq *rvt_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata); +int rvt_destroy_cq(struct ib_cq *ibcq); +int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); +int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); +int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); +int rvt_driver_cq_init(struct rvt_dev_info *rdi); +void rvt_cq_exit(struct rvt_dev_info *rdi); +#endif /* DEF_RVTCQ_H */ diff --git a/drivers/infiniband/sw/rdmavt/dma.c b/drivers/infiniband/sw/rdmavt/dma.c new file mode 100644 index 000000000000..33076a5eee2f --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/dma.c @@ -0,0 +1,184 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/types.h> +#include <linux/scatterlist.h> +#include <rdma/ib_verbs.h> + +#include "dma.h" + +#define BAD_DMA_ADDRESS ((u64)0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int rvt_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 rvt_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + return (u64)cpu_addr; +} + +static void rvt_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static u64 rvt_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + if (offset + size > PAGE_SIZE) + return BAD_DMA_ADDRESS; + + addr = (u64)page_address(page); + if (addr) + addr += offset; + + return addr; +} + +static void rvt_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static int rvt_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + if (WARN_ON(!valid_dma_direction(direction))) + return 0; + + for_each_sg(sgl, sg, nents, i) { + addr = (u64)page_address(sg_page(sg)); + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void rvt_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static void rvt_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void rvt_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *rvt_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64)addr; + return addr; +} + +static void rvt_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long)cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops rvt_default_dma_mapping_ops = { + .mapping_error = rvt_mapping_error, + .map_single = rvt_dma_map_single, + .unmap_single = rvt_dma_unmap_single, + .map_page = rvt_dma_map_page, + .unmap_page = rvt_dma_unmap_page, + .map_sg = rvt_map_sg, + .unmap_sg = rvt_unmap_sg, + .sync_single_for_cpu = rvt_sync_single_for_cpu, + .sync_single_for_device = rvt_sync_single_for_device, + .alloc_coherent = rvt_dma_alloc_coherent, + .free_coherent = rvt_dma_free_coherent +}; diff --git a/drivers/infiniband/sw/rdmavt/dma.h b/drivers/infiniband/sw/rdmavt/dma.h new file mode 100644 index 000000000000..979f07e09195 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/dma.h @@ -0,0 +1,53 @@ +#ifndef DEF_RDMAVTDMA_H +#define DEF_RDMAVTDMA_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +extern struct ib_dma_mapping_ops rvt_default_dma_mapping_ops; + +#endif /* DEF_RDMAVTDMA_H */ diff --git a/drivers/infiniband/sw/rdmavt/mad.c b/drivers/infiniband/sw/rdmavt/mad.c new file mode 100644 index 000000000000..f6e99778d7ca --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/mad.c @@ -0,0 +1,171 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/ib_mad.h> +#include "mad.h" +#include "vt.h" + +/** + * rvt_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port_num: the port number this packet came in on, 1 based from ib core + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + * + * Return: IB_MAD_RESULT_SUCCESS or error + */ +int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + /* + * MAD processing is quite different between hfi1 and qib. Therfore this + * is expected to be provided by the driver. Other drivers in the future + * may chose to implement this but it should not be made into a + * requirement. + */ + if (ibport_num_to_idx(ibdev, port_num) < 0) + return -EINVAL; + + return IB_MAD_RESULT_FAILURE; +} + +static void rvt_send_mad_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_free_send_mad(mad_send_wc->send_buf); +} + +/** + * rvt_create_mad_agents - create mad agents + * @rdi: rvt dev struct + * + * If driver needs to be notified of mad agent creation then call back + * + * Return 0 on success + */ +int rvt_create_mad_agents(struct rvt_dev_info *rdi) +{ + struct ib_mad_agent *agent; + struct rvt_ibport *rvp; + int p; + int ret; + + for (p = 0; p < rdi->dparms.nports; p++) { + rvp = rdi->ports[p]; + agent = ib_register_mad_agent(&rdi->ibdev, p + 1, + IB_QPT_SMI, + NULL, 0, rvt_send_mad_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + + rvp->send_agent = agent; + + if (rdi->driver_f.notify_create_mad_agent) + rdi->driver_f.notify_create_mad_agent(rdi, p); + } + + return 0; + +err: + for (p = 0; p < rdi->dparms.nports; p++) { + rvp = rdi->ports[p]; + if (rvp->send_agent) { + agent = rvp->send_agent; + rvp->send_agent = NULL; + ib_unregister_mad_agent(agent); + if (rdi->driver_f.notify_free_mad_agent) + rdi->driver_f.notify_free_mad_agent(rdi, p); + } + } + + return ret; +} + +/** + * rvt_free_mad_agents - free up mad agents + * @rdi: rvt dev struct + * + * If driver needs notification of mad agent removal make the call back + */ +void rvt_free_mad_agents(struct rvt_dev_info *rdi) +{ + struct ib_mad_agent *agent; + struct rvt_ibport *rvp; + int p; + + for (p = 0; p < rdi->dparms.nports; p++) { + rvp = rdi->ports[p]; + if (rvp->send_agent) { + agent = rvp->send_agent; + rvp->send_agent = NULL; + ib_unregister_mad_agent(agent); + } + if (rvp->sm_ah) { + ib_destroy_ah(&rvp->sm_ah->ibah); + rvp->sm_ah = NULL; + } + + if (rdi->driver_f.notify_free_mad_agent) + rdi->driver_f.notify_free_mad_agent(rdi, p); + } +} + diff --git a/drivers/infiniband/sw/rdmavt/mad.h b/drivers/infiniband/sw/rdmavt/mad.h new file mode 100644 index 000000000000..a9d6eecc3723 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/mad.h @@ -0,0 +1,60 @@ +#ifndef DEF_RVTMAD_H +#define DEF_RVTMAD_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> + +int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index); +int rvt_create_mad_agents(struct rvt_dev_info *rdi); +void rvt_free_mad_agents(struct rvt_dev_info *rdi); +#endif /* DEF_RVTMAD_H */ diff --git a/drivers/staging/rdma/hfi1/verbs_mcast.c b/drivers/infiniband/sw/rdmavt/mcast.c index afc6b4c61a1d..983d319ac976 100644 --- a/drivers/staging/rdma/hfi1/verbs_mcast.c +++ b/drivers/infiniband/sw/rdmavt/mcast.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -48,17 +45,36 @@ * */ +#include <linux/slab.h> +#include <linux/sched.h> #include <linux/rculist.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> -#include "hfi.h" +#include "mcast.h" + +/** + * rvt_driver_mcast - init resources for multicast + * @rdi: rvt dev struct + * + * This is per device that registers with rdmavt + */ +void rvt_driver_mcast_init(struct rvt_dev_info *rdi) +{ + /* + * Anything that needs setup for multicast on a per driver or per rdi + * basis should be done in here. + */ + spin_lock_init(&rdi->n_mcast_grps_lock); +} /** * mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct * @qp: the QP to link */ -static struct hfi1_mcast_qp *mcast_qp_alloc(struct hfi1_qp *qp) +static struct rvt_mcast_qp *rvt_mcast_qp_alloc(struct rvt_qp *qp) { - struct hfi1_mcast_qp *mqp; + struct rvt_mcast_qp *mqp; mqp = kmalloc(sizeof(*mqp), GFP_KERNEL); if (!mqp) @@ -71,9 +87,9 @@ bail: return mqp; } -static void mcast_qp_free(struct hfi1_mcast_qp *mqp) +static void rvt_mcast_qp_free(struct rvt_mcast_qp *mqp) { - struct hfi1_qp *qp = mqp->qp; + struct rvt_qp *qp = mqp->qp; /* Notify hfi1_destroy_qp() if it is waiting. */ if (atomic_dec_and_test(&qp->refcount)) @@ -88,11 +104,11 @@ static void mcast_qp_free(struct hfi1_mcast_qp *mqp) * * A list of QPs will be attached to this structure. */ -static struct hfi1_mcast *mcast_alloc(union ib_gid *mgid) +static struct rvt_mcast *rvt_mcast_alloc(union ib_gid *mgid) { - struct hfi1_mcast *mcast; + struct rvt_mcast *mcast; - mcast = kmalloc(sizeof(*mcast), GFP_KERNEL); + mcast = kzalloc(sizeof(*mcast), GFP_KERNEL); if (!mcast) goto bail; @@ -100,75 +116,72 @@ static struct hfi1_mcast *mcast_alloc(union ib_gid *mgid) INIT_LIST_HEAD(&mcast->qp_list); init_waitqueue_head(&mcast->wait); atomic_set(&mcast->refcount, 0); - mcast->n_attached = 0; bail: return mcast; } -static void mcast_free(struct hfi1_mcast *mcast) +static void rvt_mcast_free(struct rvt_mcast *mcast) { - struct hfi1_mcast_qp *p, *tmp; + struct rvt_mcast_qp *p, *tmp; list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) - mcast_qp_free(p); + rvt_mcast_qp_free(p); kfree(mcast); } /** - * hfi1_mcast_find - search the global table for the given multicast GID + * rvt_mcast_find - search the global table for the given multicast GID * @ibp: the IB port structure * @mgid: the multicast GID to search for * - * Returns NULL if not found. - * * The caller is responsible for decrementing the reference count if found. + * + * Return: NULL if not found. */ -struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid) +struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid) { struct rb_node *n; unsigned long flags; - struct hfi1_mcast *mcast; + struct rvt_mcast *found = NULL; spin_lock_irqsave(&ibp->lock, flags); n = ibp->mcast_tree.rb_node; while (n) { int ret; + struct rvt_mcast *mcast; - mcast = rb_entry(n, struct hfi1_mcast, rb_node); + mcast = rb_entry(n, struct rvt_mcast, rb_node); ret = memcmp(mgid->raw, mcast->mgid.raw, sizeof(union ib_gid)); - if (ret < 0) + if (ret < 0) { n = n->rb_left; - else if (ret > 0) + } else if (ret > 0) { n = n->rb_right; - else { + } else { atomic_inc(&mcast->refcount); - spin_unlock_irqrestore(&ibp->lock, flags); - goto bail; + found = mcast; + break; } } spin_unlock_irqrestore(&ibp->lock, flags); - - mcast = NULL; - -bail: - return mcast; + return found; } +EXPORT_SYMBOL(rvt_mcast_find); /** * mcast_add - insert mcast GID into table and attach QP struct * @mcast: the mcast GID table * @mqp: the QP to attach * - * Return zero if both were added. Return EEXIST if the GID was already in + * Return: zero if both were added. Return EEXIST if the GID was already in * the table but the QP was added. Return ESRCH if the QP was already * attached and neither structure was added. */ -static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp, - struct hfi1_mcast *mcast, struct hfi1_mcast_qp *mqp) +static int rvt_mcast_add(struct rvt_dev_info *rdi, struct rvt_ibport *ibp, + struct rvt_mcast *mcast, struct rvt_mcast_qp *mqp) { struct rb_node **n = &ibp->mcast_tree.rb_node; struct rb_node *pn = NULL; @@ -177,11 +190,11 @@ static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp, spin_lock_irq(&ibp->lock); while (*n) { - struct hfi1_mcast *tmcast; - struct hfi1_mcast_qp *p; + struct rvt_mcast *tmcast; + struct rvt_mcast_qp *p; pn = *n; - tmcast = rb_entry(pn, struct hfi1_mcast, rb_node); + tmcast = rb_entry(pn, struct rvt_mcast, rb_node); ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, sizeof(union ib_gid)); @@ -201,7 +214,8 @@ static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp, goto bail; } } - if (tmcast->n_attached == hfi1_max_mcast_qp_attached) { + if (tmcast->n_attached == + rdi->dparms.props.max_mcast_qp_attach) { ret = ENOMEM; goto bail; } @@ -213,15 +227,15 @@ static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp, goto bail; } - spin_lock(&dev->n_mcast_grps_lock); - if (dev->n_mcast_grps_allocated == hfi1_max_mcast_grps) { - spin_unlock(&dev->n_mcast_grps_lock); + spin_lock(&rdi->n_mcast_grps_lock); + if (rdi->n_mcast_grps_allocated == rdi->dparms.props.max_mcast_grp) { + spin_unlock(&rdi->n_mcast_grps_lock); ret = ENOMEM; goto bail; } - dev->n_mcast_grps_allocated++; - spin_unlock(&dev->n_mcast_grps_lock); + rdi->n_mcast_grps_allocated++; + spin_unlock(&rdi->n_mcast_grps_lock); mcast->n_attached++; @@ -239,92 +253,98 @@ bail: return ret; } -int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +/** + * rvt_attach_mcast - attach a qp to a multicast group + * @ibqp: Infiniband qp + * @igd: multicast guid + * @lid: multicast lid + * + * Return: 0 on success + */ +int rvt_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { - struct hfi1_qp *qp = to_iqp(ibqp); - struct hfi1_ibdev *dev = to_idev(ibqp->device); - struct hfi1_ibport *ibp; - struct hfi1_mcast *mcast; - struct hfi1_mcast_qp *mqp; - int ret; + struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + struct rvt_ibport *ibp = rdi->ports[qp->port_num - 1]; + struct rvt_mcast *mcast; + struct rvt_mcast_qp *mqp; + int ret = -ENOMEM; - if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { - ret = -EINVAL; - goto bail; - } + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) + return -EINVAL; /* * Allocate data structures since its better to do this outside of * spin locks and it will most likely be needed. */ - mcast = mcast_alloc(gid); - if (mcast == NULL) { - ret = -ENOMEM; - goto bail; - } - mqp = mcast_qp_alloc(qp); - if (mqp == NULL) { - mcast_free(mcast); - ret = -ENOMEM; - goto bail; - } - ibp = to_iport(ibqp->device, qp->port_num); - switch (mcast_add(dev, ibp, mcast, mqp)) { - case ESRCH: - /* Neither was used: OK to attach the same QP twice. */ - mcast_qp_free(mqp); - mcast_free(mcast); - break; + mcast = rvt_mcast_alloc(gid); + if (!mcast) + return -ENOMEM; - case EEXIST: /* The mcast wasn't used */ - mcast_free(mcast); - break; + mqp = rvt_mcast_qp_alloc(qp); + if (!mqp) + goto bail_mcast; + switch (rvt_mcast_add(rdi, ibp, mcast, mqp)) { + case ESRCH: + /* Neither was used: OK to attach the same QP twice. */ + ret = 0; + goto bail_mqp; + case EEXIST: /* The mcast wasn't used */ + ret = 0; + goto bail_mcast; case ENOMEM: /* Exceeded the maximum number of mcast groups. */ - mcast_qp_free(mqp); - mcast_free(mcast); ret = -ENOMEM; - goto bail; - + goto bail_mqp; default: break; } - ret = 0; + return 0; + +bail_mqp: + rvt_mcast_qp_free(mqp); + +bail_mcast: + rvt_mcast_free(mcast); -bail: return ret; } -int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +/** + * rvt_detach_mcast - remove a qp from a multicast group + * @ibqp: Infiniband qp + * @igd: multicast guid + * @lid: multicast lid + * + * Return: 0 on success + */ +int rvt_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { - struct hfi1_qp *qp = to_iqp(ibqp); - struct hfi1_ibdev *dev = to_idev(ibqp->device); - struct hfi1_ibport *ibp = to_iport(ibqp->device, qp->port_num); - struct hfi1_mcast *mcast = NULL; - struct hfi1_mcast_qp *p, *tmp; + struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + struct rvt_ibport *ibp = rdi->ports[qp->port_num - 1]; + struct rvt_mcast *mcast = NULL; + struct rvt_mcast_qp *p, *tmp, *delp = NULL; struct rb_node *n; int last = 0; - int ret; + int ret = 0; - if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { - ret = -EINVAL; - goto bail; - } + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) + return -EINVAL; spin_lock_irq(&ibp->lock); /* Find the GID in the mcast table. */ n = ibp->mcast_tree.rb_node; while (1) { - if (n == NULL) { + if (!n) { spin_unlock_irq(&ibp->lock); - ret = -EINVAL; - goto bail; + return -EINVAL; } - mcast = rb_entry(n, struct hfi1_mcast, rb_node); + mcast = rb_entry(n, struct rvt_mcast, rb_node); ret = memcmp(gid->raw, mcast->mgid.raw, sizeof(union ib_gid)); if (ret < 0) @@ -345,6 +365,7 @@ int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) */ list_del_rcu(&p->list); mcast->n_attached--; + delp = p; /* If this was the last attached QP, remove the GID too. */ if (list_empty(&mcast->qp_list)) { @@ -355,31 +376,42 @@ int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) } spin_unlock_irq(&ibp->lock); + /* QP not attached */ + if (!delp) + return -EINVAL; + + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + rvt_mcast_qp_free(delp); - if (p) { - /* - * Wait for any list walkers to finish before freeing the - * list element. - */ - wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); - mcast_qp_free(p); - } if (last) { atomic_dec(&mcast->refcount); wait_event(mcast->wait, !atomic_read(&mcast->refcount)); - mcast_free(mcast); - spin_lock_irq(&dev->n_mcast_grps_lock); - dev->n_mcast_grps_allocated--; - spin_unlock_irq(&dev->n_mcast_grps_lock); + rvt_mcast_free(mcast); + spin_lock_irq(&rdi->n_mcast_grps_lock); + rdi->n_mcast_grps_allocated--; + spin_unlock_irq(&rdi->n_mcast_grps_lock); } - ret = 0; - -bail: - return ret; + return 0; } -int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp) +/** + *rvt_mast_tree_empty - determine if any qps are attached to any mcast group + *@rdi: rvt dev struct + * + * Return: in use count + */ +int rvt_mcast_tree_empty(struct rvt_dev_info *rdi) { - return ibp->mcast_tree.rb_node == NULL; + int i; + int in_use = 0; + + for (i = 0; i < rdi->dparms.nports; i++) + if (rdi->ports[i]->mcast_tree.rb_node) + in_use++; + return in_use; } diff --git a/drivers/infiniband/sw/rdmavt/mcast.h b/drivers/infiniband/sw/rdmavt/mcast.h new file mode 100644 index 000000000000..29f579267608 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/mcast.h @@ -0,0 +1,58 @@ +#ifndef DEF_RVTMCAST_H +#define DEF_RVTMCAST_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> + +void rvt_driver_mcast_init(struct rvt_dev_info *rdi); +int rvt_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); +int rvt_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); +int rvt_mcast_tree_empty(struct rvt_dev_info *rdi); + +#endif /* DEF_RVTMCAST_H */ diff --git a/drivers/staging/rdma/hfi1/mmap.c b/drivers/infiniband/sw/rdmavt/mmap.c index 5173b1c60b3d..e202b8142759 100644 --- a/drivers/staging/rdma/hfi1/mmap.c +++ b/drivers/infiniband/sw/rdmavt/mmap.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -48,68 +45,74 @@ * */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> -#include <linux/errno.h> #include <asm/pgtable.h> +#include "mmap.h" -#include "verbs.h" +/** + * rvt_mmap_init - init link list and lock for mem map + * @rdi: rvt dev struct + */ +void rvt_mmap_init(struct rvt_dev_info *rdi) +{ + INIT_LIST_HEAD(&rdi->pending_mmaps); + spin_lock_init(&rdi->pending_lock); + rdi->mmap_offset = PAGE_SIZE; + spin_lock_init(&rdi->mmap_offset_lock); +} /** - * hfi1_release_mmap_info - free mmap info structure - * @ref: a pointer to the kref within struct hfi1_mmap_info + * rvt_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct rvt_mmap_info */ -void hfi1_release_mmap_info(struct kref *ref) +void rvt_release_mmap_info(struct kref *ref) { - struct hfi1_mmap_info *ip = - container_of(ref, struct hfi1_mmap_info, ref); - struct hfi1_ibdev *dev = to_idev(ip->context->device); + struct rvt_mmap_info *ip = + container_of(ref, struct rvt_mmap_info, ref); + struct rvt_dev_info *rdi = ib_to_rvt(ip->context->device); - spin_lock_irq(&dev->pending_lock); + spin_lock_irq(&rdi->pending_lock); list_del(&ip->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); + spin_unlock_irq(&rdi->pending_lock); vfree(ip->obj); kfree(ip); } -/* - * open and close keep track of how many times the CQ is mapped, - * to avoid releasing it. - */ -static void hfi1_vma_open(struct vm_area_struct *vma) +static void rvt_vma_open(struct vm_area_struct *vma) { - struct hfi1_mmap_info *ip = vma->vm_private_data; + struct rvt_mmap_info *ip = vma->vm_private_data; kref_get(&ip->ref); } -static void hfi1_vma_close(struct vm_area_struct *vma) +static void rvt_vma_close(struct vm_area_struct *vma) { - struct hfi1_mmap_info *ip = vma->vm_private_data; + struct rvt_mmap_info *ip = vma->vm_private_data; - kref_put(&ip->ref, hfi1_release_mmap_info); + kref_put(&ip->ref, rvt_release_mmap_info); } -static struct vm_operations_struct hfi1_vm_ops = { - .open = hfi1_vma_open, - .close = hfi1_vma_close, +static const struct vm_operations_struct rvt_vm_ops = { + .open = rvt_vma_open, + .close = rvt_vma_close, }; /** - * hfi1_mmap - create a new mmap region + * rvt_mmap - create a new mmap region * @context: the IB user context of the process making the mmap() call * @vma: the VMA to be initialized - * Return zero if the mmap is OK. Otherwise, return an errno. + * + * Return: zero if the mmap is OK. Otherwise, return an errno. */ -int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { - struct hfi1_ibdev *dev = to_idev(context->device); + struct rvt_dev_info *rdi = ib_to_rvt(context->device); unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; - struct hfi1_mmap_info *ip, *pp; + struct rvt_mmap_info *ip, *pp; int ret = -EINVAL; /* @@ -117,53 +120,60 @@ int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) * Normally, this list is very short since a call to create a * CQ, QP, or SRQ is soon followed by a call to mmap(). */ - spin_lock_irq(&dev->pending_lock); - list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + spin_lock_irq(&rdi->pending_lock); + list_for_each_entry_safe(ip, pp, &rdi->pending_mmaps, pending_mmaps) { /* Only the creator is allowed to mmap the object */ - if (context != ip->context || (__u64) offset != ip->offset) + if (context != ip->context || (__u64)offset != ip->offset) continue; /* Don't allow a mmap larger than the object. */ if (size > ip->size) break; list_del_init(&ip->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); + spin_unlock_irq(&rdi->pending_lock); ret = remap_vmalloc_range(vma, ip->obj, 0); if (ret) goto done; - vma->vm_ops = &hfi1_vm_ops; + vma->vm_ops = &rvt_vm_ops; vma->vm_private_data = ip; - hfi1_vma_open(vma); + rvt_vma_open(vma); goto done; } - spin_unlock_irq(&dev->pending_lock); + spin_unlock_irq(&rdi->pending_lock); done: return ret; } -/* - * Allocate information for hfi1_mmap +/** + * rvt_create_mmap_info - allocate information for hfi1_mmap + * @rdi: rvt dev struct + * @size: size in bytes to map + * @context: user context + * @obj: opaque pointer to a cq, wq etc + * + * Return: rvt_mmap struct on success */ -struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, - u32 size, - struct ib_ucontext *context, - void *obj) { - struct hfi1_mmap_info *ip; +struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, + u32 size, + struct ib_ucontext *context, + void *obj) +{ + struct rvt_mmap_info *ip; - ip = kmalloc(sizeof(*ip), GFP_KERNEL); + ip = kmalloc_node(sizeof(*ip), GFP_KERNEL, rdi->dparms.node); if (!ip) - goto bail; + return ip; size = PAGE_ALIGN(size); - spin_lock_irq(&dev->mmap_offset_lock); - if (dev->mmap_offset == 0) - dev->mmap_offset = PAGE_SIZE; - ip->offset = dev->mmap_offset; - dev->mmap_offset += size; - spin_unlock_irq(&dev->mmap_offset_lock); + spin_lock_irq(&rdi->mmap_offset_lock); + if (rdi->mmap_offset == 0) + rdi->mmap_offset = PAGE_SIZE; + ip->offset = rdi->mmap_offset; + rdi->mmap_offset += size; + spin_unlock_irq(&rdi->mmap_offset_lock); INIT_LIST_HEAD(&ip->pending_mmaps); ip->size = size; @@ -171,21 +181,27 @@ struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, ip->obj = obj; kref_init(&ip->ref); -bail: return ip; } -void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip, - u32 size, void *obj) +/** + * rvt_update_mmap_info - update a mem map + * @rdi: rvt dev struct + * @ip: mmap info pointer + * @size: size to grow by + * @obj: opaque pointer to cq, wq, etc. + */ +void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip, + u32 size, void *obj) { size = PAGE_ALIGN(size); - spin_lock_irq(&dev->mmap_offset_lock); - if (dev->mmap_offset == 0) - dev->mmap_offset = PAGE_SIZE; - ip->offset = dev->mmap_offset; - dev->mmap_offset += size; - spin_unlock_irq(&dev->mmap_offset_lock); + spin_lock_irq(&rdi->mmap_offset_lock); + if (rdi->mmap_offset == 0) + rdi->mmap_offset = PAGE_SIZE; + ip->offset = rdi->mmap_offset; + rdi->mmap_offset += size; + spin_unlock_irq(&rdi->mmap_offset_lock); ip->size = size; ip->obj = obj; diff --git a/drivers/infiniband/sw/rdmavt/mmap.h b/drivers/infiniband/sw/rdmavt/mmap.h new file mode 100644 index 000000000000..fab0e7b1daf9 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/mmap.h @@ -0,0 +1,63 @@ +#ifndef DEF_RDMAVTMMAP_H +#define DEF_RDMAVTMMAP_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> + +void rvt_mmap_init(struct rvt_dev_info *rdi); +void rvt_release_mmap_info(struct kref *ref); +int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); +struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, + u32 size, + struct ib_ucontext *context, + void *obj); +void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip, + u32 size, void *obj); + +#endif /* DEF_RDMAVTMMAP_H */ diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c new file mode 100644 index 000000000000..0ff765bfd619 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -0,0 +1,830 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <rdma/ib_umem.h> +#include <rdma/rdma_vt.h> +#include "vt.h" +#include "mr.h" + +/** + * rvt_driver_mr_init - Init MR resources per driver + * @rdi: rvt dev struct + * + * Do any intilization needed when a driver registers with rdmavt. + * + * Return: 0 on success or errno on failure + */ +int rvt_driver_mr_init(struct rvt_dev_info *rdi) +{ + unsigned int lkey_table_size = rdi->dparms.lkey_table_size; + unsigned lk_tab_size; + int i; + + /* + * The top hfi1_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + if (!lkey_table_size) + return -EINVAL; + + spin_lock_init(&rdi->lkey_table.lock); + + /* ensure generation is at least 4 bits */ + if (lkey_table_size > RVT_MAX_LKEY_TABLE_BITS) { + rvt_pr_warn(rdi, "lkey bits %u too large, reduced to %u\n", + lkey_table_size, RVT_MAX_LKEY_TABLE_BITS); + rdi->dparms.lkey_table_size = RVT_MAX_LKEY_TABLE_BITS; + lkey_table_size = rdi->dparms.lkey_table_size; + } + rdi->lkey_table.max = 1 << lkey_table_size; + lk_tab_size = rdi->lkey_table.max * sizeof(*rdi->lkey_table.table); + rdi->lkey_table.table = (struct rvt_mregion __rcu **) + vmalloc_node(lk_tab_size, rdi->dparms.node); + if (!rdi->lkey_table.table) + return -ENOMEM; + + RCU_INIT_POINTER(rdi->dma_mr, NULL); + for (i = 0; i < rdi->lkey_table.max; i++) + RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL); + + return 0; +} + +/** + *rvt_mr_exit: clean up MR + *@rdi: rvt dev structure + * + * called when drivers have unregistered or perhaps failed to register with us + */ +void rvt_mr_exit(struct rvt_dev_info *rdi) +{ + if (rdi->dma_mr) + rvt_pr_err(rdi, "DMA MR not null!\n"); + + vfree(rdi->lkey_table.table); +} + +static void rvt_deinit_mregion(struct rvt_mregion *mr) +{ + int i = mr->mapsz; + + mr->mapsz = 0; + while (i) + kfree(mr->map[--i]); +} + +static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd, + int count) +{ + int m, i = 0; + + mr->mapsz = 0; + m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ; + for (; i < m; i++) { + mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL); + if (!mr->map[i]) { + rvt_deinit_mregion(mr); + return -ENOMEM; + } + mr->mapsz++; + } + init_completion(&mr->comp); + /* count returning the ptr to user */ + atomic_set(&mr->refcount, 1); + mr->pd = pd; + mr->max_segs = count; + return 0; +} + +/** + * rvt_alloc_lkey - allocate an lkey + * @mr: memory region that this lkey protects + * @dma_region: 0->normal key, 1->restricted DMA key + * + * Returns 0 if successful, otherwise returns -errno. + * + * Increments mr reference count as required. + * + * Sets the lkey field mr for non-dma regions. + * + */ +static int rvt_alloc_lkey(struct rvt_mregion *mr, int dma_region) +{ + unsigned long flags; + u32 r; + u32 n; + int ret = 0; + struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device); + struct rvt_lkey_table *rkt = &dev->lkey_table; + + rvt_get_mr(mr); + spin_lock_irqsave(&rkt->lock, flags); + + /* special case for dma_mr lkey == 0 */ + if (dma_region) { + struct rvt_mregion *tmr; + + tmr = rcu_access_pointer(dev->dma_mr); + if (!tmr) { + rcu_assign_pointer(dev->dma_mr, mr); + mr->lkey_published = 1; + } else { + rvt_put_mr(mr); + } + goto success; + } + + /* Find the next available LKEY */ + r = rkt->next; + n = r; + for (;;) { + if (!rcu_access_pointer(rkt->table[r])) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) + goto bail; + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + /* + * bits are capped to ensure enough bits for generation number + */ + mr->lkey = (r << (32 - dev->dparms.lkey_table_size)) | + ((((1 << (24 - dev->dparms.lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + rcu_assign_pointer(rkt->table[r], mr); + mr->lkey_published = 1; +success: + spin_unlock_irqrestore(&rkt->lock, flags); +out: + return ret; +bail: + rvt_put_mr(mr); + spin_unlock_irqrestore(&rkt->lock, flags); + ret = -ENOMEM; + goto out; +} + +/** + * rvt_free_lkey - free an lkey + * @mr: mr to free from tables + */ +static void rvt_free_lkey(struct rvt_mregion *mr) +{ + unsigned long flags; + u32 lkey = mr->lkey; + u32 r; + struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device); + struct rvt_lkey_table *rkt = &dev->lkey_table; + int freed = 0; + + spin_lock_irqsave(&rkt->lock, flags); + if (!mr->lkey_published) + goto out; + if (lkey == 0) { + RCU_INIT_POINTER(dev->dma_mr, NULL); + } else { + r = lkey >> (32 - dev->dparms.lkey_table_size); + RCU_INIT_POINTER(rkt->table[r], NULL); + } + mr->lkey_published = 0; + freed++; +out: + spin_unlock_irqrestore(&rkt->lock, flags); + if (freed) { + synchronize_rcu(); + rvt_put_mr(mr); + } +} + +static struct rvt_mr *__rvt_alloc_mr(int count, struct ib_pd *pd) +{ + struct rvt_mr *mr; + int rval = -ENOMEM; + int m; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ; + mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL); + if (!mr) + goto bail; + + rval = rvt_init_mregion(&mr->mr, pd, count); + if (rval) + goto bail; + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + rval = rvt_alloc_lkey(&mr->mr, 0); + if (rval) + goto bail_mregion; + mr->ibmr.lkey = mr->mr.lkey; + mr->ibmr.rkey = mr->mr.lkey; +done: + return mr; + +bail_mregion: + rvt_deinit_mregion(&mr->mr); +bail: + kfree(mr); + mr = ERR_PTR(rval); + goto done; +} + +static void __rvt_free_mr(struct rvt_mr *mr) +{ + rvt_deinit_mregion(&mr->mr); + rvt_free_lkey(&mr->mr); + vfree(mr); +} + +/** + * rvt_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Return: the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see dma.c). + */ +struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct rvt_mr *mr; + struct ib_mr *ret; + int rval; + + if (ibpd_to_rvtpd(pd)->user) + return ERR_PTR(-EPERM); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + rval = rvt_init_mregion(&mr->mr, pd, 0); + if (rval) { + ret = ERR_PTR(rval); + goto bail; + } + + rval = rvt_alloc_lkey(&mr->mr, 1); + if (rval) { + ret = ERR_PTR(rval); + goto bail_mregion; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; +done: + return ret; + +bail_mregion: + rvt_deinit_mregion(&mr->mr); +bail: + kfree(mr); + goto done; +} + +/** + * rvt_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @start: starting userspace address + * @length: length of region to register + * @mr_access_flags: access flags for this memory region + * @udata: unused by the driver + * + * Return: the memory region on success, otherwise returns an errno. + */ +struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) +{ + struct rvt_mr *mr; + struct ib_umem *umem; + struct scatterlist *sg; + int n, m, entry; + struct ib_mr *ret; + + if (length == 0) + return ERR_PTR(-EINVAL); + + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *)umem; + + n = umem->nmap; + + mr = __rvt_alloc_mr(n, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; + goto bail_umem; + } + + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = ib_umem_offset(umem); + mr->mr.access_flags = mr_access_flags; + mr->umem = umem; + + if (is_power_of_2(umem->page_size)) + mr->mr.page_shift = ilog2(umem->page_size); + m = 0; + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail_inval; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == RVT_SEGSZ) { + m++; + n = 0; + } + } + return &mr->ibmr; + +bail_inval: + __rvt_free_mr(mr); + +bail_umem: + ib_umem_release(umem); + + return ret; +} + +/** + * rvt_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * + * Note that this is called to free MRs created by rvt_get_dma_mr() + * or rvt_reg_user_mr(). + * + * Returns 0 on success. + */ +int rvt_dereg_mr(struct ib_mr *ibmr) +{ + struct rvt_mr *mr = to_imr(ibmr); + struct rvt_dev_info *rdi = ib_to_rvt(ibmr->pd->device); + int ret = 0; + unsigned long timeout; + + rvt_free_lkey(&mr->mr); + + rvt_put_mr(&mr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&mr->mr.comp, 5 * HZ); + if (!timeout) { + rvt_pr_err(rdi, + "rvt_dereg_mr timeout mr %p pd %p refcount %u\n", + mr, mr->mr.pd, atomic_read(&mr->mr.refcount)); + rvt_get_mr(&mr->mr); + ret = -EBUSY; + goto out; + } + rvt_deinit_mregion(&mr->mr); + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); +out: + return ret; +} + +/** + * rvt_alloc_mr - Allocate a memory region usable with the + * @pd: protection domain for this memory region + * @mr_type: mem region type + * @max_num_sg: Max number of segments allowed + * + * Return: the memory region on success, otherwise return an errno. + */ +struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct rvt_mr *mr; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + mr = __rvt_alloc_mr(max_num_sg, pd); + if (IS_ERR(mr)) + return (struct ib_mr *)mr; + + return &mr->ibmr; +} + +/** + * rvt_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Return: the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct rvt_fmr *fmr; + int m; + struct ib_fmr *ret; + int rval = -ENOMEM; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + RVT_SEGSZ - 1) / RVT_SEGSZ; + fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL); + if (!fmr) + goto bail; + + rval = rvt_init_mregion(&fmr->mr, pd, fmr_attr->max_pages); + if (rval) + goto bail; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + rval = rvt_alloc_lkey(&fmr->mr, 0); + if (rval) + goto bail_mregion; + fmr->ibfmr.rkey = fmr->mr.lkey; + fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->mr.page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; +done: + return ret; + +bail_mregion: + rvt_deinit_mregion(&fmr->mr); +bail: + kfree(fmr); + ret = ERR_PTR(rval); + goto done; +} + +/** + * rvt_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + * + * Return: 0 on success + */ + +int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct rvt_fmr *fmr = to_ifmr(ibfmr); + struct rvt_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + struct rvt_dev_info *rdi = ib_to_rvt(ibfmr->device); + + i = atomic_read(&fmr->mr.refcount); + if (i > 2) + return -EBUSY; + + if (list_len > fmr->mr.max_segs) + return -EINVAL; + + rkt = &rdi->lkey_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->mr.page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = (void *)page_list[i]; + fmr->mr.map[m]->segs[n].length = ps; + if (++n == RVT_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + return 0; +} + +/** + * rvt_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Return: 0 on success. + */ +int rvt_unmap_fmr(struct list_head *fmr_list) +{ + struct rvt_fmr *fmr; + struct rvt_lkey_table *rkt; + unsigned long flags; + struct rvt_dev_info *rdi; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rdi = ib_to_rvt(fmr->ibfmr.device); + rkt = &rdi->lkey_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * rvt_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Return: 0 on success. + */ +int rvt_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct rvt_fmr *fmr = to_ifmr(ibfmr); + int ret = 0; + unsigned long timeout; + + rvt_free_lkey(&fmr->mr); + rvt_put_mr(&fmr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&fmr->mr.comp, 5 * HZ); + if (!timeout) { + rvt_get_mr(&fmr->mr); + ret = -EBUSY; + goto out; + } + rvt_deinit_mregion(&fmr->mr); + kfree(fmr); +out: + return ret; +} + +/** + * rvt_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @pd: protection domain + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Check the IB SGE for validity and initialize our internal version + * of it. + * + * Return: 1 if valid and successful, otherwise returns 0. + * + * increments the reference count upon success + * + */ +int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, + struct rvt_sge *isge, struct ib_sge *sge, int acc) +{ + struct rvt_mregion *mr; + unsigned n, m; + size_t off; + struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device); + + /* + * We use LKEY == zero for kernel virtual addresses + * (see rvt_get_dma_mr and dma.c). + */ + rcu_read_lock(); + if (sge->lkey == 0) { + if (pd->user) + goto bail; + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + isge->mr = mr; + isge->vaddr = (void *)sge->addr; + isge->length = sge->length; + isge->sge_length = sge->length; + isge->m = 0; + isge->n = 0; + goto ok; + } + mr = rcu_dereference( + rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]); + if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) + goto bail; + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + * page sizes are uniform power of 2 so no loop is necessary + * entries_spanned_by_off is the number of times the loop below + * would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off / RVT_SEGSZ; + n = entries_spanned_by_off % RVT_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= RVT_SEGSZ) { + m++; + n = 0; + } + } + } + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; + isge->sge_length = sge->length; + isge->m = m; + isge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(rvt_lkey_ok); + +/** + * rvt_rkey_ok - check the IB virtual address, length, and RKEY + * @qp: qp for validation + * @sge: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return: 1 if successful, otherwise 0. + * + * increments the reference count upon success + */ +int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device); + struct rvt_lkey_table *rkt = &dev->lkey_table; + struct rvt_mregion *mr; + unsigned n, m; + size_t off; + + /* + * We use RKEY == zero for kernel virtual addresses + * (see rvt_get_dma_mr and dma.c). + */ + rcu_read_lock(); + if (rkey == 0) { + struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd); + struct rvt_dev_info *rdi = ib_to_rvt(pd->ibpd.device); + + if (pd->user) + goto bail; + mr = rcu_dereference(rdi->dma_mr); + if (!mr) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + sge->mr = mr; + sge->vaddr = (void *)vaddr; + sge->length = len; + sge->sge_length = len; + sge->m = 0; + sge->n = 0; + goto ok; + } + + mr = rcu_dereference( + rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]); + if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + goto bail; + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + * page sizes are uniform power of 2 so no loop is necessary + * entries_spanned_by_off is the number of times the loop below + * would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off / RVT_SEGSZ; + n = entries_spanned_by_off % RVT_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= RVT_SEGSZ) { + m++; + n = 0; + } + } + } + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; + sge->sge_length = len; + sge->m = m; + sge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(rvt_rkey_ok); diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h new file mode 100644 index 000000000000..69380512c6d1 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/mr.h @@ -0,0 +1,92 @@ +#ifndef DEF_RVTMR_H +#define DEF_RVTMR_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> +struct rvt_fmr { + struct ib_fmr ibfmr; + struct rvt_mregion mr; /* must be last */ +}; + +struct rvt_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct rvt_mregion mr; /* must be last */ +}; + +static inline struct rvt_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct rvt_fmr, ibfmr); +} + +static inline struct rvt_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct rvt_mr, ibmr); +} + +int rvt_driver_mr_init(struct rvt_dev_info *rdi); +void rvt_mr_exit(struct rvt_dev_info *rdi); + +/* Mem Regions */ +struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); +int rvt_dereg_mr(struct ib_mr *ibmr); +struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); +struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); +int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); +int rvt_unmap_fmr(struct list_head *fmr_list); +int rvt_dealloc_fmr(struct ib_fmr *ibfmr); + +#endif /* DEF_RVTMR_H */ diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c new file mode 100644 index 000000000000..d1292f324c67 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/pd.c @@ -0,0 +1,119 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/slab.h> +#include "pd.h" + +/** + * rvt_alloc_pd - allocate a protection domain + * @ibdev: ib device + * @context: optional user context + * @udata: optional user data + * + * Allocate and keep track of a PD. + * + * Return: 0 on success + */ +struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct rvt_dev_info *dev = ib_to_rvt(ibdev); + struct rvt_pd *pd; + struct ib_pd *ret; + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + /* + * While we could continue allocating protecetion domains, being + * constrained only by system resources. The IBTA spec defines that + * there is a max_pd limit that can be set and we need to check for + * that. + */ + + spin_lock(&dev->n_pds_lock); + if (dev->n_pds_allocated == dev->dparms.props.max_pd) { + spin_unlock(&dev->n_pds_lock); + kfree(pd); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_pds_allocated++; + spin_unlock(&dev->n_pds_lock); + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata ? 1 : 0; + + ret = &pd->ibpd; + +bail: + return ret; +} + +/** + * rvt_dealloc_pd - Free PD + * @ibpd: Free up PD + * + * Return: always 0 + */ +int rvt_dealloc_pd(struct ib_pd *ibpd) +{ + struct rvt_pd *pd = ibpd_to_rvtpd(ibpd); + struct rvt_dev_info *dev = ib_to_rvt(ibpd->device); + + spin_lock(&dev->n_pds_lock); + dev->n_pds_allocated--; + spin_unlock(&dev->n_pds_lock); + + kfree(pd); + + return 0; +} diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h new file mode 100644 index 000000000000..1892ca4a9746 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/pd.h @@ -0,0 +1,58 @@ +#ifndef DEF_RDMAVTPD_H +#define DEF_RDMAVTPD_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> + +struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata); +int rvt_dealloc_pd(struct ib_pd *ibpd); + +#endif /* DEF_RDMAVTPD_H */ diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c new file mode 100644 index 000000000000..bd82a6948dc8 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -0,0 +1,1696 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/hash.h> +#include <linux/bitops.h> +#include <linux/lockdep.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <rdma/ib_verbs.h> +#include "qp.h" +#include "vt.h" +#include "trace.h" + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; rvt_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_rvt_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = RVT_POST_RECV_OK, + [IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK, + [IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK | + RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK | + RVT_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK | + RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK, + [IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK | + RVT_POST_SEND_OK | RVT_FLUSH_SEND, + [IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV | + RVT_POST_SEND_OK | RVT_FLUSH_SEND, +}; +EXPORT_SYMBOL(ib_rvt_state_ops); + +static void get_map_page(struct rvt_qpn_table *qpt, + struct rvt_qpn_map *map, + gfp_t gfp) +{ + unsigned long page = get_zeroed_page(gfp); + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock(&qpt->lock); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock(&qpt->lock); +} + +/** + * init_qpn_table - initialize the QP number table for a device + * @qpt: the QPN table + */ +static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt) +{ + u32 offset, i; + struct rvt_qpn_map *map; + int ret = 0; + + if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start)) + return -EINVAL; + + spin_lock_init(&qpt->lock); + + qpt->last = rdi->dparms.qpn_start; + qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift; + + /* + * Drivers may want some QPs beyond what we need for verbs let them use + * our qpn table. No need for two. Lets go ahead and mark the bitmaps + * for those. The reserved range must be *after* the range which verbs + * will pick from. + */ + + /* Figure out number of bit maps needed before reserved range */ + qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE; + + /* This should always be zero */ + offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK; + + /* Starting with the first reserved bit map */ + map = &qpt->map[qpt->nmaps]; + + rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n", + rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end); + for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) { + if (!map->page) { + get_map_page(qpt, map, GFP_KERNEL); + if (!map->page) { + ret = -ENOMEM; + break; + } + } + set_bit(offset, map->page); + offset++; + if (offset == RVT_BITS_PER_PAGE) { + /* next page */ + qpt->nmaps++; + map++; + offset = 0; + } + } + return ret; +} + +/** + * free_qpn_table - free the QP number table for a device + * @qpt: the QPN table + */ +static void free_qpn_table(struct rvt_qpn_table *qpt) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(qpt->map); i++) + free_page((unsigned long)qpt->map[i].page); +} + +/** + * rvt_driver_qp_init - Init driver qp resources + * @rdi: rvt dev strucutre + * + * Return: 0 on success + */ +int rvt_driver_qp_init(struct rvt_dev_info *rdi) +{ + int i; + int ret = -ENOMEM; + + if (!rdi->dparms.qp_table_size) + return -EINVAL; + + /* + * If driver is not doing any QP allocation then make sure it is + * providing the necessary QP functions. + */ + if (!rdi->driver_f.free_all_qps || + !rdi->driver_f.qp_priv_alloc || + !rdi->driver_f.qp_priv_free || + !rdi->driver_f.notify_qp_reset) + return -EINVAL; + + /* allocate parent object */ + rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL, + rdi->dparms.node); + if (!rdi->qp_dev) + return -ENOMEM; + + /* allocate hash table */ + rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size; + rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size); + rdi->qp_dev->qp_table = + kmalloc_node(rdi->qp_dev->qp_table_size * + sizeof(*rdi->qp_dev->qp_table), + GFP_KERNEL, rdi->dparms.node); + if (!rdi->qp_dev->qp_table) + goto no_qp_table; + + for (i = 0; i < rdi->qp_dev->qp_table_size; i++) + RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL); + + spin_lock_init(&rdi->qp_dev->qpt_lock); + + /* initialize qpn map */ + if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table)) + goto fail_table; + + spin_lock_init(&rdi->n_qps_lock); + + return 0; + +fail_table: + kfree(rdi->qp_dev->qp_table); + free_qpn_table(&rdi->qp_dev->qpn_table); + +no_qp_table: + kfree(rdi->qp_dev); + + return ret; +} + +/** + * free_all_qps - check for QPs still in use + * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. + */ +static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi) +{ + unsigned long flags; + struct rvt_qp *qp; + unsigned n, qp_inuse = 0; + spinlock_t *ql; /* work around too long line below */ + + if (rdi->driver_f.free_all_qps) + qp_inuse = rdi->driver_f.free_all_qps(rdi); + + qp_inuse += rvt_mcast_tree_empty(rdi); + + if (!rdi->qp_dev) + return qp_inuse; + + ql = &rdi->qp_dev->qpt_lock; + spin_lock_irqsave(ql, flags); + for (n = 0; n < rdi->qp_dev->qp_table_size; n++) { + qp = rcu_dereference_protected(rdi->qp_dev->qp_table[n], + lockdep_is_held(ql)); + RCU_INIT_POINTER(rdi->qp_dev->qp_table[n], NULL); + + for (; qp; qp = rcu_dereference_protected(qp->next, + lockdep_is_held(ql))) + qp_inuse++; + } + spin_unlock_irqrestore(ql, flags); + synchronize_rcu(); + return qp_inuse; +} + +/** + * rvt_qp_exit - clean up qps on device exit + * @rdi: rvt dev structure + * + * Check for qp leaks and free resources. + */ +void rvt_qp_exit(struct rvt_dev_info *rdi) +{ + u32 qps_inuse = rvt_free_all_qps(rdi); + + if (qps_inuse) + rvt_pr_err(rdi, "QP memory leak! %u still in use\n", + qps_inuse); + if (!rdi->qp_dev) + return; + + kfree(rdi->qp_dev->qp_table); + free_qpn_table(&rdi->qp_dev->qpn_table); + kfree(rdi->qp_dev); +} + +static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, + struct rvt_qpn_map *map, unsigned off) +{ + return (map - qpt->map) * RVT_BITS_PER_PAGE + off; +} + +/** + * alloc_qpn - Allocate the next available qpn or zero/one for QP type + * IB_QPT_SMI/IB_QPT_GSI + *@rdi: rvt device info structure + *@qpt: queue pair number table pointer + *@port_num: IB port number, 1 based, comes from core + * + * Return: The queue pair number + */ +static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, + enum ib_qp_type type, u8 port_num, gfp_t gfp) +{ + u32 i, offset, max_scan, qpn; + struct rvt_qpn_map *map; + u32 ret; + + if (rdi->driver_f.alloc_qpn) + return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num, gfp); + + if (type == IB_QPT_SMI || type == IB_QPT_GSI) { + unsigned n; + + ret = type == IB_QPT_GSI; + n = 1 << (ret + 2 * (port_num - 1)); + spin_lock(&qpt->lock); + if (qpt->flags & n) + ret = -EINVAL; + else + qpt->flags |= n; + spin_unlock(&qpt->lock); + goto bail; + } + + qpn = qpt->last + qpt->incr; + if (qpn >= RVT_QPN_MAX) + qpn = qpt->incr | ((qpt->last & 1) ^ 1); + /* offset carries bit 0 */ + offset = qpn & RVT_BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / RVT_BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + get_map_page(qpt, map, gfp); + if (unlikely(!map->page)) + break; + } + do { + if (!test_and_set_bit(offset, map->page)) { + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset += qpt->incr; + /* + * This qpn might be bogus if offset >= BITS_PER_PAGE. + * That is OK. It gets re-assigned below + */ + qpn = mk_qpn(qpt, map, offset); + } while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX); + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == RVT_QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + /* start at incr with current bit 0 */ + offset = qpt->incr | (offset & 1); + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + /* start at incr with current bit 0 */ + offset = qpt->incr | (offset & 1); + } else { + map = &qpt->map[0]; + /* wrap to first map page, invert bit 0 */ + offset = qpt->incr | ((offset & 1) ^ 1); + } + /* there can be no bits at shift and below */ + WARN_ON(offset & (rdi->dparms.qos_shift - 1)); + qpn = mk_qpn(qpt, map, offset); + } + + ret = -ENOMEM; + +bail: + return ret; +} + +static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn) +{ + struct rvt_qpn_map *map; + + map = qpt->map + qpn / RVT_BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); +} + +/** + * rvt_clear_mr_refs - Drop help mr refs + * @qp: rvt qp data structure + * @clr_sends: If shoudl clear send side or not + */ +static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) +{ + unsigned n; + + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) + rvt_put_ss(&qp->s_rdma_read_sge); + + rvt_put_ss(&qp->r_sge); + + if (clr_sends) { + while (qp->s_last != qp->s_head) { + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last); + unsigned i; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct rvt_sge *sge = &wqe->sg_list[i]; + + rvt_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&ibah_to_rvtah( + wqe->ud_wr.ah)->refcount); + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + smp_wmb(); /* see qp_set_savail */ + } + if (qp->s_rdma_mr) { + rvt_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + } + + if (qp->ibqp.qp_type != IB_QPT_RC) + return; + + for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { + struct rvt_ack_entry *e = &qp->s_ack_queue[n]; + + if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && + e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + } +} + +/** + * rvt_remove_qp - remove qp form table + * @rdi: rvt dev struct + * @qp: qp to remove + * + * Remove the QP from the table so it can't be found asynchronously by + * the receive routine. + */ +static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1]; + u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits); + unsigned long flags; + int removed = 1; + + spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags); + + if (rcu_dereference_protected(rvp->qp[0], + lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) { + RCU_INIT_POINTER(rvp->qp[0], NULL); + } else if (rcu_dereference_protected(rvp->qp[1], + lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) { + RCU_INIT_POINTER(rvp->qp[1], NULL); + } else { + struct rvt_qp *q; + struct rvt_qp __rcu **qpp; + + removed = 0; + qpp = &rdi->qp_dev->qp_table[n]; + for (; (q = rcu_dereference_protected(*qpp, + lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL; + qpp = &q->next) { + if (q == qp) { + RCU_INIT_POINTER(*qpp, + rcu_dereference_protected(qp->next, + lockdep_is_held(&rdi->qp_dev->qpt_lock))); + removed = 1; + trace_rvt_qpremove(qp, n); + break; + } + } + } + + spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags); + if (removed) { + synchronize_rcu(); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +/** + * reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + * @type: the QP type + * r and s lock are required to be held by the caller + */ +static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, + enum ib_qp_type type) +{ + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + + /* Let drivers flush their waitlist */ + rdi->driver_f.flush_qp_waiters(qp); + qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + + /* Stop the send queue and the retry timer */ + rdi->driver_f.stop_send_queue(qp); + + /* Wait for things to stop */ + rdi->driver_f.quiesce_qp(qp); + + /* take qp out the hash and wait for it to be unused */ + rvt_remove_qp(rdi, qp); + wait_event(qp->wait, !atomic_read(&qp->refcount)); + + /* grab the lock b/c it was locked at call time */ + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + + rvt_clear_mr_refs(qp, 1); + } + + /* + * Let the driver do any tear down it needs to for a qp + * that has been reset + */ + rdi->driver_f.notify_qp_reset(qp); + + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + qp->s_flags &= RVT_S_SIGNAL_REQ_WR; + qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_draining = 0; + qp->s_next_psn = 0; + qp->s_last_psn = 0; + qp->s_sending_psn = 0; + qp->s_sending_hpsn = 0; + qp->s_psn = 0; + qp->r_psn = 0; + qp->r_msn = 0; + if (type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_acked = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + qp->s_mig_state = IB_MIG_MIGRATED; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; + if (qp->r_rq.wq) { + qp->r_rq.wq->head = 0; + qp->r_rq.wq->tail = 0; + } + qp->r_sge.num_sge = 0; +} + +/** + * rvt_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: user data for libibverbs.so + * + * Queue pair creation is mostly an rvt issue. However, drivers have their own + * unique idea of what queue pair numbers mean. For instance there is a reserved + * range for PSM. + * + * Return: the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct rvt_qp *qp; + int err; + struct rvt_swqe *swq = NULL; + size_t sz; + size_t sg_list_sz; + struct ib_qp *ret = ERR_PTR(-ENOMEM); + struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); + void *priv = NULL; + gfp_t gfp; + + if (!rdi) + return ERR_PTR(-EINVAL); + + if (init_attr->cap.max_send_sge > rdi->dparms.props.max_sge || + init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr || + init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO)) + return ERR_PTR(-EINVAL); + + /* GFP_NOIO is applicable to RC QP's only */ + + if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO && + init_attr->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + + gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ? + GFP_NOIO : GFP_KERNEL; + + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > rdi->dparms.props.max_sge || + init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr) + return ERR_PTR(-EINVAL); + + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) + return ERR_PTR(-EINVAL); + } + + switch (init_attr->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (init_attr->port_num == 0 || + init_attr->port_num > ibpd->device->phys_port_cnt) + return ERR_PTR(-EINVAL); + case IB_QPT_UC: + case IB_QPT_RC: + case IB_QPT_UD: + sz = sizeof(struct rvt_sge) * + init_attr->cap.max_send_sge + + sizeof(struct rvt_swqe); + if (gfp == GFP_NOIO) + swq = __vmalloc( + (init_attr->cap.max_send_wr + 1) * sz, + gfp, PAGE_KERNEL); + else + swq = vmalloc_node( + (init_attr->cap.max_send_wr + 1) * sz, + rdi->dparms.node); + if (!swq) + return ERR_PTR(-ENOMEM); + + sz = sizeof(*qp); + sg_list_sz = 0; + if (init_attr->srq) { + struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq); + + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kzalloc_node(sz + sg_list_sz, gfp, rdi->dparms.node); + if (!qp) + goto bail_swq; + + RCU_INIT_POINTER(qp->next, NULL); + + /* + * Driver needs to set up it's private QP structure and do any + * initialization that is needed. + */ + priv = rdi->driver_f.qp_priv_alloc(rdi, qp, gfp); + if (!priv) + goto bail_qp; + qp->priv = priv; + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + if (init_attr->srq) { + sz = 0; + } else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + + sizeof(struct rvt_rwqe); + if (udata) + qp->r_rq.wq = vmalloc_user( + sizeof(struct rvt_rwq) + + qp->r_rq.size * sz); + else if (gfp == GFP_NOIO) + qp->r_rq.wq = __vmalloc( + sizeof(struct rvt_rwq) + + qp->r_rq.size * sz, + gfp, PAGE_KERNEL); + else + qp->r_rq.wq = vmalloc_node( + sizeof(struct rvt_rwq) + + qp->r_rq.size * sz, + rdi->dparms.node); + if (!qp->r_rq.wq) + goto bail_driver_priv; + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->r_lock); + spin_lock_init(&qp->s_hlock); + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + init_timer(&qp->s_timer); + qp->s_timer.data = (unsigned long)qp; + INIT_LIST_HEAD(&qp->rspwait); + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_avail = init_attr->cap.max_send_wr; + qp->s_max_sge = init_attr->cap.max_send_sge; + if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) + qp->s_flags = RVT_S_SIGNAL_REQ_WR; + + err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table, + init_attr->qp_type, + init_attr->port_num, gfp); + if (err < 0) { + ret = ERR_PTR(err); + goto bail_rq_wq; + } + qp->ibqp.qp_num = err; + qp->port_num = init_attr->port_num; + rvt_reset_qp(rdi, qp, init_attr->qp_type); + break; + + default: + /* Don't support raw QPs */ + return ERR_PTR(-EINVAL); + } + + init_attr->cap.max_inline_data = 0; + + /* + * Return the address of the RWQ as the offset to mmap. + * See rvt_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_qpn; + } + } else { + u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz; + + qp->ip = rvt_create_mmap_info(rdi, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_qpn; + } + + err = ib_copy_to_udata(udata, &qp->ip->offset, + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } + qp->pid = current->pid; + } + + spin_lock(&rdi->n_qps_lock); + if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) { + spin_unlock(&rdi->n_qps_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + rdi->n_qps_allocated++; + /* + * Maintain a busy_jiffies variable that will be added to the timeout + * period in mod_retry_timer and add_retry_timer. This busy jiffies + * is scaled by the number of rc qps created for the device to reduce + * the number of timeouts occurring when there is a large number of + * qps. busy_jiffies is incremented every rc qp scaling interval. + * The scaling interval is selected based on extensive performance + * evaluation of targeted workloads. + */ + if (init_attr->qp_type == IB_QPT_RC) { + rdi->n_rc_qps++; + rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL; + } + spin_unlock(&rdi->n_qps_lock); + + if (qp->ip) { + spin_lock_irq(&rdi->pending_lock); + list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps); + spin_unlock_irq(&rdi->pending_lock); + } + + ret = &qp->ibqp; + + /* + * We have our QP and its good, now keep track of what types of opcodes + * can be processed on this QP. We do this by keeping track of what the + * 3 high order bits of the opcode are. + */ + switch (init_attr->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK; + break; + case IB_QPT_RC: + qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK; + break; + case IB_QPT_UC: + qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK; + break; + default: + ret = ERR_PTR(-EINVAL); + goto bail_ip; + } + + return ret; + +bail_ip: + kref_put(&qp->ip->ref, rvt_release_mmap_info); + +bail_qpn: + free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); + +bail_rq_wq: + vfree(qp->r_rq.wq); + +bail_driver_priv: + rdi->driver_f.qp_priv_free(rdi, qp); + +bail_qp: + kfree(qp); + +bail_swq: + vfree(swq); + + return ret; +} + +/** + * rvt_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * + * Return: true if last WQE event should be generated. + * The QP r_lock and s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ +int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) +{ + struct ib_wc wc; + int ret = 0; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) + goto bail; + + qp->state = IB_QPS_ERR; + + if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { + qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + if (qp->s_flags & RVT_S_ANY_WAIT_SEND) + qp->s_flags &= ~RVT_S_ANY_WAIT_SEND; + + rdi->driver_f.notify_error_qp(qp); + + /* Schedule the sending tasklet to drain the send work queue. */ + if (ACCESS_ONCE(qp->s_last) != qp->s_head) + rdi->driver_f.schedule_send(qp); + + rvt_clear_mr_refs(qp, 0); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); + } + wc.status = IB_WC_WR_FLUSH_ERR; + + if (qp->r_rq.wq) { + struct rvt_rwq *wq; + u32 head; + u32 tail; + + spin_lock(&qp->r_rq.lock); + + /* sanity check pointers before trusting them */ + wq = qp->r_rq.wq; + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + while (tail != head) { + wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id; + if (++tail >= qp->r_rq.size) + tail = 0; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); + } + wq->tail = tail; + + spin_unlock(&qp->r_rq.lock); + } else if (qp->ibqp.event_handler) { + ret = 1; + } + +bail: + return ret; +} +EXPORT_SYMBOL(rvt_error_qp); + +/* + * Put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static void rvt_insert_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1]; + unsigned long flags; + + atomic_inc(&qp->refcount); + spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags); + + if (qp->ibqp.qp_num <= 1) { + rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp); + } else { + u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits); + + qp->next = rdi->qp_dev->qp_table[n]; + rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp); + trace_rvt_qpinsert(qp, n); + } + + spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags); +} + +/** + * qib_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for libibverbs.so + * + * Return: 0 on success, otherwise returns an errno. + */ +int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); + enum ib_qp_state cur_state, new_state; + struct ib_event ev; + int lastwqe = 0; + int mig = 0; + int pmtu = 0; /* for gcc warning only */ + enum rdma_link_layer link; + + link = rdma_port_get_link_layer(ibqp->device, qp->port_num); + + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, link)) + goto inval; + + if (rdi->driver_f.check_modify_qp && + rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata)) + goto inval; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto inval; + if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr)) + goto inval; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_ah_attr.dlid >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto inval; + if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) + goto inval; + if (attr->alt_pkey_index >= rvt_get_npkeys(rdi)) + goto inval; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + if (attr->pkey_index >= rvt_get_npkeys(rdi)) + goto inval; + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + if (attr->min_rnr_timer > 31) + goto inval; + + if (attr_mask & IB_QP_PORT) + if (qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI || + attr->port_num == 0 || + attr->port_num > ibqp->device->phys_port_cnt) + goto inval; + + if (attr_mask & IB_QP_DEST_QPN) + if (attr->dest_qp_num > RVT_QPN_MASK) + goto inval; + + if (attr_mask & IB_QP_RETRY_CNT) + if (attr->retry_cnt > 7) + goto inval; + + if (attr_mask & IB_QP_RNR_RETRY) + if (attr->rnr_retry > 7) + goto inval; + + /* + * Don't allow invalid path_mtu values. OK to set greater + * than the active mtu (or even the max_cap, if we have tuned + * that to a small mtu. We'll set qp->path_mtu + * to the lesser of requested attribute mtu and active, + * for packetizing messages. + * Note that the QP port has to be set in INIT and MTU in RTR. + */ + if (attr_mask & IB_QP_PATH_MTU) { + pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr); + if (pmtu < 0) + goto inval; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + if (attr->path_mig_state == IB_MIG_REARM) { + if (qp->s_mig_state == IB_MIG_ARMED) + goto inval; + if (new_state != IB_QPS_RTS) + goto inval; + } else if (attr->path_mig_state == IB_MIG_MIGRATED) { + if (qp->s_mig_state == IB_MIG_REARM) + goto inval; + if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD) + goto inval; + if (qp->s_mig_state == IB_MIG_ARMED) + mig = 1; + } else { + goto inval; + } + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) + rvt_reset_qp(rdi, qp, ibqp->qp_type); + break; + + case IB_QPS_RTR: + /* Allow event to re-trigger if QP set to RTR more than once */ + qp->r_flags &= ~RVT_R_COMM_EST; + qp->state = new_state; + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + + case IB_QPS_ERR: + lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + qp->state = new_state; + break; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + if (attr_mask & IB_QP_PORT) + qp->port_num = attr->port_num; + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask; + qp->s_psn = qp->s_next_psn; + qp->s_sending_psn = qp->s_next_psn; + qp->s_last_psn = qp->s_next_psn - 1; + qp->s_sending_hpsn = qp->s_last_psn; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + qp->remote_ah_attr = attr->ah_attr; + qp->s_srate = attr->ah_attr.static_rate; + qp->srate_mbps = ib_rate_to_mbps(qp->s_srate); + } + + if (attr_mask & IB_QP_ALT_PATH) { + qp->alt_ah_attr = attr->alt_ah_attr; + qp->s_alt_pkey_index = attr->alt_pkey_index; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + qp->s_mig_state = attr->path_mig_state; + if (mig) { + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + } + } + + if (attr_mask & IB_QP_PATH_MTU) { + qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu); + qp->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu); + qp->log_pmtu = ilog2(qp->pmtu); + } + + if (attr_mask & IB_QP_RETRY_CNT) { + qp->s_retry_cnt = attr->retry_cnt; + qp->s_retry = attr->retry_cnt; + } + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry_cnt = attr->rnr_retry; + qp->s_rnr_retry = attr->rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + qp->r_min_rnr_timer = attr->min_rnr_timer; + + if (attr_mask & IB_QP_TIMEOUT) { + qp->timeout = attr->timeout; + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + } + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + if (rdi->driver_f.modify_qp) + rdi->driver_f.modify_qp(qp, attr, attr_mask, udata); + + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + rvt_insert_qp(rdi, qp); + + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + if (mig) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + return 0; + +inval: + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + return -EINVAL; +} + +/** rvt_free_qpn - Free a qpn from the bit map + * @qpt: QP table + * @qpn: queue pair number to free + */ +static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn) +{ + struct rvt_qpn_map *map; + + map = qpt->map + qpn / RVT_BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); +} + +/** + * rvt_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Note that this can be called while the QP is actively sending or + * receiving! + * + * Return: 0 on success. + */ +int rvt_destroy_qp(struct ib_qp *ibqp) +{ + struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + rvt_reset_qp(rdi, qp, ibqp->qp_type); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + + /* qpn is now available for use again */ + rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); + + spin_lock(&rdi->n_qps_lock); + rdi->n_qps_allocated--; + if (qp->ibqp.qp_type == IB_QPT_RC) { + rdi->n_rc_qps--; + rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL; + } + spin_unlock(&rdi->n_qps_lock); + + if (qp->ip) + kref_put(&qp->ip->ref, rvt_release_mmap_info); + else + vfree(qp->r_rq.wq); + vfree(qp->s_wq); + rdi->driver_f.qp_priv_free(rdi, qp); + kfree(qp); + return 0; +} + +/** + * rvt_query_qp - query an ipbq + * @ibqp: IB qp to query + * @attr: attr struct to fill in + * @attr_mask: attr mask ignored + * @init_attr: struct to fill in + * + * Return: always 0 + */ +int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = qp->s_mig_state; + attr->qkey = qp->qkey; + attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask; + attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask; + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + attr->alt_ah_attr = qp->alt_ah_attr; + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = qp->s_alt_pkey_index; + attr->en_sqd_async_notify = 0; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; + attr->min_rnr_timer = qp->r_min_rnr_timer; + attr->port_num = qp->port_num; + attr->timeout = qp->timeout; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry_cnt; + attr->alt_port_num = qp->alt_ah_attr.port_num; + attr->alt_timeout = qp->alt_timeout; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + if (qp->s_flags & RVT_S_SIGNAL_REQ_WR) + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + else + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = qp->port_num; + return 0; +} + +/** + * rvt_post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + * + * Return: 0 on success otherwise errno + */ +int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); + struct rvt_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) && + !qp->ibqp.srq; + + /* Check that state is OK to post receive. */ + if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) { + *bad_wr = wr; + return -EINVAL; + } + + for (; wr; wr = wr->next) { + struct rvt_rwqe *wqe; + u32 next; + int i; + + if ((unsigned)wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + return -EINVAL; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + return -ENOMEM; + } + if (unlikely(qp_err_flush)) { + struct ib_wc wc; + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + wc.wr_id = wr->wr_id; + wc.status = IB_WC_WR_FLUSH_ERR; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); + } else { + wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* + * Make sure queue entry is written + * before the head index. + */ + smp_wmb(); + wq->head = next; + } + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + return 0; +} + +/** + * qp_get_savail - return number of avail send entries + * + * @qp - the qp + * + * This assumes the s_hlock is held but the s_last + * qp variable is uncontrolled. + */ +static inline u32 qp_get_savail(struct rvt_qp *qp) +{ + u32 slast; + u32 ret; + + smp_read_barrier_depends(); /* see rc.c */ + slast = ACCESS_ONCE(qp->s_last); + if (qp->s_head >= slast) + ret = qp->s_size - (qp->s_head - slast); + else + ret = slast - qp->s_head; + return ret - 1; +} + +/** + * rvt_post_one_wr - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int rvt_post_one_wr(struct rvt_qp *qp, + struct ib_send_wr *wr, + int *call_send) +{ + struct rvt_swqe *wqe; + u32 next; + int i; + int j; + int acc; + struct rvt_lkey_table *rkt; + struct rvt_pd *pd; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + u8 log_pmtu; + int ret; + + /* IB spec says that num_sge == 0 is OK. */ + if (unlikely(wr->num_sge > qp->s_max_sge)) + return -EINVAL; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned)wr->opcode >= IB_WR_RDMA_READ) + return -EINVAL; + } else if (qp->ibqp.qp_type != IB_QPT_RC) { + /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + return -EINVAL; + /* Check UD destination address PD */ + if (qp->ibqp.pd != ud_wr(wr)->ah->pd) + return -EINVAL; + } else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) { + return -EINVAL; + } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) { + return -EINVAL; + } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) { + return -EINVAL; + } + /* check for avail */ + if (unlikely(!qp->s_avail)) { + qp->s_avail = qp_get_savail(qp); + if (WARN_ON(qp->s_avail > (qp->s_size - 1))) + rvt_pr_err(rdi, + "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u", + qp->ibqp.qp_num, qp->s_size, qp->s_avail, + qp->s_head, qp->s_tail, qp->s_cur, + qp->s_acked, qp->s_last); + if (!qp->s_avail) + return -ENOMEM; + } + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + + rkt = &rdi->lkey_table; + pd = ibpd_to_rvtpd(qp->ibqp.pd); + wqe = rvt_get_swqe_ptr(qp, qp->s_head); + + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) + memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr)); + else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE || + wr->opcode == IB_WR_RDMA_READ) + memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr)); + else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr)); + else + memcpy(&wqe->wr, wr, sizeof(wqe->wr)); + + wqe->length = 0; + j = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) { + ret = -EINVAL; + goto bail_inval_free; + } + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + + /* general part of wqe valid - allow for driver checks */ + if (rdi->driver_f.check_send_wqe) { + ret = rdi->driver_f.check_send_wqe(qp, wqe); + if (ret < 0) + goto bail_inval_free; + if (ret) + *call_send = ret; + } + + log_pmtu = qp->log_pmtu; + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) { + struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah); + + log_pmtu = ah->log_pmtu; + atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); + } + + wqe->ssn = qp->s_ssn++; + wqe->psn = qp->s_next_psn; + wqe->lpsn = wqe->psn + + (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); + qp->s_next_psn = wqe->lpsn + 1; + trace_rvt_post_one_wr(qp, wqe); + smp_wmb(); /* see request builders */ + qp->s_avail--; + qp->s_head = next; + + return 0; + +bail_inval_free: + /* release mr holds */ + while (j) { + struct rvt_sge *sge = &wqe->sg_list[--j]; + + rvt_put_mr(sge->mr); + } + return ret; +} + +/** + * rvt_post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + * + * Return: 0 on success else errno + */ +int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + unsigned long flags = 0; + int call_send; + unsigned nreq = 0; + int err = 0; + + spin_lock_irqsave(&qp->s_hlock, flags); + + /* + * Ensure QP state is such that we can send. If not bail out early, + * there is no need to do this every time we post a send. + */ + if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) { + spin_unlock_irqrestore(&qp->s_hlock, flags); + return -EINVAL; + } + + /* + * If the send queue is empty, and we only have a single WR then just go + * ahead and kick the send engine into gear. Otherwise we will always + * just schedule the send to happen later. + */ + call_send = qp->s_head == ACCESS_ONCE(qp->s_last) && !wr->next; + + for (; wr; wr = wr->next) { + err = rvt_post_one_wr(qp, wr, &call_send); + if (unlikely(err)) { + *bad_wr = wr; + goto bail; + } + nreq++; + } +bail: + spin_unlock_irqrestore(&qp->s_hlock, flags); + if (nreq) { + if (call_send) + rdi->driver_f.schedule_send_no_lock(qp); + else + rdi->driver_f.do_send(qp); + } + return err; +} + +/** + * rvt_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: A pointer to the first WR to cause a problem is put here + * + * This may be called from interrupt context. + * + * Return: 0 on success else errno + */ +int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); + struct rvt_rwq *wq; + unsigned long flags; + + for (; wr; wr = wr->next) { + struct rvt_rwqe *wqe; + u32 next; + int i; + + if ((unsigned)wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + return -EINVAL; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + wq = srq->rq.wq; + next = wq->head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + return -ENOMEM; + } + + wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + return 0; +} diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h new file mode 100644 index 000000000000..8409f80d5f25 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -0,0 +1,69 @@ +#ifndef DEF_RVTQP_H +#define DEF_RVTQP_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> + +int rvt_driver_qp_init(struct rvt_dev_info *rdi); +void rvt_qp_exit(struct rvt_dev_info *rdi); +struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int rvt_destroy_qp(struct ib_qp *ibqp); +int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); +int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +#endif /* DEF_RVTQP_H */ diff --git a/drivers/staging/rdma/hfi1/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index 67786d417493..f7c48e9023de 100644 --- a/drivers/staging/rdma/hfi1/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -52,96 +49,50 @@ #include <linux/slab.h> #include <linux/vmalloc.h> -#include "verbs.h" +#include "srq.h" +#include "vt.h" /** - * hfi1_post_srq_receive - post a receive on a shared receive queue - * @ibsrq: the SRQ to post the receive on - * @wr: the list of work requests to post - * @bad_wr: A pointer to the first WR to cause a problem is put here + * rvt_driver_srq_init - init srq resources on a per driver basis + * @rdi: rvt dev structure * - * This may be called from interrupt context. + * Do any initialization needed when a driver registers with rdmavt. */ -int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +void rvt_driver_srq_init(struct rvt_dev_info *rdi) { - struct hfi1_srq *srq = to_isrq(ibsrq); - struct hfi1_rwq *wq; - unsigned long flags; - int ret; - - for (; wr; wr = wr->next) { - struct hfi1_rwqe *wqe; - u32 next; - int i; - - if ((unsigned) wr->num_sge > srq->rq.max_sge) { - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - - spin_lock_irqsave(&srq->rq.lock, flags); - wq = srq->rq.wq; - next = wq->head + 1; - if (next >= srq->rq.size) - next = 0; - if (next == wq->tail) { - spin_unlock_irqrestore(&srq->rq.lock, flags); - *bad_wr = wr; - ret = -ENOMEM; - goto bail; - } - - wqe = get_rwqe_ptr(&srq->rq, wq->head); - wqe->wr_id = wr->wr_id; - wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; - /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; - spin_unlock_irqrestore(&srq->rq.lock, flags); - } - ret = 0; - -bail: - return ret; + spin_lock_init(&rdi->n_srqs_lock); + rdi->n_srqs_allocated = 0; } /** - * hfi1_create_srq - create a shared receive queue + * rvt_create_srq - create a shared receive queue * @ibpd: the protection domain of the SRQ to create * @srq_init_attr: the attributes of the SRQ * @udata: data from libibverbs when creating a user SRQ + * + * Return: Allocated srq object */ -struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, - struct ib_srq_init_attr *srq_init_attr, - struct ib_udata *udata) +struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) { - struct hfi1_ibdev *dev = to_idev(ibpd->device); - struct hfi1_srq *srq; + struct rvt_dev_info *dev = ib_to_rvt(ibpd->device); + struct rvt_srq *srq; u32 sz; struct ib_srq *ret; - if (srq_init_attr->srq_type != IB_SRQT_BASIC) { - ret = ERR_PTR(-ENOSYS); - goto done; - } + if (srq_init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); if (srq_init_attr->attr.max_sge == 0 || - srq_init_attr->attr.max_sge > hfi1_max_srq_sges || + srq_init_attr->attr.max_sge > dev->dparms.props.max_srq_sge || srq_init_attr->attr.max_wr == 0 || - srq_init_attr->attr.max_wr > hfi1_max_srq_wrs) { - ret = ERR_PTR(-EINVAL); - goto done; - } + srq_init_attr->attr.max_wr > dev->dparms.props.max_srq_wr) + return ERR_PTR(-EINVAL); srq = kmalloc(sizeof(*srq), GFP_KERNEL); - if (!srq) { - ret = ERR_PTR(-ENOMEM); - goto done; - } + if (!srq) + return ERR_PTR(-ENOMEM); /* * Need to use vmalloc() if we want to support large #s of entries. @@ -149,8 +100,8 @@ struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, srq->rq.size = srq_init_attr->attr.max_wr + 1; srq->rq.max_sge = srq_init_attr->attr.max_sge; sz = sizeof(struct ib_sge) * srq->rq.max_sge + - sizeof(struct hfi1_rwqe); - srq->rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + srq->rq.size * sz); + sizeof(struct rvt_rwqe); + srq->rq.wq = vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz); if (!srq->rq.wq) { ret = ERR_PTR(-ENOMEM); goto bail_srq; @@ -158,15 +109,15 @@ struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, /* * Return the address of the RWQ as the offset to mmap. - * See hfi1_mmap() for details. + * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { int err; - u32 s = sizeof(struct hfi1_rwq) + srq->rq.size * sz; + u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz; srq->ip = - hfi1_create_mmap_info(dev, s, ibpd->uobject->context, - srq->rq.wq); + rvt_create_mmap_info(dev, s, ibpd->uobject->context, + srq->rq.wq); if (!srq->ip) { ret = ERR_PTR(-ENOMEM); goto bail_wq; @@ -178,8 +129,9 @@ struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, ret = ERR_PTR(err); goto bail_ip; } - } else + } else { srq->ip = NULL; + } /* * ib_create_srq() will initialize srq->ibsrq. @@ -190,7 +142,7 @@ struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, srq->limit = srq_init_attr->attr.srq_limit; spin_lock(&dev->n_srqs_lock); - if (dev->n_srqs_allocated == hfi1_max_srqs) { + if (dev->n_srqs_allocated == dev->dparms.props.max_srq) { spin_unlock(&dev->n_srqs_lock); ret = ERR_PTR(-ENOMEM); goto bail_ip; @@ -205,8 +157,7 @@ struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, spin_unlock_irq(&dev->pending_lock); } - ret = &srq->ibsrq; - goto done; + return &srq->ibsrq; bail_ip: kfree(srq->ip); @@ -214,46 +165,44 @@ bail_wq: vfree(srq->rq.wq); bail_srq: kfree(srq); -done: return ret; } /** - * hfi1_modify_srq - modify a shared receive queue + * rvt_modify_srq - modify a shared receive queue * @ibsrq: the SRQ to modify * @attr: the new attributes of the SRQ * @attr_mask: indicates which attributes to modify * @udata: user data for libibverbs.so + * + * Return: 0 on success */ -int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, - enum ib_srq_attr_mask attr_mask, - struct ib_udata *udata) +int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) { - struct hfi1_srq *srq = to_isrq(ibsrq); - struct hfi1_rwq *wq; + struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); + struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device); + struct rvt_rwq *wq; int ret = 0; if (attr_mask & IB_SRQ_MAX_WR) { - struct hfi1_rwq *owq; - struct hfi1_rwqe *p; + struct rvt_rwq *owq; + struct rvt_rwqe *p; u32 sz, size, n, head, tail; /* Check that the requested sizes are below the limits. */ - if ((attr->max_wr > hfi1_max_srq_wrs) || + if ((attr->max_wr > dev->dparms.props.max_srq_wr) || ((attr_mask & IB_SRQ_LIMIT) ? - attr->srq_limit : srq->limit) > attr->max_wr) { - ret = -EINVAL; - goto bail; - } + attr->srq_limit : srq->limit) > attr->max_wr) + return -EINVAL; - sz = sizeof(struct hfi1_rwqe) + + sz = sizeof(struct rvt_rwqe) + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = vmalloc_user(sizeof(struct hfi1_rwq) + size * sz); - if (!wq) { - ret = -ENOMEM; - goto bail; - } + wq = vmalloc_user(sizeof(struct rvt_rwq) + size * sz); + if (!wq) + return -ENOMEM; /* Check that we can write the offset to mmap. */ if (udata && udata->inlen >= sizeof(__u64)) { @@ -264,8 +213,8 @@ int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, sizeof(offset_addr)); if (ret) goto bail_free; - udata->outbuf = - (void __user *) (unsigned long) offset_addr; + udata->outbuf = (void __user *) + (unsigned long)offset_addr; ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); if (ret) @@ -296,16 +245,16 @@ int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, n = 0; p = wq->wq; while (tail != head) { - struct hfi1_rwqe *wqe; + struct rvt_rwqe *wqe; int i; - wqe = get_rwqe_ptr(&srq->rq, tail); + wqe = rvt_get_rwqe_ptr(&srq->rq, tail); p->wr_id = wqe->wr_id; p->num_sge = wqe->num_sge; for (i = 0; i < wqe->num_sge; i++) p->sg_list[i] = wqe->sg_list[i]; n++; - p = (struct hfi1_rwqe *)((char *)p + sz); + p = (struct rvt_rwqe *)((char *)p + sz); if (++tail >= srq->rq.size) tail = 0; } @@ -320,21 +269,21 @@ int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, vfree(owq); if (srq->ip) { - struct hfi1_mmap_info *ip = srq->ip; - struct hfi1_ibdev *dev = to_idev(srq->ibsrq.device); - u32 s = sizeof(struct hfi1_rwq) + size * sz; + struct rvt_mmap_info *ip = srq->ip; + struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device); + u32 s = sizeof(struct rvt_rwq) + size * sz; - hfi1_update_mmap_info(dev, ip, s, wq); + rvt_update_mmap_info(dev, ip, s, wq); /* * Return the offset to mmap. - * See hfi1_mmap() for details. + * See rvt_mmap() for details. */ if (udata && udata->inlen >= sizeof(__u64)) { ret = ib_copy_to_udata(udata, &ip->offset, sizeof(ip->offset)); if (ret) - goto bail; + return ret; } /* @@ -355,19 +304,24 @@ int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, srq->limit = attr->srq_limit; spin_unlock_irq(&srq->rq.lock); } - goto bail; + return ret; bail_unlock: spin_unlock_irq(&srq->rq.lock); bail_free: vfree(wq); -bail: return ret; } -int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +/** rvt_query_srq - query srq data + * @ibsrq: srq to query + * @attr: return info in attr + * + * Return: always 0 + */ +int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) { - struct hfi1_srq *srq = to_isrq(ibsrq); + struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); attr->max_wr = srq->rq.size - 1; attr->max_sge = srq->rq.max_sge; @@ -376,19 +330,21 @@ int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) } /** - * hfi1_destroy_srq - destroy a shared receive queue - * @ibsrq: the SRQ to destroy + * rvt_destroy_srq - destory an srq + * @ibsrq: srq object to destroy + * + * Return always 0 */ -int hfi1_destroy_srq(struct ib_srq *ibsrq) +int rvt_destroy_srq(struct ib_srq *ibsrq) { - struct hfi1_srq *srq = to_isrq(ibsrq); - struct hfi1_ibdev *dev = to_idev(ibsrq->device); + struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); + struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device); spin_lock(&dev->n_srqs_lock); dev->n_srqs_allocated--; spin_unlock(&dev->n_srqs_lock); if (srq->ip) - kref_put(&srq->ip->ref, hfi1_release_mmap_info); + kref_put(&srq->ip->ref, rvt_release_mmap_info); else vfree(srq->rq.wq); kfree(srq); diff --git a/drivers/infiniband/sw/rdmavt/srq.h b/drivers/infiniband/sw/rdmavt/srq.h new file mode 100644 index 000000000000..bf0eaaf56465 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/srq.h @@ -0,0 +1,62 @@ +#ifndef DEF_RVTSRQ_H +#define DEF_RVTSRQ_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> +void rvt_driver_srq_init(struct rvt_dev_info *rdi); +struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); +int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); +int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); +int rvt_destroy_srq(struct ib_srq *ibsrq); + +#endif /* DEF_RVTSRQ_H */ diff --git a/drivers/infiniband/sw/rdmavt/trace.c b/drivers/infiniband/sw/rdmavt/trace.c new file mode 100644 index 000000000000..d593285a349c --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/trace.c @@ -0,0 +1,49 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h new file mode 100644 index 000000000000..6c0457db5499 --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/trace.h @@ -0,0 +1,187 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR rdmavt + +#if !defined(__RDMAVT_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __RDMAVT_TRACE_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_vt.h> + +#define RDI_DEV_ENTRY(rdi) __string(dev, rdi->driver_f.get_card_name(rdi)) +#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rdi->driver_f.get_card_name(rdi)) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rdmavt + +TRACE_EVENT(rvt_dbg, + TP_PROTO(struct rvt_dev_info *rdi, + const char *msg), + TP_ARGS(rdi, msg), + TP_STRUCT__entry( + RDI_DEV_ENTRY(rdi) + __string(msg, msg) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(rdi); + __assign_str(msg, msg); + ), + TP_printk("[%s]: %s", __get_str(dev), __get_str(msg)) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rvt_qphash +DECLARE_EVENT_CLASS(rvt_qphash_template, + TP_PROTO(struct rvt_qp *qp, u32 bucket), + TP_ARGS(qp, bucket), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, bucket) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->bucket = bucket; + ), + TP_printk( + "[%s] qpn 0x%x bucket %u", + __get_str(dev), + __entry->qpn, + __entry->bucket + ) +); + +DEFINE_EVENT(rvt_qphash_template, rvt_qpinsert, + TP_PROTO(struct rvt_qp *qp, u32 bucket), + TP_ARGS(qp, bucket)); + +DEFINE_EVENT(rvt_qphash_template, rvt_qpremove, + TP_PROTO(struct rvt_qp *qp, u32 bucket), + TP_ARGS(qp, bucket)); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rvt_tx + +#define wr_opcode_name(opcode) { IB_WR_##opcode, #opcode } +#define show_wr_opcode(opcode) \ +__print_symbolic(opcode, \ + wr_opcode_name(RDMA_WRITE), \ + wr_opcode_name(RDMA_WRITE_WITH_IMM), \ + wr_opcode_name(SEND), \ + wr_opcode_name(SEND_WITH_IMM), \ + wr_opcode_name(RDMA_READ), \ + wr_opcode_name(ATOMIC_CMP_AND_SWP), \ + wr_opcode_name(ATOMIC_FETCH_AND_ADD), \ + wr_opcode_name(LSO), \ + wr_opcode_name(SEND_WITH_INV), \ + wr_opcode_name(RDMA_READ_WITH_INV), \ + wr_opcode_name(LOCAL_INV), \ + wr_opcode_name(MASKED_ATOMIC_CMP_AND_SWP), \ + wr_opcode_name(MASKED_ATOMIC_FETCH_AND_ADD)) + +#define POS_PRN \ +"[%s] wr_id %llx qpn %x psn 0x%x lpsn 0x%x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u" + +TRACE_EVENT( + rvt_post_one_wr, + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe), + TP_ARGS(qp, wqe), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(u64, wr_id) + __field(u32, qpn) + __field(u32, psn) + __field(u32, lpsn) + __field(u32, length) + __field(u32, opcode) + __field(u32, size) + __field(u32, avail) + __field(u32, head) + __field(u32, last) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->wr_id = wqe->wr.wr_id; + __entry->qpn = qp->ibqp.qp_num; + __entry->psn = wqe->psn; + __entry->lpsn = wqe->lpsn; + __entry->length = wqe->length; + __entry->opcode = wqe->wr.opcode; + __entry->size = qp->s_size; + __entry->avail = qp->s_avail; + __entry->head = qp->s_head; + __entry->last = qp->s_last; + ), + TP_printk( + POS_PRN, + __get_str(dev), + __entry->wr_id, + __entry->qpn, + __entry->psn, + __entry->lpsn, + __entry->length, + __entry->opcode, show_wr_opcode(__entry->opcode), + __entry->size, + __entry->avail, + __entry->head, + __entry->last + ) +); + +#endif /* __RDMAVT_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c new file mode 100644 index 000000000000..6caf5272ba1f --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -0,0 +1,873 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include "vt.h" +#include "trace.h" + +#define RVT_UVERBS_ABI_VERSION 2 + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("RDMA Verbs Transport Library"); + +static int rvt_init(void) +{ + /* + * rdmavt does not need to do anything special when it starts up. All it + * needs to do is sit and wait until a driver attempts registration. + */ + return 0; +} +module_init(rvt_init); + +static void rvt_cleanup(void) +{ + /* + * Nothing to do at exit time either. The module won't be able to be + * removed until all drivers are gone which means all the dev structs + * are gone so there is really nothing to do. + */ +} +module_exit(rvt_cleanup); + +/** + * rvt_alloc_device - allocate rdi + * @size: how big of a structure to allocate + * @nports: number of ports to allocate array slots for + * + * Use IB core device alloc to allocate space for the rdi which is assumed to be + * inside of the ib_device. Any extra space that drivers require should be + * included in size. + * + * We also allocate a port array based on the number of ports. + * + * Return: pointer to allocated rdi + */ +struct rvt_dev_info *rvt_alloc_device(size_t size, int nports) +{ + struct rvt_dev_info *rdi = ERR_PTR(-ENOMEM); + + rdi = (struct rvt_dev_info *)ib_alloc_device(size); + if (!rdi) + return rdi; + + rdi->ports = kcalloc(nports, + sizeof(struct rvt_ibport **), + GFP_KERNEL); + if (!rdi->ports) + ib_dealloc_device(&rdi->ibdev); + + return rdi; +} +EXPORT_SYMBOL(rvt_alloc_device); + +static int rvt_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *uhw) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + /* + * Return rvt_dev_info.dparms.props contents + */ + *props = rdi->dparms.props; + return 0; +} + +static int rvt_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + /* + * There is currently no need to supply this based on qib and hfi1. + * Future drivers may need to implement this though. + */ + + return -EOPNOTSUPP; +} + +/** + * rvt_query_port: Passes the query port call to the driver + * @ibdev: Verbs IB dev + * @port_num: port number, 1 based from ib core + * @props: structure to hold returned properties + * + * Return: 0 on success + */ +static int rvt_query_port(struct ib_device *ibdev, u8 port_num, + struct ib_port_attr *props) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + struct rvt_ibport *rvp; + int port_index = ibport_num_to_idx(ibdev, port_num); + + if (port_index < 0) + return -EINVAL; + + rvp = rdi->ports[port_index]; + memset(props, 0, sizeof(*props)); + props->sm_lid = rvp->sm_lid; + props->sm_sl = rvp->sm_sl; + props->port_cap_flags = rvp->port_cap_flags; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = rvt_get_npkeys(rdi); + props->bad_pkey_cntr = rvp->pkey_violations; + props->qkey_viol_cntr = rvp->qkey_violations; + props->subnet_timeout = rvp->subnet_timeout; + props->init_type_reply = 0; + + /* Populate the remaining ib_port_attr elements */ + return rdi->driver_f.query_port_state(rdi, port_num, props); +} + +/** + * rvt_modify_port + * @ibdev: Verbs IB dev + * @port_num: Port number, 1 based from ib core + * @port_modify_mask: How to change the port + * @props: Structure to fill in + * + * Return: 0 on success + */ +static int rvt_modify_port(struct ib_device *ibdev, u8 port_num, + int port_modify_mask, struct ib_port_modify *props) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + struct rvt_ibport *rvp; + int ret = 0; + int port_index = ibport_num_to_idx(ibdev, port_num); + + if (port_index < 0) + return -EINVAL; + + rvp = rdi->ports[port_index]; + rvp->port_cap_flags |= props->set_port_cap_mask; + rvp->port_cap_flags &= ~props->clr_port_cap_mask; + + if (props->set_port_cap_mask || props->clr_port_cap_mask) + rdi->driver_f.cap_mask_chg(rdi, port_num); + if (port_modify_mask & IB_PORT_SHUTDOWN) + ret = rdi->driver_f.shut_down_port(rdi, port_num); + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + rvp->qkey_violations = 0; + + return ret; +} + +/** + * rvt_query_pkey - Return a pkey from the table at a given index + * @ibdev: Verbs IB dev + * @port_num: Port number, 1 based from ib core + * @intex: Index into pkey table + * + * Return: 0 on failure pkey otherwise + */ +static int rvt_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, + u16 *pkey) +{ + /* + * Driver will be responsible for keeping rvt_dev_info.pkey_table up to + * date. This function will just return that value. There is no need to + * lock, if a stale value is read and sent to the user so be it there is + * no way to protect against that anyway. + */ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + int port_index; + + port_index = ibport_num_to_idx(ibdev, port_num); + if (port_index < 0) + return -EINVAL; + + if (index >= rvt_get_npkeys(rdi)) + return -EINVAL; + + *pkey = rvt_get_pkey(rdi, port_index, index); + return 0; +} + +/** + * rvt_query_gid - Return a gid from the table + * @ibdev: Verbs IB dev + * @port_num: Port number, 1 based from ib core + * @index: = Index in table + * @gid: Gid to return + * + * Return: 0 on success + */ +static int rvt_query_gid(struct ib_device *ibdev, u8 port_num, + int guid_index, union ib_gid *gid) +{ + struct rvt_dev_info *rdi; + struct rvt_ibport *rvp; + int port_index; + + /* + * Driver is responsible for updating the guid table. Which will be used + * to craft the return value. This will work similar to how query_pkey() + * is being done. + */ + port_index = ibport_num_to_idx(ibdev, port_num); + if (port_index < 0) + return -EINVAL; + + rdi = ib_to_rvt(ibdev); + rvp = rdi->ports[port_index]; + + gid->global.subnet_prefix = rvp->gid_prefix; + + return rdi->driver_f.get_guid_be(rdi, rvp, guid_index, + &gid->global.interface_id); +} + +struct rvt_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct rvt_ucontext, ibucontext); +} + +/** + * rvt_alloc_ucontext - Allocate a user context + * @ibdev: Vers IB dev + * @data: User data allocated + */ +static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct rvt_ucontext *context; + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + return &context->ibucontext; +} + +/** + *rvt_dealloc_ucontext - Free a user context + *@context - Free this + */ +static int rvt_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static int rvt_get_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + struct ib_port_attr attr; + int err, port_index; + + port_index = ibport_num_to_idx(ibdev, port_num); + if (port_index < 0) + return -EINVAL; + + err = rvt_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = rdi->dparms.core_cap_flags; + immutable->max_mad_size = rdi->dparms.max_mad_size; + + return 0; +} + +enum { + MISC, + QUERY_DEVICE, + MODIFY_DEVICE, + QUERY_PORT, + MODIFY_PORT, + QUERY_PKEY, + QUERY_GID, + ALLOC_UCONTEXT, + DEALLOC_UCONTEXT, + GET_PORT_IMMUTABLE, + CREATE_QP, + MODIFY_QP, + DESTROY_QP, + QUERY_QP, + POST_SEND, + POST_RECV, + POST_SRQ_RECV, + CREATE_AH, + DESTROY_AH, + MODIFY_AH, + QUERY_AH, + CREATE_SRQ, + MODIFY_SRQ, + DESTROY_SRQ, + QUERY_SRQ, + ATTACH_MCAST, + DETACH_MCAST, + GET_DMA_MR, + REG_USER_MR, + DEREG_MR, + ALLOC_MR, + ALLOC_FMR, + MAP_PHYS_FMR, + UNMAP_FMR, + DEALLOC_FMR, + MMAP, + CREATE_CQ, + DESTROY_CQ, + POLL_CQ, + REQ_NOTFIY_CQ, + RESIZE_CQ, + ALLOC_PD, + DEALLOC_PD, + _VERB_IDX_MAX /* Must always be last! */ +}; + +static inline int check_driver_override(struct rvt_dev_info *rdi, + size_t offset, void *func) +{ + if (!*(void **)((void *)&rdi->ibdev + offset)) { + *(void **)((void *)&rdi->ibdev + offset) = func; + return 0; + } + + return 1; +} + +static noinline int check_support(struct rvt_dev_info *rdi, int verb) +{ + switch (verb) { + case MISC: + /* + * These functions are not part of verbs specifically but are + * required for rdmavt to function. + */ + if ((!rdi->driver_f.port_callback) || + (!rdi->driver_f.get_card_name) || + (!rdi->driver_f.get_pci_dev)) + return -EINVAL; + break; + + case QUERY_DEVICE: + check_driver_override(rdi, offsetof(struct ib_device, + query_device), + rvt_query_device); + break; + + case MODIFY_DEVICE: + /* + * rdmavt does not support modify device currently drivers must + * provide. + */ + if (!check_driver_override(rdi, offsetof(struct ib_device, + modify_device), + rvt_modify_device)) + return -EOPNOTSUPP; + break; + + case QUERY_PORT: + if (!check_driver_override(rdi, offsetof(struct ib_device, + query_port), + rvt_query_port)) + if (!rdi->driver_f.query_port_state) + return -EINVAL; + break; + + case MODIFY_PORT: + if (!check_driver_override(rdi, offsetof(struct ib_device, + modify_port), + rvt_modify_port)) + if (!rdi->driver_f.cap_mask_chg || + !rdi->driver_f.shut_down_port) + return -EINVAL; + break; + + case QUERY_PKEY: + check_driver_override(rdi, offsetof(struct ib_device, + query_pkey), + rvt_query_pkey); + break; + + case QUERY_GID: + if (!check_driver_override(rdi, offsetof(struct ib_device, + query_gid), + rvt_query_gid)) + if (!rdi->driver_f.get_guid_be) + return -EINVAL; + break; + + case ALLOC_UCONTEXT: + check_driver_override(rdi, offsetof(struct ib_device, + alloc_ucontext), + rvt_alloc_ucontext); + break; + + case DEALLOC_UCONTEXT: + check_driver_override(rdi, offsetof(struct ib_device, + dealloc_ucontext), + rvt_dealloc_ucontext); + break; + + case GET_PORT_IMMUTABLE: + check_driver_override(rdi, offsetof(struct ib_device, + get_port_immutable), + rvt_get_port_immutable); + break; + + case CREATE_QP: + if (!check_driver_override(rdi, offsetof(struct ib_device, + create_qp), + rvt_create_qp)) + if (!rdi->driver_f.qp_priv_alloc || + !rdi->driver_f.qp_priv_free || + !rdi->driver_f.notify_qp_reset || + !rdi->driver_f.flush_qp_waiters || + !rdi->driver_f.stop_send_queue || + !rdi->driver_f.quiesce_qp) + return -EINVAL; + break; + + case MODIFY_QP: + if (!check_driver_override(rdi, offsetof(struct ib_device, + modify_qp), + rvt_modify_qp)) + if (!rdi->driver_f.notify_qp_reset || + !rdi->driver_f.schedule_send || + !rdi->driver_f.get_pmtu_from_attr || + !rdi->driver_f.flush_qp_waiters || + !rdi->driver_f.stop_send_queue || + !rdi->driver_f.quiesce_qp || + !rdi->driver_f.notify_error_qp || + !rdi->driver_f.mtu_from_qp || + !rdi->driver_f.mtu_to_path_mtu || + !rdi->driver_f.shut_down_port || + !rdi->driver_f.cap_mask_chg) + return -EINVAL; + break; + + case DESTROY_QP: + if (!check_driver_override(rdi, offsetof(struct ib_device, + destroy_qp), + rvt_destroy_qp)) + if (!rdi->driver_f.qp_priv_free || + !rdi->driver_f.notify_qp_reset || + !rdi->driver_f.flush_qp_waiters || + !rdi->driver_f.stop_send_queue || + !rdi->driver_f.quiesce_qp) + return -EINVAL; + break; + + case QUERY_QP: + check_driver_override(rdi, offsetof(struct ib_device, + query_qp), + rvt_query_qp); + break; + + case POST_SEND: + if (!check_driver_override(rdi, offsetof(struct ib_device, + post_send), + rvt_post_send)) + if (!rdi->driver_f.schedule_send || + !rdi->driver_f.do_send) + return -EINVAL; + break; + + case POST_RECV: + check_driver_override(rdi, offsetof(struct ib_device, + post_recv), + rvt_post_recv); + break; + case POST_SRQ_RECV: + check_driver_override(rdi, offsetof(struct ib_device, + post_srq_recv), + rvt_post_srq_recv); + break; + + case CREATE_AH: + check_driver_override(rdi, offsetof(struct ib_device, + create_ah), + rvt_create_ah); + break; + + case DESTROY_AH: + check_driver_override(rdi, offsetof(struct ib_device, + destroy_ah), + rvt_destroy_ah); + break; + + case MODIFY_AH: + check_driver_override(rdi, offsetof(struct ib_device, + modify_ah), + rvt_modify_ah); + break; + + case QUERY_AH: + check_driver_override(rdi, offsetof(struct ib_device, + query_ah), + rvt_query_ah); + break; + + case CREATE_SRQ: + check_driver_override(rdi, offsetof(struct ib_device, + create_srq), + rvt_create_srq); + break; + + case MODIFY_SRQ: + check_driver_override(rdi, offsetof(struct ib_device, + modify_srq), + rvt_modify_srq); + break; + + case DESTROY_SRQ: + check_driver_override(rdi, offsetof(struct ib_device, + destroy_srq), + rvt_destroy_srq); + break; + + case QUERY_SRQ: + check_driver_override(rdi, offsetof(struct ib_device, + query_srq), + rvt_query_srq); + break; + + case ATTACH_MCAST: + check_driver_override(rdi, offsetof(struct ib_device, + attach_mcast), + rvt_attach_mcast); + break; + + case DETACH_MCAST: + check_driver_override(rdi, offsetof(struct ib_device, + detach_mcast), + rvt_detach_mcast); + break; + + case GET_DMA_MR: + check_driver_override(rdi, offsetof(struct ib_device, + get_dma_mr), + rvt_get_dma_mr); + break; + + case REG_USER_MR: + check_driver_override(rdi, offsetof(struct ib_device, + reg_user_mr), + rvt_reg_user_mr); + break; + + case DEREG_MR: + check_driver_override(rdi, offsetof(struct ib_device, + dereg_mr), + rvt_dereg_mr); + break; + + case ALLOC_FMR: + check_driver_override(rdi, offsetof(struct ib_device, + alloc_fmr), + rvt_alloc_fmr); + break; + + case ALLOC_MR: + check_driver_override(rdi, offsetof(struct ib_device, + alloc_mr), + rvt_alloc_mr); + break; + + case MAP_PHYS_FMR: + check_driver_override(rdi, offsetof(struct ib_device, + map_phys_fmr), + rvt_map_phys_fmr); + break; + + case UNMAP_FMR: + check_driver_override(rdi, offsetof(struct ib_device, + unmap_fmr), + rvt_unmap_fmr); + break; + + case DEALLOC_FMR: + check_driver_override(rdi, offsetof(struct ib_device, + dealloc_fmr), + rvt_dealloc_fmr); + break; + + case MMAP: + check_driver_override(rdi, offsetof(struct ib_device, + mmap), + rvt_mmap); + break; + + case CREATE_CQ: + check_driver_override(rdi, offsetof(struct ib_device, + create_cq), + rvt_create_cq); + break; + + case DESTROY_CQ: + check_driver_override(rdi, offsetof(struct ib_device, + destroy_cq), + rvt_destroy_cq); + break; + + case POLL_CQ: + check_driver_override(rdi, offsetof(struct ib_device, + poll_cq), + rvt_poll_cq); + break; + + case REQ_NOTFIY_CQ: + check_driver_override(rdi, offsetof(struct ib_device, + req_notify_cq), + rvt_req_notify_cq); + break; + + case RESIZE_CQ: + check_driver_override(rdi, offsetof(struct ib_device, + resize_cq), + rvt_resize_cq); + break; + + case ALLOC_PD: + check_driver_override(rdi, offsetof(struct ib_device, + alloc_pd), + rvt_alloc_pd); + break; + + case DEALLOC_PD: + check_driver_override(rdi, offsetof(struct ib_device, + dealloc_pd), + rvt_dealloc_pd); + break; + + default: + return -EINVAL; + } + + return 0; +} + +/** + * rvt_register_device - register a driver + * @rdi: main dev structure for all of rdmavt operations + * + * It is up to drivers to allocate the rdi and fill in the appropriate + * information. + * + * Return: 0 on success otherwise an errno. + */ +int rvt_register_device(struct rvt_dev_info *rdi) +{ + int ret = 0, i; + + if (!rdi) + return -EINVAL; + + /* + * Check to ensure drivers have setup the required helpers for the verbs + * they want rdmavt to handle + */ + for (i = 0; i < _VERB_IDX_MAX; i++) + if (check_support(rdi, i)) { + pr_err("Driver support req not met at %d\n", i); + return -EINVAL; + } + + + /* Once we get past here we can use rvt_pr macros and tracepoints */ + trace_rvt_dbg(rdi, "Driver attempting registration"); + rvt_mmap_init(rdi); + + /* Queue Pairs */ + ret = rvt_driver_qp_init(rdi); + if (ret) { + pr_err("Error in driver QP init.\n"); + return -EINVAL; + } + + /* Address Handle */ + spin_lock_init(&rdi->n_ahs_lock); + rdi->n_ahs_allocated = 0; + + /* Shared Receive Queue */ + rvt_driver_srq_init(rdi); + + /* Multicast */ + rvt_driver_mcast_init(rdi); + + /* Mem Region */ + ret = rvt_driver_mr_init(rdi); + if (ret) { + pr_err("Error in driver MR init.\n"); + goto bail_no_mr; + } + + /* Completion queues */ + ret = rvt_driver_cq_init(rdi); + if (ret) { + pr_err("Error in driver CQ init.\n"); + goto bail_mr; + } + + /* DMA Operations */ + rdi->ibdev.dma_ops = + rdi->ibdev.dma_ops ? : &rvt_default_dma_mapping_ops; + + /* Protection Domain */ + spin_lock_init(&rdi->n_pds_lock); + rdi->n_pds_allocated = 0; + + /* + * There are some things which could be set by underlying drivers but + * really should be up to rdmavt to set. For instance drivers can't know + * exactly which functions rdmavt supports, nor do they know the ABI + * version, so we do all of this sort of stuff here. + */ + rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION; + rdi->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_MODIFY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + rdi->ibdev.node_type = RDMA_NODE_IB_CA; + rdi->ibdev.num_comp_vectors = 1; + + /* We are now good to announce we exist */ + ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); + if (ret) { + rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); + goto bail_cq; + } + + rvt_create_mad_agents(rdi); + + rvt_pr_info(rdi, "Registration with rdmavt done.\n"); + return ret; + +bail_cq: + rvt_cq_exit(rdi); + +bail_mr: + rvt_mr_exit(rdi); + +bail_no_mr: + rvt_qp_exit(rdi); + + return ret; +} +EXPORT_SYMBOL(rvt_register_device); + +/** + * rvt_unregister_device - remove a driver + * @rdi: rvt dev struct + */ +void rvt_unregister_device(struct rvt_dev_info *rdi) +{ + trace_rvt_dbg(rdi, "Driver is unregistering."); + if (!rdi) + return; + + rvt_free_mad_agents(rdi); + + ib_unregister_device(&rdi->ibdev); + rvt_cq_exit(rdi); + rvt_mr_exit(rdi); + rvt_qp_exit(rdi); +} +EXPORT_SYMBOL(rvt_unregister_device); + +/** + * rvt_init_port - init internal data for driver port + * @rdi: rvt dev strut + * @port: rvt port + * @port_index: 0 based index of ports, different from IB core port num + * + * Keep track of a list of ports. No need to have a detach port. + * They persist until the driver goes away. + * + * Return: always 0 + */ +int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, + int port_index, u16 *pkey_table) +{ + + rdi->ports[port_index] = port; + rdi->ports[port_index]->pkey_table = pkey_table; + + return 0; +} +EXPORT_SYMBOL(rvt_init_port); diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h new file mode 100644 index 000000000000..6b01eaa4461b --- /dev/null +++ b/drivers/infiniband/sw/rdmavt/vt.h @@ -0,0 +1,104 @@ +#ifndef DEF_RDMAVT_H +#define DEF_RDMAVT_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> +#include <linux/pci.h> +#include "dma.h" +#include "pd.h" +#include "qp.h" +#include "ah.h" +#include "mr.h" +#include "srq.h" +#include "mcast.h" +#include "mmap.h" +#include "cq.h" +#include "mad.h" +#include "mmap.h" + +#define rvt_pr_info(rdi, fmt, ...) \ + __rvt_pr_info(rdi->driver_f.get_pci_dev(rdi), \ + rdi->driver_f.get_card_name(rdi), \ + fmt, \ + ##__VA_ARGS__) + +#define rvt_pr_warn(rdi, fmt, ...) \ + __rvt_pr_warn(rdi->driver_f.get_pci_dev(rdi), \ + rdi->driver_f.get_card_name(rdi), \ + fmt, \ + ##__VA_ARGS__) + +#define rvt_pr_err(rdi, fmt, ...) \ + __rvt_pr_err(rdi->driver_f.get_pci_dev(rdi), \ + rdi->driver_f.get_card_name(rdi), \ + fmt, \ + ##__VA_ARGS__) + +#define __rvt_pr_info(pdev, name, fmt, ...) \ + dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) + +#define __rvt_pr_warn(pdev, name, fmt, ...) \ + dev_warn(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) + +#define __rvt_pr_err(pdev, name, fmt, ...) \ + dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) + +static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num) +{ + struct rvt_dev_info *rdi = ib_to_rvt(ibdev); + int port_index; + + port_index = port_num - 1; /* IB ports start at 1 our arrays at 0 */ + if ((port_index < 0) || (port_index >= rdi->dparms.nports)) + return -EINVAL; + + return port_index; +} + +#endif /* DEF_RDMAVT_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index a6f3eab0f350..caec8e9c4666 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -244,6 +244,7 @@ struct ipoib_cm_tx { unsigned tx_tail; unsigned long flags; u32 mtu; + unsigned max_send_sge; }; struct ipoib_cm_rx_buf { @@ -387,9 +388,10 @@ struct ipoib_dev_priv { struct dentry *mcg_dentry; struct dentry *path_dentry; #endif - int hca_caps; + u64 hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; + unsigned max_send_sge; }; struct ipoib_ah { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 917e46ea3bf6..c8ed53562c9b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; int rc; + unsigned usable_sge = tx->max_send_sge - !!skb_headlen(skb); if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -719,7 +720,23 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); return; } - + if (skb_shinfo(skb)->nr_frags > usable_sge) { + if (skb_linearize(skb) < 0) { + ipoib_warn(priv, "skb could not be linearized\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + /* Does skb_linearize return ok without reducing nr_frags? */ + if (skb_shinfo(skb)->nr_frags > usable_sge) { + ipoib_warn(priv, "too many frags after skb linearize\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", tx->tx_head, skb->len, tx->qp->qp_num); @@ -1031,7 +1048,8 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ struct ib_qp *tx_qp; if (dev->features & NETIF_F_SG) - attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + attr.cap.max_send_sge = + min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); tx_qp = ib_create_qp(priv->pd, &attr); if (PTR_ERR(tx_qp) == -EINVAL) { @@ -1040,6 +1058,7 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; tx_qp = ib_create_qp(priv->pd, &attr); } + tx->max_send_sge = attr.cap.max_send_sge; return tx_qp; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index fa9c42ff1fb0..f0e55e47eb54 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -180,6 +180,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) struct sk_buff *skb; u64 mapping[IPOIB_UD_RX_SG]; union ib_gid *dgid; + union ib_gid *sgid; ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", wr_id, wc->status); @@ -203,13 +204,6 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) return; } - /* - * Drop packets that this interface sent, ie multicast packets - * that the HCA has replicated. - */ - if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) - goto repost; - memcpy(mapping, priv->rx_ring[wr_id].mapping, IPOIB_UD_RX_SG * sizeof *mapping); @@ -239,6 +233,25 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) else skb->pkt_type = PACKET_MULTICAST; + sgid = &((struct ib_grh *)skb->data)->sgid; + + /* + * Drop packets that this interface sent, ie multicast packets + * that the HCA has replicated. + */ + if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) { + int need_repost = 1; + + if ((wc->wc_flags & IB_WC_GRH) && + sgid->global.interface_id != priv->local_gid.global.interface_id) + need_repost = 0; + + if (need_repost) { + dev_kfree_skb_any(skb); + goto repost; + } + } + skb_pull(skb, IB_GRH_BYTES); skb->protocol = ((struct ipoib_header *) skb->data)->proto; @@ -538,6 +551,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_tx_buf *tx_req; int hlen, rc; void *phead; + unsigned usable_sge = priv->max_send_sge - !!skb_headlen(skb); if (skb_is_gso(skb)) { hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); @@ -561,6 +575,23 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, phead = NULL; hlen = 0; } + if (skb_shinfo(skb)->nr_frags > usable_sge) { + if (skb_linearize(skb) < 0) { + ipoib_warn(priv, "skb could not be linearized\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + /* Does skb_linearize return ok without reducing nr_frags? */ + if (skb_shinfo(skb)->nr_frags > usable_sge) { + ipoib_warn(priv, "too many frags after skb linearize\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", skb->len, address, qpn); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 25509bbd4a05..80807d6e5c4c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -51,6 +51,7 @@ #include <net/addrconf.h> #include <linux/inetdevice.h> #include <rdma/ib_cache.h> +#include <linux/pci.h> #define DRV_VERSION "1.0.0" @@ -1590,11 +1591,67 @@ void ipoib_dev_cleanup(struct net_device *dev) priv->tx_ring = NULL; } +static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); +} + +static int ipoib_get_vf_config(struct net_device *dev, int vf, + struct ifla_vf_info *ivf) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int err; + + err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); + if (err) + return err; + + ivf->vf = vf; + + return 0; +} + +static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) + return -EINVAL; + + return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); +} + +static int ipoib_get_vf_stats(struct net_device *dev, int vf, + struct ifla_vf_stats *vf_stats) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); +} + static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; -static const struct net_device_ops ipoib_netdev_ops = { +static const struct net_device_ops ipoib_netdev_ops_pf = { + .ndo_uninit = ipoib_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, + .ndo_change_mtu = ipoib_change_mtu, + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, + .ndo_get_iflink = ipoib_get_iflink, + .ndo_set_vf_link_state = ipoib_set_vf_link_state, + .ndo_get_vf_config = ipoib_get_vf_config, + .ndo_get_vf_stats = ipoib_get_vf_stats, + .ndo_set_vf_guid = ipoib_set_vf_guid, +}; + +static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_uninit = ipoib_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, @@ -1610,7 +1667,11 @@ void ipoib_setup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - dev->netdev_ops = &ipoib_netdev_ops; + if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION) + dev->netdev_ops = &ipoib_netdev_ops_vf; + else + dev->netdev_ops = &ipoib_netdev_ops_pf; + dev->header_ops = &ipoib_header_ops; ipoib_set_ethtool_ops(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index d48c5bae7877..b809c373e40e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -206,7 +206,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; if (dev->features & NETIF_F_SG) - init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + init_attr.cap.max_send_sge = + min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { @@ -233,6 +234,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; + priv->max_send_sge = init_attr.cap.max_send_sge; + return 0; out_free_send_cq: diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index c827c93f46c5..80b6bedc172f 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -969,7 +969,16 @@ static umode_t iser_attr_is_visible(int param_type, int param) static int iscsi_iser_slave_alloc(struct scsi_device *sdev) { - blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K); + struct iscsi_session *session; + struct iser_conn *iser_conn; + struct ib_device *ib_dev; + + session = starget_to_session(scsi_target(sdev))->dd_data; + iser_conn = session->leadconn->dd_data; + ib_dev = iser_conn->ib_conn.device->ib_device; + + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + blk_queue_virt_boundary(sdev->request_queue, ~MASK_4K); return 0; } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 95f0a64e076b..0351059783b1 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -458,9 +458,6 @@ struct iser_fr_pool { * @comp: iser completion context * @fr_pool: connection fast registration poool * @pi_support: Indicate device T10-PI support - * @last: last send wr to signal all flush errors were drained - * @last_cqe: cqe handler for last wr - * @last_comp: completes when all connection completions consumed */ struct ib_conn { struct rdma_cm_id *cma_id; @@ -472,10 +469,7 @@ struct ib_conn { struct iser_comp *comp; struct iser_fr_pool fr_pool; bool pi_support; - struct ib_send_wr last; - struct ib_cqe last_cqe; struct ib_cqe reg_cqe; - struct completion last_comp; }; /** @@ -617,7 +611,6 @@ void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc); -void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_task_rdma_init(struct iscsi_iser_task *task); diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index ed54b388e7ad..81ae2e30dd12 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -729,13 +729,6 @@ void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc) kmem_cache_free(ig.desc_cache, desc); } -void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc) -{ - struct ib_conn *ib_conn = wc->qp->qp_context; - - complete(&ib_conn->last_comp); -} - void iser_task_rdma_init(struct iscsi_iser_task *iser_task) { diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 40c0f4978e2f..1b4945367e4f 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -252,14 +252,21 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn) } static int -iser_alloc_reg_res(struct ib_device *ib_device, +iser_alloc_reg_res(struct iser_device *device, struct ib_pd *pd, struct iser_reg_resources *res, unsigned int size) { + struct ib_device *ib_dev = device->ib_device; + enum ib_mr_type mr_type; int ret; - res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, size); + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + + res->mr = ib_alloc_mr(pd, mr_type, size); if (IS_ERR(res->mr)) { ret = PTR_ERR(res->mr); iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); @@ -277,7 +284,7 @@ iser_free_reg_res(struct iser_reg_resources *rsc) } static int -iser_alloc_pi_ctx(struct ib_device *ib_device, +iser_alloc_pi_ctx(struct iser_device *device, struct ib_pd *pd, struct iser_fr_desc *desc, unsigned int size) @@ -291,7 +298,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, pi_ctx = desc->pi_ctx; - ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc, size); + ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size); if (ret) { iser_err("failed to allocate reg_resources\n"); goto alloc_reg_res_err; @@ -324,7 +331,7 @@ iser_free_pi_ctx(struct iser_pi_context *pi_ctx) } static struct iser_fr_desc * -iser_create_fastreg_desc(struct ib_device *ib_device, +iser_create_fastreg_desc(struct iser_device *device, struct ib_pd *pd, bool pi_enable, unsigned int size) @@ -336,12 +343,12 @@ iser_create_fastreg_desc(struct ib_device *ib_device, if (!desc) return ERR_PTR(-ENOMEM); - ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc, size); + ret = iser_alloc_reg_res(device, pd, &desc->rsc, size); if (ret) goto reg_res_alloc_failure; if (pi_enable) { - ret = iser_alloc_pi_ctx(ib_device, pd, desc, size); + ret = iser_alloc_pi_ctx(device, pd, desc, size); if (ret) goto pi_ctx_alloc_failure; } @@ -374,7 +381,7 @@ int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, spin_lock_init(&fr_pool->lock); fr_pool->size = 0; for (i = 0; i < cmds_max; i++) { - desc = iser_create_fastreg_desc(device->ib_device, device->pd, + desc = iser_create_fastreg_desc(device, device->pd, ib_conn->pi_support, size); if (IS_ERR(desc)) { ret = PTR_ERR(desc); @@ -663,7 +670,6 @@ void iser_conn_release(struct iser_conn *iser_conn) int iser_conn_terminate(struct iser_conn *iser_conn) { struct ib_conn *ib_conn = &iser_conn->ib_conn; - struct ib_send_wr *bad_wr; int err = 0; /* terminate the iser conn only if the conn state is UP */ @@ -688,14 +694,8 @@ int iser_conn_terminate(struct iser_conn *iser_conn) iser_err("Failed to disconnect, conn: 0x%p err %d\n", iser_conn, err); - /* post an indication that all flush errors were consumed */ - err = ib_post_send(ib_conn->qp, &ib_conn->last, &bad_wr); - if (err) { - iser_err("conn %p failed to post last wr", ib_conn); - return 1; - } - - wait_for_completion(&ib_conn->last_comp); + /* block until all flush errors are consumed */ + ib_drain_sq(ib_conn->qp); } return 1; @@ -954,10 +954,6 @@ void iser_conn_init(struct iser_conn *iser_conn) ib_conn->post_recv_buf_count = 0; ib_conn->reg_cqe.done = iser_reg_comp; - ib_conn->last_cqe.done = iser_last_comp; - ib_conn->last.wr_cqe = &ib_conn->last_cqe; - ib_conn->last.opcode = IB_WR_SEND; - init_completion(&ib_conn->last_comp); } /** diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 03022f6420d7..b6bf20496021 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -446,49 +446,17 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) dev->max_pages_per_mr); } -static void srp_drain_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct srp_rdma_ch *ch = cq->cq_context; - - complete(&ch->done); -} - -static struct ib_cqe srp_drain_cqe = { - .done = srp_drain_done, -}; - /** * srp_destroy_qp() - destroy an RDMA queue pair * @ch: SRP RDMA channel. * - * Change a queue pair into the error state and wait until all receive - * completions have been processed before destroying it. This avoids that - * the receive completion handler can access the queue pair while it is + * Drain the qp before destroying it. This avoids that the receive + * completion handler can access the queue pair while it is * being destroyed. */ static void srp_destroy_qp(struct srp_rdma_ch *ch) { - static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; - static struct ib_recv_wr wr = { 0 }; - struct ib_recv_wr *bad_wr; - int ret; - - wr.wr_cqe = &srp_drain_cqe; - /* Destroying a QP and reusing ch->done is only safe if not connected */ - WARN_ON_ONCE(ch->connected); - - ret = ib_modify_qp(ch->qp, &attr, IB_QP_STATE); - WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret); - if (ret) - goto out; - - init_completion(&ch->done); - ret = ib_post_recv(ch->qp, &wr, &bad_wr); - WARN_ONCE(ret, "ib_post_recv() returned %d\n", ret); - if (ret == 0) - wait_for_completion(&ch->done); - -out: + ib_drain_rq(ch->qp); ib_destroy_qp(ch->qp); } @@ -508,7 +476,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) if (!init_attr) return -ENOMEM; - /* queue_size + 1 for ib_drain_qp */ + /* queue_size + 1 for ib_drain_rq() */ recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1, ch->comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(recv_cq)) { diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 0c37fee363b1..578c3703421d 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -91,76 +91,32 @@ MODULE_PARM_DESC(srpt_service_guid, " instead of using the node_guid of the first HCA."); static struct ib_client srpt_client; -static void srpt_release_channel(struct srpt_rdma_ch *ch); +static void srpt_release_cmd(struct se_cmd *se_cmd); +static void srpt_free_ch(struct kref *kref); static int srpt_queue_status(struct se_cmd *cmd); static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void srpt_process_wait_list(struct srpt_rdma_ch *ch); -/** - * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE. - */ -static inline -enum dma_data_direction opposite_dma_dir(enum dma_data_direction dir) -{ - switch (dir) { - case DMA_TO_DEVICE: return DMA_FROM_DEVICE; - case DMA_FROM_DEVICE: return DMA_TO_DEVICE; - default: return dir; - } -} - -/** - * srpt_sdev_name() - Return the name associated with the HCA. - * - * Examples are ib0, ib1, ... - */ -static inline const char *srpt_sdev_name(struct srpt_device *sdev) -{ - return sdev->device->name; -} - -static enum rdma_ch_state srpt_get_ch_state(struct srpt_rdma_ch *ch) -{ - unsigned long flags; - enum rdma_ch_state state; - - spin_lock_irqsave(&ch->spinlock, flags); - state = ch->state; - spin_unlock_irqrestore(&ch->spinlock, flags); - return state; -} - -static enum rdma_ch_state -srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new_state) -{ - unsigned long flags; - enum rdma_ch_state prev; - - spin_lock_irqsave(&ch->spinlock, flags); - prev = ch->state; - ch->state = new_state; - spin_unlock_irqrestore(&ch->spinlock, flags); - return prev; -} - -/** - * srpt_test_and_set_ch_state() - Test and set the channel state. - * - * Returns true if and only if the channel state has been set to the new state. +/* + * The only allowed channel state changes are those that change the channel + * state into a state with a higher numerical value. Hence the new > prev test. */ -static bool -srpt_test_and_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state old, - enum rdma_ch_state new) +static bool srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new) { unsigned long flags; enum rdma_ch_state prev; + bool changed = false; spin_lock_irqsave(&ch->spinlock, flags); prev = ch->state; - if (prev == old) + if (new > prev) { ch->state = new; + changed = true; + } spin_unlock_irqrestore(&ch->spinlock, flags); - return prev == old; + + return changed; } /** @@ -182,7 +138,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, return; pr_debug("ASYNC event= %d on device= %s\n", event->event, - srpt_sdev_name(sdev)); + sdev->device->name); switch (event->event) { case IB_EVENT_PORT_ERR: @@ -220,25 +176,39 @@ static void srpt_srq_event(struct ib_event *event, void *ctx) pr_info("SRQ event %d\n", event->event); } +static const char *get_ch_state_name(enum rdma_ch_state s) +{ + switch (s) { + case CH_CONNECTING: + return "connecting"; + case CH_LIVE: + return "live"; + case CH_DISCONNECTING: + return "disconnecting"; + case CH_DRAINING: + return "draining"; + case CH_DISCONNECTED: + return "disconnected"; + } + return "???"; +} + /** * srpt_qp_event() - QP event callback function. */ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) { pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n", - event->event, ch->cm_id, ch->sess_name, srpt_get_ch_state(ch)); + event->event, ch->cm_id, ch->sess_name, ch->state); switch (event->event) { case IB_EVENT_COMM_EST: ib_cm_notify(ch->cm_id, event->event); break; case IB_EVENT_QP_LAST_WQE_REACHED: - if (srpt_test_and_set_ch_state(ch, CH_DRAINING, - CH_RELEASING)) - srpt_release_channel(ch); - else - pr_debug("%s: state %d - ignored LAST_WQE.\n", - ch->sess_name, srpt_get_ch_state(ch)); + pr_debug("%s-%d, state %s: received Last WQE event.\n", + ch->sess_name, ch->qp->qp_num, + get_ch_state_name(ch->state)); break; default: pr_err("received unrecognized IB QP event %d\n", event->event); @@ -281,7 +251,7 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad) struct ib_class_port_info *cif; cif = (struct ib_class_port_info *)mad->data; - memset(cif, 0, sizeof *cif); + memset(cif, 0, sizeof(*cif)); cif->base_version = 1; cif->class_version = 1; cif->resp_time_value = 20; @@ -340,7 +310,7 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, return; } - memset(iocp, 0, sizeof *iocp); + memset(iocp, 0, sizeof(*iocp)); strcpy(iocp->id_string, SRPT_ID_STRING); iocp->guid = cpu_to_be64(srpt_service_guid); iocp->vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id); @@ -390,7 +360,7 @@ static void srpt_get_svc_entries(u64 ioc_guid, } svc_entries = (struct ib_dm_svc_entries *)mad->data; - memset(svc_entries, 0, sizeof *svc_entries); + memset(svc_entries, 0, sizeof(*svc_entries)); svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid); snprintf(svc_entries->service_entries[0].name, sizeof(svc_entries->service_entries[0].name), @@ -484,7 +454,7 @@ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, rsp->ah = ah; dm_mad = rsp->mad; - memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof *dm_mad); + memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof(*dm_mad)); dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; dm_mad->mad_hdr.status = 0; @@ -532,7 +502,7 @@ static int srpt_refresh_port(struct srpt_port *sport) struct ib_port_attr port_attr; int ret; - memset(&port_modify, 0, sizeof port_modify); + memset(&port_modify, 0, sizeof(port_modify)); port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; port_modify.clr_port_cap_mask = 0; @@ -553,7 +523,7 @@ static int srpt_refresh_port(struct srpt_port *sport) goto err_query_port; if (!sport->mad_agent) { - memset(®_req, 0, sizeof reg_req); + memset(®_req, 0, sizeof(reg_req)); reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT; reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION; set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask); @@ -841,6 +811,39 @@ out: } /** + * srpt_zerolength_write() - Perform a zero-length RDMA write. + * + * A quote from the InfiniBand specification: C9-88: For an HCA responder + * using Reliable Connection service, for each zero-length RDMA READ or WRITE + * request, the R_Key shall not be validated, even if the request includes + * Immediate data. + */ +static int srpt_zerolength_write(struct srpt_rdma_ch *ch) +{ + struct ib_send_wr wr, *bad_wr; + + memset(&wr, 0, sizeof(wr)); + wr.opcode = IB_WR_RDMA_WRITE; + wr.wr_cqe = &ch->zw_cqe; + wr.send_flags = IB_SEND_SIGNALED; + return ib_post_send(ch->qp, &wr, &bad_wr); +} + +static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct srpt_rdma_ch *ch = cq->cq_context; + + if (wc->status == IB_WC_SUCCESS) { + srpt_process_wait_list(ch); + } else { + if (srpt_set_ch_state(ch, CH_DISCONNECTED)) + schedule_work(&ch->release_work); + else + WARN_ONCE(1, "%s-%d\n", ch->sess_name, ch->qp->qp_num); + } +} + +/** * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. * @ioctx: Pointer to the I/O context associated with the request. * @srp_cmd: Pointer to the SRP_CMD request data. @@ -903,14 +906,14 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, db = (struct srp_direct_buf *)(srp_cmd->add_data + add_cdb_offset); - memcpy(ioctx->rbufs, db, sizeof *db); + memcpy(ioctx->rbufs, db, sizeof(*db)); *data_len = be32_to_cpu(db->len); } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { idb = (struct srp_indirect_buf *)(srp_cmd->add_data + add_cdb_offset); - ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof *db; + ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof(*db); if (ioctx->n_rbuf > (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { @@ -929,7 +932,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, ioctx->rbufs = &ioctx->single_rbuf; else { ioctx->rbufs = - kmalloc(ioctx->n_rbuf * sizeof *db, GFP_ATOMIC); + kmalloc(ioctx->n_rbuf * sizeof(*db), GFP_ATOMIC); if (!ioctx->rbufs) { ioctx->n_rbuf = 0; ret = -ENOMEM; @@ -938,7 +941,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, } db = idb->desc_list; - memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof *db); + memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof(*db)); *data_len = be32_to_cpu(idb->len); } out: @@ -956,7 +959,7 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) struct ib_qp_attr *attr; int ret; - attr = kzalloc(sizeof *attr, GFP_KERNEL); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; @@ -1070,7 +1073,7 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, dir = ioctx->cmd.data_direction; BUG_ON(dir == DMA_NONE); ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, - opposite_dma_dir(dir)); + target_reverse_dma_direction(&ioctx->cmd)); ioctx->mapped_sg_count = 0; } } @@ -1107,7 +1110,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, - opposite_dma_dir(dir)); + target_reverse_dma_direction(cmd)); if (unlikely(!count)) return -EAGAIN; @@ -1313,10 +1316,7 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) /* * If the command is in a state where the target core is waiting for - * the ib_srpt driver, change the state to the next state. Changing - * the state of the command from SRPT_STATE_NEED_DATA to - * SRPT_STATE_DATA_IN ensures that srpt_xmit_response() will call this - * function a second time. + * the ib_srpt driver, change the state to the next state. */ spin_lock_irqsave(&ioctx->spinlock, flags); @@ -1325,25 +1325,17 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) case SRPT_STATE_NEED_DATA: ioctx->state = SRPT_STATE_DATA_IN; break; - case SRPT_STATE_DATA_IN: case SRPT_STATE_CMD_RSP_SENT: case SRPT_STATE_MGMT_RSP_SENT: ioctx->state = SRPT_STATE_DONE; break; default: + WARN_ONCE(true, "%s: unexpected I/O context state %d\n", + __func__, state); break; } spin_unlock_irqrestore(&ioctx->spinlock, flags); - if (state == SRPT_STATE_DONE) { - struct srpt_rdma_ch *ch = ioctx->ch; - - BUG_ON(ch->sess == NULL); - - target_put_sess_cmd(&ioctx->cmd); - goto out; - } - pr_debug("Aborting cmd with state %d and tag %lld\n", state, ioctx->cmd.tag); @@ -1351,19 +1343,16 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) case SRPT_STATE_NEW: case SRPT_STATE_DATA_IN: case SRPT_STATE_MGMT: + case SRPT_STATE_DONE: /* * Do nothing - defer abort processing until * srpt_queue_response() is invoked. */ - WARN_ON(!transport_check_aborted_status(&ioctx->cmd, false)); break; case SRPT_STATE_NEED_DATA: - /* DMA_TO_DEVICE (write) - RDMA read error. */ - - /* XXX(hch): this is a horrible layering violation.. */ - spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags); - ioctx->cmd.transport_state &= ~CMD_T_ACTIVE; - spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags); + pr_debug("tag %#llx: RDMA read error\n", ioctx->cmd.tag); + transport_generic_request_failure(&ioctx->cmd, + TCM_CHECK_CONDITION_ABORT_CMD); break; case SRPT_STATE_CMD_RSP_SENT: /* @@ -1371,18 +1360,16 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * not been received in time. */ srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); - target_put_sess_cmd(&ioctx->cmd); + transport_generic_free_cmd(&ioctx->cmd, 0); break; case SRPT_STATE_MGMT_RSP_SENT: - srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - target_put_sess_cmd(&ioctx->cmd); + transport_generic_free_cmd(&ioctx->cmd, 0); break; default: WARN(1, "Unexpected command state (%d)", state); break; } -out: return state; } @@ -1422,9 +1409,14 @@ static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe); if (unlikely(wc->status != IB_WC_SUCCESS)) { + /* + * Note: if an RDMA write error completion is received that + * means that a SEND also has been posted. Defer further + * processing of the associated command until the send error + * completion has been received. + */ pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n", ioctx, wc->status); - srpt_abort_cmd(ioctx); } } @@ -1464,7 +1456,7 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, sense_data_len = ioctx->cmd.scsi_sense_length; WARN_ON(sense_data_len > sizeof(ioctx->sense_data)); - memset(srp_rsp, 0, sizeof *srp_rsp); + memset(srp_rsp, 0, sizeof(*srp_rsp)); srp_rsp->opcode = SRP_RSP; srp_rsp->req_lim_delta = cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0)); @@ -1514,7 +1506,7 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, srp_rsp = ioctx->ioctx.buf; BUG_ON(!srp_rsp); - memset(srp_rsp, 0, sizeof *srp_rsp); + memset(srp_rsp, 0, sizeof(*srp_rsp)); srp_rsp->opcode = SRP_RSP; srp_rsp->req_lim_delta = @@ -1528,80 +1520,6 @@ static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch, return resp_len; } -#define NO_SUCH_LUN ((uint64_t)-1LL) - -/* - * SCSI LUN addressing method. See also SAM-2 and the section about - * eight byte LUNs. - */ -enum scsi_lun_addr_method { - SCSI_LUN_ADDR_METHOD_PERIPHERAL = 0, - SCSI_LUN_ADDR_METHOD_FLAT = 1, - SCSI_LUN_ADDR_METHOD_LUN = 2, - SCSI_LUN_ADDR_METHOD_EXTENDED_LUN = 3, -}; - -/* - * srpt_unpack_lun() - Convert from network LUN to linear LUN. - * - * Convert an 2-byte, 4-byte, 6-byte or 8-byte LUN structure in network byte - * order (big endian) to a linear LUN. Supports three LUN addressing methods: - * peripheral, flat and logical unit. See also SAM-2, section 4.9.4 (page 40). - */ -static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) -{ - uint64_t res = NO_SUCH_LUN; - int addressing_method; - - if (unlikely(len < 2)) { - pr_err("Illegal LUN length %d, expected 2 bytes or more\n", - len); - goto out; - } - - switch (len) { - case 8: - if ((*((__be64 *)lun) & - cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0) - goto out_err; - break; - case 4: - if (*((__be16 *)&lun[2]) != 0) - goto out_err; - break; - case 6: - if (*((__be32 *)&lun[2]) != 0) - goto out_err; - break; - case 2: - break; - default: - goto out_err; - } - - addressing_method = (*lun) >> 6; /* highest two bits of byte 0 */ - switch (addressing_method) { - case SCSI_LUN_ADDR_METHOD_PERIPHERAL: - case SCSI_LUN_ADDR_METHOD_FLAT: - case SCSI_LUN_ADDR_METHOD_LUN: - res = *(lun + 1) | (((*lun) & 0x3f) << 8); - break; - - case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN: - default: - pr_err("Unimplemented LUN addressing method %u\n", - addressing_method); - break; - } - -out: - return res; - -out_err: - pr_err("Support for multi-level LUNs has not yet been implemented\n"); - goto out; -} - static int srpt_check_stop_free(struct se_cmd *cmd) { struct srpt_send_ioctx *ioctx = container_of(cmd, @@ -1613,16 +1531,14 @@ static int srpt_check_stop_free(struct se_cmd *cmd) /** * srpt_handle_cmd() - Process SRP_CMD. */ -static int srpt_handle_cmd(struct srpt_rdma_ch *ch, - struct srpt_recv_ioctx *recv_ioctx, - struct srpt_send_ioctx *send_ioctx) +static void srpt_handle_cmd(struct srpt_rdma_ch *ch, + struct srpt_recv_ioctx *recv_ioctx, + struct srpt_send_ioctx *send_ioctx) { struct se_cmd *cmd; struct srp_cmd *srp_cmd; - uint64_t unpacked_lun; u64 data_len; enum dma_data_direction dir; - sense_reason_t ret; int rc; BUG_ON(!send_ioctx); @@ -1650,65 +1566,23 @@ static int srpt_handle_cmd(struct srpt_rdma_ch *ch, if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { pr_err("0x%llx: parsing SRP descriptor table failed.\n", srp_cmd->tag); - ret = TCM_INVALID_CDB_FIELD; - goto send_sense; + goto release_ioctx; } - unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun, - sizeof(srp_cmd->lun)); rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, - &send_ioctx->sense_data[0], unpacked_lun, data_len, - TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + &send_ioctx->sense_data[0], + scsilun_to_int(&srp_cmd->lun), data_len, + TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); if (rc != 0) { - ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - goto send_sense; + pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc, + srp_cmd->tag); + goto release_ioctx; } - return 0; - -send_sense: - transport_send_check_condition_and_sense(cmd, ret, 0); - return -1; -} - -/** - * srpt_rx_mgmt_fn_tag() - Process a task management function by tag. - * @ch: RDMA channel of the task management request. - * @fn: Task management function to perform. - * @req_tag: Tag of the SRP task management request. - * @mgmt_ioctx: I/O context of the task management request. - * - * Returns zero if the target core will process the task management - * request asynchronously. - * - * Note: It is assumed that the initiator serializes tag-based task management - * requests. - */ -static int srpt_rx_mgmt_fn_tag(struct srpt_send_ioctx *ioctx, u64 tag) -{ - struct srpt_device *sdev; - struct srpt_rdma_ch *ch; - struct srpt_send_ioctx *target; - int ret, i; + return; - ret = -EINVAL; - ch = ioctx->ch; - BUG_ON(!ch); - BUG_ON(!ch->sport); - sdev = ch->sport->sdev; - BUG_ON(!sdev); - spin_lock_irq(&sdev->spinlock); - for (i = 0; i < ch->rq_size; ++i) { - target = ch->ioctx_ring[i]; - if (target->cmd.se_lun == ioctx->cmd.se_lun && - target->cmd.tag == tag && - srpt_get_cmd_state(target) != SRPT_STATE_DONE) { - ret = 0; - /* now let the target core abort &target->cmd; */ - break; - } - } - spin_unlock_irq(&sdev->spinlock); - return ret; +release_ioctx: + send_ioctx->state = SRPT_STATE_DONE; + srpt_release_cmd(cmd); } static int srp_tmr_to_tcm(int fn) @@ -1744,8 +1618,6 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, struct srp_tsk_mgmt *srp_tsk; struct se_cmd *cmd; struct se_session *sess = ch->sess; - uint64_t unpacked_lun; - uint32_t tag = 0; int tcm_tmr; int rc; @@ -1761,26 +1633,10 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT); send_ioctx->cmd.tag = srp_tsk->tag; tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func); - if (tcm_tmr < 0) { - send_ioctx->cmd.se_tmr_req->response = - TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; - goto fail; - } - unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun, - sizeof(srp_tsk->lun)); - - if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) { - rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag); - if (rc < 0) { - send_ioctx->cmd.se_tmr_req->response = - TMR_TASK_DOES_NOT_EXIST; - goto fail; - } - tag = srp_tsk->task_tag; - } - rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun, - srp_tsk, tcm_tmr, GFP_KERNEL, tag, - TARGET_SCF_ACK_KREF); + rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, + scsilun_to_int(&srp_tsk->lun), srp_tsk, tcm_tmr, + GFP_KERNEL, srp_tsk->task_tag, + TARGET_SCF_ACK_KREF); if (rc != 0) { send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED; goto fail; @@ -1800,7 +1656,6 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_send_ioctx *send_ioctx) { struct srp_cmd *srp_cmd; - enum rdma_ch_state ch_state; BUG_ON(!ch); BUG_ON(!recv_ioctx); @@ -1809,13 +1664,12 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, recv_ioctx->ioctx.dma, srp_max_req_size, DMA_FROM_DEVICE); - ch_state = srpt_get_ch_state(ch); - if (unlikely(ch_state == CH_CONNECTING)) { + if (unlikely(ch->state == CH_CONNECTING)) { list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); goto out; } - if (unlikely(ch_state != CH_LIVE)) + if (unlikely(ch->state != CH_LIVE)) goto out; srp_cmd = recv_ioctx->ioctx.buf; @@ -1878,6 +1732,28 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) } } +/* + * This function must be called from the context in which RDMA completions are + * processed because it accesses the wait list without protection against + * access from other threads. + */ +static void srpt_process_wait_list(struct srpt_rdma_ch *ch) +{ + struct srpt_send_ioctx *ioctx; + + while (!list_empty(&ch->cmd_wait_list) && + ch->state >= CH_LIVE && + (ioctx = srpt_get_send_ioctx(ch)) != NULL) { + struct srpt_recv_ioctx *recv_ioctx; + + recv_ioctx = list_first_entry(&ch->cmd_wait_list, + struct srpt_recv_ioctx, + wait_list); + list_del(&recv_ioctx->wait_list); + srpt_handle_new_iu(ch, recv_ioctx, ioctx); + } +} + /** * Note: Although this has not yet been observed during tests, at least in * theory it is possible that the srpt_get_send_ioctx() call invoked by @@ -1905,15 +1781,10 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) atomic_inc(&ch->sq_wr_avail); - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) pr_info("sending response for ioctx 0x%p failed" " with status %d\n", ioctx, wc->status); - atomic_dec(&ch->req_lim); - srpt_abort_cmd(ioctx); - goto out; - } - if (state != SRPT_STATE_DONE) { srpt_unmap_sg_to_ib_sge(ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); @@ -1922,18 +1793,7 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) " wr_id = %u.\n", ioctx->ioctx.index); } -out: - while (!list_empty(&ch->cmd_wait_list) && - srpt_get_ch_state(ch) == CH_LIVE && - (ioctx = srpt_get_send_ioctx(ch)) != NULL) { - struct srpt_recv_ioctx *recv_ioctx; - - recv_ioctx = list_first_entry(&ch->cmd_wait_list, - struct srpt_recv_ioctx, - wait_list); - list_del(&recv_ioctx->wait_list); - srpt_handle_new_iu(ch, recv_ioctx, ioctx); - } + srpt_process_wait_list(ch); } /** @@ -1950,7 +1810,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) WARN_ON(ch->rq_size < 1); ret = -ENOMEM; - qp_init = kzalloc(sizeof *qp_init, GFP_KERNEL); + qp_init = kzalloc(sizeof(*qp_init), GFP_KERNEL); if (!qp_init) goto out; @@ -2017,168 +1877,102 @@ static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) } /** - * __srpt_close_ch() - Close an RDMA channel by setting the QP error state. + * srpt_close_ch() - Close an RDMA channel. * - * Reset the QP and make sure all resources associated with the channel will - * be deallocated at an appropriate time. + * Make sure all resources associated with the channel will be deallocated at + * an appropriate time. * - * Note: The caller must hold ch->sport->sdev->spinlock. + * Returns true if and only if the channel state has been modified into + * CH_DRAINING. */ -static void __srpt_close_ch(struct srpt_rdma_ch *ch) +static bool srpt_close_ch(struct srpt_rdma_ch *ch) { - enum rdma_ch_state prev_state; - unsigned long flags; + int ret; - spin_lock_irqsave(&ch->spinlock, flags); - prev_state = ch->state; - switch (prev_state) { - case CH_CONNECTING: - case CH_LIVE: - ch->state = CH_DISCONNECTING; - break; - default: - break; + if (!srpt_set_ch_state(ch, CH_DRAINING)) { + pr_debug("%s-%d: already closed\n", ch->sess_name, + ch->qp->qp_num); + return false; } - spin_unlock_irqrestore(&ch->spinlock, flags); - - switch (prev_state) { - case CH_CONNECTING: - ib_send_cm_rej(ch->cm_id, IB_CM_REJ_NO_RESOURCES, NULL, 0, - NULL, 0); - /* fall through */ - case CH_LIVE: - if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0) - pr_err("sending CM DREQ failed.\n"); - break; - case CH_DISCONNECTING: - break; - case CH_DRAINING: - case CH_RELEASING: - break; - } -} - -/** - * srpt_close_ch() - Close an RDMA channel. - */ -static void srpt_close_ch(struct srpt_rdma_ch *ch) -{ - struct srpt_device *sdev; - sdev = ch->sport->sdev; - spin_lock_irq(&sdev->spinlock); - __srpt_close_ch(ch); - spin_unlock_irq(&sdev->spinlock); -} + kref_get(&ch->kref); -/** - * srpt_shutdown_session() - Whether or not a session may be shut down. - */ -static int srpt_shutdown_session(struct se_session *se_sess) -{ - struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; - unsigned long flags; + ret = srpt_ch_qp_err(ch); + if (ret < 0) + pr_err("%s-%d: changing queue pair into error state failed: %d\n", + ch->sess_name, ch->qp->qp_num, ret); - spin_lock_irqsave(&ch->spinlock, flags); - if (ch->in_shutdown) { - spin_unlock_irqrestore(&ch->spinlock, flags); - return true; + pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, + ch->qp->qp_num); + ret = srpt_zerolength_write(ch); + if (ret < 0) { + pr_err("%s-%d: queuing zero-length write failed: %d\n", + ch->sess_name, ch->qp->qp_num, ret); + if (srpt_set_ch_state(ch, CH_DISCONNECTED)) + schedule_work(&ch->release_work); + else + WARN_ON_ONCE(true); } - ch->in_shutdown = true; - target_sess_cmd_list_set_waiting(se_sess); - spin_unlock_irqrestore(&ch->spinlock, flags); + kref_put(&ch->kref, srpt_free_ch); return true; } -/** - * srpt_drain_channel() - Drain a channel by resetting the IB queue pair. - * @cm_id: Pointer to the CM ID of the channel to be drained. - * - * Note: Must be called from inside srpt_cm_handler to avoid a race between - * accessing sdev->spinlock and the call to kfree(sdev) in srpt_remove_one() - * (the caller of srpt_cm_handler holds the cm_id spinlock; srpt_remove_one() - * waits until all target sessions for the associated IB device have been - * unregistered and target session registration involves a call to - * ib_destroy_cm_id(), which locks the cm_id spinlock and hence waits until - * this function has finished). +/* + * Change the channel state into CH_DISCONNECTING. If a channel has not yet + * reached the connected state, close it. If a channel is in the connected + * state, send a DREQ. If a DREQ has been received, send a DREP. Note: it is + * the responsibility of the caller to ensure that this function is not + * invoked concurrently with the code that accepts a connection. This means + * that this function must either be invoked from inside a CM callback + * function or that it must be invoked with the srpt_port.mutex held. */ -static void srpt_drain_channel(struct ib_cm_id *cm_id) +static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) { - struct srpt_device *sdev; - struct srpt_rdma_ch *ch; int ret; - bool do_reset = false; - WARN_ON_ONCE(irqs_disabled()); + if (!srpt_set_ch_state(ch, CH_DISCONNECTING)) + return -ENOTCONN; - sdev = cm_id->context; - BUG_ON(!sdev); - spin_lock_irq(&sdev->spinlock); - list_for_each_entry(ch, &sdev->rch_list, list) { - if (ch->cm_id == cm_id) { - do_reset = srpt_test_and_set_ch_state(ch, - CH_CONNECTING, CH_DRAINING) || - srpt_test_and_set_ch_state(ch, - CH_LIVE, CH_DRAINING) || - srpt_test_and_set_ch_state(ch, - CH_DISCONNECTING, CH_DRAINING); - break; - } - } - spin_unlock_irq(&sdev->spinlock); + ret = ib_send_cm_dreq(ch->cm_id, NULL, 0); + if (ret < 0) + ret = ib_send_cm_drep(ch->cm_id, NULL, 0); - if (do_reset) { - if (ch->sess) - srpt_shutdown_session(ch->sess); + if (ret < 0 && srpt_close_ch(ch)) + ret = 0; - ret = srpt_ch_qp_err(ch); - if (ret < 0) - pr_err("Setting queue pair in error state" - " failed: %d\n", ret); - } + return ret; } -/** - * srpt_find_channel() - Look up an RDMA channel. - * @cm_id: Pointer to the CM ID of the channel to be looked up. - * - * Return NULL if no matching RDMA channel has been found. - */ -static struct srpt_rdma_ch *srpt_find_channel(struct srpt_device *sdev, - struct ib_cm_id *cm_id) +static void __srpt_close_all_ch(struct srpt_device *sdev) { struct srpt_rdma_ch *ch; - bool found; - WARN_ON_ONCE(irqs_disabled()); - BUG_ON(!sdev); + lockdep_assert_held(&sdev->mutex); - found = false; - spin_lock_irq(&sdev->spinlock); list_for_each_entry(ch, &sdev->rch_list, list) { - if (ch->cm_id == cm_id) { - found = true; - break; - } + if (srpt_disconnect_ch(ch) >= 0) + pr_info("Closing channel %s-%d because target %s has been disabled\n", + ch->sess_name, ch->qp->qp_num, + sdev->device->name); + srpt_close_ch(ch); } - spin_unlock_irq(&sdev->spinlock); - - return found ? ch : NULL; } /** - * srpt_release_channel() - Release channel resources. - * - * Schedules the actual release because: - * - Calling the ib_destroy_cm_id() call from inside an IB CM callback would - * trigger a deadlock. - * - It is not safe to call TCM transport_* functions from interrupt context. + * srpt_shutdown_session() - Whether or not a session may be shut down. */ -static void srpt_release_channel(struct srpt_rdma_ch *ch) +static int srpt_shutdown_session(struct se_session *se_sess) +{ + return 1; +} + +static void srpt_free_ch(struct kref *kref) { - schedule_work(&ch->release_work); + struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref); + + kfree(ch); } static void srpt_release_channel_work(struct work_struct *w) @@ -2188,8 +1982,8 @@ static void srpt_release_channel_work(struct work_struct *w) struct se_session *se_sess; ch = container_of(w, struct srpt_rdma_ch, release_work); - pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess, - ch->release_done); + pr_debug("%s: %s-%d; release_done = %p\n", __func__, ch->sess_name, + ch->qp->qp_num, ch->release_done); sdev = ch->sport->sdev; BUG_ON(!sdev); @@ -2197,6 +1991,7 @@ static void srpt_release_channel_work(struct work_struct *w) se_sess = ch->sess; BUG_ON(!se_sess); + target_sess_cmd_list_set_waiting(se_sess); target_wait_for_sess_cmds(se_sess); transport_deregister_session_configfs(se_sess); @@ -2211,16 +2006,15 @@ static void srpt_release_channel_work(struct work_struct *w) ch->sport->sdev, ch->rq_size, ch->rsp_size, DMA_TO_DEVICE); - spin_lock_irq(&sdev->spinlock); - list_del(&ch->list); - spin_unlock_irq(&sdev->spinlock); - + mutex_lock(&sdev->mutex); + list_del_init(&ch->list); if (ch->release_done) complete(ch->release_done); + mutex_unlock(&sdev->mutex); wake_up(&sdev->ch_releaseQ); - kfree(ch); + kref_put(&ch->kref, srpt_free_ch); } /** @@ -2266,9 +2060,9 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); - rsp = kzalloc(sizeof *rsp, GFP_KERNEL); - rej = kzalloc(sizeof *rej, GFP_KERNEL); - rep_param = kzalloc(sizeof *rep_param, GFP_KERNEL); + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + rej = kzalloc(sizeof(*rej), GFP_KERNEL); + rep_param = kzalloc(sizeof(*rep_param), GFP_KERNEL); if (!rsp || !rej || !rep_param) { ret = -ENOMEM; @@ -2297,7 +2091,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; - spin_lock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) { if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) @@ -2305,26 +2099,16 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, && param->port == ch->sport->port && param->listen_id == ch->sport->sdev->cm_id && ch->cm_id) { - enum rdma_ch_state ch_state; - - ch_state = srpt_get_ch_state(ch); - if (ch_state != CH_CONNECTING - && ch_state != CH_LIVE) + if (srpt_disconnect_ch(ch) < 0) continue; - - /* found an existing channel */ - pr_debug("Found existing channel %s" - " cm_id= %p state= %d\n", - ch->sess_name, ch->cm_id, ch_state); - - __srpt_close_ch(ch); - + pr_info("Relogin - closed existing channel %s\n", + ch->sess_name); rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED; } } - spin_unlock_irq(&sdev->spinlock); + mutex_unlock(&sdev->mutex); } else rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; @@ -2340,7 +2124,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto reject; } - ch = kzalloc(sizeof *ch, GFP_KERNEL); + ch = kzalloc(sizeof(*ch), GFP_KERNEL); if (!ch) { rej->reason = cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); @@ -2349,11 +2133,14 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto reject; } + kref_init(&ch->kref); + ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); memcpy(ch->i_port_id, req->initiator_port_id, 16); memcpy(ch->t_port_id, req->target_port_id, 16); ch->sport = &sdev->port[param->port - 1]; ch->cm_id = cm_id; + cm_id->context = ch; /* * Avoid QUEUE_FULL conditions by limiting the number of buffers used * for the SRP protocol to the command queue size. @@ -2453,7 +2240,7 @@ try_again: /* create cm reply */ rep_param->qp_num = ch->qp->qp_num; rep_param->private_data = (void *)rsp; - rep_param->private_data_len = sizeof *rsp; + rep_param->private_data_len = sizeof(*rsp); rep_param->rnr_retry_count = 7; rep_param->flow_control = 1; rep_param->failover_accepted = 0; @@ -2468,14 +2255,14 @@ try_again: goto release_channel; } - spin_lock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); list_add_tail(&ch->list, &sdev->rch_list); - spin_unlock_irq(&sdev->spinlock); + mutex_unlock(&sdev->mutex); goto out; release_channel: - srpt_set_ch_state(ch, CH_RELEASING); + srpt_disconnect_ch(ch); transport_deregister_session_configfs(ch->sess); transport_deregister_session(ch->sess); ch->sess = NULL; @@ -2497,7 +2284,7 @@ reject: | SRP_BUF_FORMAT_INDIRECT); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, - (void *)rej, sizeof *rej); + (void *)rej, sizeof(*rej)); out: kfree(rep_param); @@ -2507,10 +2294,23 @@ out: return ret; } -static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) +static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, + enum ib_cm_rej_reason reason, + const u8 *private_data, + u8 private_data_len) { - pr_info("Received IB REJ for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); + char *priv = NULL; + int i; + + if (private_data_len && (priv = kmalloc(private_data_len * 3 + 1, + GFP_KERNEL))) { + for (i = 0; i < private_data_len; i++) + sprintf(priv + 3 * i, " %02x", private_data[i]); + } + pr_info("Received CM REJ for ch %s-%d; reason %d%s%s.\n", + ch->sess_name, ch->qp->qp_num, reason, private_data_len ? + "; private data" : "", priv ? priv : " (?)"); + kfree(priv); } /** @@ -2519,87 +2319,23 @@ static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) * An IB_CM_RTU_RECEIVED message indicates that the connection is established * and that the recipient may begin transmitting (RTU = ready to use). */ -static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id) +static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) { - struct srpt_rdma_ch *ch; int ret; - ch = srpt_find_channel(cm_id->context, cm_id); - BUG_ON(!ch); - - if (srpt_test_and_set_ch_state(ch, CH_CONNECTING, CH_LIVE)) { - struct srpt_recv_ioctx *ioctx, *ioctx_tmp; - + if (srpt_set_ch_state(ch, CH_LIVE)) { ret = srpt_ch_qp_rts(ch, ch->qp); - list_for_each_entry_safe(ioctx, ioctx_tmp, &ch->cmd_wait_list, - wait_list) { - list_del(&ioctx->wait_list); - srpt_handle_new_iu(ch, ioctx, NULL); - } - if (ret) + if (ret == 0) { + /* Trigger wait list processing. */ + ret = srpt_zerolength_write(ch); + WARN_ONCE(ret < 0, "%d\n", ret); + } else { srpt_close_ch(ch); + } } } -static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id) -{ - pr_info("Received IB TimeWait exit for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); -} - -static void srpt_cm_rep_error(struct ib_cm_id *cm_id) -{ - pr_info("Received IB REP error for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); -} - -/** - * srpt_cm_dreq_recv() - Process reception of a DREQ message. - */ -static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) -{ - struct srpt_rdma_ch *ch; - unsigned long flags; - bool send_drep = false; - - ch = srpt_find_channel(cm_id->context, cm_id); - BUG_ON(!ch); - - pr_debug("cm_id= %p ch->state= %d\n", cm_id, srpt_get_ch_state(ch)); - - spin_lock_irqsave(&ch->spinlock, flags); - switch (ch->state) { - case CH_CONNECTING: - case CH_LIVE: - send_drep = true; - ch->state = CH_DISCONNECTING; - break; - case CH_DISCONNECTING: - case CH_DRAINING: - case CH_RELEASING: - WARN(true, "unexpected channel state %d\n", ch->state); - break; - } - spin_unlock_irqrestore(&ch->spinlock, flags); - - if (send_drep) { - if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0) - pr_err("Sending IB DREP failed.\n"); - pr_info("Received DREQ and sent DREP for session %s.\n", - ch->sess_name); - } -} - -/** - * srpt_cm_drep_recv() - Process reception of a DREP message. - */ -static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) -{ - pr_info("Received InfiniBand DREP message for cm_id %p.\n", cm_id); - srpt_drain_channel(cm_id); -} - /** * srpt_cm_handler() - IB connection manager callback function. * @@ -2612,6 +2348,7 @@ static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) */ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { + struct srpt_rdma_ch *ch = cm_id->context; int ret; ret = 0; @@ -2621,32 +2358,39 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) event->private_data); break; case IB_CM_REJ_RECEIVED: - srpt_cm_rej_recv(cm_id); + srpt_cm_rej_recv(ch, event->param.rej_rcvd.reason, + event->private_data, + IB_CM_REJ_PRIVATE_DATA_SIZE); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: - srpt_cm_rtu_recv(cm_id); + srpt_cm_rtu_recv(ch); break; case IB_CM_DREQ_RECEIVED: - srpt_cm_dreq_recv(cm_id); + srpt_disconnect_ch(ch); break; case IB_CM_DREP_RECEIVED: - srpt_cm_drep_recv(cm_id); + pr_info("Received CM DREP message for ch %s-%d.\n", + ch->sess_name, ch->qp->qp_num); + srpt_close_ch(ch); break; case IB_CM_TIMEWAIT_EXIT: - srpt_cm_timewait_exit(cm_id); + pr_info("Received CM TimeWait exit for ch %s-%d.\n", + ch->sess_name, ch->qp->qp_num); + srpt_close_ch(ch); break; case IB_CM_REP_ERROR: - srpt_cm_rep_error(cm_id); + pr_info("Received CM REP error for ch %s-%d.\n", ch->sess_name, + ch->qp->qp_num); break; case IB_CM_DREQ_ERROR: - pr_info("Received IB DREQ ERROR event.\n"); + pr_info("Received CM DREQ ERROR event.\n"); break; case IB_CM_MRA_RECEIVED: - pr_info("Received IB MRA event\n"); + pr_info("Received CM MRA event\n"); break; default: - pr_err("received unrecognized IB CM event %d\n", event->event); + pr_err("received unrecognized CM event %d\n", event->event); break; } @@ -2755,41 +2499,14 @@ static int srpt_write_pending_status(struct se_cmd *se_cmd) */ static int srpt_write_pending(struct se_cmd *se_cmd) { - struct srpt_rdma_ch *ch; - struct srpt_send_ioctx *ioctx; + struct srpt_send_ioctx *ioctx = + container_of(se_cmd, struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; enum srpt_command_state new_state; - enum rdma_ch_state ch_state; - int ret; - - ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); WARN_ON(new_state == SRPT_STATE_DONE); - - ch = ioctx->ch; - BUG_ON(!ch); - - ch_state = srpt_get_ch_state(ch); - switch (ch_state) { - case CH_CONNECTING: - WARN(true, "unexpected channel state %d\n", ch_state); - ret = -EINVAL; - goto out; - case CH_LIVE: - break; - case CH_DISCONNECTING: - case CH_DRAINING: - case CH_RELEASING: - pr_debug("cmd with tag %lld: channel disconnecting\n", - ioctx->cmd.tag); - srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN); - ret = -EINVAL; - goto out; - } - ret = srpt_xfer_data(ch, ioctx); - -out: - return ret; + return srpt_xfer_data(ch, ioctx); } static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) @@ -2920,36 +2637,25 @@ static void srpt_refresh_port_work(struct work_struct *work) srpt_refresh_port(sport); } -static int srpt_ch_list_empty(struct srpt_device *sdev) -{ - int res; - - spin_lock_irq(&sdev->spinlock); - res = list_empty(&sdev->rch_list); - spin_unlock_irq(&sdev->spinlock); - - return res; -} - /** * srpt_release_sdev() - Free the channel resources associated with a target. */ static int srpt_release_sdev(struct srpt_device *sdev) { - struct srpt_rdma_ch *ch, *tmp_ch; - int res; + int i, res; WARN_ON_ONCE(irqs_disabled()); BUG_ON(!sdev); - spin_lock_irq(&sdev->spinlock); - list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) - __srpt_close_ch(ch); - spin_unlock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); + for (i = 0; i < ARRAY_SIZE(sdev->port); i++) + sdev->port[i].enabled = false; + __srpt_close_all_ch(sdev); + mutex_unlock(&sdev->mutex); res = wait_event_interruptible(sdev->ch_releaseQ, - srpt_ch_list_empty(sdev)); + list_empty_careful(&sdev->rch_list)); if (res) pr_err("%s: interrupted.\n", __func__); @@ -3003,14 +2709,14 @@ static void srpt_add_one(struct ib_device *device) pr_debug("device = %p, device->dma_ops = %p\n", device, device->dma_ops); - sdev = kzalloc(sizeof *sdev, GFP_KERNEL); + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); if (!sdev) goto err; sdev->device = device; INIT_LIST_HEAD(&sdev->rch_list); init_waitqueue_head(&sdev->ch_releaseQ); - spin_lock_init(&sdev->spinlock); + mutex_init(&sdev->mutex); sdev->pd = ib_alloc_pd(device); if (IS_ERR(sdev->pd)) @@ -3082,7 +2788,7 @@ static void srpt_add_one(struct ib_device *device) if (srpt_refresh_port(sport)) { pr_err("MAD registration failed for %s-%d.\n", - srpt_sdev_name(sdev), i); + sdev->device->name, i); goto err_ring; } snprintf(sport->port_guid, sizeof(sport->port_guid), @@ -3231,24 +2937,26 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) static void srpt_close_session(struct se_session *se_sess) { DECLARE_COMPLETION_ONSTACK(release_done); - struct srpt_rdma_ch *ch; - struct srpt_device *sdev; - unsigned long res; - - ch = se_sess->fabric_sess_ptr; - WARN_ON(ch->sess != se_sess); + struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; + struct srpt_device *sdev = ch->sport->sdev; + bool wait; - pr_debug("ch %p state %d\n", ch, srpt_get_ch_state(ch)); + pr_debug("ch %s-%d state %d\n", ch->sess_name, ch->qp->qp_num, + ch->state); - sdev = ch->sport->sdev; - spin_lock_irq(&sdev->spinlock); + mutex_lock(&sdev->mutex); BUG_ON(ch->release_done); ch->release_done = &release_done; - __srpt_close_ch(ch); - spin_unlock_irq(&sdev->spinlock); + wait = !list_empty(&ch->list); + srpt_disconnect_ch(ch); + mutex_unlock(&sdev->mutex); - res = wait_for_completion_timeout(&release_done, 60 * HZ); - WARN_ON(res == 0); + if (!wait) + return; + + while (wait_for_completion_timeout(&release_done, 180 * HZ) == 0) + pr_info("%s(%s-%d state %d): still waiting ...\n", __func__, + ch->sess_name, ch->qp->qp_num, ch->state); } /** @@ -3456,6 +3164,8 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, { struct se_portal_group *se_tpg = to_tpg(item); struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1); + struct srpt_device *sdev = sport->sdev; + struct srpt_rdma_ch *ch; unsigned long tmp; int ret; @@ -3469,11 +3179,24 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, pr_err("Illegal value for srpt_tpg_store_enable: %lu\n", tmp); return -EINVAL; } - if (tmp == 1) - sport->enabled = true; - else - sport->enabled = false; + if (sport->enabled == tmp) + goto out; + sport->enabled = tmp; + if (sport->enabled) + goto out; + + mutex_lock(&sdev->mutex); + list_for_each_entry(ch, &sdev->rch_list, list) { + if (ch->sport == sport) { + pr_debug("%s: ch %p %s-%d\n", __func__, ch, + ch->sess_name, ch->qp->qp_num); + srpt_disconnect_ch(ch); + srpt_close_ch(ch); + } + } + mutex_unlock(&sdev->mutex); +out: return count; } @@ -3565,7 +3288,6 @@ static struct configfs_attribute *srpt_wwn_attrs[] = { static const struct target_core_fabric_ops srpt_template = { .module = THIS_MODULE, .name = "srpt", - .node_acl_size = sizeof(struct srpt_node_acl), .get_fabric_name = srpt_get_fabric_name, .tpg_get_wwn = srpt_get_fabric_wwn, .tpg_get_tag = srpt_get_tag, diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 09037f2b0b51..af9b8b527340 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -218,20 +218,20 @@ struct srpt_send_ioctx { /** * enum rdma_ch_state - SRP channel state. - * @CH_CONNECTING: QP is in RTR state; waiting for RTU. - * @CH_LIVE: QP is in RTS state. - * @CH_DISCONNECTING: DREQ has been received; waiting for DREP - * or DREQ has been send and waiting for DREP - * or . - * @CH_DRAINING: QP is in ERR state; waiting for last WQE event. - * @CH_RELEASING: Last WQE event has been received; releasing resources. + * @CH_CONNECTING: QP is in RTR state; waiting for RTU. + * @CH_LIVE: QP is in RTS state. + * @CH_DISCONNECTING: DREQ has been sent and waiting for DREP or DREQ has + * been received. + * @CH_DRAINING: DREP has been received or waiting for DREP timed out + * and last work request has been queued. + * @CH_DISCONNECTED: Last completion has been received. */ enum rdma_ch_state { CH_CONNECTING, CH_LIVE, CH_DISCONNECTING, CH_DRAINING, - CH_RELEASING + CH_DISCONNECTED, }; /** @@ -267,6 +267,8 @@ struct srpt_rdma_ch { struct ib_cm_id *cm_id; struct ib_qp *qp; struct ib_cq *cq; + struct ib_cqe zw_cqe; + struct kref kref; int rq_size; u32 rsp_size; atomic_t sq_wr_avail; @@ -286,7 +288,6 @@ struct srpt_rdma_ch { u8 sess_name[36]; struct work_struct release_work; struct completion *release_done; - bool in_shutdown; }; /** @@ -343,7 +344,7 @@ struct srpt_port { * @ioctx_ring: Per-HCA SRQ. * @rch_list: Per-device channel list -- see also srpt_rdma_ch.list. * @ch_releaseQ: Enables waiting for removal from rch_list. - * @spinlock: Protects rch_list and tpg. + * @mutex: Protects rch_list. * @port: Information about the ports owned by this HCA. * @event_handler: Per-HCA asynchronous IB event handler. * @list: Node in srpt_dev_list. @@ -357,18 +358,10 @@ struct srpt_device { struct srpt_recv_ioctx **ioctx_ring; struct list_head rch_list; wait_queue_head_t ch_releaseQ; - spinlock_t spinlock; + struct mutex mutex; struct srpt_port port[2]; struct ib_event_handler event_handler; struct list_head list; }; -/** - * struct srpt_node_acl - Per-initiator ACL data (managed via configfs). - * @nacl: Target core node ACL information. - */ -struct srpt_node_acl { - struct se_node_acl nacl; -}; - #endif /* IB_SRPT_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h index a072d341e205..1d2d1da40c80 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h @@ -1021,6 +1021,8 @@ struct cpl_l2t_write_req { #define L2T_W_NOREPLY_V(x) ((x) << L2T_W_NOREPLY_S) #define L2T_W_NOREPLY_F L2T_W_NOREPLY_V(1U) +#define CPL_L2T_VLAN_NONE 0xfff + struct cpl_l2t_write_rpl { union opcode_tid ot; u8 status; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h index a32de30ea663..c8661c77b4e3 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h @@ -561,6 +561,7 @@ enum fw_flowc_mnem { FW_FLOWC_MNEM_SNDBUF, FW_FLOWC_MNEM_MSS, FW_FLOWC_MNEM_TXDATAPLEN_MAX, + FW_FLOWC_MNEM_SCHEDCLASS = 11, }; struct fw_flowc_mnemval { diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index d66c690a8597..e97094598b2d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -157,7 +157,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [29] = "802.1ad offload support", [31] = "Modifying loopback source checks using UPDATE_QP support", [32] = "Loopback source checks support", - [33] = "RoCEv2 support" + [33] = "RoCEv2 support", + [34] = "DMFS Sniffer support (UC & MC)" }; int i; @@ -810,6 +811,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN; dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f; + if (field & 0x20) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER; MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_BEACON_OFFSET); if (field & 0x80) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_BEACON; diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c index 1d4e2e054647..42d8de892bfe 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c @@ -752,8 +752,10 @@ static const u8 __promisc_mode[] = { [MLX4_FS_REGULAR] = 0x0, [MLX4_FS_ALL_DEFAULT] = 0x1, [MLX4_FS_MC_DEFAULT] = 0x3, - [MLX4_FS_UC_SNIFFER] = 0x4, - [MLX4_FS_MC_SNIFFER] = 0x5, + [MLX4_FS_MIRROR_RX_PORT] = 0x4, + [MLX4_FS_MIRROR_SX_PORT] = 0x5, + [MLX4_FS_UC_SNIFFER] = 0x6, + [MLX4_FS_MC_SNIFFER] = 0x7, }; int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 037fc4cdf5af..ebb4036b98e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -407,6 +407,12 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, const char *mlx5_command_str(int command) { switch (command) { + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: + return "QUERY_HCA_VPORT_CONTEXT"; + + case MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT: + return "MODIFY_HCA_VPORT_CONTEXT"; + case MLX5_CMD_OP_QUERY_HCA_CAP: return "QUERY_HCA_CAP"; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index aac071a7e830..6ef0bfded2ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -515,7 +515,7 @@ struct mlx5e_priv { struct mlx5_uar cq_uar; u32 pdn; u32 tdn; - struct mlx5_core_mr mr; + struct mlx5_core_mkey mkey; struct mlx5e_rq drop_rq; struct mlx5e_channel **channel; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d4e1c3045200..43a148939557 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -982,7 +982,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; - c->mkey_be = cpu_to_be32(priv->mr.key); + c->mkey_be = cpu_to_be32(priv->mkey.key); c->num_tc = priv->params.num_tc; mlx5e_build_channeltc_to_txq_map(priv, ix); @@ -2194,7 +2194,7 @@ static void mlx5e_build_netdev(struct net_device *netdev) } static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, - struct mlx5_core_mr *mr) + struct mlx5_core_mkey *mkey) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_create_mkey_mbox_in *in; @@ -2210,7 +2210,7 @@ static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, in->seg.flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - err = mlx5_core_create_mkey(mdev, mr, in, sizeof(*in), NULL, NULL, + err = mlx5_core_create_mkey(mdev, mkey, in, sizeof(*in), NULL, NULL, NULL); kvfree(in); @@ -2259,7 +2259,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) goto err_dealloc_pd; } - err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr); + err = mlx5e_create_mkey(priv, priv->pdn, &priv->mkey); if (err) { mlx5_core_err(mdev, "create mkey failed, %d\n", err); goto err_dealloc_transport_domain; @@ -2333,7 +2333,7 @@ err_destroy_tises: mlx5e_destroy_tises(priv); err_destroy_mkey: - mlx5_core_destroy_mkey(mdev, &priv->mr); + mlx5_core_destroy_mkey(mdev, &priv->mkey); err_dealloc_transport_domain: mlx5_core_dealloc_transport_domain(mdev, priv->tdn); @@ -2367,7 +2367,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) mlx5e_destroy_rqt(priv, MLX5E_INDIRECTION_RQT); mlx5e_close_drop_rq(priv); mlx5e_destroy_tises(priv); - mlx5_core_destroy_mkey(priv->mdev, &priv->mr); + mlx5_core_destroy_mkey(priv->mdev, &priv->mkey); mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 6f68dba8d7ed..bf3446794bd5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -77,6 +77,9 @@ #define KERNEL_NUM_PRIOS 1 #define KENREL_MIN_LEVEL 2 +#define ANCHOR_MAX_FT 1 +#define ANCHOR_NUM_PRIOS 1 +#define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1) struct node_caps { size_t arr_sz; long *caps; @@ -92,7 +95,7 @@ static struct init_tree_node { int max_ft; } root_fs = { .type = FS_TYPE_NAMESPACE, - .ar_size = 3, + .ar_size = 4, .children = (struct init_tree_node[]) { ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_REQUIRED_CAPS(FS_CAP(flow_table_properties_nic_receive.flow_modify_en), @@ -108,6 +111,8 @@ static struct init_tree_node { FS_CAP(flow_table_properties_nic_receive.identified_miss_table_mode), FS_CAP(flow_table_properties_nic_receive.flow_table_modify)), ADD_NS(ADD_MULTIPLE_PRIO(LEFTOVERS_NUM_PRIOS, LEFTOVERS_MAX_FT))), + ADD_PRIO(0, ANCHOR_MIN_LEVEL, 0, {}, + ADD_NS(ADD_MULTIPLE_PRIO(ANCHOR_NUM_PRIOS, ANCHOR_MAX_FT))), } }; @@ -196,8 +201,10 @@ static void tree_put_node(struct fs_node *node) static int tree_remove_node(struct fs_node *node) { - if (atomic_read(&node->refcount) > 1) - return -EPERM; + if (atomic_read(&node->refcount) > 1) { + atomic_dec(&node->refcount); + return -EEXIST; + } tree_put_node(node); return 0; } @@ -360,6 +367,11 @@ static void del_rule(struct fs_node *node) memcpy(match_value, fte->val, sizeof(fte->val)); fs_get_obj(ft, fg->node.parent); list_del(&rule->node.list); + if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + mutex_lock(&rule->dest_attr.ft->lock); + list_del(&rule->next_ft); + mutex_unlock(&rule->dest_attr.ft->lock); + } fte->dests_size--; if (fte->dests_size) { err = mlx5_cmd_update_fte(dev, ft, @@ -465,6 +477,8 @@ static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte, ft->node.type = FS_TYPE_FLOW_TABLE; ft->type = table_type; ft->max_fte = max_fte; + INIT_LIST_HEAD(&ft->fwd_rules); + mutex_init(&ft->lock); return ft; } @@ -601,9 +615,63 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio return err; } +static int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct fs_fte *fte; + int err = 0; + + fs_get_obj(fte, rule->node.parent); + if (!(fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return -EINVAL; + lock_ref_node(&fte->node); + fs_get_obj(fg, fte->node.parent); + fs_get_obj(ft, fg->node.parent); + + memcpy(&rule->dest_attr, dest, sizeof(*dest)); + err = mlx5_cmd_update_fte(get_dev(&ft->node), + ft, fg->id, fte); + unlock_ref_node(&fte->node); + + return err; +} + +/* Modify/set FWD rules that point on old_next_ft to point on new_next_ft */ +static int connect_fwd_rules(struct mlx5_core_dev *dev, + struct mlx5_flow_table *new_next_ft, + struct mlx5_flow_table *old_next_ft) +{ + struct mlx5_flow_destination dest; + struct mlx5_flow_rule *iter; + int err = 0; + + /* new_next_ft and old_next_ft could be NULL only + * when we create/destroy the anchor flow table. + */ + if (!new_next_ft || !old_next_ft) + return 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = new_next_ft; + + mutex_lock(&old_next_ft->lock); + list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules); + mutex_unlock(&old_next_ft->lock); + list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) { + err = mlx5_modify_rule_destination(iter, &dest); + if (err) + pr_err("mlx5_core: failed to modify rule to point on flow table %d\n", + new_next_ft->id); + } + return 0; +} + static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, struct fs_prio *prio) { + struct mlx5_flow_table *next_ft; int err = 0; /* Connect_prev_fts and update_root_ft_create are mutually exclusive */ @@ -612,6 +680,11 @@ static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table err = connect_prev_fts(dev, ft, prio); if (err) return err; + + next_ft = find_next_chained_ft(prio); + err = connect_fwd_rules(dev, ft, next_ft); + if (err) + return err; } if (MLX5_CAP_FLOWTABLE(dev, @@ -762,6 +835,7 @@ static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest) if (!rule) return NULL; + INIT_LIST_HEAD(&rule->next_ft); rule->node.type = FS_TYPE_FLOW_DEST; memcpy(&rule->dest_attr, dest, sizeof(*dest)); @@ -782,9 +856,14 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte, return ERR_PTR(-ENOMEM); fs_get_obj(ft, fg->node.parent); - /* Add dest to dests list- added as first element after the head */ + /* Add dest to dests list- we need flow tables to be in the + * end of the list for forward to next prio rules. + */ tree_init_node(&rule->node, 1, del_rule); - list_add_tail(&rule->node.list, &fte->node.children); + if (dest && dest->type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + list_add(&rule->node.list, &fte->node.children); + else + list_add_tail(&rule->node.list, &fte->node.children); fte->dests_size++; if (fte->dests_size == 1) err = mlx5_cmd_create_fte(get_dev(&ft->node), @@ -903,6 +982,25 @@ out: return fg; } +static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_rule *rule; + + list_for_each_entry(rule, &fte->node.children, node.list) { + if (rule->dest_attr.type == dest->type) { + if ((dest->type == MLX5_FLOW_DESTINATION_TYPE_VPORT && + dest->vport_num == rule->dest_attr.vport_num) || + (dest->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + dest->ft == rule->dest_attr.ft) || + (dest->type == MLX5_FLOW_DESTINATION_TYPE_TIR && + dest->tir_num == rule->dest_attr.tir_num)) + return rule; + } + } + return NULL; +} + static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg, u32 *match_value, u8 action, @@ -919,6 +1017,13 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg, nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD); if (compare_match_value(&fg->mask, match_value, &fte->val) && action == fte->action && flow_tag == fte->flow_tag) { + rule = find_flow_rule(fte, dest); + if (rule) { + atomic_inc(&rule->node.refcount); + unlock_ref_node(&fte->node); + unlock_ref_node(&fg->node); + return rule; + } rule = add_rule_fte(fte, fg, dest); unlock_ref_node(&fte->node); if (IS_ERR(rule)) @@ -984,14 +1089,14 @@ static struct mlx5_flow_rule *add_rule_to_auto_fg(struct mlx5_flow_table *ft, return rule; } -struct mlx5_flow_rule * -mlx5_add_flow_rule(struct mlx5_flow_table *ft, - u8 match_criteria_enable, - u32 *match_criteria, - u32 *match_value, - u32 action, - u32 flow_tag, - struct mlx5_flow_destination *dest) +static struct mlx5_flow_rule * +_mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest) { struct mlx5_flow_group *g; struct mlx5_flow_rule *rule; @@ -1014,6 +1119,63 @@ unlock: unlock_ref_node(&ft->node); return rule; } + +static bool fwd_next_prio_supported(struct mlx5_flow_table *ft) +{ + return ((ft->type == FS_FT_NIC_RX) && + (MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs))); +} + +struct mlx5_flow_rule * +mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + struct mlx5_flow_destination gen_dest; + struct mlx5_flow_table *next_ft = NULL; + struct mlx5_flow_rule *rule = NULL; + u32 sw_action = action; + struct fs_prio *prio; + + fs_get_obj(prio, ft->node.parent); + if (action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + if (!fwd_next_prio_supported(ft)) + return ERR_PTR(-EOPNOTSUPP); + if (dest) + return ERR_PTR(-EINVAL); + mutex_lock(&root->chain_lock); + next_ft = find_next_chained_ft(prio); + if (next_ft) { + gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + gen_dest.ft = next_ft; + dest = &gen_dest; + action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } else { + mutex_unlock(&root->chain_lock); + return ERR_PTR(-EOPNOTSUPP); + } + } + + rule = _mlx5_add_flow_rule(ft, match_criteria_enable, match_criteria, + match_value, action, flow_tag, dest); + + if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { + if (!IS_ERR_OR_NULL(rule) && + (list_empty(&rule->next_ft))) { + mutex_lock(&next_ft->lock); + list_add(&rule->next_ft, &next_ft->fwd_rules); + mutex_unlock(&next_ft->lock); + rule->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + } + mutex_unlock(&root->chain_lock); + } + return rule; +} EXPORT_SYMBOL(mlx5_add_flow_rule); void mlx5_del_flow_rule(struct mlx5_flow_rule *rule) @@ -1077,6 +1239,10 @@ static int disconnect_flow_table(struct mlx5_flow_table *ft) return 0; next_ft = find_next_chained_ft(prio); + err = connect_fwd_rules(dev, next_ft, ft); + if (err) + return err; + err = connect_prev_fts(dev, next_ft, prio); if (err) mlx5_core_warn(dev, "Failed to disconnect flow table %d\n", @@ -1126,6 +1292,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, case MLX5_FLOW_NAMESPACE_BYPASS: case MLX5_FLOW_NAMESPACE_KERNEL: case MLX5_FLOW_NAMESPACE_LEFTOVERS: + case MLX5_FLOW_NAMESPACE_ANCHOR: prio = type; break; case MLX5_FLOW_NAMESPACE_FDB: @@ -1351,6 +1518,25 @@ static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns) } } +#define ANCHOR_PRIO 0 +#define ANCHOR_SIZE 1 +static int create_anchor_flow_table(struct mlx5_core_dev + *dev) +{ + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_flow_table *ft; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ANCHOR); + if (!ns) + return -EINVAL; + ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE); + if (IS_ERR(ft)) { + mlx5_core_err(dev, "Failed to create last anchor flow table"); + return PTR_ERR(ft); + } + return 0; +} + static int init_root_ns(struct mlx5_core_dev *dev) { @@ -1363,6 +1549,9 @@ static int init_root_ns(struct mlx5_core_dev *dev) set_prio_attrs(dev->priv.root_ns); + if (create_anchor_flow_table(dev)) + goto cleanup; + return 0; cleanup: @@ -1392,6 +1581,15 @@ static void cleanup_single_prio_root_ns(struct mlx5_core_dev *dev, root_ns = NULL; } +static void destroy_flow_tables(struct fs_prio *prio) +{ + struct mlx5_flow_table *iter; + struct mlx5_flow_table *tmp; + + fs_for_each_ft_safe(iter, tmp, prio) + mlx5_destroy_flow_table(iter); +} + static void cleanup_root_ns(struct mlx5_core_dev *dev) { struct mlx5_flow_root_namespace *root_ns = dev->priv.root_ns; @@ -1420,6 +1618,7 @@ static void cleanup_root_ns(struct mlx5_core_dev *dev) list); fs_get_obj(obj_iter_prio2, iter_prio2); + destroy_flow_tables(obj_iter_prio2); if (tree_remove_node(iter_prio2)) { mlx5_core_warn(dev, "Priority %d wasn't destroyed, refcount > 1\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 00245fd7e4bc..f37a6248a27b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -68,6 +68,11 @@ struct fs_node { struct mlx5_flow_rule { struct fs_node node; struct mlx5_flow_destination dest_attr; + /* next_ft should be accessed under chain_lock and only of + * destination type is FWD_NEXT_fT. + */ + struct list_head next_ft; + u32 sw_action; }; /* Type of children is mlx5_flow_group */ @@ -82,6 +87,10 @@ struct mlx5_flow_table { unsigned int required_groups; unsigned int num_groups; } autogroup; + /* Protect fwd_rules */ + struct mutex lock; + /* FWD rules that point on this flow table */ + struct list_head fwd_rules; }; /* Type of children is mlx5_flow_rule */ @@ -142,6 +151,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_list_for_each_entry(pos, root) \ list_for_each_entry(pos, root, node.list) +#define fs_list_for_each_entry_safe(pos, tmp, root) \ + list_for_each_entry_safe(pos, tmp, root, node.list) + #define fs_for_each_ns_or_ft_reverse(pos, prio) \ list_for_each_entry_reverse(pos, &(prio)->node.children, list) @@ -157,6 +169,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_for_each_ft(pos, prio) \ fs_list_for_each_entry(pos, &(prio)->node.children) +#define fs_for_each_ft_safe(pos, tmp, prio) \ + fs_list_for_each_entry_safe(pos, tmp, &(prio)->node.children) + #define fs_for_each_fg(pos, ft) \ fs_list_for_each_entry(pos, &(ft)->node.children) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index aa1ab4702385..75c7ae6a5cc4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -98,88 +98,55 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) { int err; - err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_CUR); - if (err) - return err; - - err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_MAX); + err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL); if (err) return err; if (MLX5_CAP_GEN(dev, eth_net_offloads)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS, - HCA_CAP_OPMOD_GET_CUR); - if (err) - return err; - err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS, - HCA_CAP_OPMOD_GET_MAX); + err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS); if (err) return err; } if (MLX5_CAP_GEN(dev, pg)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ODP, - HCA_CAP_OPMOD_GET_CUR); - if (err) - return err; - err = mlx5_core_get_caps(dev, MLX5_CAP_ODP, - HCA_CAP_OPMOD_GET_MAX); + err = mlx5_core_get_caps(dev, MLX5_CAP_ODP); if (err) return err; } if (MLX5_CAP_GEN(dev, atomic)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC, - HCA_CAP_OPMOD_GET_CUR); - if (err) - return err; - err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC, - HCA_CAP_OPMOD_GET_MAX); + err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC); if (err) return err; } if (MLX5_CAP_GEN(dev, roce)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE, - HCA_CAP_OPMOD_GET_CUR); - if (err) - return err; - err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE, - HCA_CAP_OPMOD_GET_MAX); + err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE); if (err) return err; } if (MLX5_CAP_GEN(dev, nic_flow_table)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE, - HCA_CAP_OPMOD_GET_CUR); - if (err) - return err; - err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE, - HCA_CAP_OPMOD_GET_MAX); + err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE); if (err) return err; } if (MLX5_CAP_GEN(dev, vport_group_manager) && MLX5_CAP_GEN(dev, eswitch_flow_table)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE, - HCA_CAP_OPMOD_GET_CUR); - if (err) - return err; - err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE, - HCA_CAP_OPMOD_GET_MAX); + err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE); if (err) return err; } if (MLX5_CAP_GEN(dev, eswitch_flow_table)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH, - HCA_CAP_OPMOD_GET_CUR); + err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH); if (err) return err; - err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH, - HCA_CAP_OPMOD_GET_MAX); + } + + if (MLX5_CAP_GEN(dev, vector_calc)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_VECTOR_CALC); if (err) return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 1545a944c309..f2354bc0ec19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -341,8 +341,9 @@ static u16 to_fw_pkey_sz(u32 size) } } -int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type, - enum mlx5_cap_mode cap_mode) +static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev, + enum mlx5_cap_type cap_type, + enum mlx5_cap_mode cap_mode) { u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); @@ -392,6 +393,16 @@ query_ex: return err; } +int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type) +{ + int ret; + + ret = mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_CUR); + if (ret) + return ret; + return mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_MAX); +} + static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod) { u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)]; @@ -419,8 +430,7 @@ static int handle_hca_cap_atomic(struct mlx5_core_dev *dev) int err; if (MLX5_CAP_GEN(dev, atomic)) { - err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC, - HCA_CAP_OPMOD_GET_CUR); + err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC); if (err) return err; } else { @@ -462,11 +472,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) if (!set_ctx) goto query_ex; - err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_MAX); - if (err) - goto query_ex; - - err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_CUR); + err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL); if (err) goto query_ex; @@ -1117,7 +1123,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) mlx5_init_cq_table(dev); mlx5_init_qp_table(dev); mlx5_init_srq_table(dev); - mlx5_init_mr_table(dev); + mlx5_init_mkey_table(dev); err = mlx5_init_fs(dev); if (err) { @@ -1164,7 +1170,7 @@ err_sriov: err_reg_dev: mlx5_cleanup_fs(dev); err_fs: - mlx5_cleanup_mr_table(dev); + mlx5_cleanup_mkey_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); @@ -1237,7 +1243,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) #endif mlx5_cleanup_fs(dev); - mlx5_cleanup_mr_table(dev); + mlx5_cleanup_mkey_table(dev); mlx5_cleanup_srq_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 6fa22b51e460..77a7293921d5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -36,25 +36,26 @@ #include <linux/mlx5/cmd.h> #include "mlx5_core.h" -void mlx5_init_mr_table(struct mlx5_core_dev *dev) +void mlx5_init_mkey_table(struct mlx5_core_dev *dev) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; memset(table, 0, sizeof(*table)); rwlock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); } -void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev) +void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev) { } -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, struct mlx5_create_mkey_mbox_in *in, int inlen, mlx5_cmd_cbk_t callback, void *context, struct mlx5_create_mkey_mbox_out *out) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; struct mlx5_create_mkey_mbox_out lout; int err; u8 key; @@ -83,34 +84,35 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, return mlx5_cmd_status_to_err(&lout.hdr); } - mr->iova = be64_to_cpu(in->seg.start_addr); - mr->size = be64_to_cpu(in->seg.len); - mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; - mr->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; + mkey->iova = be64_to_cpu(in->seg.start_addr); + mkey->size = be64_to_cpu(in->seg.len); + mkey->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; + mkey->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff; mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", - be32_to_cpu(lout.mkey), key, mr->key); + be32_to_cpu(lout.mkey), key, mkey->key); - /* connect to MR tree */ + /* connect to mkey tree */ write_lock_irq(&table->lock); - err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->key), mr); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), mkey); write_unlock_irq(&table->lock); if (err) { - mlx5_core_warn(dev, "failed radix tree insert of mr 0x%x, %d\n", - mlx5_base_mkey(mr->key), err); - mlx5_core_destroy_mkey(dev, mr); + mlx5_core_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n", + mlx5_base_mkey(mkey->key), err); + mlx5_core_destroy_mkey(dev, mkey); } return err; } EXPORT_SYMBOL(mlx5_core_create_mkey); -int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey) { - struct mlx5_mr_table *table = &dev->priv.mr_table; + struct mlx5_mkey_table *table = &dev->priv.mkey_table; struct mlx5_destroy_mkey_mbox_in in; struct mlx5_destroy_mkey_mbox_out out; - struct mlx5_core_mr *deleted_mr; + struct mlx5_core_mkey *deleted_mkey; unsigned long flags; int err; @@ -118,16 +120,16 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) memset(&out, 0, sizeof(out)); write_lock_irqsave(&table->lock, flags); - deleted_mr = radix_tree_delete(&table->tree, mlx5_base_mkey(mr->key)); + deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key)); write_unlock_irqrestore(&table->lock, flags); - if (!deleted_mr) { - mlx5_core_warn(dev, "failed radix tree delete of mr 0x%x\n", - mlx5_base_mkey(mr->key)); + if (!deleted_mkey) { + mlx5_core_warn(dev, "failed radix tree delete of mkey 0x%x\n", + mlx5_base_mkey(mkey->key)); return -ENOENT; } in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); if (err) return err; @@ -139,7 +141,7 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr) } EXPORT_SYMBOL(mlx5_core_destroy_mkey); -int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_query_mkey_mbox_out *out, int outlen) { struct mlx5_query_mkey_mbox_in in; @@ -149,7 +151,7 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, memset(out, 0, outlen); in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_MKEY); - in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key)); + in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mkey->key)); err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen); if (err) return err; @@ -161,7 +163,7 @@ int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, } EXPORT_SYMBOL(mlx5_core_query_mkey); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey) { struct mlx5_query_special_ctxs_mbox_in in; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index a87e773e93f3..5635ce7ad693 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -324,6 +324,29 @@ int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap); +int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, + u8 port_num, void *out, size_t sz) +{ + u32 *in; + int err; + + in = mlx5_vzalloc(sz); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(ppcnt_reg, in, local_port, port_num); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP); + err = mlx5_core_access_reg(dev, in, sz, out, + sz, MLX5_REG_PPCNT, 0, 0); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_ib_ppcnt); + int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause) { u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index c7398b95aecd..bd518405859e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -850,3 +850,111 @@ int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); } EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); + +int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, + int vf, u8 port_num, void *out, + size_t out_sz) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); + int is_group_manager; + void *in; + int err; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + in = mlx5_vzalloc(in_sz); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(query_vport_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_COUNTER); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_vport_counter_in, in, other_vport, 1); + MLX5_SET(query_vport_counter_in, in, vport_number, vf + 1); + } else { + err = -EPERM; + goto free; + } + } + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_vport_counter_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + if (err) + goto free; + err = mlx5_cmd_status_to_err_v2(out); + +free: + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_vport_counter); + +int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, + u8 other_vport, u8 port_num, + int vf, + struct mlx5_hca_vport_context *req) +{ + int in_sz = MLX5_ST_SZ_BYTES(modify_hca_vport_context_in); + u8 out[MLX5_ST_SZ_BYTES(modify_hca_vport_context_out)]; + int is_group_manager; + void *in; + int err; + void *ctx; + + mlx5_core_dbg(dev, "vf %d\n", vf); + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + in = kzalloc(in_sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + memset(out, 0, sizeof(out)); + MLX5_SET(modify_hca_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(modify_hca_vport_context_in, in, other_vport, 1); + MLX5_SET(modify_hca_vport_context_in, in, vport_number, vf); + } else { + err = -EPERM; + goto ex; + } + } + + if (MLX5_CAP_GEN(dev, num_ports) > 1) + MLX5_SET(modify_hca_vport_context_in, in, port_num, port_num); + + ctx = MLX5_ADDR_OF(modify_hca_vport_context_in, in, hca_vport_context); + MLX5_SET(hca_vport_context, ctx, field_select, req->field_select); + MLX5_SET(hca_vport_context, ctx, sm_virt_aware, req->sm_virt_aware); + MLX5_SET(hca_vport_context, ctx, has_smi, req->has_smi); + MLX5_SET(hca_vport_context, ctx, has_raw, req->has_raw); + MLX5_SET(hca_vport_context, ctx, vport_state_policy, req->policy); + MLX5_SET(hca_vport_context, ctx, port_physical_state, req->phys_state); + MLX5_SET(hca_vport_context, ctx, vport_state, req->vport_state); + MLX5_SET64(hca_vport_context, ctx, port_guid, req->port_guid); + MLX5_SET64(hca_vport_context, ctx, node_guid, req->node_guid); + MLX5_SET(hca_vport_context, ctx, cap_mask1, req->cap_mask1); + MLX5_SET(hca_vport_context, ctx, cap_mask1_field_select, req->cap_mask1_perm); + MLX5_SET(hca_vport_context, ctx, cap_mask2, req->cap_mask2); + MLX5_SET(hca_vport_context, ctx, cap_mask2_field_select, req->cap_mask2_perm); + MLX5_SET(hca_vport_context, ctx, lid, req->lid); + MLX5_SET(hca_vport_context, ctx, init_type_reply, req->init_type_reply); + MLX5_SET(hca_vport_context, ctx, lmc, req->lmc); + MLX5_SET(hca_vport_context, ctx, subnet_timeout, req->subnet_timeout); + MLX5_SET(hca_vport_context, ctx, sm_lid, req->sm_lid); + MLX5_SET(hca_vport_context, ctx, sm_sl, req->sm_sl); + MLX5_SET(hca_vport_context, ctx, qkey_violation_counter, req->qkey_violation_counter); + MLX5_SET(hca_vport_context, ctx, pkey_violation_counter, req->pkey_violation_counter); + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + if (err) + goto ex; + + err = mlx5_cmd_status_to_err_v2(out); + +ex: + kfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_modify_hca_vport_context); diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig index fd25078ee923..3e668d852f03 100644 --- a/drivers/staging/rdma/hfi1/Kconfig +++ b/drivers/staging/rdma/hfi1/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_HFI1 tristate "Intel OPA Gen1 support" - depends on X86_64 + depends on X86_64 && INFINIBAND_RDMAVT + select MMU_NOTIFIER default m ---help--- This is a low-level driver for Intel OPA Gen1 adapter. @@ -25,13 +26,3 @@ config SDMA_VERBOSITY ---help--- This is a configuration flag to enable verbose SDMA debug -config PRESCAN_RXQ - bool "Enable prescanning of the RX queue for ECNs" - depends on INFINIBAND_HFI1 - default n - ---help--- - This option toggles the prescanning of the receive queue for - Explicit Congestion Notifications. If an ECN is detected, it - is processed as quickly as possible, the ECN is toggled off. - After the prescanning step, the receive queue is processed as - usual. diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile index 68c5a315e557..8dc59382ee96 100644 --- a/drivers/staging/rdma/hfi1/Makefile +++ b/drivers/staging/rdma/hfi1/Makefile @@ -7,10 +7,12 @@ # obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o -hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o efivar.o eprom.o file_ops.o firmware.o \ - init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \ - qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \ - uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o +hfi1-y := affinity.o chip.o device.o diag.o driver.o efivar.o \ + eprom.o file_ops.o firmware.o \ + init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ + qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \ + uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ + verbs_txreq.o hfi1-$(CONFIG_DEBUG_FS) += debugfs.o CFLAGS_trace.o = -I$(src) diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c new file mode 100644 index 000000000000..2cb8ca77f876 --- /dev/null +++ b/drivers/staging/rdma/hfi1/affinity.c @@ -0,0 +1,430 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/topology.h> +#include <linux/cpumask.h> +#include <linux/module.h> + +#include "hfi.h" +#include "affinity.h" +#include "sdma.h" +#include "trace.h" + +struct cpu_mask_set { + struct cpumask mask; + struct cpumask used; + uint gen; +}; + +struct hfi1_affinity { + struct cpu_mask_set def_intr; + struct cpu_mask_set rcv_intr; + struct cpu_mask_set proc; + /* spin lock to protect affinity struct */ + spinlock_t lock; +}; + +/* Name of IRQ types, indexed by enum irq_type */ +static const char * const irq_type_names[] = { + "SDMA", + "RCVCTXT", + "GENERAL", + "OTHER", +}; + +static inline void init_cpu_mask_set(struct cpu_mask_set *set) +{ + cpumask_clear(&set->mask); + cpumask_clear(&set->used); + set->gen = 0; +} + +/* + * Interrupt affinity. + * + * non-rcv avail gets a default mask that + * starts as possible cpus with threads reset + * and each rcv avail reset. + * + * rcv avail gets node relative 1 wrapping back + * to the node relative 1 as necessary. + * + */ +int hfi1_dev_affinity_init(struct hfi1_devdata *dd) +{ + int node = pcibus_to_node(dd->pcidev->bus); + struct hfi1_affinity *info; + const struct cpumask *local_mask; + int curr_cpu, possible, i, ht; + + if (node < 0) + node = numa_node_id(); + dd->node = node; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + spin_lock_init(&info->lock); + + init_cpu_mask_set(&info->def_intr); + init_cpu_mask_set(&info->rcv_intr); + init_cpu_mask_set(&info->proc); + + local_mask = cpumask_of_node(dd->node); + if (cpumask_first(local_mask) >= nr_cpu_ids) + local_mask = topology_core_cpumask(0); + /* use local mask as default */ + cpumask_copy(&info->def_intr.mask, local_mask); + /* + * Remove HT cores from the default mask. Do this in two steps below. + */ + possible = cpumask_weight(&info->def_intr.mask); + ht = cpumask_weight(topology_sibling_cpumask( + cpumask_first(&info->def_intr.mask))); + /* + * Step 1. Skip over the first N HT siblings and use them as the + * "real" cores. Assumes that HT cores are not enumerated in + * succession (except in the single core case). + */ + curr_cpu = cpumask_first(&info->def_intr.mask); + for (i = 0; i < possible / ht; i++) + curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); + /* + * Step 2. Remove the remaining HT siblings. Use cpumask_next() to + * skip any gaps. + */ + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); + curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); + } + + /* fill in the receive list */ + possible = cpumask_weight(&info->def_intr.mask); + curr_cpu = cpumask_first(&info->def_intr.mask); + if (possible == 1) { + /* only one CPU, everyone will use it */ + cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); + } else { + /* + * Retain the first CPU in the default list for the control + * context. + */ + curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); + /* + * Remove the remaining kernel receive queues from + * the default list and add them to the receive list. + */ + for (i = 0; i < dd->n_krcv_queues - 1; i++) { + cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); + cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); + curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); + if (curr_cpu >= nr_cpu_ids) + break; + } + } + + cpumask_copy(&info->proc.mask, cpu_online_mask); + dd->affinity = info; + return 0; +} + +void hfi1_dev_affinity_free(struct hfi1_devdata *dd) +{ + kfree(dd->affinity); +} + +int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) +{ + int ret; + cpumask_var_t diff; + struct cpu_mask_set *set; + struct sdma_engine *sde = NULL; + struct hfi1_ctxtdata *rcd = NULL; + char extra[64]; + int cpu = -1; + + extra[0] = '\0'; + cpumask_clear(&msix->mask); + + ret = zalloc_cpumask_var(&diff, GFP_KERNEL); + if (!ret) + return -ENOMEM; + + switch (msix->type) { + case IRQ_SDMA: + sde = (struct sdma_engine *)msix->arg; + scnprintf(extra, 64, "engine %u", sde->this_idx); + /* fall through */ + case IRQ_GENERAL: + set = &dd->affinity->def_intr; + break; + case IRQ_RCVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + if (rcd->ctxt == HFI1_CTRL_CTXT) { + set = &dd->affinity->def_intr; + cpu = cpumask_first(&set->mask); + } else { + set = &dd->affinity->rcv_intr; + } + scnprintf(extra, 64, "ctxt %u", rcd->ctxt); + break; + default: + dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); + return -EINVAL; + } + + /* + * The control receive context is placed on a particular CPU, which + * is set above. Skip accounting for it. Everything else finds its + * CPU here. + */ + if (cpu == -1) { + spin_lock(&dd->affinity->lock); + if (cpumask_equal(&set->mask, &set->used)) { + /* + * We've used up all the CPUs, bump up the generation + * and reset the 'used' map + */ + set->gen++; + cpumask_clear(&set->used); + } + cpumask_andnot(diff, &set->mask, &set->used); + cpu = cpumask_first(diff); + cpumask_set_cpu(cpu, &set->used); + spin_unlock(&dd->affinity->lock); + } + + switch (msix->type) { + case IRQ_SDMA: + sde->cpu = cpu; + break; + case IRQ_GENERAL: + case IRQ_RCVCTXT: + case IRQ_OTHER: + break; + } + + cpumask_set_cpu(cpu, &msix->mask); + dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n", + msix->msix.vector, irq_type_names[msix->type], + extra, cpu); + irq_set_affinity_hint(msix->msix.vector, &msix->mask); + + free_cpumask_var(diff); + return 0; +} + +void hfi1_put_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix) +{ + struct cpu_mask_set *set = NULL; + struct hfi1_ctxtdata *rcd; + + switch (msix->type) { + case IRQ_SDMA: + case IRQ_GENERAL: + set = &dd->affinity->def_intr; + break; + case IRQ_RCVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + /* only do accounting for non control contexts */ + if (rcd->ctxt != HFI1_CTRL_CTXT) + set = &dd->affinity->rcv_intr; + break; + default: + return; + } + + if (set) { + spin_lock(&dd->affinity->lock); + cpumask_andnot(&set->used, &set->used, &msix->mask); + if (cpumask_empty(&set->used) && set->gen) { + set->gen--; + cpumask_copy(&set->used, &set->mask); + } + spin_unlock(&dd->affinity->lock); + } + + irq_set_affinity_hint(msix->msix.vector, NULL); + cpumask_clear(&msix->mask); +} + +int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) +{ + int cpu = -1, ret; + cpumask_var_t diff, mask, intrs; + const struct cpumask *node_mask, + *proc_mask = tsk_cpus_allowed(current); + struct cpu_mask_set *set = &dd->affinity->proc; + char buf[1024]; + + /* + * check whether process/context affinity has already + * been set + */ + if (cpumask_weight(proc_mask) == 1) { + scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask)); + hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %s", + current->pid, current->comm, buf); + /* + * Mark the pre-set CPU as used. This is atomic so we don't + * need the lock + */ + cpu = cpumask_first(proc_mask); + cpumask_set_cpu(cpu, &set->used); + goto done; + } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { + scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(proc_mask)); + hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %s", + current->pid, current->comm, buf); + goto done; + } + + /* + * The process does not have a preset CPU affinity so find one to + * recommend. We prefer CPUs on the same NUMA as the device. + */ + + ret = zalloc_cpumask_var(&diff, GFP_KERNEL); + if (!ret) + goto done; + ret = zalloc_cpumask_var(&mask, GFP_KERNEL); + if (!ret) + goto free_diff; + ret = zalloc_cpumask_var(&intrs, GFP_KERNEL); + if (!ret) + goto free_mask; + + spin_lock(&dd->affinity->lock); + /* + * If we've used all available CPUs, clear the mask and start + * overloading. + */ + if (cpumask_equal(&set->mask, &set->used)) { + set->gen++; + cpumask_clear(&set->used); + } + + /* CPUs used by interrupt handlers */ + cpumask_copy(intrs, (dd->affinity->def_intr.gen ? + &dd->affinity->def_intr.mask : + &dd->affinity->def_intr.used)); + cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ? + &dd->affinity->rcv_intr.mask : + &dd->affinity->rcv_intr.used)); + scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(intrs)); + hfi1_cdbg(PROC, "CPUs used by interrupts: %s", buf); + + /* + * If we don't have a NUMA node requested, preference is towards + * device NUMA node + */ + if (node == -1) + node = dd->node; + node_mask = cpumask_of_node(node); + scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(node_mask)); + hfi1_cdbg(PROC, "device on NUMA %u, CPUs %s", node, buf); + + /* diff will hold all unused cpus */ + cpumask_andnot(diff, &set->mask, &set->used); + scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(diff)); + hfi1_cdbg(PROC, "unused CPUs (all) %s", buf); + + /* get cpumask of available CPUs on preferred NUMA */ + cpumask_and(mask, diff, node_mask); + scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask)); + hfi1_cdbg(PROC, "available cpus on NUMA %s", buf); + + /* + * At first, we don't want to place processes on the same + * CPUs as interrupt handlers. + */ + cpumask_andnot(diff, mask, intrs); + if (!cpumask_empty(diff)) + cpumask_copy(mask, diff); + + /* + * if we don't have a cpu on the preferred NUMA, get + * the list of the remaining available CPUs + */ + if (cpumask_empty(mask)) { + cpumask_andnot(diff, &set->mask, &set->used); + cpumask_andnot(mask, diff, node_mask); + } + scnprintf(buf, 1024, "%*pbl", cpumask_pr_args(mask)); + hfi1_cdbg(PROC, "possible CPUs for process %s", buf); + + cpu = cpumask_first(mask); + if (cpu >= nr_cpu_ids) /* empty */ + cpu = -1; + else + cpumask_set_cpu(cpu, &set->used); + spin_unlock(&dd->affinity->lock); + + free_cpumask_var(intrs); +free_mask: + free_cpumask_var(mask); +free_diff: + free_cpumask_var(diff); +done: + return cpu; +} + +void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu) +{ + struct cpu_mask_set *set = &dd->affinity->proc; + + if (cpu < 0) + return; + spin_lock(&dd->affinity->lock); + cpumask_clear_cpu(cpu, &set->used); + if (cpumask_empty(&set->used) && set->gen) { + set->gen--; + cpumask_copy(&set->used, &set->mask); + } + spin_unlock(&dd->affinity->lock); +} + diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h new file mode 100644 index 000000000000..b287e4963024 --- /dev/null +++ b/drivers/staging/rdma/hfi1/affinity.h @@ -0,0 +1,91 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_AFFINITY_H +#define _HFI1_AFFINITY_H + +#include "hfi.h" + +enum irq_type { + IRQ_SDMA, + IRQ_RCVCTXT, + IRQ_GENERAL, + IRQ_OTHER +}; + +/* Can be used for both memory and cpu */ +enum affinity_flags { + AFF_AUTO, + AFF_NUMA_LOCAL, + AFF_DEV_LOCAL, + AFF_IRQ_LOCAL +}; + +struct hfi1_msix_entry; + +/* Initialize driver affinity data */ +int hfi1_dev_affinity_init(struct hfi1_devdata *); +/* Free driver affinity data */ +void hfi1_dev_affinity_free(struct hfi1_devdata *); +/* + * Set IRQ affinity to a CPU. The function will determine the + * CPU and set the affinity to it. + */ +int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); +/* + * Remove the IRQ's CPU affinity. This function also updates + * any internal CPU tracking data + */ +void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); +/* + * Determine a CPU affinity for a user process, if the process does not + * have an affinity set yet. + */ +int hfi1_get_proc_affinity(struct hfi1_devdata *, int); +/* Release a CPU used by a user process. */ +void hfi1_put_proc_affinity(struct hfi1_devdata *, int); + +#endif /* _HFI1_AFFINITY_H */ diff --git a/drivers/staging/rdma/hfi1/aspm.h b/drivers/staging/rdma/hfi1/aspm.h new file mode 100644 index 000000000000..0d58fe3b49b5 --- /dev/null +++ b/drivers/staging/rdma/hfi1/aspm.h @@ -0,0 +1,309 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _ASPM_H +#define _ASPM_H + +#include "hfi.h" + +extern uint aspm_mode; + +enum aspm_mode { + ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */ + ASPM_MODE_ENABLED = 1, /* ASPM always enabled, power saving mode */ + ASPM_MODE_DYNAMIC = 2, /* ASPM enabled/disabled dynamically */ +}; + +/* Time after which the timer interrupt will re-enable ASPM */ +#define ASPM_TIMER_MS 1000 +/* Time for which interrupts are ignored after a timer has been scheduled */ +#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2) +/* Two interrupts within this time trigger ASPM disable */ +#define ASPM_TRIGGER_MS 1 +#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull) +#define ASPM_L1_SUPPORTED(reg) \ + (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2) + +static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + u32 up, dn; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return false; + + pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn); + dn = ASPM_L1_SUPPORTED(dn); + + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up); + up = ASPM_L1_SUPPORTED(up); + + /* ASPM works on A-step but is reported as not supported */ + return (!!dn || is_ax(dd)) && !!up; +} + +/* Set L1 entrance latency for slower entry to L1 */ +static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd) +{ + u32 l1_ent_lat = 0x4u; + u32 reg32; + + pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, ®32); + reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK; + reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32); +} + +static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return; + + /* Enable ASPM L1 first in upstream component and then downstream */ + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); +} + +static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* Disable ASPM L1 first in downstream component and then upstream */ + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); + if (parent) + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); +} + +static inline void aspm_enable(struct hfi1_devdata *dd) +{ + if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED || + !dd->aspm_supported) + return; + + aspm_hw_enable_l1(dd); + dd->aspm_enabled = true; +} + +static inline void aspm_disable(struct hfi1_devdata *dd) +{ + if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED) + return; + + aspm_hw_disable_l1(dd); + dd->aspm_enabled = false; +} + +static inline void aspm_disable_inc(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + aspm_disable(dd); + atomic_inc(&dd->aspm_disabled_cnt); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +static inline void aspm_enable_dec(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + if (atomic_dec_and_test(&dd->aspm_disabled_cnt)) + aspm_enable(dd); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +/* ASPM processing for each receive context interrupt */ +static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd) +{ + bool restart_timer; + bool close_interrupts; + unsigned long flags; + ktime_t now, prev; + + /* Quickest exit for minimum impact */ + if (!rcd->aspm_intr_supported) + return; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + /* PSM contexts are open */ + if (!rcd->aspm_intr_enable) + goto unlock; + + prev = rcd->aspm_ts_last_intr; + now = ktime_get(); + rcd->aspm_ts_last_intr = now; + + /* An interrupt pair close together in time */ + close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS; + + /* Don't push out our timer till this much time has elapsed */ + restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) > + ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC; + restart_timer = restart_timer && close_interrupts; + + /* Disable ASPM and schedule timer */ + if (rcd->aspm_enabled && close_interrupts) { + aspm_disable_inc(rcd->dd); + rcd->aspm_enabled = false; + restart_timer = true; + } + + if (restart_timer) { + mod_timer(&rcd->aspm_timer, + jiffies + msecs_to_jiffies(ASPM_TIMER_MS)); + rcd->aspm_ts_timer_sched = now; + } +unlock: + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* Timer function for re-enabling ASPM in the absence of interrupt activity */ +static inline void aspm_ctx_timer_function(unsigned long data) +{ + struct hfi1_ctxtdata *rcd = (struct hfi1_ctxtdata *)data; + unsigned long flags; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + aspm_enable_dec(rcd->dd); + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* Disable interrupt processing for verbs contexts when PSM contexts are open */ +static inline void aspm_disable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + unsigned i; + + for (i = 0; i < dd->first_user_ctxt; i++) { + rcd = dd->rcd[i]; + del_timer_sync(&rcd->aspm_timer); + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = false; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + } + + aspm_disable(dd); + atomic_set(&dd->aspm_disabled_cnt, 0); +} + +/* Re-enable interrupt processing for verbs contexts */ +static inline void aspm_enable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + unsigned i; + + aspm_enable(dd); + + if (aspm_mode != ASPM_MODE_DYNAMIC) + return; + + for (i = 0; i < dd->first_user_ctxt; i++) { + rcd = dd->rcd[i]; + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = true; + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + } +} + +static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd) +{ + spin_lock_init(&rcd->aspm_lock); + setup_timer(&rcd->aspm_timer, aspm_ctx_timer_function, + (unsigned long)rcd); + rcd->aspm_intr_supported = rcd->dd->aspm_supported && + aspm_mode == ASPM_MODE_DYNAMIC && + rcd->ctxt < rcd->dd->first_user_ctxt; +} + +static inline void aspm_init(struct hfi1_devdata *dd) +{ + unsigned i; + + spin_lock_init(&dd->aspm_lock); + dd->aspm_supported = aspm_hw_l1_supported(dd); + + for (i = 0; i < dd->first_user_ctxt; i++) + aspm_ctx_init(dd->rcd[i]); + + /* Start with ASPM disabled */ + aspm_hw_set_l1_ent_latency(dd); + dd->aspm_enabled = false; + aspm_hw_disable_l1(dd); + + /* Now turn on ASPM if configured */ + aspm_enable_all(dd); +} + +static inline void aspm_exit(struct hfi1_devdata *dd) +{ + aspm_disable_all(dd); + + /* Turn on ASPM on exit to conserve power */ + aspm_enable(dd); +} + +#endif /* _ASPM_H */ diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c index bbe5ad85cec0..c29860c05ed4 100644 --- a/drivers/staging/rdma/hfi1/chip.c +++ b/drivers/staging/rdma/hfi1/chip.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -64,6 +61,8 @@ #include "sdma.h" #include "eprom.h" #include "efivar.h" +#include "platform.h" +#include "aspm.h" #define NUM_IB_PORTS 1 @@ -420,10 +419,10 @@ static struct flag_table pio_err_status_flags[] = { SEC_SPC_FREEZE, SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK), /*23*/ FLAG_ENTRY("PioWriteQwValidParity", - SEC_WRITE_DROPPED|SEC_SPC_FREEZE, + SEC_WRITE_DROPPED | SEC_SPC_FREEZE, SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK), /*24*/ FLAG_ENTRY("PioBlockQwCountParity", - SEC_WRITE_DROPPED|SEC_SPC_FREEZE, + SEC_WRITE_DROPPED | SEC_SPC_FREEZE, SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK), /*25*/ FLAG_ENTRY("PioVlfVlLenParity", SEC_SPC_FREEZE, @@ -509,6 +508,12 @@ static struct flag_table sdma_err_status_flags[] = { | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \ | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK) +/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */ +#define PORT_DISCARD_EGRESS_ERRS \ + (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \ + | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \ + | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK) + /* * TXE Egress Error flags */ @@ -936,7 +941,7 @@ static struct flag_table dc8051_err_flags[] = { FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)), FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)), FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES", - D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)), + D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)), FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)), }; @@ -950,7 +955,7 @@ static struct flag_table dc8051_info_err_flags[] = { FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME), FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET), FLAG_ENTRY0("Serdes internal loopback failure", - FAILED_SERDES_INTERNAL_LOOPBACK), + FAILED_SERDES_INTERNAL_LOOPBACK), FLAG_ENTRY0("Failed SerDes init", FAILED_SERDES_INIT), FLAG_ENTRY0("Failed LNI(Polling)", FAILED_LNI_POLLING), FLAG_ENTRY0("Failed LNI(Debounce)", FAILED_LNI_DEBOUNCE), @@ -958,7 +963,8 @@ static struct flag_table dc8051_info_err_flags[] = { FLAG_ENTRY0("Failed LNI(OptEq)", FAILED_LNI_OPTEQ), FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1), FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2), - FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT) + FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT), + FLAG_ENTRY0("Host Handshake Timeout", HOST_HANDSHAKE_TIMEOUT) }; /* @@ -978,7 +984,6 @@ static struct flag_table dc8051_info_host_msg_flags[] = { FLAG_ENTRY0("Link going down", 0x0100), }; - static u32 encoded_size(u32 size); static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate); static int set_physical_link_state(struct hfi1_devdata *dd, u64 state); @@ -1140,11 +1145,8 @@ struct cntr_entry { /* * accessor for stat element, context either dd or ppd */ - u64 (*rw_cntr)(const struct cntr_entry *, - void *context, - int vl, - int mode, - u64 data); + u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl, + int mode, u64 data); }; #define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0 @@ -1188,7 +1190,7 @@ CNTR_ELEM(#name, \ #define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx #define OVR_ELM(ctx) \ CNTR_ELEM("RcvHdrOvr" #ctx, \ - (RCV_HDR_OVFL_CNT + ctx*0x100), \ + (RCV_HDR_OVFL_CNT + ctx * 0x100), \ 0, CNTR_NORMAL, port_access_u64_csr) /* 32bit TXE */ @@ -1277,7 +1279,6 @@ static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr, { u64 ret; - if (mode == CNTR_MODE_R) { ret = read_csr(dd, csr); } else if (mode == CNTR_MODE_W) { @@ -1294,17 +1295,65 @@ static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr, /* Dev Access */ static u64 dev_access_u32_csr(const struct cntr_entry *entry, - void *context, int vl, int mode, u64 data) + void *context, int vl, int mode, u64 data) { struct hfi1_devdata *dd = context; + u64 csr = entry->csr; - if (vl != CNTR_INVALID_VL) - return 0; - return read_write_csr(dd, entry->csr, mode, data); + if (entry->flags & CNTR_SDMA) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 0x100 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + return read_write_csr(dd, csr, mode, data); +} + +static u64 access_sde_err_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].err_cnt; + return 0; +} + +static u64 access_sde_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].sdma_int_cnt; + return 0; +} + +static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].idle_int_cnt; + return 0; +} + +static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].progress_int_cnt; + return 0; } static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context, - int vl, int mode, u64 data) + int vl, int mode, u64 data) { struct hfi1_devdata *dd = context; @@ -1325,7 +1374,7 @@ static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context, } static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context, - int vl, int mode, u64 data) + int vl, int mode, u64 data) { struct hfi1_devdata *dd = context; u32 csr = entry->csr; @@ -1349,7 +1398,7 @@ static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context, /* Port Access */ static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context, - int vl, int mode, u64 data) + int vl, int mode, u64 data) { struct hfi1_pportdata *ppd = context; @@ -1359,7 +1408,7 @@ static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context, } static u64 port_access_u64_csr(const struct cntr_entry *entry, - void *context, int vl, int mode, u64 data) + void *context, int vl, int mode, u64 data) { struct hfi1_pportdata *ppd = context; u64 val; @@ -1399,7 +1448,7 @@ static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode, } static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context, - int vl, int mode, u64 data) + int vl, int mode, u64 data) { struct hfi1_pportdata *ppd = context; @@ -1409,7 +1458,7 @@ static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context, } static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context, - int vl, int mode, u64 data) + int vl, int mode, u64 data) { struct hfi1_pportdata *ppd = context; @@ -1430,18 +1479,25 @@ static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry, } static u64 access_sw_xmit_discards(const struct cntr_entry *entry, - void *context, int vl, int mode, u64 data) + void *context, int vl, int mode, u64 data) { - struct hfi1_pportdata *ppd = context; + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + u64 zero = 0; + u64 *counter; - if (vl != CNTR_INVALID_VL) - return 0; + if (vl == CNTR_INVALID_VL) + counter = &ppd->port_xmit_discards; + else if (vl >= 0 && vl < C_VL_COUNT) + counter = &ppd->port_xmit_discards_vl[vl]; + else + counter = &zero; - return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data); + return read_write_sw(ppd->dd, counter, mode, data); } static u64 access_xmit_constraint_errs(const struct cntr_entry *entry, - void *context, int vl, int mode, u64 data) + void *context, int vl, int mode, + u64 data) { struct hfi1_pportdata *ppd = context; @@ -1453,7 +1509,7 @@ static u64 access_xmit_constraint_errs(const struct cntr_entry *entry, } static u64 access_rcv_constraint_errs(const struct cntr_entry *entry, - void *context, int vl, int mode, u64 data) + void *context, int vl, int mode, u64 data) { struct hfi1_pportdata *ppd = context; @@ -1478,7 +1534,6 @@ static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val, u64 __percpu *cntr, int vl, int mode, u64 data) { - u64 ret = 0; if (vl != CNTR_INVALID_VL) @@ -1510,7 +1565,7 @@ static u64 access_sw_cpu_intr(const struct cntr_entry *entry, } static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry, - void *context, int vl, int mode, u64 data) + void *context, int vl, int mode, u64 data) { struct hfi1_devdata *dd = context; @@ -1526,6 +1581,14 @@ static u64 access_sw_pio_wait(const struct cntr_entry *entry, return dd->verbs_dev.n_piowait; } +static u64 access_sw_pio_drain(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->verbs_dev.n_piodrain; +} + static u64 access_sw_vtx_wait(const struct cntr_entry *entry, void *context, int vl, int mode, u64 data) { @@ -1543,11 +1606,12 @@ static u64 access_sw_kmem_wait(const struct cntr_entry *entry, } static u64 access_sw_send_schedule(const struct cntr_entry *entry, - void *context, int vl, int mode, u64 data) + void *context, int vl, int mode, u64 data) { struct hfi1_devdata *dd = (struct hfi1_devdata *)context; - return dd->verbs_dev.n_send_schedule; + return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl, + mode, data); } /* Software counters for the error status bits within MISC_ERR_STATUS */ @@ -3885,8 +3949,8 @@ static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \ void *context, int vl, int mode, u64 data) \ { \ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \ - return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr, \ - ppd->ibport_data.cntr, vl, \ + return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr, \ + ppd->ibport_data.rvp.cntr, vl, \ mode, data); \ } @@ -3903,7 +3967,7 @@ static u64 access_ibp_##cntr(const struct cntr_entry *entry, \ if (vl != CNTR_INVALID_VL) \ return 0; \ \ - return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr, \ + return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr, \ mode, data); \ } @@ -4066,10 +4130,28 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { access_sw_vtx_wait), [C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL, access_sw_pio_wait), +[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL, + access_sw_pio_drain), [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL, access_sw_kmem_wait), [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL, access_sw_send_schedule), +[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn", + SEND_DMA_DESC_FETCHED_CNT, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + dev_access_u32_csr), +[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_int_cnt), +[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_err_cnt), +[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_idle_int_cnt), +[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_progress_int_cnt), /* MISC_ERR_STATUS */ [C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0, CNTR_NORMAL, @@ -4879,28 +4961,28 @@ static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = { [C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL), [C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH), [C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT, - CNTR_SYNTH | CNTR_VL), + CNTR_SYNTH | CNTR_VL), [C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT, - CNTR_SYNTH | CNTR_VL), + CNTR_SYNTH | CNTR_VL), [C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT, - CNTR_SYNTH | CNTR_VL), + CNTR_SYNTH | CNTR_VL), [C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL), [C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL), [C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT, - access_sw_link_dn_cnt), + access_sw_link_dn_cnt), [C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT, - access_sw_link_up_cnt), + access_sw_link_up_cnt), [C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL, access_sw_unknown_frame_cnt), [C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT, - access_sw_xmit_discards), + access_sw_xmit_discards), [C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0, - CNTR_SYNTH | CNTR_32BIT | CNTR_VL, - access_sw_xmit_discards), + CNTR_SYNTH | CNTR_32BIT | CNTR_VL, + access_sw_xmit_discards), [C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH, - access_xmit_constraint_errs), + access_xmit_constraint_errs), [C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH, - access_rcv_constraint_errs), + access_rcv_constraint_errs), [C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts), [C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends), [C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks), @@ -4916,9 +4998,9 @@ static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = { [C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL, access_sw_cpu_rc_acks), [C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL, - access_sw_cpu_rc_qacks), + access_sw_cpu_rc_qacks), [C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL, - access_sw_cpu_rc_delayed_comp), + access_sw_cpu_rc_delayed_comp), [OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1), [OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3), [OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5), @@ -5067,7 +5149,7 @@ done: * the buffer. End in '*' if the buffer is too short. */ static char *flag_string(char *buf, int buf_len, u64 flags, - struct flag_table *table, int table_size) + struct flag_table *table, int table_size) { char extra[32]; char *p = buf; @@ -5128,10 +5210,8 @@ static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source) if (source < ARRAY_SIZE(cce_misc_names)) strncpy(buf, cce_misc_names[source], bsize); else - snprintf(buf, - bsize, - "Reserved%u", - source + IS_GENERAL_ERR_START); + snprintf(buf, bsize, "Reserved%u", + source + IS_GENERAL_ERR_START); return buf; } @@ -5170,7 +5250,7 @@ static char *is_various_name(char *buf, size_t bsize, unsigned int source) if (source < ARRAY_SIZE(various_names)) strncpy(buf, various_names[source], bsize); else - snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START); + snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START); return buf; } @@ -5255,51 +5335,56 @@ static char *is_reserved_name(char *buf, size_t bsize, unsigned int source) static char *cce_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags)); + cce_err_status_flags, + ARRAY_SIZE(cce_err_status_flags)); } static char *rxe_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags)); + rxe_err_status_flags, + ARRAY_SIZE(rxe_err_status_flags)); } static char *misc_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, misc_err_status_flags, - ARRAY_SIZE(misc_err_status_flags)); + ARRAY_SIZE(misc_err_status_flags)); } static char *pio_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags)); + pio_err_status_flags, + ARRAY_SIZE(pio_err_status_flags)); } static char *sdma_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - sdma_err_status_flags, - ARRAY_SIZE(sdma_err_status_flags)); + sdma_err_status_flags, + ARRAY_SIZE(sdma_err_status_flags)); } static char *egress_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags)); + egress_err_status_flags, + ARRAY_SIZE(egress_err_status_flags)); } static char *egress_err_info_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags)); + egress_err_info_flags, + ARRAY_SIZE(egress_err_info_flags)); } static char *send_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - send_err_status_flags, - ARRAY_SIZE(send_err_status_flags)); + send_err_status_flags, + ARRAY_SIZE(send_err_status_flags)); } static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg) @@ -5312,7 +5397,7 @@ static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg) * report or record it. */ dd_dev_info(dd, "CCE Error: %s\n", - cce_err_status_string(buf, sizeof(buf), reg)); + cce_err_status_string(buf, sizeof(buf), reg)); if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) && is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) { @@ -5342,14 +5427,14 @@ static void update_rcverr_timer(unsigned long opaque) u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); if (dd->rcv_ovfl_cnt < cur_ovfl_cnt && - ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) { + ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) { dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__); - set_link_down_reason(ppd, - OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0, - OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN); + set_link_down_reason( + ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0, + OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN); queue_work(ppd->hfi1_wq, &ppd->link_bounce_work); } - dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt; + dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt; mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME); } @@ -5375,7 +5460,7 @@ static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) int i = 0; dd_dev_info(dd, "Receive Error: %s\n", - rxe_err_status_string(buf, sizeof(buf), reg)); + rxe_err_status_string(buf, sizeof(buf), reg)); if (reg & ALL_RXE_FREEZE_ERR) { int flags = 0; @@ -5402,7 +5487,7 @@ static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) int i = 0; dd_dev_info(dd, "Misc Error: %s", - misc_err_status_string(buf, sizeof(buf), reg)); + misc_err_status_string(buf, sizeof(buf), reg)); for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) { if (reg & (1ull << i)) incr_cntr64(&dd->misc_err_status_cnt[i]); @@ -5415,7 +5500,7 @@ static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg) int i = 0; dd_dev_info(dd, "PIO Error: %s\n", - pio_err_status_string(buf, sizeof(buf), reg)); + pio_err_status_string(buf, sizeof(buf), reg)); if (reg & ALL_PIO_FREEZE_ERR) start_freeze_handling(dd->pport, 0); @@ -5432,7 +5517,7 @@ static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg) int i = 0; dd_dev_info(dd, "SDMA Error: %s\n", - sdma_err_status_string(buf, sizeof(buf), reg)); + sdma_err_status_string(buf, sizeof(buf), reg)); if (reg & ALL_SDMA_FREEZE_ERR) start_freeze_handling(dd->pport, 0); @@ -5443,12 +5528,14 @@ static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg) } } -static void count_port_inactive(struct hfi1_devdata *dd) +static inline void __count_port_discards(struct hfi1_pportdata *ppd) { - struct hfi1_pportdata *ppd = dd->pport; + incr_cntr64(&ppd->port_xmit_discards); +} - if (ppd->port_xmit_discards < ~(u64)0) - ppd->port_xmit_discards++; +static void count_port_inactive(struct hfi1_devdata *dd) +{ + __count_port_discards(dd->pport); } /* @@ -5460,7 +5547,8 @@ static void count_port_inactive(struct hfi1_devdata *dd) * egress error if more than one packet fails the same integrity check * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO. */ -static void handle_send_egress_err_info(struct hfi1_devdata *dd) +static void handle_send_egress_err_info(struct hfi1_devdata *dd, + int vl) { struct hfi1_pportdata *ppd = dd->pport; u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */ @@ -5471,14 +5559,44 @@ static void handle_send_egress_err_info(struct hfi1_devdata *dd) write_csr(dd, SEND_EGRESS_ERR_INFO, info); dd_dev_info(dd, - "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n", - info, egress_err_info_string(buf, sizeof(buf), info), src); + "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n", + info, egress_err_info_string(buf, sizeof(buf), info), src); /* Eventually add other counters for each bit */ + if (info & PORT_DISCARD_EGRESS_ERRS) { + int weight, i; - if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) { - if (ppd->port_xmit_discards < ~(u64)0) - ppd->port_xmit_discards++; + /* + * Count all applicable bits as individual errors and + * attribute them to the packet that triggered this handler. + * This may not be completely accurate due to limitations + * on the available hardware error information. There is + * a single information register and any number of error + * packets may have occurred and contributed to it before + * this routine is called. This means that: + * a) If multiple packets with the same error occur before + * this routine is called, earlier packets are missed. + * There is only a single bit for each error type. + * b) Errors may not be attributed to the correct VL. + * The driver is attributing all bits in the info register + * to the packet that triggered this call, but bits + * could be an accumulation of different packets with + * different VLs. + * c) A single error packet may have multiple counts attached + * to it. There is no way for the driver to know if + * multiple bits set in the info register are due to a + * single packet or multiple packets. The driver assumes + * multiple packets. + */ + weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS); + for (i = 0; i < weight; i++) { + __count_port_discards(ppd); + if (vl >= 0 && vl < TXE_NUM_DATA_VL) + incr_cntr64(&ppd->port_xmit_discards_vl[vl]); + else if (vl == 15) + incr_cntr64(&ppd->port_xmit_discards_vl + [C_VL_15]); + } } } @@ -5496,12 +5614,71 @@ static inline int port_inactive_err(u64 posn) * Input value is a bit position within the SEND_EGRESS_ERR_STATUS * register. Does it represent a 'disallowed packet' error? */ -static inline int disallowed_pkt_err(u64 posn) +static inline int disallowed_pkt_err(int posn) { return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) && posn <= SEES(TX_SDMA15_DISALLOWED_PACKET)); } +/* + * Input value is a bit position of one of the SDMA engine disallowed + * packet errors. Return which engine. Use of this must be guarded by + * disallowed_pkt_err(). + */ +static inline int disallowed_pkt_engine(int posn) +{ + return posn - SEES(TX_SDMA0_DISALLOWED_PACKET); +} + +/* + * Translate an SDMA engine to a VL. Return -1 if the tranlation cannot + * be done. + */ +static int engine_to_vl(struct hfi1_devdata *dd, int engine) +{ + struct sdma_vl_map *m; + int vl; + + /* range check */ + if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES) + return -1; + + rcu_read_lock(); + m = rcu_dereference(dd->sdma_map); + vl = m->engine_to_vl[engine]; + rcu_read_unlock(); + + return vl; +} + +/* + * Translate the send context (sofware index) into a VL. Return -1 if the + * translation cannot be done. + */ +static int sc_to_vl(struct hfi1_devdata *dd, int sw_index) +{ + struct send_context_info *sci; + struct send_context *sc; + int i; + + sci = &dd->send_contexts[sw_index]; + + /* there is no information for user (PSM) and ack contexts */ + if (sci->type != SC_KERNEL) + return -1; + + sc = sci->sc; + if (!sc) + return -1; + if (dd->vld[15].sc == sc) + return 15; + for (i = 0; i < num_vls; i++) + if (dd->vld[i].sc == sc) + return i; + + return -1; +} + static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg) { u64 reg_copy = reg, handled = 0; @@ -5510,34 +5687,34 @@ static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg) if (reg & ALL_TXE_EGRESS_FREEZE_ERR) start_freeze_handling(dd->pport, 0); - if (is_ax(dd) && (reg & - SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) - && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) + else if (is_ax(dd) && + (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) && + (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) start_freeze_handling(dd->pport, 0); while (reg_copy) { int posn = fls64(reg_copy); - /* - * fls64() returns a 1-based offset, but we generally - * want 0-based offsets. - */ + /* fls64() returns a 1-based offset, we want it zero based */ int shift = posn - 1; + u64 mask = 1ULL << shift; if (port_inactive_err(shift)) { count_port_inactive(dd); - handled |= (1ULL << shift); + handled |= mask; } else if (disallowed_pkt_err(shift)) { - handle_send_egress_err_info(dd); - handled |= (1ULL << shift); + int vl = engine_to_vl(dd, disallowed_pkt_engine(shift)); + + handle_send_egress_err_info(dd, vl); + handled |= mask; } - clear_bit(shift, (unsigned long *)®_copy); + reg_copy &= ~mask; } reg &= ~handled; if (reg) dd_dev_info(dd, "Egress Error: %s\n", - egress_err_status_string(buf, sizeof(buf), reg)); + egress_err_status_string(buf, sizeof(buf), reg)); for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) { if (reg & (1ull << i)) @@ -5551,7 +5728,7 @@ static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) int i = 0; dd_dev_info(dd, "Send Error: %s\n", - send_err_status_string(buf, sizeof(buf), reg)); + send_err_status_string(buf, sizeof(buf), reg)); for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) { if (reg & (1ull << i)) @@ -5597,7 +5774,7 @@ static void interrupt_clear_down(struct hfi1_devdata *dd, u64 mask; dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n", - eri->desc, reg); + eri->desc, reg); /* * Read-modify-write so any other masked bits * remain masked. @@ -5621,14 +5798,15 @@ static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source) interrupt_clear_down(dd, 0, eri); } else { dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n", - source); + source); } } static char *send_context_err_status_string(char *buf, int buf_len, u64 flags) { return flag_string(buf, buf_len, flags, - sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags)); + sc_err_status_flags, + ARRAY_SIZE(sc_err_status_flags)); } /* @@ -5653,15 +5831,15 @@ static void is_sendctxt_err_int(struct hfi1_devdata *dd, sw_index = dd->hw_to_sw[hw_context]; if (sw_index >= dd->num_send_contexts) { dd_dev_err(dd, - "out of range sw index %u for send context %u\n", - sw_index, hw_context); + "out of range sw index %u for send context %u\n", + sw_index, hw_context); return; } sci = &dd->send_contexts[sw_index]; sc = sci->sc; if (!sc) { dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__, - sw_index, hw_context); + sw_index, hw_context); return; } @@ -5671,10 +5849,11 @@ static void is_sendctxt_err_int(struct hfi1_devdata *dd, status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS); dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context, - send_context_err_status_string(flags, sizeof(flags), status)); + send_context_err_status_string(flags, sizeof(flags), + status)); if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK) - handle_send_egress_err_info(dd); + handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index)); /* * Automatically restart halted kernel contexts out of interrupt @@ -5707,6 +5886,7 @@ static void handle_sdma_eng_err(struct hfi1_devdata *dd, dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n", sde->this_idx, source, (unsigned long long)status); #endif + sde->err_cnt++; sdma_engine_error(sde, status); /* @@ -5755,23 +5935,22 @@ static void is_various_int(struct hfi1_devdata *dd, unsigned int source) interrupt_clear_down(dd, 0, eri); else dd_dev_info(dd, - "%s: Unimplemented/reserved interrupt %d\n", - __func__, source); + "%s: Unimplemented/reserved interrupt %d\n", + __func__, source); } static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) { - /* source is always zero */ + /* src_ctx is always zero */ struct hfi1_pportdata *ppd = dd->pport; unsigned long flags; u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); if (reg & QSFP_HFI0_MODPRST_N) { - - dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n", - __func__); - if (!qsfp_mod_present(ppd)) { + dd_dev_info(dd, "%s: QSFP module removed\n", + __func__); + ppd->driver_link_ready = 0; /* * Cable removed, reset all our information about the @@ -5784,14 +5963,23 @@ static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) * an interrupt when a cable is inserted */ ppd->qsfp_info.cache_valid = 0; - ppd->qsfp_info.qsfp_interrupt_functional = 0; + ppd->qsfp_info.reset_needed = 0; + ppd->qsfp_info.limiting_active = 0; spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, - flags); - write_csr(dd, - dd->hfi1_id ? - ASIC_QSFP2_INVERT : - ASIC_QSFP1_INVERT, - qsfp_int_mgmt); + flags); + /* Invert the ModPresent pin now to detect plug-in */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, qsfp_int_mgmt); + + if ((ppd->offline_disabled_reason > + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) || + (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))) + ppd->offline_disabled_reason = + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + if (ppd->host_link_state == HLS_DN_POLL) { /* * The link is still in POLL. This means @@ -5802,28 +5990,33 @@ static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) queue_work(ppd->hfi1_wq, &ppd->link_down_work); } } else { + dd_dev_info(dd, "%s: QSFP module inserted\n", + __func__); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); ppd->qsfp_info.cache_valid = 0; ppd->qsfp_info.cache_refresh_required = 1; spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, - flags); + flags); + /* + * Stop inversion of ModPresent pin to detect + * removal of the cable + */ qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N; - write_csr(dd, - dd->hfi1_id ? - ASIC_QSFP2_INVERT : - ASIC_QSFP1_INVERT, - qsfp_int_mgmt); + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, qsfp_int_mgmt); + + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); } } if (reg & QSFP_HFI0_INT_N) { - - dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n", - __func__); + dd_dev_info(dd, "%s: Interrupt received from QSFP module\n", + __func__); spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); ppd->qsfp_info.check_interrupt_flags = 1; - ppd->qsfp_info.qsfp_interrupt_functional = 1; spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); } @@ -5837,11 +6030,11 @@ static int request_host_lcb_access(struct hfi1_devdata *dd) int ret; ret = do_8051_command(dd, HCMD_MISC, - (u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT, - NULL); + (u64)HCMD_MISC_REQUEST_LCB_ACCESS << + LOAD_DATA_FIELD_ID_SHIFT, NULL); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, "%s: command failed with error %d\n", - __func__, ret); + __func__, ret); } return ret == HCMD_SUCCESS ? 0 : -EBUSY; } @@ -5851,11 +6044,11 @@ static int request_8051_lcb_access(struct hfi1_devdata *dd) int ret; ret = do_8051_command(dd, HCMD_MISC, - (u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT, - NULL); + (u64)HCMD_MISC_GRANT_LCB_ACCESS << + LOAD_DATA_FIELD_ID_SHIFT, NULL); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, "%s: command failed with error %d\n", - __func__, ret); + __func__, ret); } return ret == HCMD_SUCCESS ? 0 : -EBUSY; } @@ -5867,8 +6060,8 @@ static int request_8051_lcb_access(struct hfi1_devdata *dd) static inline void set_host_lcb_access(struct hfi1_devdata *dd) { write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, - DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK - | DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK); + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK | + DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK); } /* @@ -5878,7 +6071,7 @@ static inline void set_host_lcb_access(struct hfi1_devdata *dd) static inline void set_8051_lcb_access(struct hfi1_devdata *dd) { write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, - DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK); + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK); } /* @@ -5912,7 +6105,7 @@ int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok) /* this access is valid only when the link is up */ if ((ppd->host_link_state & HLS_UP) == 0) { dd_dev_info(dd, "%s: link state %s not up\n", - __func__, link_state_name(ppd->host_link_state)); + __func__, link_state_name(ppd->host_link_state)); ret = -EBUSY; goto done; } @@ -5921,8 +6114,8 @@ int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok) ret = request_host_lcb_access(dd); if (ret) { dd_dev_err(dd, - "%s: unable to acquire LCB access, err %d\n", - __func__, ret); + "%s: unable to acquire LCB access, err %d\n", + __func__, ret); goto done; } set_host_lcb_access(dd); @@ -5959,7 +6152,7 @@ int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok) if (dd->lcb_access_count == 0) { dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n", - __func__); + __func__); goto done; } @@ -5968,8 +6161,8 @@ int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok) ret = request_8051_lcb_access(dd); if (ret) { dd_dev_err(dd, - "%s: unable to release LCB access, err %d\n", - __func__, ret); + "%s: unable to release LCB access, err %d\n", + __func__, ret); /* restore host access if the grant didn't work */ set_host_lcb_access(dd); goto done; @@ -6001,19 +6194,26 @@ static void init_lcb_access(struct hfi1_devdata *dd) static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data) { write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, - DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK - | (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT - | (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT); + DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK | + (u64)return_code << + DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT | + (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT); } /* - * Handle requests from the 8051. + * Handle host requests from the 8051. + * + * This is a work-queue function outside of the interrupt. */ -static void handle_8051_request(struct hfi1_devdata *dd) +void handle_8051_request(struct work_struct *work) { + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + dc_host_req_work); + struct hfi1_devdata *dd = ppd->dd; u64 reg; - u16 data; - u8 type; + u16 data = 0; + u8 type, i, lanes, *cache = ppd->qsfp_info.cache; + u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS]; reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) @@ -6034,12 +6234,46 @@ static void handle_8051_request(struct hfi1_devdata *dd) case HREQ_READ_CONFIG: case HREQ_SET_TX_EQ_ABS: case HREQ_SET_TX_EQ_REL: - case HREQ_ENABLE: dd_dev_info(dd, "8051 request: request 0x%x not supported\n", - type); + type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); break; + case HREQ_ENABLE: + lanes = data & 0xF; + for (i = 0; lanes; lanes >>= 1, i++) { + if (!(lanes & 1)) + continue; + if (data & 0x200) { + /* enable TX CDR */ + if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && + cache[QSFP_CDR_INFO_OFFS] & 0x80) + cdr_ctrl_byte |= (1 << (i + 4)); + } else { + /* disable TX CDR */ + if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && + cache[QSFP_CDR_INFO_OFFS] & 0x80) + cdr_ctrl_byte &= ~(1 << (i + 4)); + } + + if (data & 0x800) { + /* enable RX CDR */ + if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && + cache[QSFP_CDR_INFO_OFFS] & 0x40) + cdr_ctrl_byte |= (1 << i); + } else { + /* disable RX CDR */ + if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && + cache[QSFP_CDR_INFO_OFFS] & 0x40) + cdr_ctrl_byte &= ~(1 << i); + } + } + one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS, + &cdr_ctrl_byte, 1); + hreq_response(dd, HREQ_SUCCESS, data); + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + break; + case HREQ_CONFIG_DONE: hreq_response(dd, HREQ_SUCCESS, 0); break; @@ -6059,11 +6293,11 @@ static void write_global_credit(struct hfi1_devdata *dd, u8 vau, u16 total, u16 shared) { write_csr(dd, SEND_CM_GLOBAL_CREDIT, - ((u64)total - << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) - | ((u64)shared - << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) - | ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT)); + ((u64)total << + SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) | + ((u64)shared << + SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) | + ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT)); } /* @@ -6100,7 +6334,7 @@ void reset_link_credits(struct hfi1_devdata *dd) /* remove all previous VL credit limits */ for (i = 0; i < TXE_NUM_DATA_VL; i++) - write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0); + write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0); write_csr(dd, SEND_CM_CREDIT_VL15, 0); write_global_credit(dd, 0, 0, 0); /* reset the CM block */ @@ -6142,15 +6376,14 @@ static void lcb_shutdown(struct hfi1_devdata *dd, int abort) write_csr(dd, DC_LCB_CFG_RUN, 0); /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, - 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT); + 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT); /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */ dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN); reg = read_csr(dd, DCC_CFG_RESET); - write_csr(dd, DCC_CFG_RESET, - reg - | (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) - | (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT)); - (void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ + write_csr(dd, DCC_CFG_RESET, reg | + (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) | + (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT)); + (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ if (!abort) { udelay(1); /* must hold for the longer of 16cclks or 20ns */ write_csr(dd, DCC_CFG_RESET, reg); @@ -6179,14 +6412,18 @@ static void dc_shutdown(struct hfi1_devdata *dd) spin_unlock_irqrestore(&dd->dc8051_lock, flags); /* Shutdown the LCB */ lcb_shutdown(dd, 1); - /* Going to OFFLINE would have causes the 8051 to put the + /* + * Going to OFFLINE would have causes the 8051 to put the * SerDes into reset already. Just need to shut down the 8051, - * itself. */ + * itself. + */ write_csr(dd, DC_DC8051_CFG_RST, 0x1); } -/* Calling this after the DC has been brought out of reset should not - * do any damage. */ +/* + * Calling this after the DC has been brought out of reset should not + * do any damage. + */ static void dc_start(struct hfi1_devdata *dd) { unsigned long flags; @@ -6202,7 +6439,7 @@ static void dc_start(struct hfi1_devdata *dd) ret = wait_fm_ready(dd, TIMEOUT_8051_START); if (ret) { dd_dev_err(dd, "%s: timeout starting 8051 firmware\n", - __func__); + __func__); } /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ write_csr(dd, DCC_CFG_RESET, 0x10); @@ -6295,7 +6532,7 @@ static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd) write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr); /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */ write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, - DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK); + DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK); write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr); } @@ -6312,8 +6549,10 @@ void handle_sma_message(struct work_struct *work) u64 msg; int ret; - /* msg is bytes 1-4 of the 40-bit idle message - the command code - is stripped off */ + /* + * msg is bytes 1-4 of the 40-bit idle message - the command code + * is stripped off + */ ret = read_idle_sma(dd, &msg); if (ret) return; @@ -6339,8 +6578,8 @@ void handle_sma_message(struct work_struct *work) * * Can activate the node. Discard otherwise. */ - if (ppd->host_link_state == HLS_UP_ARMED - && ppd->is_active_optimize_enabled) { + if (ppd->host_link_state == HLS_UP_ARMED && + ppd->is_active_optimize_enabled) { ppd->neighbor_normal = 1; ret = set_link_state(ppd, HLS_UP_ACTIVE); if (ret) @@ -6352,8 +6591,8 @@ void handle_sma_message(struct work_struct *work) break; default: dd_dev_err(dd, - "%s: received unexpected SMA idle message 0x%llx\n", - __func__, msg); + "%s: received unexpected SMA idle message 0x%llx\n", + __func__, msg); break; } } @@ -6445,10 +6684,9 @@ static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze) if (time_after(jiffies, timeout)) { dd_dev_err(dd, - "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing", - freeze ? "" : "un", - reg & ALL_FROZE, - freeze ? ALL_FROZE : 0ull); + "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing", + freeze ? "" : "un", reg & ALL_FROZE, + freeze ? ALL_FROZE : 0ull); return; } usleep_range(80, 120); @@ -6478,11 +6716,17 @@ static void rxe_freeze(struct hfi1_devdata *dd) */ static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) { + u32 rcvmask; int i; /* enable all kernel contexts */ - for (i = 0; i < dd->n_krcv_queues; i++) - hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i); + for (i = 0; i < dd->n_krcv_queues; i++) { + rcvmask = HFI1_RCVCTRL_CTXT_ENB; + /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */ + rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? + HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; + hfi1_rcvctrl(dd, rcvmask, i); + } /* enable port */ add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); @@ -6567,7 +6811,7 @@ void handle_freeze(struct work_struct *work) void handle_link_up(struct work_struct *work) { struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, - link_up_work); + link_up_work); set_link_state(ppd, HLS_UP_INIT); /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ @@ -6586,17 +6830,20 @@ void handle_link_up(struct work_struct *work) if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) { /* oops - current speed is not enabled, bounce */ dd_dev_err(ppd->dd, - "Link speed active 0x%x is outside enabled 0x%x, downing link\n", - ppd->link_speed_active, ppd->link_speed_enabled); + "Link speed active 0x%x is outside enabled 0x%x, downing link\n", + ppd->link_speed_active, ppd->link_speed_enabled); set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0, - OPA_LINKDOWN_REASON_SPEED_POLICY); + OPA_LINKDOWN_REASON_SPEED_POLICY); set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); start_link(ppd); } } -/* Several pieces of LNI information were cached for SMA in ppd. - * Reset these on link down */ +/* + * Several pieces of LNI information were cached for SMA in ppd. + * Reset these on link down + */ static void reset_neighbor_info(struct hfi1_pportdata *ppd) { ppd->neighbor_guid = 0; @@ -6616,7 +6863,13 @@ void handle_link_down(struct work_struct *work) struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, link_down_work); - /* go offline first, then deal with reasons */ + if ((ppd->host_link_state & + (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) && + ppd->port_type == PORT_TYPE_FIXED) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED); + + /* Go offline first, then deal with reading/writing through 8051 */ set_link_state(ppd, HLS_DN_OFFLINE); lcl_reason = 0; @@ -6636,12 +6889,16 @@ void handle_link_down(struct work_struct *work) /* disable the port */ clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); - /* If there is no cable attached, turn the DC off. Otherwise, - * start the link bring up. */ - if (!qsfp_mod_present(ppd)) + /* + * If there is no cable attached, turn the DC off. Otherwise, + * start the link bring up. + */ + if (!qsfp_mod_present(ppd)) { dc_shutdown(ppd->dd); - else + } else { + tune_serdes(ppd); start_link(ppd); + } } void handle_link_bounce(struct work_struct *work) @@ -6654,10 +6911,11 @@ void handle_link_bounce(struct work_struct *work) */ if (ppd->host_link_state & HLS_UP) { set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); start_link(ppd); } else { dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n", - __func__, link_state_name(ppd->host_link_state)); + __func__, link_state_name(ppd->host_link_state)); } } @@ -6754,7 +7012,7 @@ static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width) case 3: return OPA_LINK_WIDTH_3X; default: dd_dev_info(dd, "%s: invalid width %d, using 4\n", - __func__, width); + __func__, width); /* fall through */ case 4: return OPA_LINK_WIDTH_4X; } @@ -6766,6 +7024,7 @@ static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width) static const u8 bit_counts[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + static inline u8 nibble_to_count(u8 nibble) { return bit_counts[nibble & 0xf]; @@ -6791,7 +7050,7 @@ static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width, /* read the active lanes */ read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, - &rx_polarity_inversion, &max_rate); + &rx_polarity_inversion, &max_rate); read_local_lni(dd, &enable_lane_rx); /* convert to counts */ @@ -6803,8 +7062,8 @@ static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width, * handle_verify_cap(). The ASIC 8051 firmware does not correctly * set the max_rate field in handle_verify_cap until v0.19. */ - if ((dd->icode == ICODE_RTL_SILICON) - && (dd->dc8051_ver < dc8051_ver(0, 19))) { + if ((dd->icode == ICODE_RTL_SILICON) && + (dd->dc8051_ver < dc8051_ver(0, 19))) { /* max_rate: 0 = 12.5G, 1 = 25G */ switch (max_rate) { case 0: @@ -6812,8 +7071,8 @@ static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width, break; default: dd_dev_err(dd, - "%s: unexpected max rate %d, using 25Gb\n", - __func__, (int)max_rate); + "%s: unexpected max rate %d, using 25Gb\n", + __func__, (int)max_rate); /* fall through */ case 1: dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G; @@ -6822,8 +7081,8 @@ static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width, } dd_dev_info(dd, - "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n", - enable_lane_tx, tx, enable_lane_rx, rx); + "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n", + enable_lane_tx, tx, enable_lane_rx, rx); *tx_width = link_width_to_bits(dd, tx); *rx_width = link_width_to_bits(dd, rx); } @@ -6926,13 +7185,8 @@ void handle_verify_cap(struct work_struct *work) */ read_vc_remote_phy(dd, &power_management, &continious); - read_vc_remote_fabric( - dd, - &vau, - &z, - &vcu, - &vl15buf, - &partner_supported_crc); + read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf, + &partner_supported_crc); read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths); read_remote_device_id(dd, &device_id, &device_rev); /* @@ -6943,19 +7197,16 @@ void handle_verify_cap(struct work_struct *work) /* print the active widths */ get_link_widths(dd, &active_tx, &active_rx); dd_dev_info(dd, - "Peer PHY: power management 0x%x, continuous updates 0x%x\n", - (int)power_management, (int)continious); + "Peer PHY: power management 0x%x, continuous updates 0x%x\n", + (int)power_management, (int)continious); dd_dev_info(dd, - "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n", - (int)vau, - (int)z, - (int)vcu, - (int)vl15buf, - (int)partner_supported_crc); + "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n", + (int)vau, (int)z, (int)vcu, (int)vl15buf, + (int)partner_supported_crc); dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n", - (u32)remote_tx_rate, (u32)link_widths); + (u32)remote_tx_rate, (u32)link_widths); dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n", - (u32)device_id, (u32)device_rev); + (u32)device_id, (u32)device_rev); /* * The peer vAU value just read is the peer receiver value. HFI does * not support a transmit vAU of 0 (AU == 8). We advertised that @@ -6990,10 +7241,10 @@ void handle_verify_cap(struct work_struct *work) reg = read_csr(dd, SEND_CM_CTRL); if (crc_val == LCB_CRC_14B && crc_14b_sideband) { write_csr(dd, SEND_CM_CTRL, - reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); } else { write_csr(dd, SEND_CM_CTRL, - reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); } ppd->link_speed_active = 0; /* invalid value */ @@ -7018,7 +7269,7 @@ void handle_verify_cap(struct work_struct *work) } if (ppd->link_speed_active == 0) { dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n", - __func__, (int)remote_tx_rate); + __func__, (int)remote_tx_rate); ppd->link_speed_active = OPA_LINK_SPEED_25G; } @@ -7074,9 +7325,9 @@ void handle_verify_cap(struct work_struct *work) read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) & DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK; dd_dev_info(dd, - "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n", - ppd->neighbor_guid, ppd->neighbor_type, - ppd->mgmt_allowed, ppd->neighbor_fm_security); + "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n", + ppd->neighbor_guid, ppd->neighbor_type, + ppd->mgmt_allowed, ppd->neighbor_fm_security); if (ppd->mgmt_allowed) add_full_mgmt_pkey(ppd); @@ -7130,28 +7381,27 @@ retry: /* bounce if not at starting active width */ if ((ppd->link_width_active != - ppd->link_width_downgrade_tx_active) - || (ppd->link_width_active != - ppd->link_width_downgrade_rx_active)) { + ppd->link_width_downgrade_tx_active) || + (ppd->link_width_active != + ppd->link_width_downgrade_rx_active)) { dd_dev_err(ppd->dd, - "Link downgrade is disabled and link has downgraded, downing link\n"); + "Link downgrade is disabled and link has downgraded, downing link\n"); dd_dev_err(ppd->dd, - " original 0x%x, tx active 0x%x, rx active 0x%x\n", - ppd->link_width_active, - ppd->link_width_downgrade_tx_active, - ppd->link_width_downgrade_rx_active); + " original 0x%x, tx active 0x%x, rx active 0x%x\n", + ppd->link_width_active, + ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); do_bounce = 1; } - } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 - || (lwde & ppd->link_width_downgrade_rx_active) == 0) { + } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 || + (lwde & ppd->link_width_downgrade_rx_active) == 0) { /* Tx or Rx is outside the enabled policy */ dd_dev_err(ppd->dd, - "Link is outside of downgrade allowed, downing link\n"); + "Link is outside of downgrade allowed, downing link\n"); dd_dev_err(ppd->dd, - " enabled 0x%x, tx active 0x%x, rx active 0x%x\n", - lwde, - ppd->link_width_downgrade_tx_active, - ppd->link_width_downgrade_rx_active); + " enabled 0x%x, tx active 0x%x, rx active 0x%x\n", + lwde, ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); do_bounce = 1; } @@ -7160,8 +7410,9 @@ done: if (do_bounce) { set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0, - OPA_LINKDOWN_REASON_WIDTH_POLICY); + OPA_LINKDOWN_REASON_WIDTH_POLICY); set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); start_link(ppd); } } @@ -7242,9 +7493,10 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { queue_link_down = 1; dd_dev_info(dd, "Link error: %s\n", - dc8051_info_err_string(buf, - sizeof(buf), - err & FAILED_LNI)); + dc8051_info_err_string(buf, + sizeof(buf), + err & + FAILED_LNI)); } err &= ~(u64)FAILED_LNI; } @@ -7256,7 +7508,8 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) if (err) { /* report remaining errors, but do not do anything */ dd_dev_err(dd, "8051 info error: %s\n", - dc8051_info_err_string(buf, sizeof(buf), err)); + dc8051_info_err_string(buf, sizeof(buf), + err)); } /* @@ -7284,7 +7537,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) host_msg &= ~(u64)LINKUP_ACHIEVED; } if (host_msg & EXT_DEVICE_CFG_REQ) { - handle_8051_request(dd); + queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work); host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; } if (host_msg & VERIFY_CAP_FRAME) { @@ -7309,8 +7562,9 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) if (host_msg) { /* report remaining messages, but do not do anything */ dd_dev_info(dd, "8051 info host message: %s\n", - dc8051_info_host_msg_string(buf, sizeof(buf), - host_msg)); + dc8051_info_host_msg_string(buf, + sizeof(buf), + host_msg)); } reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK; @@ -7323,25 +7577,27 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) */ dd_dev_err(dd, "Lost 8051 heartbeat\n"); write_csr(dd, DC_DC8051_ERR_EN, - read_csr(dd, DC_DC8051_ERR_EN) - & ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK); + read_csr(dd, DC_DC8051_ERR_EN) & + ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK); reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK; } if (reg) { /* report the error, but do not do anything */ dd_dev_err(dd, "8051 error: %s\n", - dc8051_err_string(buf, sizeof(buf), reg)); + dc8051_err_string(buf, sizeof(buf), reg)); } if (queue_link_down) { - /* if the link is already going down or disabled, do not - * queue another */ - if ((ppd->host_link_state - & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN)) - || ppd->link_enabled == 0) { + /* + * if the link is already going down or disabled, do not + * queue another + */ + if ((ppd->host_link_state & + (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) || + ppd->link_enabled == 0) { dd_dev_info(dd, "%s: not queuing link down\n", - __func__); + __func__); } else { queue_work(ppd->hfi1_wq, &ppd->link_down_work); } @@ -7483,8 +7739,10 @@ static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) /* set status bit */ dd->err_info_rcvport.status_and_code |= OPA_EI_STATUS_SMASK; - /* save first 2 flits in the packet that caused - * the error */ + /* + * save first 2 flits in the packet that caused + * the error + */ dd->err_info_rcvport.packet_flit1 = hdr0; dd->err_info_rcvport.packet_flit2 = hdr1; } @@ -7517,7 +7775,7 @@ static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) /* just report this */ dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra); dd_dev_info(dd, " hdr0 0x%llx, hdr1 0x%llx\n", - hdr0, hdr1); + hdr0, hdr1); reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK; } @@ -7536,7 +7794,7 @@ static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) /* report any remaining errors */ if (reg) dd_dev_info(dd, "DCC Error: %s\n", - dcc_err_string(buf, sizeof(buf), reg)); + dcc_err_string(buf, sizeof(buf), reg)); if (lcl_reason == 0) lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN; @@ -7553,7 +7811,7 @@ static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg) char buf[96]; dd_dev_info(dd, "LCB Error: %s\n", - lcb_err_string(buf, sizeof(buf), reg)); + lcb_err_string(buf, sizeof(buf), reg)); } /* @@ -7643,7 +7901,7 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) err_detail = "out of range"; } dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n", - err_detail, source); + err_detail, source); } /* @@ -7669,7 +7927,7 @@ static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source) err_detail = "out of range"; } dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n", - err_detail, source); + err_detail, source); } /* @@ -7680,12 +7938,14 @@ static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source) char name[64]; dd_dev_err(dd, "unexpected %s interrupt\n", - is_reserved_name(name, sizeof(name), source)); + is_reserved_name(name, sizeof(name), source)); } static const struct is_table is_table[] = { -/* start end - name func interrupt func */ +/* + * start end + * name func interrupt func + */ { IS_GENERAL_ERR_START, IS_GENERAL_ERR_END, is_misc_err_name, is_misc_err_int }, { IS_SDMAENG_ERR_START, IS_SDMAENG_ERR_END, @@ -7756,7 +8016,7 @@ static irqreturn_t general_interrupt(int irq, void *data) /* phase 2: call the appropriate handler */ for_each_set_bit(bit, (unsigned long *)®s[0], - CCE_NUM_INT_CSRS*64) { + CCE_NUM_INT_CSRS * 64) { is_interrupt(dd, bit); } @@ -7779,27 +8039,27 @@ static irqreturn_t sdma_interrupt(int irq, void *data) /* This read_csr is really bad in the hot path */ status = read_csr(dd, - CCE_INT_STATUS + (8*(IS_SDMA_START/64))) - & sde->imask; + CCE_INT_STATUS + (8 * (IS_SDMA_START / 64))) + & sde->imask; if (likely(status)) { /* clear the interrupt(s) */ write_csr(dd, - CCE_INT_CLEAR + (8*(IS_SDMA_START/64)), - status); + CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)), + status); /* handle the interrupt(s) */ sdma_engine_interrupt(sde, status); } else dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n", - sde->this_idx); + sde->this_idx); return IRQ_HANDLED; } /* - * Clear the receive interrupt, forcing the write and making sure - * we have data from the chip, pushing everything in front of it - * back to the host. + * Clear the receive interrupt. Use a read of the interrupt clear CSR + * to insure that the write completed. This does NOT guarantee that + * queued DMA writes to memory from the chip are pushed. */ static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd) { @@ -7813,27 +8073,45 @@ static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd) } /* force the receive interrupt */ -static inline void force_recv_intr(struct hfi1_ctxtdata *rcd) +void force_recv_intr(struct hfi1_ctxtdata *rcd) { write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask); } -/* return non-zero if a packet is present */ +/* + * Return non-zero if a packet is present. + * + * This routine is called when rechecking for packets after the RcvAvail + * interrupt has been cleared down. First, do a quick check of memory for + * a packet present. If not found, use an expensive CSR read of the context + * tail to determine the actual tail. The CSR read is necessary because there + * is no method to push pending DMAs to memory other than an interrupt and we + * are trying to determine if we need to force an interrupt. + */ static inline int check_packet_present(struct hfi1_ctxtdata *rcd) { + u32 tail; + int present; + if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) - return (rcd->seq_cnt == + present = (rcd->seq_cnt == rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd)))); + else /* is RDMA rtail */ + present = (rcd->head != get_rcvhdrtail(rcd)); + + if (present) + return 1; - /* else is RDMA rtail */ - return (rcd->head != get_rcvhdrtail(rcd)); + /* fall back to a CSR read, correct indpendent of DMA_RTAIL */ + tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL); + return rcd->head != tail; } /* * Receive packet IRQ handler. This routine expects to be on its own IRQ. * This routine will try to handle packets immediately (latency), but if * it finds too many, it will invoke the thread handler (bandwitdh). The - * chip receive interupt is *not* cleared down until this or the thread (if + * chip receive interrupt is *not* cleared down until this or the thread (if * invoked) is finished. The intent is to avoid extra interrupts while we * are processing packets anyway. */ @@ -7846,6 +8124,7 @@ static irqreturn_t receive_context_interrupt(int irq, void *data) trace_hfi1_receive_interrupt(dd, rcd->ctxt); this_cpu_inc(*dd->int_counter); + aspm_ctx_disable(rcd); /* receive interrupt remains blocked while processing packets */ disposition = rcd->do_interrupt(rcd, 0); @@ -7912,7 +8191,7 @@ u32 read_physical_state(struct hfi1_devdata *dd) & DC_DC8051_STS_CUR_STATE_PORT_MASK; } -static u32 read_logical_state(struct hfi1_devdata *dd) +u32 read_logical_state(struct hfi1_devdata *dd) { u64 reg; @@ -8160,8 +8439,8 @@ static int set_physical_link_state(struct hfi1_devdata *dd, u64 state) return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL); } -static int load_8051_config(struct hfi1_devdata *dd, u8 field_id, - u8 lane_id, u32 config_data) +int load_8051_config(struct hfi1_devdata *dd, u8 field_id, + u8 lane_id, u32 config_data) { u64 data; int ret; @@ -8172,8 +8451,8 @@ static int load_8051_config(struct hfi1_devdata *dd, u8 field_id, ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, - "load 8051 config: field id %d, lane %d, err %d\n", - (int)field_id, (int)lane_id, ret); + "load 8051 config: field id %d, lane %d, err %d\n", + (int)field_id, (int)lane_id, ret); } return ret; } @@ -8183,8 +8462,8 @@ static int load_8051_config(struct hfi1_devdata *dd, u8 field_id, * set the result, even on error. * Return 0 on success, -errno on failure */ -static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id, - u32 *result) +int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id, + u32 *result) { u64 big_data; u32 addr; @@ -8210,7 +8489,7 @@ static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id, } else { *result = 0; dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n", - __func__, lane_id, field_id); + __func__, lane_id, field_id); } return ret; @@ -8247,7 +8526,7 @@ static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, u32 frame; read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, - &frame); + &frame); *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK; *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK; *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; @@ -8329,7 +8608,7 @@ static void read_vc_remote_link_width(struct hfi1_devdata *dd, u32 frame; read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG, - &frame); + &frame); *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT) & REMOTE_TX_RATE_MASK; *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; @@ -8369,7 +8648,7 @@ void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality) *link_quality = 0; if (dd->pport->host_link_state & HLS_UP) { ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, - &frame); + &frame); if (ret == 0) *link_quality = (frame >> LINK_QUALITY_SHIFT) & LINK_QUALITY_MASK; @@ -8429,10 +8708,9 @@ static void check_fabric_firmware_versions(struct hfi1_devdata *dd) for (lane = 0; lane < 4; lane++) { ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame); if (ret) { - dd_dev_err( - dd, - "Unable to read lane %d firmware details\n", - lane); + dd_dev_err(dd, + "Unable to read lane %d firmware details\n", + lane); continue; } version = (frame >> SPICO_ROM_VERSION_SHIFT) @@ -8440,8 +8718,8 @@ static void check_fabric_firmware_versions(struct hfi1_devdata *dd) prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT) & SPICO_ROM_PROD_ID_MASK; dd_dev_info(dd, - "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n", - lane, version, prod_id); + "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n", + lane, version, prod_id); } } @@ -8454,11 +8732,10 @@ static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out) { int ret; - ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, - type, data_out); + ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, "read idle message: type %d, err %d\n", - (u32)type, ret); + (u32)type, ret); return -EINVAL; } dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out); @@ -8475,8 +8752,8 @@ static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out) */ static int read_idle_sma(struct hfi1_devdata *dd, u64 *data) { - return read_idle_message(dd, - (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data); + return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, + data); } /* @@ -8492,7 +8769,7 @@ static int send_idle_message(struct hfi1_devdata *dd, u64 data) ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n", - data, ret); + data, ret); return -EINVAL; } return 0; @@ -8507,8 +8784,8 @@ int send_idle_sma(struct hfi1_devdata *dd, u64 message) { u64 data; - data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) - | ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT); + data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) | + ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT); return send_idle_message(dd, data); } @@ -8530,7 +8807,7 @@ static int do_quick_linkup(struct hfi1_devdata *dd) /* LCB_CFG_LOOPBACK.VAL = 2 */ /* LCB_CFG_LANE_WIDTH.VAL = 0 */ write_csr(dd, DC_LCB_CFG_LOOPBACK, - IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT); + IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT); write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0); } @@ -8542,25 +8819,24 @@ static int do_quick_linkup(struct hfi1_devdata *dd) if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { /* LCB_CFG_RUN.EN = 1 */ write_csr(dd, DC_LCB_CFG_RUN, - 1ull << DC_LCB_CFG_RUN_EN_SHIFT); + 1ull << DC_LCB_CFG_RUN_EN_SHIFT); /* watch LCB_STS_LINK_TRANSFER_ACTIVE */ timeout = jiffies + msecs_to_jiffies(10); while (1) { - reg = read_csr(dd, - DC_LCB_STS_LINK_TRANSFER_ACTIVE); + reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE); if (reg) break; if (time_after(jiffies, timeout)) { dd_dev_err(dd, - "timeout waiting for LINK_TRANSFER_ACTIVE\n"); + "timeout waiting for LINK_TRANSFER_ACTIVE\n"); return -ETIMEDOUT; } udelay(2); } write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, - 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT); + 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT); } if (!loopback) { @@ -8572,10 +8848,9 @@ static int do_quick_linkup(struct hfi1_devdata *dd) * done with LCB set up before resuming. */ dd_dev_err(dd, - "Pausing for peer to be finished with LCB set up\n"); + "Pausing for peer to be finished with LCB set up\n"); msleep(5000); - dd_dev_err(dd, - "Continuing with quick linkup\n"); + dd_dev_err(dd, "Continuing with quick linkup\n"); } write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */ @@ -8589,8 +8864,8 @@ static int do_quick_linkup(struct hfi1_devdata *dd) ret = set_physical_link_state(dd, PLS_QUICK_LINKUP); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, - "%s: set physical link state to quick LinkUp failed with return %d\n", - __func__, ret); + "%s: set physical link state to quick LinkUp failed with return %d\n", + __func__, ret); set_host_lcb_access(dd); write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ @@ -8615,8 +8890,8 @@ static int set_serdes_loopback_mode(struct hfi1_devdata *dd) if (ret == HCMD_SUCCESS) return 0; dd_dev_err(dd, - "Set physical link state to SerDes Loopback failed with return %d\n", - ret); + "Set physical link state to SerDes Loopback failed with return %d\n", + ret); if (ret >= 0) ret = -EINVAL; return ret; @@ -8631,7 +8906,7 @@ static int init_loopback(struct hfi1_devdata *dd) /* all loopbacks should disable self GUID check */ write_csr(dd, DC_DC8051_CFG_MODE, - (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK)); + (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK)); /* * The simulator has only one loopback option - LCB. Switch @@ -8639,10 +8914,9 @@ static int init_loopback(struct hfi1_devdata *dd) * * Accept all valid loopback values. */ - if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) - && (loopback == LOOPBACK_SERDES - || loopback == LOOPBACK_LCB - || loopback == LOOPBACK_CABLE)) { + if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) && + (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB || + loopback == LOOPBACK_CABLE)) { loopback = LOOPBACK_LCB; quick_linkup = 1; return 0; @@ -8663,7 +8937,7 @@ static int init_loopback(struct hfi1_devdata *dd) /* not supported in emulation due to emulation RTL changes */ if (dd->icode == ICODE_FPGA_EMULATION) { dd_dev_err(dd, - "LCB loopback not supported in emulation\n"); + "LCB loopback not supported in emulation\n"); return -EINVAL; } return 0; @@ -8690,10 +8964,10 @@ static u16 opa_to_vc_link_widths(u16 opa_widths) u16 from; u16 to; } opa_link_xlate[] = { - { OPA_LINK_WIDTH_1X, 1 << (1-1) }, - { OPA_LINK_WIDTH_2X, 1 << (2-1) }, - { OPA_LINK_WIDTH_3X, 1 << (3-1) }, - { OPA_LINK_WIDTH_4X, 1 << (4-1) }, + { OPA_LINK_WIDTH_1X, 1 << (1 - 1) }, + { OPA_LINK_WIDTH_2X, 1 << (2 - 1) }, + { OPA_LINK_WIDTH_3X, 1 << (3 - 1) }, + { OPA_LINK_WIDTH_4X, 1 << (4 - 1) }, }; for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) { @@ -8719,7 +8993,7 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) /* set the local tx rate - need to read-modify-write */ ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, - &rx_polarity_inversion, &ppd->local_tx_rate); + &rx_polarity_inversion, &ppd->local_tx_rate); if (ret) goto set_local_link_attributes_fail; @@ -8740,15 +9014,16 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) enable_lane_tx = 0xF; /* enable all four lanes */ ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion, - rx_polarity_inversion, ppd->local_tx_rate); + rx_polarity_inversion, ppd->local_tx_rate); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail; /* * DC supports continuous updates. */ - ret = write_vc_local_phy(dd, 0 /* no power management */, - 1 /* continuous updates */); + ret = write_vc_local_phy(dd, + 0 /* no power management */, + 1 /* continuous updates */); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail; @@ -8759,7 +9034,8 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) goto set_local_link_attributes_fail; ret = write_vc_local_link_width(dd, 0, 0, - opa_to_vc_link_widths(ppd->link_width_enabled)); + opa_to_vc_link_widths( + ppd->link_width_enabled)); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail; @@ -8770,8 +9046,8 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) set_local_link_attributes_fail: dd_dev_err(dd, - "Failed to set local link attributes, return 0x%x\n", - ret); + "Failed to set local link attributes, return 0x%x\n", + ret); return ret; } @@ -8784,54 +9060,101 @@ int start_link(struct hfi1_pportdata *ppd) { if (!ppd->link_enabled) { dd_dev_info(ppd->dd, - "%s: stopping link start because link is disabled\n", - __func__); + "%s: stopping link start because link is disabled\n", + __func__); return 0; } if (!ppd->driver_link_ready) { dd_dev_info(ppd->dd, - "%s: stopping link start because driver is not ready\n", - __func__); + "%s: stopping link start because driver is not ready\n", + __func__); return 0; } if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES || - loopback == LOOPBACK_LCB || - ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + loopback == LOOPBACK_LCB || + ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) return set_link_state(ppd, HLS_DN_POLL); dd_dev_info(ppd->dd, - "%s: stopping link start because no cable is present\n", - __func__); + "%s: stopping link start because no cable is present\n", + __func__); return -EAGAIN; } -static void reset_qsfp(struct hfi1_pportdata *ppd) +static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask; + unsigned long timeout; + + /* + * Check for QSFP interrupt for t_init (SFF 8679) + */ + timeout = jiffies + msecs_to_jiffies(2000); + while (1) { + mask = read_csr(dd, dd->hfi1_id ? + ASIC_QSFP2_IN : ASIC_QSFP1_IN); + if (!(mask & QSFP_HFI0_INT_N)) { + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : + ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N); + break; + } + if (time_after(jiffies, timeout)) { + dd_dev_info(dd, "%s: No IntN detected, reset complete\n", + __func__); + break; + } + udelay(2); + } +} + +static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask; + + mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK); + if (enable) + mask |= (u64)QSFP_HFI0_INT_N; + else + mask &= ~(u64)QSFP_HFI0_INT_N; + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); +} + +void reset_qsfp(struct hfi1_pportdata *ppd) { struct hfi1_devdata *dd = ppd->dd; u64 mask, qsfp_mask; + /* Disable INT_N from triggering QSFP interrupts */ + set_qsfp_int_n(ppd, 0); + + /* Reset the QSFP */ mask = (u64)QSFP_HFI0_RESET_N; - qsfp_mask = read_csr(dd, - dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE); + qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE); qsfp_mask |= mask; - write_csr(dd, - dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, - qsfp_mask); + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, qsfp_mask); qsfp_mask = read_csr(dd, - dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT); + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT); qsfp_mask &= ~mask; write_csr(dd, - dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, - qsfp_mask); + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask); udelay(10); qsfp_mask |= mask; write_csr(dd, - dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, - qsfp_mask); + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask); + + wait_for_qsfp_init(ppd); + + /* + * Allow INT_N to trigger the QSFP interrupt to watch + * for alarms and warnings + */ + set_qsfp_int_n(ppd, 1); } static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, @@ -8840,102 +9163,86 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, struct hfi1_devdata *dd = ppd->dd; if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) || - (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING)) - dd_dev_info(dd, - "%s: QSFP cable on fire\n", - __func__); + (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING)) + dd_dev_info(dd, "%s: QSFP cable on fire\n", + __func__); if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) || - (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING)) - dd_dev_info(dd, - "%s: QSFP cable temperature too low\n", - __func__); + (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING)) + dd_dev_info(dd, "%s: QSFP cable temperature too low\n", + __func__); if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) || - (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING)) - dd_dev_info(dd, - "%s: QSFP supply voltage too high\n", - __func__); + (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING)) + dd_dev_info(dd, "%s: QSFP supply voltage too high\n", + __func__); if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) || - (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING)) - dd_dev_info(dd, - "%s: QSFP supply voltage too low\n", - __func__); + (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING)) + dd_dev_info(dd, "%s: QSFP supply voltage too low\n", + __func__); /* Byte 2 is vendor specific */ if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) || - (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable RX channel 1/2 power too high\n", - __func__); + (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n", + __func__); if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) || - (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable RX channel 1/2 power too low\n", - __func__); + (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n", + __func__); if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) || - (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable RX channel 3/4 power too high\n", - __func__); + (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n", + __func__); if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) || - (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable RX channel 3/4 power too low\n", - __func__); + (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n", + __func__); if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) || - (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 1/2 bias too high\n", - __func__); + (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n", + __func__); if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) || - (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 1/2 bias too low\n", - __func__); + (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n", + __func__); if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) || - (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 3/4 bias too high\n", - __func__); + (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n", + __func__); if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) || - (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 3/4 bias too low\n", - __func__); + (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n", + __func__); if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) || - (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 1/2 power too high\n", - __func__); + (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n", + __func__); if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) || - (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 1/2 power too low\n", - __func__); + (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n", + __func__); if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) || - (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 3/4 power too high\n", - __func__); + (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n", + __func__); if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) || - (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING)) - dd_dev_info(dd, - "%s: Cable TX channel 3/4 power too low\n", - __func__); + (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n", + __func__); /* Bytes 9-10 and 11-12 are reserved */ /* Bytes 13-15 are vendor specific */ @@ -8943,35 +9250,8 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, return 0; } -static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd) -{ - refresh_qsfp_cache(ppd, &ppd->qsfp_info); - - return 0; -} - -static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd) -{ - struct hfi1_devdata *dd = ppd->dd; - u8 qsfp_interrupt_status = 0; - - if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1) - != 1) { - dd_dev_info(dd, - "%s: Failed to read status of QSFP module\n", - __func__); - return -EIO; - } - - /* We don't care about alarms & warnings with a non-functional INT_N */ - if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY)) - do_pre_lni_host_behaviors(ppd); - - return 0; -} - /* This routine will only be scheduled if the QSFP module is present */ -static void qsfp_event(struct work_struct *work) +void qsfp_event(struct work_struct *work) { struct qsfp_data *qd; struct hfi1_pportdata *ppd; @@ -8993,76 +9273,75 @@ static void qsfp_event(struct work_struct *work) dc_start(dd); if (qd->cache_refresh_required) { - msleep(3000); - reset_qsfp(ppd); + set_qsfp_int_n(ppd, 0); - /* Check for QSFP interrupt after t_init (SFF 8679) - * + extra + wait_for_qsfp_init(ppd); + + /* + * Allow INT_N to trigger the QSFP interrupt to watch + * for alarms and warnings */ - msleep(3000); - if (!qd->qsfp_interrupt_functional) { - if (do_qsfp_intr_fallback(ppd) < 0) - dd_dev_info(dd, "%s: QSFP fallback failed\n", - __func__); - ppd->driver_link_ready = 1; - start_link(ppd); - } + set_qsfp_int_n(ppd, 1); + + tune_serdes(ppd); + + start_link(ppd); } if (qd->check_interrupt_flags) { u8 qsfp_interrupt_status[16] = {0,}; - if (qsfp_read(ppd, dd->hfi1_id, 6, - &qsfp_interrupt_status[0], 16) != 16) { + if (one_qsfp_read(ppd, dd->hfi1_id, 6, + &qsfp_interrupt_status[0], 16) != 16) { dd_dev_info(dd, - "%s: Failed to read status of QSFP module\n", - __func__); + "%s: Failed to read status of QSFP module\n", + __func__); } else { unsigned long flags; - u8 data_status; + handle_qsfp_error_conditions( + ppd, qsfp_interrupt_status); spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); ppd->qsfp_info.check_interrupt_flags = 0; spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, - flags); - - if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1) - != 1) { - dd_dev_info(dd, - "%s: Failed to read status of QSFP module\n", - __func__); - } - if (!(data_status & QSFP_DATA_NOT_READY)) { - do_pre_lni_host_behaviors(ppd); - start_link(ppd); - } else - handle_qsfp_error_conditions(ppd, - qsfp_interrupt_status); + flags); } } } -void init_qsfp(struct hfi1_pportdata *ppd) +static void init_qsfp_int(struct hfi1_devdata *dd) { - struct hfi1_devdata *dd = ppd->dd; - u64 qsfp_mask; + struct hfi1_pportdata *ppd = dd->pport; + u64 qsfp_mask, cce_int_mask; + const int qsfp1_int_smask = QSFP1_INT % 64; + const int qsfp2_int_smask = QSFP2_INT % 64; - if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB || - ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { - ppd->driver_link_ready = 1; - return; + /* + * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0 + * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR, + * therefore just one of QSFP1_INT/QSFP2_INT can be used to find + * the index of the appropriate CSR in the CCEIntMask CSR array + */ + cce_int_mask = read_csr(dd, CCE_INT_MASK + + (8 * (QSFP1_INT / 64))); + if (dd->hfi1_id) { + cce_int_mask &= ~((u64)1 << qsfp1_int_smask); + write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)), + cce_int_mask); + } else { + cce_int_mask &= ~((u64)1 << qsfp2_int_smask); + write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)), + cce_int_mask); } - ppd->qsfp_info.ppd = ppd; - INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); - qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); /* Clear current status to avoid spurious interrupts */ - write_csr(dd, - dd->hfi1_id ? - ASIC_QSFP2_CLEAR : - ASIC_QSFP1_CLEAR, - qsfp_mask); + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR, + qsfp_mask); + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, + qsfp_mask); + + set_qsfp_int_n(ppd, 0); /* Handle active low nature of INT_N and MODPRST_N pins */ if (qsfp_mod_present(ppd)) @@ -9070,29 +9349,6 @@ void init_qsfp(struct hfi1_pportdata *ppd) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT, qsfp_mask); - - /* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */ - qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N; - write_csr(dd, - dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, - qsfp_mask); - - if (qsfp_mod_present(ppd)) { - msleep(3000); - reset_qsfp(ppd); - - /* Check for QSFP interrupt after t_init (SFF 8679) - * + extra - */ - msleep(3000); - if (!ppd->qsfp_info.qsfp_interrupt_functional) { - if (do_qsfp_intr_fallback(ppd) < 0) - dd_dev_info(dd, - "%s: QSFP fallback failed\n", - __func__); - ppd->driver_link_ready = 1; - } - } } /* @@ -9100,6 +9356,10 @@ void init_qsfp(struct hfi1_pportdata *ppd) */ static void init_lcb(struct hfi1_devdata *dd) { + /* simulator does not correctly handle LCB cclk loopback, skip */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + return; + /* the DC has been reset earlier in the driver load */ /* set LCB for cclk loopback on the port */ @@ -9128,8 +9388,6 @@ int bringup_serdes(struct hfi1_pportdata *ppd) ppd->guid = guid; } - /* the link defaults to enabled */ - ppd->link_enabled = 1; /* Set linkinit_reason on power up per OPA spec */ ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP; @@ -9142,6 +9400,12 @@ int bringup_serdes(struct hfi1_pportdata *ppd) return ret; } + /* tune the SERDES to a ballpark setting for + * optimal signal and bit error rate + * Needs to be done before starting the link + */ + tune_serdes(ppd); + return start_link(ppd); } @@ -9159,8 +9423,10 @@ void hfi1_quiet_serdes(struct hfi1_pportdata *ppd) ppd->driver_link_ready = 0; ppd->link_enabled = 0; + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED); set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0, - OPA_LINKDOWN_REASON_SMA_DISABLED); + OPA_LINKDOWN_REASON_SMA_DISABLED); set_link_state(ppd, HLS_DN_OFFLINE); /* disable the port */ @@ -9174,14 +9440,14 @@ static inline int init_cpu_counters(struct hfi1_devdata *dd) ppd = (struct hfi1_pportdata *)(dd + 1); for (i = 0; i < dd->num_pports; i++, ppd++) { - ppd->ibport_data.rc_acks = NULL; - ppd->ibport_data.rc_qacks = NULL; - ppd->ibport_data.rc_acks = alloc_percpu(u64); - ppd->ibport_data.rc_qacks = alloc_percpu(u64); - ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64); - if ((ppd->ibport_data.rc_acks == NULL) || - (ppd->ibport_data.rc_delayed_comp == NULL) || - (ppd->ibport_data.rc_qacks == NULL)) + ppd->ibport_data.rvp.rc_acks = NULL; + ppd->ibport_data.rvp.rc_qacks = NULL; + ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64); + if (!ppd->ibport_data.rvp.rc_acks || + !ppd->ibport_data.rvp.rc_delayed_comp || + !ppd->ibport_data.rvp.rc_qacks) return -ENOMEM; } @@ -9216,8 +9482,8 @@ void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, pa = 0; } else if (type > PT_INVALID) { dd_dev_err(dd, - "unexpected receive array type %u for index %u, not handled\n", - type, index); + "unexpected receive array type %u for index %u, not handled\n", + type, index); goto done; } @@ -9432,12 +9698,15 @@ static void set_send_length(struct hfi1_pportdata *ppd) /* all kernel receive contexts have the same hdrqentsize */ for (i = 0; i < ppd->vls_supported; i++) { sc_set_cr_threshold(dd->vld[i].sc, - sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu, - dd->rcd[0]->rcvhdrqentsize)); + sc_mtu_to_threshold(dd->vld[i].sc, + dd->vld[i].mtu, + dd->rcd[0]-> + rcvhdrqentsize)); } sc_set_cr_threshold(dd->vld[15].sc, - sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu, - dd->rcd[0]->rcvhdrqentsize)); + sc_mtu_to_threshold(dd->vld[15].sc, + dd->vld[15].mtu, + dd->rcd[0]->rcvhdrqentsize)); /* Adjust maximum MTU for the port in DC */ dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : @@ -9463,7 +9732,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK); c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) - << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)| + << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) | ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK) << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT); write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1); @@ -9498,8 +9767,8 @@ static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs) break; if (time_after(jiffies, timeout)) { dd_dev_err(dd, - "timeout waiting for phy link state 0x%x, current state is 0x%x\n", - state, curr_state); + "timeout waiting for phy link state 0x%x, current state is 0x%x\n", + state, curr_state); return -ETIMEDOUT; } usleep_range(1950, 2050); /* sleep 2ms-ish */ @@ -9542,17 +9811,18 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) if (do_transition) { ret = set_physical_link_state(dd, - PLS_OFFLINE | (rem_reason << 8)); + (rem_reason << 8) | PLS_OFFLINE); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, - "Failed to transition to Offline link state, return %d\n", - ret); + "Failed to transition to Offline link state, return %d\n", + ret); return -EINVAL; } - if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE) + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) ppd->offline_disabled_reason = - OPA_LINKDOWN_REASON_TRANSIENT; + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); } if (do_wait) { @@ -9573,6 +9843,22 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + if (ppd->port_type == PORT_TYPE_QSFP && + ppd->qsfp_info.limiting_active && + qsfp_mod_present(ppd)) { + int ret; + + ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT); + if (ret == 0) { + set_qsfp_tx(ppd, 0); + release_chip_resource(dd, qsfp_resource(dd)); + } else { + /* not fatal, but should warn */ + dd_dev_err(dd, + "Unable to acquire lock to turn off QSFP TX\n"); + } + } + /* * The LNI has a mandatory wait time after the physical state * moves to Offline.Quiet. The wait time may be different @@ -9585,7 +9871,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) ret = wait_fm_ready(dd, 7000); if (ret) { dd_dev_err(dd, - "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n"); + "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n"); /* state is really offline, so make it so */ ppd->host_link_state = HLS_DN_OFFLINE; return ret; @@ -9608,8 +9894,8 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) read_last_local_state(dd, &last_local_state); read_last_remote_state(dd, &last_remote_state); dd_dev_err(dd, - "LNI failure last states: local 0x%08x, remote 0x%08x\n", - last_local_state, last_remote_state); + "LNI failure last states: local 0x%08x, remote 0x%08x\n", + last_local_state, last_remote_state); } /* the active link width (downgrade) is 0 on link down */ @@ -9757,14 +10043,14 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) state = dd->link_default; /* interpret poll -> poll as a link bounce */ - poll_bounce = ppd->host_link_state == HLS_DN_POLL - && state == HLS_DN_POLL; + poll_bounce = ppd->host_link_state == HLS_DN_POLL && + state == HLS_DN_POLL; dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__, - link_state_name(ppd->host_link_state), - link_state_name(orig_new_state), - poll_bounce ? "(bounce) " : "", - link_state_reason_name(ppd, state)); + link_state_name(ppd->host_link_state), + link_state_name(orig_new_state), + poll_bounce ? "(bounce) " : "", + link_state_reason_name(ppd, state)); was_up = !!(ppd->host_link_state & HLS_UP); @@ -9785,8 +10071,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) switch (state) { case HLS_UP_INIT: - if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup - || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) { + if (ppd->host_link_state == HLS_DN_POLL && + (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) { /* * Quick link up jumps from polling to here. * @@ -9794,7 +10080,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) * simulator jumps from polling to link up. * Accept that here. */ - /* OK */; + /* OK */ } else if (ppd->host_link_state != HLS_GOING_UP) { goto unexpected; } @@ -9805,8 +10091,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) /* logical state didn't change, stay at going_up */ ppd->host_link_state = HLS_GOING_UP; dd_dev_err(dd, - "%s: logical state did not change to INIT\n", - __func__); + "%s: logical state did not change to INIT\n", + __func__); } else { /* clear old transient LINKINIT_REASON code */ if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR) @@ -9830,8 +10116,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) /* logical state didn't change, stay at init */ ppd->host_link_state = HLS_UP_INIT; dd_dev_err(dd, - "%s: logical state did not change to ARMED\n", - __func__); + "%s: logical state did not change to ARMED\n", + __func__); } /* * The simulator does not currently implement SMA messages, @@ -9852,15 +10138,14 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) /* logical state didn't change, stay at armed */ ppd->host_link_state = HLS_UP_ARMED; dd_dev_err(dd, - "%s: logical state did not change to ACTIVE\n", - __func__); + "%s: logical state did not change to ACTIVE\n", + __func__); } else { - /* tell all engines to go running */ sdma_all_running(dd); /* Signal the IB layer that the port has went active */ - event.device = &dd->verbs_dev.ibdev; + event.device = &dd->verbs_dev.rdi.ibdev; event.element.port_num = ppd->port; event.event = IB_EVENT_PORT_ACTIVE; } @@ -9887,6 +10172,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) ppd->link_enabled = 1; } + set_all_slowpath(ppd->dd); ret = set_local_link_attributes(ppd); if (ret) break; @@ -9901,12 +10187,13 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) ret1 = set_physical_link_state(dd, PLS_POLLING); if (ret1 != HCMD_SUCCESS) { dd_dev_err(dd, - "Failed to transition to Polling link state, return 0x%x\n", - ret1); + "Failed to transition to Polling link state, return 0x%x\n", + ret1); ret = -EINVAL; } } - ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE; + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); /* * If an error occurred above, go back to offline. The * caller may reschedule another attempt. @@ -9931,8 +10218,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) ret1 = set_physical_link_state(dd, PLS_DISABLED); if (ret1 != HCMD_SUCCESS) { dd_dev_err(dd, - "Failed to transition to Disabled link state, return 0x%x\n", - ret1); + "Failed to transition to Disabled link state, return 0x%x\n", + ret1); ret = -EINVAL; break; } @@ -9960,8 +10247,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) ret1 = set_physical_link_state(dd, PLS_LINKUP); if (ret1 != HCMD_SUCCESS) { dd_dev_err(dd, - "Failed to transition to link up state, return 0x%x\n", - ret1); + "Failed to transition to link up state, return 0x%x\n", + ret1); ret = -EINVAL; break; } @@ -9972,7 +10259,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) case HLS_LINK_COOLDOWN: /* transient within goto_offline() */ default: dd_dev_info(dd, "%s: state 0x%x: not supported\n", - __func__, state); + __func__, state); ret = -EINVAL; break; } @@ -9992,8 +10279,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) unexpected: dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n", - __func__, link_state_name(ppd->host_link_state), - link_state_name(state)); + __func__, link_state_name(ppd->host_link_state), + link_state_name(state)); ret = -EINVAL; done: @@ -10019,7 +10306,7 @@ int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val) * The VL Arbitrator high limit is sent in units of 4k * bytes, while HFI stores it in units of 64 bytes. */ - val *= 4096/64; + val *= 4096 / 64; reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK) << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT; write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg); @@ -10034,12 +10321,6 @@ int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val) ppd->vls_operational = val; if (!ppd->port) ret = -EINVAL; - else - ret = sdma_map_init( - ppd->dd, - ppd->port - 1, - val, - NULL); } break; /* @@ -10087,8 +10368,8 @@ int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val) default: if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) dd_dev_info(ppd->dd, - "%s: which %s, val 0x%x: not implemented\n", - __func__, ib_cfg_name(which), val); + "%s: which %s, val 0x%x: not implemented\n", + __func__, ib_cfg_name(which), val); break; } return ret; @@ -10155,6 +10436,7 @@ static int vl_arb_match_cache(struct vl_arb_cache *cache, { return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl)); } + /* end functions related to vl arbitration table caching */ static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target, @@ -10242,7 +10524,7 @@ static int get_buffer_control(struct hfi1_devdata *dd, /* OPA and HFI have a 1-1 mapping */ for (i = 0; i < TXE_NUM_DATA_VL; i++) - read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]); + read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]); /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */ read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]); @@ -10296,41 +10578,41 @@ static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems, static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp) { write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, - DC_SC_VL_VAL(15_0, - 0, dp->vlnt[0] & 0xf, - 1, dp->vlnt[1] & 0xf, - 2, dp->vlnt[2] & 0xf, - 3, dp->vlnt[3] & 0xf, - 4, dp->vlnt[4] & 0xf, - 5, dp->vlnt[5] & 0xf, - 6, dp->vlnt[6] & 0xf, - 7, dp->vlnt[7] & 0xf, - 8, dp->vlnt[8] & 0xf, - 9, dp->vlnt[9] & 0xf, - 10, dp->vlnt[10] & 0xf, - 11, dp->vlnt[11] & 0xf, - 12, dp->vlnt[12] & 0xf, - 13, dp->vlnt[13] & 0xf, - 14, dp->vlnt[14] & 0xf, - 15, dp->vlnt[15] & 0xf)); + DC_SC_VL_VAL(15_0, + 0, dp->vlnt[0] & 0xf, + 1, dp->vlnt[1] & 0xf, + 2, dp->vlnt[2] & 0xf, + 3, dp->vlnt[3] & 0xf, + 4, dp->vlnt[4] & 0xf, + 5, dp->vlnt[5] & 0xf, + 6, dp->vlnt[6] & 0xf, + 7, dp->vlnt[7] & 0xf, + 8, dp->vlnt[8] & 0xf, + 9, dp->vlnt[9] & 0xf, + 10, dp->vlnt[10] & 0xf, + 11, dp->vlnt[11] & 0xf, + 12, dp->vlnt[12] & 0xf, + 13, dp->vlnt[13] & 0xf, + 14, dp->vlnt[14] & 0xf, + 15, dp->vlnt[15] & 0xf)); write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, - DC_SC_VL_VAL(31_16, - 16, dp->vlnt[16] & 0xf, - 17, dp->vlnt[17] & 0xf, - 18, dp->vlnt[18] & 0xf, - 19, dp->vlnt[19] & 0xf, - 20, dp->vlnt[20] & 0xf, - 21, dp->vlnt[21] & 0xf, - 22, dp->vlnt[22] & 0xf, - 23, dp->vlnt[23] & 0xf, - 24, dp->vlnt[24] & 0xf, - 25, dp->vlnt[25] & 0xf, - 26, dp->vlnt[26] & 0xf, - 27, dp->vlnt[27] & 0xf, - 28, dp->vlnt[28] & 0xf, - 29, dp->vlnt[29] & 0xf, - 30, dp->vlnt[30] & 0xf, - 31, dp->vlnt[31] & 0xf)); + DC_SC_VL_VAL(31_16, + 16, dp->vlnt[16] & 0xf, + 17, dp->vlnt[17] & 0xf, + 18, dp->vlnt[18] & 0xf, + 19, dp->vlnt[19] & 0xf, + 20, dp->vlnt[20] & 0xf, + 21, dp->vlnt[21] & 0xf, + 22, dp->vlnt[22] & 0xf, + 23, dp->vlnt[23] & 0xf, + 24, dp->vlnt[24] & 0xf, + 25, dp->vlnt[25] & 0xf, + 26, dp->vlnt[26] & 0xf, + 27, dp->vlnt[27] & 0xf, + 28, dp->vlnt[28] & 0xf, + 29, dp->vlnt[29] & 0xf, + 30, dp->vlnt[30] & 0xf, + 31, dp->vlnt[31] & 0xf)); } static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what, @@ -10338,7 +10620,7 @@ static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what, { if (limit != 0) dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n", - what, (int)limit, idx); + what, (int)limit, idx); } /* change only the shared limit portion of SendCmGLobalCredit */ @@ -10416,14 +10698,14 @@ static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask, } dd_dev_err(dd, - "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n", - which, VL_STATUS_CLEAR_TIMEOUT, mask, reg); + "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n", + which, VL_STATUS_CLEAR_TIMEOUT, mask, reg); /* * If this occurs, it is likely there was a credit loss on the link. * The only recovery from that is a link bounce. */ dd_dev_err(dd, - "Continuing anyway. A credit loss may occur. Suggest a link bounce\n"); + "Continuing anyway. A credit loss may occur. Suggest a link bounce\n"); } /* @@ -10450,13 +10732,15 @@ static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask, * raise = if the new limit is higher than the current value (may be changed * earlier in the algorithm), set the new limit to the new value */ -static int set_buffer_control(struct hfi1_devdata *dd, - struct buffer_control *new_bc) +int set_buffer_control(struct hfi1_pportdata *ppd, + struct buffer_control *new_bc) { + struct hfi1_devdata *dd = ppd->dd; u64 changing_mask, ld_mask, stat_mask; int change_count; int i, use_all_mask; int this_shared_changing; + int vl_count = 0, ret; /* * A0: add the variable any_shared_limit_changing below and in the * algorithm above. If removing A0 support, it can be removed. @@ -10481,7 +10765,6 @@ static int set_buffer_control(struct hfi1_devdata *dd, #define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15) #define NUM_USABLE_VLS 16 /* look at VL15 and less */ - /* find the new total credits, do sanity check on unused VLs */ for (i = 0; i < OPA_MAX_VLS; i++) { if (valid_vl(i)) { @@ -10489,9 +10772,9 @@ static int set_buffer_control(struct hfi1_devdata *dd, continue; } nonzero_msg(dd, i, "dedicated", - be16_to_cpu(new_bc->vl[i].dedicated)); + be16_to_cpu(new_bc->vl[i].dedicated)); nonzero_msg(dd, i, "shared", - be16_to_cpu(new_bc->vl[i].shared)); + be16_to_cpu(new_bc->vl[i].shared)); new_bc->vl[i].dedicated = 0; new_bc->vl[i].shared = 0; } @@ -10505,8 +10788,10 @@ static int set_buffer_control(struct hfi1_devdata *dd, */ memset(changing, 0, sizeof(changing)); memset(lowering_dedicated, 0, sizeof(lowering_dedicated)); - /* NOTE: Assumes that the individual VL bits are adjacent and in - increasing order */ + /* + * NOTE: Assumes that the individual VL bits are adjacent and in + * increasing order + */ stat_mask = SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK; changing_mask = 0; @@ -10520,8 +10805,8 @@ static int set_buffer_control(struct hfi1_devdata *dd, != cur_bc.vl[i].shared; if (this_shared_changing) any_shared_limit_changing = 1; - if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated - || this_shared_changing) { + if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated || + this_shared_changing) { changing[i] = 1; changing_mask |= stat_mask; change_count++; @@ -10560,7 +10845,7 @@ static int set_buffer_control(struct hfi1_devdata *dd, } wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask, - "shared"); + "shared"); if (change_count > 0) { for (i = 0; i < NUM_USABLE_VLS; i++) { @@ -10569,7 +10854,8 @@ static int set_buffer_control(struct hfi1_devdata *dd, if (lowering_dedicated[i]) { set_vl_dedicated(dd, i, - be16_to_cpu(new_bc->vl[i].dedicated)); + be16_to_cpu(new_bc-> + vl[i].dedicated)); cur_bc.vl[i].dedicated = new_bc->vl[i].dedicated; } @@ -10585,7 +10871,8 @@ static int set_buffer_control(struct hfi1_devdata *dd, if (be16_to_cpu(new_bc->vl[i].dedicated) > be16_to_cpu(cur_bc.vl[i].dedicated)) set_vl_dedicated(dd, i, - be16_to_cpu(new_bc->vl[i].dedicated)); + be16_to_cpu(new_bc-> + vl[i].dedicated)); } } @@ -10601,13 +10888,35 @@ static int set_buffer_control(struct hfi1_devdata *dd, /* finally raise the global shared */ if (be16_to_cpu(new_bc->overall_shared_limit) > - be16_to_cpu(cur_bc.overall_shared_limit)) + be16_to_cpu(cur_bc.overall_shared_limit)) set_global_shared(dd, - be16_to_cpu(new_bc->overall_shared_limit)); + be16_to_cpu(new_bc->overall_shared_limit)); /* bracket the credit change with a total adjustment */ if (new_total < cur_total) set_global_limit(dd, new_total); + + /* + * Determine the actual number of operational VLS using the number of + * dedicated and shared credits for each VL. + */ + if (change_count > 0) { + for (i = 0; i < TXE_NUM_DATA_VL; i++) + if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 || + be16_to_cpu(new_bc->vl[i].shared) > 0) + vl_count++; + ppd->actual_vls_operational = vl_count; + ret = sdma_map_init(dd, ppd->port - 1, vl_count ? + ppd->actual_vls_operational : + ppd->vls_operational, + NULL); + if (ret == 0) + ret = pio_map_init(dd, ppd->port - 1, vl_count ? + ppd->actual_vls_operational : + ppd->vls_operational, NULL); + if (ret) + return ret; + } return 0; } @@ -10699,7 +11008,7 @@ int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t) VL_ARB_LOW_PRIO_TABLE_SIZE, t); break; case FM_TBL_BUFFER_CONTROL: - ret = set_buffer_control(ppd->dd, t); + ret = set_buffer_control(ppd, t); break; case FM_TBL_SC2VLNT: set_sc2vlnt(ppd->dd, t); @@ -10849,10 +11158,13 @@ static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts) } rcd->rcvavail_timeout = timeout; - /* timeout cannot be larger than rcv_intr_timeout_csr which has already - been verified to be in range */ + /* + * timeout cannot be larger than rcv_intr_timeout_csr which has already + * been verified to be in range + */ write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT, - (u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); + (u64)timeout << + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); } void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd, @@ -10918,16 +11230,16 @@ u32 hdrqempty(struct hfi1_ctxtdata *rcd) static u32 encoded_size(u32 size) { switch (size) { - case 4*1024: return 0x1; - case 8*1024: return 0x2; - case 16*1024: return 0x3; - case 32*1024: return 0x4; - case 64*1024: return 0x5; - case 128*1024: return 0x6; - case 256*1024: return 0x7; - case 512*1024: return 0x8; - case 1*1024*1024: return 0x9; - case 2*1024*1024: return 0xa; + case 4 * 1024: return 0x1; + case 8 * 1024: return 0x2; + case 16 * 1024: return 0x3; + case 32 * 1024: return 0x4; + case 64 * 1024: return 0x5; + case 128 * 1024: return 0x6; + case 256 * 1024: return 0x7; + case 512 * 1024: return 0x8; + case 1 * 1024 * 1024: return 0x9; + case 2 * 1024 * 1024: return 0xa; } return 0x1; /* if invalid, go with the minimum size */ } @@ -10946,8 +11258,8 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL); /* if the context already enabled, don't do the extra steps */ - if ((op & HFI1_RCVCTRL_CTXT_ENB) - && !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) { + if ((op & HFI1_RCVCTRL_CTXT_ENB) && + !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) { /* reset the tail and hdr addresses, and sequence count */ write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR, rcd->rcvhdrq_phys); @@ -11021,6 +11333,7 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) if (dd->rcvhdrtail_dummy_physaddr) { write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, dd->rcvhdrtail_dummy_physaddr); + /* Enabling RcvCtxtCtrl.TailUpd is intentional. */ rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; } @@ -11032,15 +11345,20 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys) rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; - if (op & HFI1_RCVCTRL_TAILUPD_DIS) - rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK; + if (op & HFI1_RCVCTRL_TAILUPD_DIS) { + /* See comment on RcvCtxtCtrl.TailUpd above */ + if (!(op & HFI1_RCVCTRL_CTXT_DIS)) + rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK; + } if (op & HFI1_RCVCTRL_TIDFLOW_ENB) rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; if (op & HFI1_RCVCTRL_TIDFLOW_DIS) rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) { - /* In one-packet-per-eager mode, the size comes from - the RcvArray entry. */ + /* + * In one-packet-per-eager mode, the size comes from + * the RcvArray entry. + */ rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK; rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; } @@ -11059,19 +11377,19 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl); /* work around sticky RcvCtxtStatus.BlockedRHQFull */ - if (did_enable - && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) { + if (did_enable && + (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) { reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); if (reg != 0) { dd_dev_info(dd, "ctxt %d status %lld (blocked)\n", - ctxt, reg); + ctxt, reg); read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10); write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00); read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n", - ctxt, reg, reg == 0 ? "not" : "still"); + ctxt, reg, reg == 0 ? "not" : "still"); } } @@ -11082,7 +11400,7 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) */ /* set interrupt timeout */ write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT, - (u64)rcd->rcvavail_timeout << + (u64)rcd->rcvavail_timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */ @@ -11100,28 +11418,19 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) dd->rcvhdrtail_dummy_physaddr); } -u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep, - u64 **cntrp) +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp) { int ret; u64 val = 0; if (namep) { ret = dd->cntrnameslen; - if (pos != 0) { - dd_dev_err(dd, "read_cntrs does not support indexing"); - return 0; - } *namep = dd->cntrnames; } else { const struct cntr_entry *entry; int i, j; ret = (dd->ndevcntrs) * sizeof(u64); - if (pos != 0) { - dd_dev_err(dd, "read_cntrs does not support indexing"); - return 0; - } /* Get the start of the block of counters */ *cntrp = dd->cntrs; @@ -11150,6 +11459,20 @@ u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep, dd->cntrs[entry->offset + j] = val; } + } else if (entry->flags & CNTR_SDMA) { + hfi1_cdbg(CNTR, + "\t Per SDMA Engine\n"); + for (j = 0; j < dd->chip_sdma_engines; + j++) { + val = + entry->rw_cntr(entry, dd, j, + CNTR_MODE_R, 0); + hfi1_cdbg(CNTR, + "\t\tRead 0x%llx for %d\n", + val, j); + dd->cntrs[entry->offset + j] = + val; + } } else { val = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, @@ -11166,30 +11489,19 @@ u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep, /* * Used by sysfs to create files for hfi stats to read */ -u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port, - char **namep, u64 **cntrp) +u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp) { int ret; u64 val = 0; if (namep) { - ret = dd->portcntrnameslen; - if (pos != 0) { - dd_dev_err(dd, "index not supported"); - return 0; - } - *namep = dd->portcntrnames; + ret = ppd->dd->portcntrnameslen; + *namep = ppd->dd->portcntrnames; } else { const struct cntr_entry *entry; - struct hfi1_pportdata *ppd; int i, j; - ret = (dd->nportcntrs) * sizeof(u64); - if (pos != 0) { - dd_dev_err(dd, "indexing not supported"); - return 0; - } - ppd = (struct hfi1_pportdata *)(dd + 1 + port); + ret = ppd->dd->nportcntrs * sizeof(u64); *cntrp = ppd->cntrs; for (i = 0; i < PORT_CNTR_LAST; i++) { @@ -11238,14 +11550,14 @@ static void free_cntrs(struct hfi1_devdata *dd) for (i = 0; i < dd->num_pports; i++, ppd++) { kfree(ppd->cntrs); kfree(ppd->scntrs); - free_percpu(ppd->ibport_data.rc_acks); - free_percpu(ppd->ibport_data.rc_qacks); - free_percpu(ppd->ibport_data.rc_delayed_comp); + free_percpu(ppd->ibport_data.rvp.rc_acks); + free_percpu(ppd->ibport_data.rvp.rc_qacks); + free_percpu(ppd->ibport_data.rvp.rc_delayed_comp); ppd->cntrs = NULL; ppd->scntrs = NULL; - ppd->ibport_data.rc_acks = NULL; - ppd->ibport_data.rc_qacks = NULL; - ppd->ibport_data.rc_delayed_comp = NULL; + ppd->ibport_data.rvp.rc_acks = NULL; + ppd->ibport_data.rvp.rc_qacks = NULL; + ppd->ibport_data.rvp.rc_delayed_comp = NULL; } kfree(dd->portcntrnames); dd->portcntrnames = NULL; @@ -11513,11 +11825,13 @@ mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); #define C_MAX_NAME 13 /* 12 chars + one for /0 */ static int init_cntrs(struct hfi1_devdata *dd) { - int i, rcv_ctxts, index, j; + int i, rcv_ctxts, j; size_t sz; char *p; char name[C_MAX_NAME]; struct hfi1_pportdata *ppd; + const char *bit_type_32 = ",32"; + const int bit_type_32_sz = strlen(bit_type_32); /* set up the stats timer; the add_timer is done at the end */ setup_timer(&dd->synth_stats_timer, update_synth_timer, @@ -11530,49 +11844,57 @@ static int init_cntrs(struct hfi1_devdata *dd) /* size names and determine how many we have*/ dd->ndevcntrs = 0; sz = 0; - index = 0; for (i = 0; i < DEV_CNTR_LAST; i++) { - hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name); if (dev_cntrs[i].flags & CNTR_DISABLED) { hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name); continue; } if (dev_cntrs[i].flags & CNTR_VL) { - hfi1_dbg_early("\tProcessing VL cntr\n"); - dev_cntrs[i].offset = index; + dev_cntrs[i].offset = dd->ndevcntrs; for (j = 0; j < C_VL_COUNT; j++) { - memset(name, '\0', C_MAX_NAME); snprintf(name, C_MAX_NAME, "%s%d", - dev_cntrs[i].name, - vl_from_idx(j)); + dev_cntrs[i].name, vl_from_idx(j)); + sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + sz++; + dd->ndevcntrs++; + } + } else if (dev_cntrs[i].flags & CNTR_SDMA) { + dev_cntrs[i].offset = dd->ndevcntrs; + for (j = 0; j < dd->chip_sdma_engines; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, j); sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; sz++; - hfi1_dbg_early("\t\t%s\n", name); dd->ndevcntrs++; - index++; } } else { - /* +1 for newline */ + /* +1 for newline. */ sz += strlen(dev_cntrs[i].name) + 1; + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + dev_cntrs[i].offset = dd->ndevcntrs; dd->ndevcntrs++; - dev_cntrs[i].offset = index; - index++; - hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name); } } /* allocate space for the counter values */ - dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL); + dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL); if (!dd->cntrs) goto bail; - dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL); + dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL); if (!dd->scntrs) goto bail; - /* allocate space for the counter names */ dd->cntrnameslen = sz; dd->cntrnames = kmalloc(sz, GFP_KERNEL); @@ -11580,27 +11902,51 @@ static int init_cntrs(struct hfi1_devdata *dd) goto bail; /* fill in the names */ - for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) { + for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) { if (dev_cntrs[i].flags & CNTR_DISABLED) { /* Nothing */ - } else { - if (dev_cntrs[i].flags & CNTR_VL) { - for (j = 0; j < C_VL_COUNT; j++) { - memset(name, '\0', C_MAX_NAME); - snprintf(name, C_MAX_NAME, "%s%d", - dev_cntrs[i].name, - vl_from_idx(j)); - memcpy(p, name, strlen(name)); - p += strlen(name); - *p++ = '\n'; + } else if (dev_cntrs[i].flags & CNTR_VL) { + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, + vl_from_idx(j)); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; } - } else { - memcpy(p, dev_cntrs[i].name, - strlen(dev_cntrs[i].name)); - p += strlen(dev_cntrs[i].name); + + *p++ = '\n'; + } + } else if (dev_cntrs[i].flags & CNTR_SDMA) { + for (j = 0; j < dd->chip_sdma_engines; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, j); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + *p++ = '\n'; } - index++; + } else { + memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name)); + p += strlen(dev_cntrs[i].name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; } } @@ -11623,31 +11969,31 @@ static int init_cntrs(struct hfi1_devdata *dd) sz = 0; dd->nportcntrs = 0; for (i = 0; i < PORT_CNTR_LAST; i++) { - hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name); if (port_cntrs[i].flags & CNTR_DISABLED) { hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name); continue; } if (port_cntrs[i].flags & CNTR_VL) { - hfi1_dbg_early("\tProcessing VL cntr\n"); port_cntrs[i].offset = dd->nportcntrs; for (j = 0; j < C_VL_COUNT; j++) { - memset(name, '\0', C_MAX_NAME); snprintf(name, C_MAX_NAME, "%s%d", - port_cntrs[i].name, - vl_from_idx(j)); + port_cntrs[i].name, vl_from_idx(j)); sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (port_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; sz++; - hfi1_dbg_early("\t\t%s\n", name); dd->nportcntrs++; } } else { - /* +1 for newline */ + /* +1 for newline */ sz += strlen(port_cntrs[i].name) + 1; + /* Add ",32" for 32-bit counters */ + if (port_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; port_cntrs[i].offset = dd->nportcntrs; dd->nportcntrs++; - hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name); } } @@ -11664,18 +12010,30 @@ static int init_cntrs(struct hfi1_devdata *dd) if (port_cntrs[i].flags & CNTR_VL) { for (j = 0; j < C_VL_COUNT; j++) { - memset(name, '\0', C_MAX_NAME); snprintf(name, C_MAX_NAME, "%s%d", - port_cntrs[i].name, - vl_from_idx(j)); + port_cntrs[i].name, vl_from_idx(j)); memcpy(p, name, strlen(name)); p += strlen(name); + + /* Counter is 32 bits */ + if (port_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + *p++ = '\n'; } } else { memcpy(p, port_cntrs[i].name, strlen(port_cntrs[i].name)); p += strlen(port_cntrs[i].name); + + /* Counter is 32 bits */ + if (port_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + *p++ = '\n'; } } @@ -11703,14 +12061,13 @@ bail: return -ENOMEM; } - static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate) { switch (chip_lstate) { default: dd_dev_err(dd, - "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n", - chip_lstate); + "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n", + chip_lstate); /* fall through */ case LSTATE_DOWN: return IB_PORT_DOWN; @@ -11729,7 +12086,7 @@ u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate) switch (chip_pstate & 0xf0) { default: dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n", - chip_pstate); + chip_pstate); /* fall through */ case PLS_DISABLED: return IB_PORTPHYSSTATE_DISABLED; @@ -11795,7 +12152,7 @@ u32 get_logical_state(struct hfi1_pportdata *ppd) new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd)); if (new_state != ppd->lstate) { dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", - opa_lstate_name(new_state), new_state); + opa_lstate_name(new_state), new_state); ppd->lstate = new_state; } /* @@ -11854,18 +12211,17 @@ static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd) { - static u32 remembered_state = 0xff; u32 pstate; u32 ib_pstate; pstate = read_physical_state(ppd->dd); ib_pstate = chip_to_opa_pstate(ppd->dd, pstate); - if (remembered_state != ib_pstate) { + if (ppd->last_pstate != ib_pstate) { dd_dev_info(ppd->dd, - "%s: physical state changed to %s (0x%x), phy 0x%x\n", - __func__, opa_pstate_name(ib_pstate), ib_pstate, - pstate); - remembered_state = ib_pstate; + "%s: physical state changed to %s (0x%x), phy 0x%x\n", + __func__, opa_pstate_name(ib_pstate), ib_pstate, + pstate); + ppd->last_pstate = ib_pstate; } return ib_pstate; } @@ -11909,7 +12265,7 @@ u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, int hfi1_init_ctxt(struct send_context *sc) { - if (sc != NULL) { + if (sc) { struct hfi1_devdata *dd = sc->dd; u64 reg; u8 set = (sc->type == SC_USER ? @@ -11966,34 +12322,14 @@ void set_intr_state(struct hfi1_devdata *dd, u32 enable) * In HFI, the mask needs to be 1 to allow interrupts. */ if (enable) { - u64 cce_int_mask; - const int qsfp1_int_smask = QSFP1_INT % 64; - const int qsfp2_int_smask = QSFP2_INT % 64; - /* enable all interrupts */ for (i = 0; i < CCE_NUM_INT_CSRS; i++) - write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0); + write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0); - /* - * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0 - * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR, - * therefore just one of QSFP1_INT/QSFP2_INT can be used to find - * the index of the appropriate CSR in the CCEIntMask CSR array - */ - cce_int_mask = read_csr(dd, CCE_INT_MASK + - (8*(QSFP1_INT/64))); - if (dd->hfi1_id) { - cce_int_mask &= ~((u64)1 << qsfp1_int_smask); - write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)), - cce_int_mask); - } else { - cce_int_mask &= ~((u64)1 << qsfp2_int_smask); - write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)), - cce_int_mask); - } + init_qsfp_int(dd); } else { for (i = 0; i < CCE_NUM_INT_CSRS; i++) - write_csr(dd, CCE_INT_MASK + (8*i), 0ull); + write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); } } @@ -12005,7 +12341,7 @@ static void clear_all_interrupts(struct hfi1_devdata *dd) int i; for (i = 0; i < CCE_NUM_INT_CSRS; i++) - write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0); + write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0); write_csr(dd, CCE_ERR_CLEAR, ~(u64)0); write_csr(dd, MISC_ERR_CLEAR, ~(u64)0); @@ -12040,10 +12376,9 @@ static void clean_up_interrupts(struct hfi1_devdata *dd) struct hfi1_msix_entry *me = dd->msix_entries; for (i = 0; i < dd->num_msix_entries; i++, me++) { - if (me->arg == NULL) /* => no irq, no affinity */ - break; - irq_set_affinity_hint(dd->msix_entries[i].msix.vector, - NULL); + if (!me->arg) /* => no irq, no affinity */ + continue; + hfi1_put_irq_affinity(dd, &dd->msix_entries[i]); free_irq(me->msix.vector, me->arg); } } else { @@ -12064,8 +12399,6 @@ static void clean_up_interrupts(struct hfi1_devdata *dd) } /* clean structures */ - for (i = 0; i < dd->num_msix_entries; i++) - free_cpumask_var(dd->msix_entries[i].mask); kfree(dd->msix_entries); dd->msix_entries = NULL; dd->num_msix_entries = 0; @@ -12088,10 +12421,10 @@ static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) /* direct the chip source to the given MSI-X interrupt */ m = isrc / 8; n = isrc % 8; - reg = read_csr(dd, CCE_INT_MAP + (8*m)); - reg &= ~((u64)0xff << (8*n)); - reg |= ((u64)msix_intr & 0xff) << (8*n); - write_csr(dd, CCE_INT_MAP + (8*m), reg); + reg = read_csr(dd, CCE_INT_MAP + (8 * m)); + reg &= ~((u64)0xff << (8 * n)); + reg |= ((u64)msix_intr & 0xff) << (8 * n); + write_csr(dd, CCE_INT_MAP + (8 * m), reg); } static void remap_sdma_interrupts(struct hfi1_devdata *dd, @@ -12104,12 +12437,12 @@ static void remap_sdma_interrupts(struct hfi1_devdata *dd, * SDMAProgress * SDMAIdle */ - remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine, - msix_intr); + remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine, + msix_intr); + remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine, + msix_intr); + remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine, + msix_intr); } static int request_intx_irq(struct hfi1_devdata *dd) @@ -12119,10 +12452,10 @@ static int request_intx_irq(struct hfi1_devdata *dd) snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d", dd->unit); ret = request_irq(dd->pcidev->irq, general_interrupt, - IRQF_SHARED, dd->intx_name, dd); + IRQF_SHARED, dd->intx_name, dd); if (ret) dd_dev_err(dd, "unable to request INTx interrupt, err %d\n", - ret); + ret); else dd->requested_intx_irq = 1; return ret; @@ -12130,70 +12463,20 @@ static int request_intx_irq(struct hfi1_devdata *dd) static int request_msix_irqs(struct hfi1_devdata *dd) { - const struct cpumask *local_mask; - cpumask_var_t def, rcv; - bool def_ret, rcv_ret; int first_general, last_general; int first_sdma, last_sdma; int first_rx, last_rx; - int first_cpu, curr_cpu; - int rcv_cpu, sdma_cpu; - int i, ret = 0, possible; - int ht; + int i, ret = 0; /* calculate the ranges we are going to use */ first_general = 0; - first_sdma = last_general = first_general + 1; - first_rx = last_sdma = first_sdma + dd->num_sdma; + last_general = first_general + 1; + first_sdma = last_general; + last_sdma = first_sdma + dd->num_sdma; + first_rx = last_sdma; last_rx = first_rx + dd->n_krcv_queues; /* - * Interrupt affinity. - * - * non-rcv avail gets a default mask that - * starts as possible cpus with threads reset - * and each rcv avail reset. - * - * rcv avail gets node relative 1 wrapping back - * to the node relative 1 as necessary. - * - */ - local_mask = cpumask_of_pcibus(dd->pcidev->bus); - /* if first cpu is invalid, use NUMA 0 */ - if (cpumask_first(local_mask) >= nr_cpu_ids) - local_mask = topology_core_cpumask(0); - - def_ret = zalloc_cpumask_var(&def, GFP_KERNEL); - rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL); - if (!def_ret || !rcv_ret) - goto bail; - /* use local mask as default */ - cpumask_copy(def, local_mask); - possible = cpumask_weight(def); - /* disarm threads from default */ - ht = cpumask_weight( - topology_sibling_cpumask(cpumask_first(local_mask))); - for (i = possible/ht; i < possible; i++) - cpumask_clear_cpu(i, def); - /* def now has full cores on chosen node*/ - first_cpu = cpumask_first(def); - if (nr_cpu_ids >= first_cpu) - first_cpu++; - curr_cpu = first_cpu; - - /* One context is reserved as control context */ - for (i = first_cpu; i < dd->n_krcv_queues + first_cpu - 1; i++) { - cpumask_clear_cpu(curr_cpu, def); - cpumask_set_cpu(curr_cpu, rcv); - curr_cpu = cpumask_next(curr_cpu, def); - if (curr_cpu >= nr_cpu_ids) - break; - } - /* def mask has non-rcv, rcv has recv mask */ - rcv_cpu = cpumask_first(rcv); - sdma_cpu = cpumask_first(def); - - /* * Sanity check - the code expects all SDMA chip source * interrupts to be in the same CSR, starting at bit 0. Verify * that this is true by checking the bit location of the start. @@ -12218,6 +12501,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd) snprintf(me->name, sizeof(me->name), DRIVER_NAME "_%d", dd->unit); err_info = "general"; + me->type = IRQ_GENERAL; } else if (first_sdma <= i && i < last_sdma) { idx = i - first_sdma; sde = &dd->per_sdma[idx]; @@ -12227,6 +12511,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd) DRIVER_NAME "_%d sdma%d", dd->unit, idx); err_info = "sdma"; remap_sdma_interrupts(dd, idx, i); + me->type = IRQ_SDMA; } else if (first_rx <= i && i < last_rx) { idx = i - first_rx; rcd = dd->rcd[idx]; @@ -12237,9 +12522,9 @@ static int request_msix_irqs(struct hfi1_devdata *dd) * Set the interrupt register and mask for this * context's interrupt. */ - rcd->ireg = (IS_RCVAVAIL_START+idx) / 64; + rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START+idx) % 64); + ((IS_RCVAVAIL_START + idx) % 64); handler = receive_context_interrupt; thread = receive_context_thread; arg = rcd; @@ -12247,25 +12532,27 @@ static int request_msix_irqs(struct hfi1_devdata *dd) DRIVER_NAME "_%d kctxt%d", dd->unit, idx); err_info = "receive context"; remap_intr(dd, IS_RCVAVAIL_START + idx, i); + me->type = IRQ_RCVCTXT; } else { /* not in our expected range - complain, then - ignore it */ + * ignore it + */ dd_dev_err(dd, - "Unexpected extra MSI-X interrupt %d\n", i); + "Unexpected extra MSI-X interrupt %d\n", i); continue; } /* no argument, no interrupt */ - if (arg == NULL) + if (!arg) continue; /* make sure the name is terminated */ - me->name[sizeof(me->name)-1] = 0; + me->name[sizeof(me->name) - 1] = 0; ret = request_threaded_irq(me->msix.vector, handler, thread, 0, - me->name, arg); + me->name, arg); if (ret) { dd_dev_err(dd, - "unable to allocate %s interrupt, vector %d, index %d, err %d\n", - err_info, me->msix.vector, idx, ret); + "unable to allocate %s interrupt, vector %d, index %d, err %d\n", + err_info, me->msix.vector, idx, ret); return ret; } /* @@ -12274,52 +12561,13 @@ static int request_msix_irqs(struct hfi1_devdata *dd) */ me->arg = arg; - if (!zalloc_cpumask_var( - &dd->msix_entries[i].mask, - GFP_KERNEL)) - goto bail; - if (handler == sdma_interrupt) { - dd_dev_info(dd, "sdma engine %d cpu %d\n", - sde->this_idx, sdma_cpu); - sde->cpu = sdma_cpu; - cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask); - sdma_cpu = cpumask_next(sdma_cpu, def); - if (sdma_cpu >= nr_cpu_ids) - sdma_cpu = cpumask_first(def); - } else if (handler == receive_context_interrupt) { - dd_dev_info(dd, "rcv ctxt %d cpu %d\n", rcd->ctxt, - (rcd->ctxt == HFI1_CTRL_CTXT) ? - cpumask_first(def) : rcv_cpu); - if (rcd->ctxt == HFI1_CTRL_CTXT) { - /* map to first default */ - cpumask_set_cpu(cpumask_first(def), - dd->msix_entries[i].mask); - } else { - cpumask_set_cpu(rcv_cpu, - dd->msix_entries[i].mask); - rcv_cpu = cpumask_next(rcv_cpu, rcv); - if (rcv_cpu >= nr_cpu_ids) - rcv_cpu = cpumask_first(rcv); - } - } else { - /* otherwise first def */ - dd_dev_info(dd, "%s cpu %d\n", - err_info, cpumask_first(def)); - cpumask_set_cpu( - cpumask_first(def), dd->msix_entries[i].mask); - } - irq_set_affinity_hint( - dd->msix_entries[i].msix.vector, - dd->msix_entries[i].mask); + ret = hfi1_get_irq_affinity(dd, me); + if (ret) + dd_dev_err(dd, + "unable to pin IRQ %d\n", ret); } -out: - free_cpumask_var(def); - free_cpumask_var(rcv); return ret; -bail: - ret = -ENOMEM; - goto out; } /* @@ -12336,7 +12584,7 @@ static void reset_interrupts(struct hfi1_devdata *dd) /* all chip interrupts map to MSI-X 0 */ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) - write_csr(dd, CCE_INT_MAP + (8*i), 0); + write_csr(dd, CCE_INT_MAP + (8 * i), 0); } static int set_up_interrupts(struct hfi1_devdata *dd) @@ -12445,7 +12693,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) */ num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1; else - num_kernel_contexts = num_online_nodes(); + num_kernel_contexts = num_online_nodes() + 1; num_kernel_contexts = max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts); /* @@ -12486,13 +12734,14 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd->num_rcv_contexts = total_contexts; dd->n_krcv_queues = num_kernel_contexts; dd->first_user_ctxt = num_kernel_contexts; + dd->num_user_contexts = num_user_contexts; dd->freectxts = num_user_contexts; dd_dev_info(dd, - "rcv contexts: chip %d, used %d (kernel %d, user %d)\n", - (int)dd->chip_rcv_contexts, - (int)dd->num_rcv_contexts, - (int)dd->n_krcv_queues, - (int)dd->num_rcv_contexts - dd->n_krcv_queues); + "rcv contexts: chip %d, used %d (kernel %d, user %d)\n", + (int)dd->chip_rcv_contexts, + (int)dd->num_rcv_contexts, + (int)dd->n_krcv_queues, + (int)dd->num_rcv_contexts - dd->n_krcv_queues); /* * Receive array allocation: @@ -12518,8 +12767,8 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) / dd->rcv_entries.group_size; dd_dev_info(dd, - "RcvArray group count too high, change to %u\n", - dd->rcv_entries.ngroups); + "RcvArray group count too high, change to %u\n", + dd->rcv_entries.ngroups); dd->rcv_entries.nctxt_extra = 0; } /* @@ -12585,7 +12834,7 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) /* CceIntMap */ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) - write_csr(dd, CCE_INT_MAP+(8*i), 0); + write_csr(dd, CCE_INT_MAP + (8 * i), 0); /* SendCtxtCreditReturnAddr */ for (i = 0; i < dd->chip_send_contexts; i++) @@ -12593,8 +12842,10 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) /* PIO Send buffers */ /* SDMA Send buffers */ - /* These are not normally read, and (presently) have no method - to be read, so are not pre-initialized */ + /* + * These are not normally read, and (presently) have no method + * to be read, so are not pre-initialized + */ /* RcvHdrAddr */ /* RcvHdrTailAddr */ @@ -12603,13 +12854,13 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); for (j = 0; j < RXE_NUM_TID_FLOWS; j++) - write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0); + write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0); } /* RcvArray */ for (i = 0; i < dd->chip_rcv_array_count; i++) - write_csr(dd, RCV_ARRAY + (8*i), - RCV_ARRAY_RT_WRITE_ENABLE_SMASK); + write_csr(dd, RCV_ARRAY + (8 * i), + RCV_ARRAY_RT_WRITE_ENABLE_SMASK); /* RcvQPMapTable */ for (i = 0; i < 32; i++) @@ -12641,8 +12892,8 @@ static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits, return; if (time_after(jiffies, timeout)) { dd_dev_err(dd, - "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n", - status_bits, reg & status_bits); + "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n", + status_bits, reg & status_bits); return; } udelay(1); @@ -12674,7 +12925,7 @@ static void reset_cce_csrs(struct hfi1_devdata *dd) for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) { write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0); write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i), - CCE_MSIX_TABLE_UPPER_RESETCSR); + CCE_MSIX_TABLE_UPPER_RESETCSR); } for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) { /* CCE_MSIX_PBA read-only */ @@ -12694,91 +12945,6 @@ static void reset_cce_csrs(struct hfi1_devdata *dd) write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0); } -/* set ASIC CSRs to chip reset defaults */ -static void reset_asic_csrs(struct hfi1_devdata *dd) -{ - int i; - - /* - * If the HFIs are shared between separate nodes or VMs, - * then more will need to be done here. One idea is a module - * parameter that returns early, letting the first power-on or - * a known first load do the reset and blocking all others. - */ - - if (!(dd->flags & HFI1_DO_INIT_ASIC)) - return; - - if (dd->icode != ICODE_FPGA_EMULATION) { - /* emulation does not have an SBus - leave these alone */ - /* - * All writes to ASIC_CFG_SBUS_REQUEST do something. - * Notes: - * o The reset is not zero if aimed at the core. See the - * SBus documentation for details. - * o If the SBus firmware has been updated (e.g. by the BIOS), - * will the reset revert that? - */ - /* ASIC_CFG_SBUS_REQUEST leave alone */ - write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); - } - /* ASIC_SBUS_RESULT read-only */ - write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0); - for (i = 0; i < ASIC_NUM_SCRATCH; i++) - write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0); - write_csr(dd, ASIC_CFG_MUTEX, 0); /* this will clear it */ - - /* We might want to retain this state across FLR if we ever use it */ - write_csr(dd, ASIC_CFG_DRV_STR, 0); - - /* ASIC_CFG_THERM_POLL_EN leave alone */ - /* ASIC_STS_THERM read-only */ - /* ASIC_CFG_RESET leave alone */ - - write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0); - /* ASIC_PCIE_SD_HOST_STATUS read-only */ - write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0); - write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0); - /* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */ - write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */ - /* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */ - /* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */ - for (i = 0; i < 16; i++) - write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0); - - /* ASIC_GPIO_IN read-only */ - write_csr(dd, ASIC_GPIO_OE, 0); - write_csr(dd, ASIC_GPIO_INVERT, 0); - write_csr(dd, ASIC_GPIO_OUT, 0); - write_csr(dd, ASIC_GPIO_MASK, 0); - /* ASIC_GPIO_STATUS read-only */ - write_csr(dd, ASIC_GPIO_CLEAR, ~0ull); - /* ASIC_GPIO_FORCE leave alone */ - - /* ASIC_QSFP1_IN read-only */ - write_csr(dd, ASIC_QSFP1_OE, 0); - write_csr(dd, ASIC_QSFP1_INVERT, 0); - write_csr(dd, ASIC_QSFP1_OUT, 0); - write_csr(dd, ASIC_QSFP1_MASK, 0); - /* ASIC_QSFP1_STATUS read-only */ - write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull); - /* ASIC_QSFP1_FORCE leave alone */ - - /* ASIC_QSFP2_IN read-only */ - write_csr(dd, ASIC_QSFP2_OE, 0); - write_csr(dd, ASIC_QSFP2_INVERT, 0); - write_csr(dd, ASIC_QSFP2_OUT, 0); - write_csr(dd, ASIC_QSFP2_MASK, 0); - /* ASIC_QSFP2_STATUS read-only */ - write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull); - /* ASIC_QSFP2_FORCE leave alone */ - - write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR); - /* this also writes a NOP command, clearing paging mode */ - write_csr(dd, ASIC_EEP_ADDR_CMD, 0); - write_csr(dd, ASIC_EEP_DATA, 0); -} - /* set MISC CSRs to chip reset defaults */ static void reset_misc_csrs(struct hfi1_devdata *dd) { @@ -12789,8 +12955,10 @@ static void reset_misc_csrs(struct hfi1_devdata *dd) write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0); write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0); } - /* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can - only be written 128-byte chunks */ + /* + * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can + * only be written 128-byte chunks + */ /* init RSA engine to clear lingering errors */ write_csr(dd, MISC_CFG_RSA_CMD, 1); write_csr(dd, MISC_CFG_RSA_MU, 0); @@ -12846,18 +13014,17 @@ static void reset_txe_csrs(struct hfi1_devdata *dd) write_csr(dd, SEND_ERR_CLEAR, ~0ull); /* SEND_ERR_FORCE read-only */ for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++) - write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0); + write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0); for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++) - write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0); - for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++) - write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0); + write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0); + for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++) + write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0); for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++) - write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0); + write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0); for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++) - write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0); + write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0); write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR); - write_csr(dd, SEND_CM_GLOBAL_CREDIT, - SEND_CM_GLOBAL_CREDIT_RESETCSR); + write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR); /* SEND_CM_CREDIT_USED_STATUS read-only */ write_csr(dd, SEND_CM_TIMER_CTRL, 0); write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0); @@ -12865,7 +13032,7 @@ static void reset_txe_csrs(struct hfi1_devdata *dd) write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0); write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0); for (i = 0; i < TXE_NUM_DATA_VL; i++) - write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0); + write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0); write_csr(dd, SEND_CM_CREDIT_VL15, 0); /* SEND_CM_CREDIT_USED_VL read-only */ /* SEND_CM_CREDIT_USED_VL15 read-only */ @@ -12951,8 +13118,8 @@ static void init_rbufs(struct hfi1_devdata *dd) */ if (count++ > 500) { dd_dev_err(dd, - "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n", - __func__, reg); + "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n", + __func__, reg); break; } udelay(2); /* do not busy-wait the CSR */ @@ -12981,8 +13148,8 @@ static void init_rbufs(struct hfi1_devdata *dd) /* give up after 100us - slowest possible at 33MHz is 73us */ if (count++ > 50) { dd_dev_err(dd, - "%s: RcvStatus.RxRbufInit not set, continuing\n", - __func__); + "%s: RcvStatus.RxRbufInit not set, continuing\n", + __func__); break; } } @@ -13008,7 +13175,7 @@ static void reset_rxe_csrs(struct hfi1_devdata *dd) write_csr(dd, RCV_VL15, 0); /* this is a clear-down */ write_csr(dd, RCV_ERR_INFO, - RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); + RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); /* RCV_ERR_STATUS read-only */ write_csr(dd, RCV_ERR_MASK, 0); write_csr(dd, RCV_ERR_CLEAR, ~0ull); @@ -13054,8 +13221,8 @@ static void reset_rxe_csrs(struct hfi1_devdata *dd) write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0); /* RCV_EGR_OFFSET_TAIL read-only */ for (j = 0; j < RXE_NUM_TID_FLOWS; j++) { - write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), - 0); + write_uctxt_csr(dd, i, + RCV_TID_FLOW_TABLE + (8 * j), 0); } } } @@ -13157,7 +13324,7 @@ static void init_chip(struct hfi1_devdata *dd) write_csr(dd, RCV_CTXT_CTRL, 0); /* mask all interrupt sources */ for (i = 0; i < CCE_NUM_INT_CSRS; i++) - write_csr(dd, CCE_INT_MASK + (8*i), 0ull); + write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); /* * DC Reset: do a full DC reset before the register clear. @@ -13166,7 +13333,7 @@ static void init_chip(struct hfi1_devdata *dd) * across the clear. */ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); - (void) read_csr(dd, CCE_DC_CTRL); + (void)read_csr(dd, CCE_DC_CTRL); if (use_flr) { /* @@ -13187,22 +13354,19 @@ static void init_chip(struct hfi1_devdata *dd) hfi1_pcie_flr(dd); restore_pci_variables(dd); } - - reset_asic_csrs(dd); } else { dd_dev_info(dd, "Resetting CSRs with writes\n"); reset_cce_csrs(dd); reset_txe_csrs(dd); reset_rxe_csrs(dd); - reset_asic_csrs(dd); reset_misc_csrs(dd); } /* clear the DC reset */ write_csr(dd, CCE_DC_CTRL, 0); /* Set the LED off */ - if (is_ax(dd)) - setextled(dd, 0); + setextled(dd, 0); + /* * Clear the QSFP reset. * An FLR enforces a 0 on all out pins. The driver does not touch @@ -13215,6 +13379,7 @@ static void init_chip(struct hfi1_devdata *dd) */ write_csr(dd, ASIC_QSFP1_OUT, 0x1f); write_csr(dd, ASIC_QSFP2_OUT, 0x1f); + init_chip_resources(dd); } static void init_early_variables(struct hfi1_devdata *dd) @@ -13255,12 +13420,12 @@ static void init_kdeth_qp(struct hfi1_devdata *dd) kdeth_qp = DEFAULT_KDETH_QP; write_csr(dd, SEND_BTH_QP, - (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) - << SEND_BTH_QP_KDETH_QP_SHIFT); + (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) << + SEND_BTH_QP_KDETH_QP_SHIFT); write_csr(dd, RCV_BTH_QP, - (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) - << RCV_BTH_QP_KDETH_QP_SHIFT); + (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) << + RCV_BTH_QP_KDETH_QP_SHIFT); } /** @@ -13385,22 +13550,21 @@ static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt) write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]); /* add rule0 */ write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */, - RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK - << RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT | - 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT); + RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK << + RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT | + 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT); write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */, - LRH_BTH_MATCH_OFFSET - << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | - LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | - LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | - ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | - QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | - ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); + LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | + LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | + LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | + ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | + QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | + ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */, - LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT | - LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT | - LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT | - LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT); + LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT | + LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT | + LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT | + LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT); /* Enable RSM */ add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); kfree(rsmmap); @@ -13418,9 +13582,8 @@ static void init_rxe(struct hfi1_devdata *dd) /* enable all receive errors */ write_csr(dd, RCV_ERR_MASK, ~0ull); /* setup QPN map table - start where VL15 context leaves off */ - init_qos( - dd, - dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0); + init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? + MIN_KERNEL_KCTXTS : 0); /* * make sure RcvCtrl.RcvWcb <= PCIe Device Control * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config @@ -13457,36 +13620,33 @@ static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu, u32 csr0to3, u32 csr4to7) { write_csr(dd, csr0to3, - 0ull << - SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT - | 1ull << - SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT - | 2ull * cu << - SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT - | 4ull * cu << - SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT); + 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT | + 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT | + 2ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT | + 4ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT); write_csr(dd, csr4to7, - 8ull * cu << - SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT - | 16ull * cu << - SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT - | 32ull * cu << - SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT - | 64ull * cu << - SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT); - + 8ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT | + 16ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT | + 32ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT | + 64ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT); } static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu) { assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3, - SEND_CM_LOCAL_AU_TABLE4_TO7); + SEND_CM_LOCAL_AU_TABLE4_TO7); } void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu) { assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3, - SEND_CM_REMOTE_AU_TABLE4_TO7); + SEND_CM_REMOTE_AU_TABLE4_TO7); } static void init_txe(struct hfi1_devdata *dd) @@ -13537,7 +13697,6 @@ int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey) write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg); /* * Enable send-side J_KEY integrity check, unless this is A0 h/w - * (due to A0 erratum). */ if (!is_ax(dd)) { reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); @@ -13590,9 +13749,9 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey) int ret = 0; u64 reg; - if (ctxt < dd->num_rcv_contexts) + if (ctxt < dd->num_rcv_contexts) { rcd = dd->rcd[ctxt]; - else { + } else { ret = -EINVAL; goto done; } @@ -13618,9 +13777,9 @@ int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt) int ret = 0; u64 reg; - if (ctxt < dd->num_rcv_contexts) + if (ctxt < dd->num_rcv_contexts) { rcd = dd->rcd[ctxt]; - else { + } else { ret = -EINVAL; goto done; } @@ -13643,24 +13802,26 @@ done: */ void hfi1_start_cleanup(struct hfi1_devdata *dd) { + aspm_exit(dd); free_cntrs(dd); free_rcverr(dd); clean_up_interrupts(dd); + finish_chip_resources(dd); } #define HFI_BASE_GUID(dev) \ ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT)) /* - * Certain chip functions need to be initialized only once per asic - * instead of per-device. This function finds the peer device and - * checks whether that chip initialization needs to be done by this - * device. + * Information can be shared between the two HFIs on the same ASIC + * in the same OS. This function finds the peer device and sets + * up a shared structure. */ -static void asic_should_init(struct hfi1_devdata *dd) +static int init_asic_data(struct hfi1_devdata *dd) { unsigned long flags; struct hfi1_devdata *tmp, *peer = NULL; + int ret = 0; spin_lock_irqsave(&hfi1_devs_lock, flags); /* Find our peer device */ @@ -13672,13 +13833,21 @@ static void asic_should_init(struct hfi1_devdata *dd) } } - /* - * "Claim" the ASIC for initialization if it hasn't been - " "claimed" yet. - */ - if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC)) - dd->flags |= HFI1_DO_INIT_ASIC; + if (peer) { + dd->asic_data = peer->asic_data; + } else { + dd->asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL); + if (!dd->asic_data) { + ret = -ENOMEM; + goto done; + } + mutex_init(&dd->asic_data->asic_resource_mutex); + } + dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */ + +done: spin_unlock_irqrestore(&hfi1_devs_lock, flags); + return ret; } /* @@ -13698,7 +13867,7 @@ static int obtain_boardname(struct hfi1_devdata *dd) ret = read_hfi1_efi_var(dd, "description", &size, (void **)&dd->boardname); if (ret) { - dd_dev_err(dd, "Board description not found\n"); + dd_dev_info(dd, "Board description not found\n"); /* use generic description */ dd->boardname = kstrdup(generic, GFP_KERNEL); if (!dd->boardname) @@ -13707,6 +13876,50 @@ static int obtain_boardname(struct hfi1_devdata *dd) return 0; } +/* + * Check the interrupt registers to make sure that they are mapped correctly. + * It is intended to help user identify any mismapping by VMM when the driver + * is running in a VM. This function should only be called before interrupt + * is set up properly. + * + * Return 0 on success, -EINVAL on failure. + */ +static int check_int_registers(struct hfi1_devdata *dd) +{ + u64 reg; + u64 all_bits = ~(u64)0; + u64 mask; + + /* Clear CceIntMask[0] to avoid raising any interrupts */ + mask = read_csr(dd, CCE_INT_MASK); + write_csr(dd, CCE_INT_MASK, 0ull); + reg = read_csr(dd, CCE_INT_MASK); + if (reg) + goto err_exit; + + /* Clear all interrupt status bits */ + write_csr(dd, CCE_INT_CLEAR, all_bits); + reg = read_csr(dd, CCE_INT_STATUS); + if (reg) + goto err_exit; + + /* Set all interrupt status bits */ + write_csr(dd, CCE_INT_FORCE, all_bits); + reg = read_csr(dd, CCE_INT_STATUS); + if (reg != all_bits) + goto err_exit; + + /* Restore the interrupt mask */ + write_csr(dd, CCE_INT_CLEAR, all_bits); + write_csr(dd, CCE_INT_MASK, mask); + + return 0; +err_exit: + write_csr(dd, CCE_INT_MASK, mask); + dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n"); + return -EINVAL; +} + /** * Allocate and initialize the device structure for the hfi. * @dev: the pci_dev for hfi1_ib device @@ -13731,9 +13944,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, "RTL FPGA emulation", "Functional simulator" }; + struct pci_dev *parent = pdev->bus->self; - dd = hfi1_alloc_devdata(pdev, - NUM_IB_PORTS * sizeof(struct hfi1_pportdata)); + dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * + sizeof(struct hfi1_pportdata)); if (IS_ERR(dd)) goto bail; ppd = dd->pport; @@ -13754,8 +13968,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* link width active is 0 when link is down */ /* link width downgrade active is 0 when link is down */ - if (num_vls < HFI1_MIN_VLS_SUPPORTED - || num_vls > HFI1_MAX_VLS_SUPPORTED) { + if (num_vls < HFI1_MIN_VLS_SUPPORTED || + num_vls > HFI1_MAX_VLS_SUPPORTED) { hfi1_early_err(&pdev->dev, "Invalid num_vls %u, using %u VLs\n", num_vls, HFI1_MAX_VLS_SUPPORTED); @@ -13763,6 +13977,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, } ppd->vls_supported = num_vls; ppd->vls_operational = ppd->vls_supported; + ppd->actual_vls_operational = ppd->vls_supported; /* Set the default MTU. */ for (vl = 0; vl < num_vls; vl++) dd->vld[vl].mtu = hfi1_max_mtu; @@ -13782,6 +13997,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* start in offline */ ppd->host_link_state = HLS_DN_OFFLINE; init_vl_arb_caches(ppd); + ppd->last_pstate = 0xff; /* invalid value */ } dd->link_default = HLS_DN_POLL; @@ -13807,8 +14023,21 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT) & CCE_REVISION_CHIP_REV_MINOR_MASK; - /* obtain the hardware ID - NOT related to unit, which is a - software enumeration */ + /* + * Check interrupt registers mapping if the driver has no access to + * the upstream component. In this case, it is likely that the driver + * is running in a VM. + */ + if (!parent) { + ret = check_int_registers(dd); + if (ret) + goto bail_cleanup; + } + + /* + * obtain the hardware ID - NOT related to unit, which is a + * software enumeration + */ reg = read_csr(dd, CCE_REVISION2); dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT) & CCE_REVISION2_HFI_ID_MASK; @@ -13816,8 +14045,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT; dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT; dd_dev_info(dd, "Implementation: %s, revision 0x%x\n", - dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown", - (int)dd->irev); + dd->icode < ARRAY_SIZE(inames) ? + inames[dd->icode] : "unknown", (int)dd->irev); /* speeds the hardware can support */ dd->pport->link_speed_supported = OPA_LINK_SPEED_25G; @@ -13846,6 +14075,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, num_vls, dd->chip_sdma_engines); num_vls = dd->chip_sdma_engines; ppd->vls_supported = dd->chip_sdma_engines; + ppd->vls_operational = ppd->vls_supported; } /* @@ -13867,8 +14097,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* needs to be done before we look for the peer device */ read_guid(dd); - /* should this device init the ASIC block? */ - asic_should_init(dd); + /* set up shared ASIC data with peer device */ + ret = init_asic_data(dd); + if (ret) + goto bail_cleanup; /* obtain chip sizes, reset chip CSRs */ init_chip(dd); @@ -13878,6 +14110,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_cleanup; + /* Needs to be called before hfi1_firmware_init */ + get_platform_config(dd); + /* read in firmware */ ret = hfi1_firmware_init(dd); if (ret) @@ -13929,6 +14164,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up KDETH QP prefix in both RX and TX CSRs */ init_kdeth_qp(dd); + ret = hfi1_dev_affinity_init(dd); + if (ret) + goto bail_cleanup; + /* send contexts must be set up before receive contexts */ ret = init_send_contexts(dd); if (ret) @@ -14026,7 +14265,6 @@ static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate, return (u16)delta_cycles; } - /** * create_pbc - build a pbc for transmission * @flags: special case flags or-ed in built pbc @@ -14082,10 +14320,15 @@ static int thermal_init(struct hfi1_devdata *dd) int ret = 0; if (dd->icode != ICODE_RTL_SILICON || - !(dd->flags & HFI1_DO_INIT_ASIC)) + check_chip_resource(dd, CR_THERM_INIT, NULL)) return ret; - acquire_hw_mutex(dd); + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + THERM_FAILURE(dd, ret, "Acquire SBus"); + return ret; + } + dd_dev_info(dd, "Initializing thermal sensor\n"); /* Disable polling of thermal readings */ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0); @@ -14132,8 +14375,14 @@ static int thermal_init(struct hfi1_devdata *dd) /* Enable polling of thermal readings */ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1); + + /* Set initialized flag */ + ret = acquire_chip_resource(dd, CR_THERM_INIT, 0); + if (ret) + THERM_FAILURE(dd, ret, "Unable to set thermal init flag"); + done: - release_hw_mutex(dd); + release_chip_resource(dd, CR_SBUS); return ret; } @@ -14148,7 +14397,7 @@ static void handle_temp_err(struct hfi1_devdata *dd) dd_dev_emerg(dd, "Critical temperature reached! Forcing device into freeze mode!\n"); dd->flags |= HFI1_FORCED_FREEZE; - start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT); + start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT); /* * Shut DC down as much and as quickly as possible. * @@ -14162,8 +14411,8 @@ static void handle_temp_err(struct hfi1_devdata *dd) */ ppd->driver_link_ready = 0; ppd->link_enabled = 0; - set_physical_link_state(dd, PLS_OFFLINE | - (OPA_LINKDOWN_REASON_SMA_DISABLED << 8)); + set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) | + PLS_OFFLINE); /* * Step 2: Shutdown LCB and 8051 * After shutdown, do not restore DC_CFG_RESET value. diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h index 5b375ddc345d..4f3b878e43eb 100644 --- a/drivers/staging/rdma/hfi1/chip.h +++ b/drivers/staging/rdma/hfi1/chip.h @@ -1,14 +1,13 @@ #ifndef _CHIP_H #define _CHIP_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -79,8 +76,10 @@ #define PIO_CMASK 0x7ff /* counter mask for free and fill counters */ #define MAX_EAGER_ENTRIES 2048 /* max receive eager entries */ #define MAX_TID_PAIR_ENTRIES 1024 /* max receive expected pairs */ -/* Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed - at 64 bytes for all generation one devices */ +/* + * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed + * at 64 bytes for all generation one devices + */ #define CM_VAU 3 /* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */ #define CM_GLOBAL_CREDITS 0x940 @@ -93,15 +92,15 @@ #define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET) /* PBC flags */ -#define PBC_INTR (1ull << 31) +#define PBC_INTR BIT_ULL(31) #define PBC_DC_INFO_SHIFT (30) -#define PBC_DC_INFO (1ull << PBC_DC_INFO_SHIFT) -#define PBC_TEST_EBP (1ull << 29) -#define PBC_PACKET_BYPASS (1ull << 28) -#define PBC_CREDIT_RETURN (1ull << 25) -#define PBC_INSERT_BYPASS_ICRC (1ull << 24) -#define PBC_TEST_BAD_ICRC (1ull << 23) -#define PBC_FECN (1ull << 22) +#define PBC_DC_INFO BIT_ULL(PBC_DC_INFO_SHIFT) +#define PBC_TEST_EBP BIT_ULL(29) +#define PBC_PACKET_BYPASS BIT_ULL(28) +#define PBC_CREDIT_RETURN BIT_ULL(25) +#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24) +#define PBC_TEST_BAD_ICRC BIT_ULL(23) +#define PBC_FECN BIT_ULL(22) /* PbcInsertHcrc field settings */ #define PBC_IHCRC_LKDETH 0x0 /* insert @ local KDETH offset */ @@ -212,7 +211,7 @@ #define PLS_CONFIGPHY_DEBOUCE 0x40 #define PLS_CONFIGPHY_ESTCOMM 0x41 #define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT 0x42 -#define PLS_CONFIGPHY_ESTcOMM_LOCAL_COMPLETE 0x43 +#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE 0x43 #define PLS_CONFIGPHY_OPTEQ 0x44 #define PLS_CONFIGPHY_OPTEQ_OPTIMIZING 0x44 #define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE 0x45 @@ -242,36 +241,37 @@ #define HCMD_SUCCESS 2 /* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */ -#define SPICO_ROM_FAILED (1 << 0) -#define UNKNOWN_FRAME (1 << 1) -#define TARGET_BER_NOT_MET (1 << 2) -#define FAILED_SERDES_INTERNAL_LOOPBACK (1 << 3) -#define FAILED_SERDES_INIT (1 << 4) -#define FAILED_LNI_POLLING (1 << 5) -#define FAILED_LNI_DEBOUNCE (1 << 6) -#define FAILED_LNI_ESTBCOMM (1 << 7) -#define FAILED_LNI_OPTEQ (1 << 8) -#define FAILED_LNI_VERIFY_CAP1 (1 << 9) -#define FAILED_LNI_VERIFY_CAP2 (1 << 10) -#define FAILED_LNI_CONFIGLT (1 << 11) +#define SPICO_ROM_FAILED BIT(0) +#define UNKNOWN_FRAME BIT(1) +#define TARGET_BER_NOT_MET BIT(2) +#define FAILED_SERDES_INTERNAL_LOOPBACK BIT(3) +#define FAILED_SERDES_INIT BIT(4) +#define FAILED_LNI_POLLING BIT(5) +#define FAILED_LNI_DEBOUNCE BIT(6) +#define FAILED_LNI_ESTBCOMM BIT(7) +#define FAILED_LNI_OPTEQ BIT(8) +#define FAILED_LNI_VERIFY_CAP1 BIT(9) +#define FAILED_LNI_VERIFY_CAP2 BIT(10) +#define FAILED_LNI_CONFIGLT BIT(11) +#define HOST_HANDSHAKE_TIMEOUT BIT(12) #define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \ | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \ | FAILED_LNI_VERIFY_CAP1 \ | FAILED_LNI_VERIFY_CAP2 \ - | FAILED_LNI_CONFIGLT) + | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT) /* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */ -#define HOST_REQ_DONE (1 << 0) -#define BC_PWR_MGM_MSG (1 << 1) -#define BC_SMA_MSG (1 << 2) -#define BC_BCC_UNKOWN_MSG (1 << 3) -#define BC_IDLE_UNKNOWN_MSG (1 << 4) -#define EXT_DEVICE_CFG_REQ (1 << 5) -#define VERIFY_CAP_FRAME (1 << 6) -#define LINKUP_ACHIEVED (1 << 7) -#define LINK_GOING_DOWN (1 << 8) -#define LINK_WIDTH_DOWNGRADED (1 << 9) +#define HOST_REQ_DONE BIT(0) +#define BC_PWR_MGM_MSG BIT(1) +#define BC_SMA_MSG BIT(2) +#define BC_BCC_UNKNOWN_MSG BIT(3) +#define BC_IDLE_UNKNOWN_MSG BIT(4) +#define EXT_DEVICE_CFG_REQ BIT(5) +#define VERIFY_CAP_FRAME BIT(6) +#define LINKUP_ACHIEVED BIT(7) +#define LINK_GOING_DOWN BIT(8) +#define LINK_WIDTH_DOWNGRADED BIT(9) /* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */ #define HREQ_LOAD_CONFIG 0x01 @@ -335,14 +335,14 @@ * the CSR fields hold multiples of this value. */ #define RCV_SHIFT 3 -#define RCV_INCREMENT (1 << RCV_SHIFT) +#define RCV_INCREMENT BIT(RCV_SHIFT) /* * Receive header queue entry increment - the CSR holds multiples of * this value. */ #define HDRQ_SIZE_SHIFT 5 -#define HDRQ_INCREMENT (1 << HDRQ_SIZE_SHIFT) +#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT) /* * Freeze handling flags @@ -371,6 +371,9 @@ #define NUM_LANE_FIELDS 0x8 /* 8051 general register Field IDs */ +#define LINK_OPTIMIZATION_SETTINGS 0x00 +#define LINK_TUNING_PARAMETERS 0x02 +#define DC_HOST_COMM_SETTINGS 0x03 #define TX_SETTINGS 0x06 #define VERIFY_CAP_LOCAL_PHY 0x07 #define VERIFY_CAP_LOCAL_FABRIC 0x08 @@ -387,6 +390,10 @@ #define LINK_QUALITY_INFO 0x14 #define REMOTE_DEVICE_ID 0x15 +/* 8051 lane specific register field IDs */ +#define TX_EQ_SETTINGS 0x00 +#define CHANNEL_LOSS_SETTINGS 0x05 + /* Lane ID for general configuration registers */ #define GENERAL_CONFIG 4 @@ -511,8 +518,10 @@ enum { #define LCB_CRC_48B 0x2 /* 48b CRC */ #define LCB_CRC_12B_16B_PER_LANE 0x3 /* 12b-16b per lane CRC */ -/* the following enum is (almost) a copy/paste of the definition - * in the OPA spec, section 20.2.2.6.8 (PortInfo) */ +/* + * the following enum is (almost) a copy/paste of the definition + * in the OPA spec, section 20.2.2.6.8 (PortInfo) + */ enum { PORT_LTP_CRC_MODE_NONE = 0, PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */ @@ -614,6 +623,8 @@ u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32); #define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */ extern const u8 pcie_serdes_broadcast[]; extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES]; +extern uint platform_config_load; + /* SBus commands */ #define RESET_SBUS_RECEIVER 0x20 #define WRITE_SBUS_RECEIVER 0x21 @@ -629,6 +640,42 @@ int load_firmware(struct hfi1_devdata *dd); void dispose_firmware(void); int acquire_hw_mutex(struct hfi1_devdata *dd); void release_hw_mutex(struct hfi1_devdata *dd); + +/* + * Bitmask of dynamic access for ASIC block chip resources. Each HFI has its + * own range of bits for the resource so it can clear its own bits on + * starting and exiting. If either HFI has the resource bit set, the + * resource is in use. The separate bit ranges are: + * HFI0 bits 7:0 + * HFI1 bits 15:8 + */ +#define CR_SBUS 0x01 /* SBUS, THERM, and PCIE registers */ +#define CR_EPROM 0x02 /* EEP, GPIO registers */ +#define CR_I2C1 0x04 /* QSFP1_OE register */ +#define CR_I2C2 0x08 /* QSFP2_OE register */ +#define CR_DYN_SHIFT 8 /* dynamic flag shift */ +#define CR_DYN_MASK ((1ull << CR_DYN_SHIFT) - 1) + +/* + * Bitmask of static ASIC states these are outside of the dynamic ASIC + * block chip resources above. These are to be set once and never cleared. + * Must be holding the SBus dynamic flag when setting. + */ +#define CR_THERM_INIT 0x010000 + +int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait); +void release_chip_resource(struct hfi1_devdata *dd, u32 resource); +bool check_chip_resource(struct hfi1_devdata *dd, u32 resource, + const char *func); +void init_chip_resources(struct hfi1_devdata *dd); +void finish_chip_resources(struct hfi1_devdata *dd); + +/* ms wait time for access to an SBus resoure */ +#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */ + +/* ms wait time for a qsfp (i2c) chain to become available */ +#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */ + void fabric_serdes_reset(struct hfi1_devdata *dd); int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result); @@ -644,13 +691,17 @@ void handle_verify_cap(struct work_struct *work); void handle_freeze(struct work_struct *work); void handle_link_up(struct work_struct *work); void handle_link_down(struct work_struct *work); +void handle_8051_request(struct work_struct *work); void handle_link_downgrade(struct work_struct *work); void handle_link_bounce(struct work_struct *work); void handle_sma_message(struct work_struct *work); +void reset_qsfp(struct hfi1_pportdata *ppd); +void qsfp_event(struct work_struct *work); void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); int send_idle_sma(struct hfi1_devdata *dd, u64 message); +int load_8051_config(struct hfi1_devdata *, u8, u8, u32); +int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *); int start_link(struct hfi1_pportdata *ppd); -void init_qsfp(struct hfi1_pportdata *ppd); int bringup_serdes(struct hfi1_pportdata *ppd); void set_intr_state(struct hfi1_devdata *dd, u32 enable); void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, @@ -690,6 +741,8 @@ u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl); u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data); u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl); u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data); +u32 read_logical_state(struct hfi1_devdata *dd); +void force_recv_intr(struct hfi1_ctxtdata *rcd); /* Per VL indexes */ enum { @@ -785,8 +838,14 @@ enum { C_SW_CPU_RCV_LIM, C_SW_VTX_WAIT, C_SW_PIO_WAIT, + C_SW_PIO_DRAIN, C_SW_KMEM_WAIT, C_SW_SEND_SCHED, + C_SDMA_DESC_FETCHED_CNT, + C_SDMA_INT_CNT, + C_SDMA_ERR_CNT, + C_SDMA_IDLE_INT_CNT, + C_SDMA_PROGRESS_INT_CNT, /* MISC_ERR_STATUS */ C_MISC_PLL_LOCK_FAIL_ERR, C_MISC_MBIST_FAIL_ERR, @@ -1275,10 +1334,8 @@ void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, u32 type, unsigned long pa, u16 order); void hfi1_quiet_serdes(struct hfi1_pportdata *ppd); void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt); -u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep, - u64 **cntrp); -u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port, - char **namep, u64 **cntrp); +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp); +u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp); u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd); int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which); int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val); diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h index 701e9e1012a6..770f05c9b8de 100644 --- a/drivers/staging/rdma/hfi1/chip_registers.h +++ b/drivers/staging/rdma/hfi1/chip_registers.h @@ -2,14 +2,13 @@ #define DEF_CHIP_REG /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -21,8 +20,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -551,6 +548,17 @@ #define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008) #define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull #define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400) +#define CCE_PCIE_CTRL (CCE + 0x0000000000C0) +#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull +#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0 +#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull +#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2 +#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8 +#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9 +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12 +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13 #define CCE_REVISION (CCE + 0x000000000000) #define CCE_REVISION2 (CCE + 0x000000000008) #define CCE_REVISION2_HFI_ID_MASK 0x1ull @@ -1270,6 +1278,9 @@ #define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0 #define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull #define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708) +#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C) +#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27 +#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000 #define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898) #define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12 #define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6 @@ -1290,5 +1301,6 @@ #define CCE_INT_BLOCKED (CCE + 0x000000110C00) #define SEND_DMA_IDLE_CNT (TXE + 0x000000200040) #define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058) +#define CCE_MSIX_PBA_OFFSET 0X0110000 #endif /* DEF_CHIP_REG */ diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h index 5dd92720faae..e9b6bb322025 100644 --- a/drivers/staging/rdma/hfi1/common.h +++ b/drivers/staging/rdma/hfi1/common.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -341,19 +338,16 @@ struct hfi1_message_header { #define FULL_MGMT_P_KEY 0xFFFF #define DEFAULT_P_KEY LIM_MGMT_P_KEY -#define HFI1_PERMISSIVE_LID 0xFFFF #define HFI1_AETH_CREDIT_SHIFT 24 #define HFI1_AETH_CREDIT_MASK 0x1F #define HFI1_AETH_CREDIT_INVAL 0x1F #define HFI1_MSN_MASK 0xFFFFFF -#define HFI1_QPN_MASK 0xFFFFFF #define HFI1_FECN_SHIFT 31 #define HFI1_FECN_MASK 1 -#define HFI1_FECN_SMASK (1 << HFI1_FECN_SHIFT) +#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT) #define HFI1_BECN_SHIFT 30 #define HFI1_BECN_MASK 1 -#define HFI1_BECN_SMASK (1 << HFI1_BECN_SHIFT) -#define HFI1_MULTICAST_LID_BASE 0xC000 +#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT) static inline __u64 rhf_to_cpu(const __le32 *rbuf) { diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c index acd2269e9f14..dbab9d9cc288 100644 --- a/drivers/staging/rdma/hfi1/debugfs.c +++ b/drivers/staging/rdma/hfi1/debugfs.c @@ -1,13 +1,12 @@ #ifdef CONFIG_DEBUG_FS /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -19,8 +18,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -52,6 +49,7 @@ #include <linux/seq_file.h> #include <linux/kernel.h> #include <linux/export.h> +#include <linux/module.h> #include "hfi.h" #include "debugfs.h" @@ -71,6 +69,7 @@ static const struct seq_operations _##name##_seq_ops = { \ .stop = _##name##_seq_stop, \ .show = _##name##_seq_show \ } + #define DEBUGFS_SEQ_FILE_OPEN(name) \ static int _##name##_open(struct inode *inode, struct file *s) \ { \ @@ -102,7 +101,6 @@ do { \ pr_warn("create of %s failed\n", name); \ } while (0) - #define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO) @@ -127,7 +125,6 @@ static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) return pos; } - static void _opcode_stats_seq_stop(struct seq_file *s, void *v) __releases(RCU) { @@ -151,8 +148,8 @@ static int _opcode_stats_seq_show(struct seq_file *s, void *v) if (!n_packets && !n_bytes) return SEQ_SKIP; seq_printf(s, "%02llx %llu/%llu\n", i, - (unsigned long long) n_packets, - (unsigned long long) n_bytes); + (unsigned long long)n_packets, + (unsigned long long)n_bytes); return 0; } @@ -247,7 +244,7 @@ __acquires(RCU) } static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, - loff_t *pos) + loff_t *pos) { struct qp_iter *iter = iter_ptr; @@ -308,7 +305,6 @@ static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos) return pos; } - static void _sdes_seq_stop(struct seq_file *s, void *v) __releases(RCU) { @@ -341,7 +337,7 @@ static ssize_t dev_counters_read(struct file *file, char __user *buf, rcu_read_lock(); dd = private2dd(file); - avail = hfi1_read_cntrs(dd, *ppos, NULL, &counters); + avail = hfi1_read_cntrs(dd, NULL, &counters); rval = simple_read_from_buffer(buf, count, ppos, counters, avail); rcu_read_unlock(); return rval; @@ -358,7 +354,7 @@ static ssize_t dev_names_read(struct file *file, char __user *buf, rcu_read_lock(); dd = private2dd(file); - avail = hfi1_read_cntrs(dd, *ppos, &names, NULL); + avail = hfi1_read_cntrs(dd, &names, NULL); rval = simple_read_from_buffer(buf, count, ppos, names, avail); rcu_read_unlock(); return rval; @@ -385,8 +381,7 @@ static ssize_t portnames_read(struct file *file, char __user *buf, rcu_read_lock(); dd = private2dd(file); - /* port number n/a here since names are constant */ - avail = hfi1_read_portcntrs(dd, *ppos, 0, &names, NULL); + avail = hfi1_read_portcntrs(dd->pport, &names, NULL); rval = simple_read_from_buffer(buf, count, ppos, names, avail); rcu_read_unlock(); return rval; @@ -394,28 +389,150 @@ static ssize_t portnames_read(struct file *file, char __user *buf, /* read the per-port counters */ static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { u64 *counters; size_t avail; - struct hfi1_devdata *dd; struct hfi1_pportdata *ppd; ssize_t rval; rcu_read_lock(); ppd = private2ppd(file); - dd = ppd->dd; - avail = hfi1_read_portcntrs(dd, *ppos, ppd->port - 1, NULL, &counters); + avail = hfi1_read_portcntrs(ppd, NULL, &counters); rval = simple_read_from_buffer(buf, count, ppos, counters, avail); rcu_read_unlock(); return rval; } +static void check_dyn_flag(u64 scratch0, char *p, int size, int *used, + int this_hfi, int hfi, u32 flag, const char *what) +{ + u32 mask; + + mask = flag << (hfi ? CR_DYN_SHIFT : 0); + if (scratch0 & mask) { + *used += scnprintf(p + *used, size - *used, + " 0x%08x - HFI%d %s in use, %s device\n", + mask, hfi, what, + this_hfi == hfi ? "this" : "other"); + } +} + +static ssize_t asic_flags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u64 scratch0; + char *tmp; + int ret = 0; + int size; + int used; + int i; + + rcu_read_lock(); + ppd = private2ppd(file); + dd = ppd->dd; + size = PAGE_SIZE; + used = 0; + tmp = kmalloc(size, GFP_KERNEL); + if (!tmp) { + rcu_read_unlock(); + return -ENOMEM; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + used += scnprintf(tmp + used, size - used, + "Resource flags: 0x%016llx\n", scratch0); + + /* check permanent flag */ + if (scratch0 & CR_THERM_INIT) { + used += scnprintf(tmp + used, size - used, + " 0x%08x - thermal monitoring initialized\n", + (u32)CR_THERM_INIT); + } + + /* check each dynamic flag on each HFI */ + for (i = 0; i < 2; i++) { + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_SBUS, "SBus"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_EPROM, "EPROM"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_I2C1, "i2c chain 1"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_I2C2, "i2c chain 2"); + } + used += scnprintf(tmp + used, size - used, "Write bits to clear\n"); + + ret = simple_read_from_buffer(buf, count, ppos, tmp, used); + rcu_read_unlock(); + kfree(tmp); + return ret; +} + +static ssize_t asic_flags_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + char *buff; + int ret; + unsigned long long value; + u64 scratch0; + u64 clear; + + rcu_read_lock(); + ppd = private2ppd(file); + dd = ppd->dd; + + buff = kmalloc(count + 1, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto do_return; + } + + ret = copy_from_user(buff, buf, count); + if (ret > 0) { + ret = -EFAULT; + goto do_free; + } + + /* zero terminate and read the expected integer */ + buff[count] = 0; + ret = kstrtoull(buff, 0, &value); + if (ret) + goto do_free; + clear = value; + + /* obtain exclusive access */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + acquire_hw_mutex(dd); + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + scratch0 &= ~clear; + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + + release_hw_mutex(dd); + mutex_unlock(&dd->asic_data->asic_resource_mutex); + + /* return the number of bytes written */ + ret = count; + + do_free: + kfree(buff); + do_return: + rcu_read_unlock(); + return ret; +} + /* * read the per-port QSFP data for ppd */ static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { struct hfi1_pportdata *ppd; char *tmp; @@ -439,7 +556,7 @@ static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf, /* Do an i2c write operation on the chain for the given HFI. */ static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, u32 target) + size_t count, loff_t *ppos, u32 target) { struct hfi1_pportdata *ppd; char *buff; @@ -451,6 +568,16 @@ static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf, rcu_read_lock(); ppd = private2ppd(file); + /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */ + i2c_addr = (*ppos >> 16) & 0xffff; + offset = *ppos & 0xffff; + + /* explicitly reject invalid address 0 to catch cp and cat */ + if (i2c_addr == 0) { + ret = -EINVAL; + goto _return; + } + buff = kmalloc(count, GFP_KERNEL); if (!buff) { ret = -ENOMEM; @@ -463,9 +590,6 @@ static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf, goto _free; } - i2c_addr = (*ppos >> 16) & 0xff; - offset = *ppos & 0xffff; - total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count); if (total_written < 0) { ret = total_written; @@ -485,21 +609,21 @@ static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf, /* Do an i2c write operation on chain for HFI 0. */ static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __i2c_debugfs_write(file, buf, count, ppos, 0); } /* Do an i2c write operation on chain for HFI 1. */ static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __i2c_debugfs_write(file, buf, count, ppos, 1); } /* Do an i2c read operation on the chain for the given HFI. */ static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos, u32 target) + size_t count, loff_t *ppos, u32 target) { struct hfi1_pportdata *ppd; char *buff; @@ -511,15 +635,22 @@ static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf, rcu_read_lock(); ppd = private2ppd(file); + /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */ + i2c_addr = (*ppos >> 16) & 0xffff; + offset = *ppos & 0xffff; + + /* explicitly reject invalid address 0 to catch cp and cat */ + if (i2c_addr == 0) { + ret = -EINVAL; + goto _return; + } + buff = kmalloc(count, GFP_KERNEL); if (!buff) { ret = -ENOMEM; goto _return; } - i2c_addr = (*ppos >> 16) & 0xff; - offset = *ppos & 0xffff; - total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count); if (total_read < 0) { ret = total_read; @@ -545,21 +676,21 @@ static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf, /* Do an i2c read operation on chain for HFI 0. */ static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __i2c_debugfs_read(file, buf, count, ppos, 0); } /* Do an i2c read operation on chain for HFI 1. */ static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __i2c_debugfs_read(file, buf, count, ppos, 1); } /* Do a QSFP write operation on the i2c chain for the given HFI. */ static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, u32 target) + size_t count, loff_t *ppos, u32 target) { struct hfi1_pportdata *ppd; char *buff; @@ -605,21 +736,21 @@ static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf, /* Do a QSFP write operation on i2c chain for HFI 0. */ static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __qsfp_debugfs_write(file, buf, count, ppos, 0); } /* Do a QSFP write operation on i2c chain for HFI 1. */ static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __qsfp_debugfs_write(file, buf, count, ppos, 1); } /* Do a QSFP read operation on the i2c chain for the given HFI. */ static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos, u32 target) + size_t count, loff_t *ppos, u32 target) { struct hfi1_pportdata *ppd; char *buff; @@ -665,18 +796,116 @@ static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf, /* Do a QSFP read operation on i2c chain for HFI 0. */ static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __qsfp_debugfs_read(file, buf, count, ppos, 0); } /* Do a QSFP read operation on i2c chain for HFI 1. */ static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { return __qsfp_debugfs_read(file, buf, count, ppos, 1); } +static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + int ret; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + ppd = private2ppd(fp); + + ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); + if (ret) /* failed - release the module */ + module_put(THIS_MODULE); + + return ret; +} + +static int i2c1_debugfs_open(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_open(in, fp, 0); +} + +static int i2c2_debugfs_open(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_open(in, fp, 1); +} + +static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + release_chip_resource(ppd->dd, i2c_target(target)); + module_put(THIS_MODULE); + + return 0; +} + +static int i2c1_debugfs_release(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_release(in, fp, 0); +} + +static int i2c2_debugfs_release(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_release(in, fp, 1); +} + +static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + int ret; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + ppd = private2ppd(fp); + + ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); + if (ret) /* failed - release the module */ + module_put(THIS_MODULE); + + return ret; +} + +static int qsfp1_debugfs_open(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_open(in, fp, 0); +} + +static int qsfp2_debugfs_open(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_open(in, fp, 1); +} + +static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + release_chip_resource(ppd->dd, i2c_target(target)); + module_put(THIS_MODULE); + + return 0; +} + +static int qsfp1_debugfs_release(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_release(in, fp, 0); +} + +static int qsfp2_debugfs_release(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_release(in, fp, 1); +} + #define DEBUGFS_OPS(nm, readroutine, writeroutine) \ { \ .name = nm, \ @@ -687,6 +916,18 @@ static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf, }, \ } +#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \ +{ \ + .name = nm, \ + .ops = { \ + .read = readf, \ + .write = writef, \ + .llseek = generic_file_llseek, \ + .open = openf, \ + .release = releasef \ + }, \ +} + static const struct counter_info cntr_ops[] = { DEBUGFS_OPS("counter_names", dev_names_read, NULL), DEBUGFS_OPS("counters", dev_counters_read, NULL), @@ -695,11 +936,16 @@ static const struct counter_info cntr_ops[] = { static const struct counter_info port_cntr_ops[] = { DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL), - DEBUGFS_OPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write), - DEBUGFS_OPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write), + DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write, + i2c1_debugfs_open, i2c1_debugfs_release), + DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write, + i2c2_debugfs_open, i2c2_debugfs_release), DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL), - DEBUGFS_OPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write), - DEBUGFS_OPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write), + DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write, + qsfp1_debugfs_open, qsfp1_debugfs_release), + DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write, + qsfp2_debugfs_open, qsfp2_debugfs_release), + DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write), }; void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) @@ -747,8 +993,8 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) ibd->hfi1_ibdev_dbg, ppd, &port_cntr_ops[i].ops, - port_cntr_ops[i].ops.write == NULL ? - S_IRUGO : S_IRUGO|S_IWUSR); + !port_cntr_ops[i].ops.write ? + S_IRUGO : S_IRUGO | S_IWUSR); } } diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h index 92d6fe146714..b6fb6814f1b8 100644 --- a/drivers/staging/rdma/hfi1/debugfs.h +++ b/drivers/staging/rdma/hfi1/debugfs.h @@ -1,14 +1,13 @@ #ifndef _HFI1_DEBUGFS_H #define _HFI1_DEBUGFS_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c index 58472e5ac4e5..c05c39da83b1 100644 --- a/drivers/staging/rdma/hfi1/device.c +++ b/drivers/staging/rdma/hfi1/device.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h index 2850ff739d81..5bb3e83cf2da 100644 --- a/drivers/staging/rdma/hfi1/device.h +++ b/drivers/staging/rdma/hfi1/device.h @@ -1,14 +1,13 @@ #ifndef _HFI1_DEVICE_H #define _HFI1_DEVICE_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c index 0c8831705664..6546e91f85b7 100644 --- a/drivers/staging/rdma/hfi1/diag.c +++ b/drivers/staging/rdma/hfi1/diag.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -70,6 +67,7 @@ #include "hfi.h" #include "device.h" #include "common.h" +#include "verbs_txreq.h" #include "trace.h" #undef pr_fmt @@ -80,15 +78,15 @@ /* Snoop option mask */ #define SNOOP_DROP_SEND BIT(0) #define SNOOP_USE_METADATA BIT(1) +#define SNOOP_SET_VL0TOVL15 BIT(2) static u8 snoop_flags; /* * Extract packet length from LRH header. - * Why & 0x7FF? Because len is only 11 bits in case it wasn't 0'd we throw the - * bogus bits away. This is in Dwords so multiply by 4 to get size in bytes + * This is in Dwords so multiply by 4 to get size in bytes */ -#define HFI1_GET_PKT_LEN(x) (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2) +#define HFI1_GET_PKT_LEN(x) (((be16_to_cpu((x)->lrh[2]) & 0xFFF)) << 2) enum hfi1_filter_status { HFI1_FILTER_HIT, @@ -860,7 +858,7 @@ static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data, vl = sc4; } else { sl = (byte_two >> 4) & 0xf; - ibp = to_iport(&dd->verbs_dev.ibdev, 1); + ibp = to_iport(&dd->verbs_dev.rdi.ibdev, 1); sc5 = ibp->sl_to_sc[sl]; vl = sc_to_vlt(dd, sc5); if (vl != sc4) { @@ -966,6 +964,65 @@ static ssize_t hfi1_snoop_read(struct file *fp, char __user *data, return ret; } +/** + * hfi1_assign_snoop_link_credits -- Set up credits for VL15 and others + * @ppd : ptr to hfi1 port data + * @value : options from user space + * + * Assumes the rest of the CM credit registers are zero from a + * previous global or credit reset. + * Leave shared count at zero for both global and all vls. + * In snoop mode ideally we don't use shared credits + * Reserve 8.5k for VL15 + * If total credits less than 8.5kbytes return error. + * Divide the rest of the credits across VL0 to VL7 and if + * each of these levels has less than 34 credits (at least 2048 + 128 bytes) + * return with an error. + * The credit registers will be reset to zero on link negotiation or link up + * so this function should be activated from user space only if the port has + * gone past link negotiation and link up. + * + * Return -- 0 if successful else error condition + * + */ +static long hfi1_assign_snoop_link_credits(struct hfi1_pportdata *ppd, + int value) +{ +#define OPA_MIN_PER_VL_CREDITS 34 /* 2048 + 128 bytes */ + struct buffer_control t; + int i; + struct hfi1_devdata *dd = ppd->dd; + u16 total_credits = (value >> 16) & 0xffff; + u16 vl15_credits = dd->vl15_init / 2; + u16 per_vl_credits; + __be16 be_per_vl_credits; + + if (!(ppd->host_link_state & HLS_UP)) + goto err_exit; + if (total_credits < vl15_credits) + goto err_exit; + + per_vl_credits = (total_credits - vl15_credits) / TXE_NUM_DATA_VL; + + if (per_vl_credits < OPA_MIN_PER_VL_CREDITS) + goto err_exit; + + memset(&t, 0, sizeof(t)); + be_per_vl_credits = cpu_to_be16(per_vl_credits); + + for (i = 0; i < TXE_NUM_DATA_VL; i++) + t.vl[i].dedicated = be_per_vl_credits; + + t.vl[15].dedicated = cpu_to_be16(vl15_credits); + return set_buffer_control(ppd, &t); + +err_exit: + snoop_dbg("port_state = 0x%x, total_credits = %d, vl15_credits = %d", + ppd->host_link_state, total_credits, vl15_credits); + + return -EINVAL; +} + static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { struct hfi1_devdata *dd; @@ -1192,6 +1249,10 @@ static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) snoop_flags |= SNOOP_DROP_SEND; if (value & SNOOP_USE_METADATA) snoop_flags |= SNOOP_USE_METADATA; + if (value & (SNOOP_SET_VL0TOVL15)) { + ppd = &dd->pport[0]; /* first port will do */ + ret = hfi1_assign_snoop_link_credits(ppd, value); + } break; default: return -ENOTTY; @@ -1603,7 +1664,7 @@ int snoop_recv_handler(struct hfi1_packet *packet) /* * Handle snooping and capturing packets when sdma is being used. */ -int snoop_send_dma_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc) { pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n"); @@ -1616,20 +1677,19 @@ int snoop_send_dma_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, * bypass packets. The only way to send a bypass packet currently is to use the * diagpkt interface. When that interface is enable snoop/capture is not. */ -int snoop_send_pio_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc) { - struct ahg_ib_header *ahdr = qp->s_hdr; u32 hdrwords = qp->s_hdrwords; - struct hfi1_sge_state *ss = qp->s_cur_sge; + struct rvt_sge_state *ss = qp->s_cur_sge; u32 len = qp->s_cur_size; u32 dwords = (len + 3) >> 2; u32 plen = hdrwords + dwords + 2; /* includes pbc */ struct hfi1_pportdata *ppd = ps->ppd; struct snoop_packet *s_packet = NULL; - u32 *hdr = (u32 *)&ahdr->ibh; + u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr; u32 length = 0; - struct hfi1_sge_state temp_ss; + struct rvt_sge_state temp_ss; void *data = NULL; void *data_start = NULL; int ret; @@ -1638,7 +1698,7 @@ int snoop_send_pio_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, struct capture_md md; u32 vl; u32 hdr_len = hdrwords << 2; - u32 tlen = HFI1_GET_PKT_LEN(&ahdr->ibh); + u32 tlen = HFI1_GET_PKT_LEN(&ps->s_txreq->phdr.hdr); md.u.pbc = 0; @@ -1665,7 +1725,7 @@ int snoop_send_pio_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, md.port = 1; md.dir = PKT_DIR_EGRESS; if (likely(pbc == 0)) { - vl = be16_to_cpu(ahdr->ibh.lrh[0]) >> 12; + vl = be16_to_cpu(ps->s_txreq->phdr.hdr.lrh[0]) >> 12; md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen); } else { md.u.pbc = 0; @@ -1727,7 +1787,7 @@ int snoop_send_pio_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, ret = HFI1_FILTER_HIT; } else { ret = ppd->dd->hfi1_snoop.filter_callback( - &ahdr->ibh, + &ps->s_txreq->phdr.hdr, NULL, ppd->dd->hfi1_snoop.filter_value); } @@ -1759,9 +1819,16 @@ int snoop_send_pio_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); - hfi1_rc_send_complete(qp, &ahdr->ibh); + hfi1_rc_send_complete(qp, + &ps->s_txreq->phdr.hdr); spin_unlock_irqrestore(&qp->s_lock, flags); } + + /* + * If snoop is dropping the packet we need to put the + * txreq back because no one else will. + */ + hfi1_put_txreq(ps->s_txreq); return 0; } break; diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c index e03bd735173c..7e8dab892848 100644 --- a/drivers/staging/rdma/hfi1/dma.c +++ b/drivers/staging/rdma/hfi1/dma.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -52,7 +49,7 @@ #include "verbs.h" -#define BAD_DMA_ADDRESS ((u64) 0) +#define BAD_DMA_ADDRESS ((u64)0) /* * The following functions implement driver specific replacements @@ -74,7 +71,7 @@ static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr, if (WARN_ON(!valid_dma_direction(direction))) return BAD_DMA_ADDRESS; - return (u64) cpu_addr; + return (u64)cpu_addr; } static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, @@ -95,7 +92,7 @@ static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page, if (offset + size > PAGE_SIZE) return BAD_DMA_ADDRESS; - addr = (u64) page_address(page); + addr = (u64)page_address(page); if (addr) addr += offset; @@ -120,7 +117,7 @@ static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl, return BAD_DMA_ADDRESS; for_each_sg(sgl, sg, nents, i) { - addr = (u64) page_address(sg_page(sg)); + addr = (u64)page_address(sg_page(sg)); if (!addr) { ret = 0; break; @@ -161,14 +158,14 @@ static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size, if (p) addr = page_address(p); if (dma_handle) - *dma_handle = (u64) addr; + *dma_handle = (u64)addr; return addr; } static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle) { - free_pages((unsigned long) cpu_addr, get_order(size)); + free_pages((unsigned long)cpu_addr, get_order(size)); } struct ib_dma_mapping_ops hfi1_dma_mapping_ops = { diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c index 8485de1fce08..914beedb556b 100644 --- a/drivers/staging/rdma/hfi1/driver.c +++ b/drivers/staging/rdma/hfi1/driver.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -56,6 +53,7 @@ #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/prefetch.h> +#include <rdma/ib_verbs.h> #include "hfi.h" #include "trace.h" @@ -162,6 +160,22 @@ const char *get_unit_name(int unit) return iname; } +const char *get_card_name(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); + struct hfi1_devdata *dd = container_of(ibdev, + struct hfi1_devdata, verbs_dev); + return get_unit_name(dd->unit); +} + +struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); + struct hfi1_devdata *dd = container_of(ibdev, + struct hfi1_devdata, verbs_dev); + return dd->pcidev; +} + /* * Return count of units with at least one port ACTIVE. */ @@ -265,6 +279,8 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, u32 rte = rhf_rcv_type_err(packet->rhf); int lnh = be16_to_cpu(rhdr->lrh[0]) & 3; struct hfi1_ibport *ibp = &ppd->ibport_data; + struct hfi1_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR)) return; @@ -283,9 +299,9 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, goto drop; /* Check for GRH */ - if (lnh == HFI1_LRH_BTH) + if (lnh == HFI1_LRH_BTH) { ohdr = &hdr->u.oth; - else if (lnh == HFI1_LRH_GRH) { + } else if (lnh == HFI1_LRH_GRH) { u32 vtf; ohdr = &hdr->u.l.oth; @@ -295,17 +311,17 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) goto drop; rcv_flags |= HFI1_HAS_GRH; - } else + } else { goto drop; - + } /* Get the destination QP number. */ - qp_num = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; - if (lid < HFI1_MULTICAST_LID_BASE) { - struct hfi1_qp *qp; + qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) { + struct rvt_qp *qp; unsigned long flags; rcu_read_lock(); - qp = hfi1_lookup_qpn(ibp, qp_num); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); if (!qp) { rcu_read_unlock(); goto drop; @@ -318,9 +334,9 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, spin_lock_irqsave(&qp->r_lock, flags); /* Check for valid receive state. */ - if (!(ib_hfi1_state_ops[qp->state] & - HFI1_PROCESS_RECV_OK)) { - ibp->n_pkt_drops++; + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; } switch (qp->ibqp.qp_type) { @@ -352,7 +368,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, if (rhf_use_egr_bfr(packet->rhf)) ebuf = packet->ebuf; - if (ebuf == NULL) + if (!ebuf) goto drop; /* this should never happen */ if (lnh == HFI1_LRH_BTH) @@ -368,9 +384,9 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, if (opcode == IB_OPCODE_CNP) { /* * Only in pre-B0 h/w is the CNP_OPCODE handled - * via this code path (errata 291394). + * via this code path. */ - struct hfi1_qp *qp = NULL; + struct rvt_qp *qp = NULL; u32 lqpn, rqpn; u16 rlid; u8 svc_type, sl, sc5; @@ -380,10 +396,10 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, sc5 |= 0x10; sl = ibp->sc_to_sl[sc5]; - lqpn = be32_to_cpu(bth[1]) & HFI1_QPN_MASK; + lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK; rcu_read_lock(); - qp = hfi1_lookup_qpn(ibp, lqpn); - if (qp == NULL) { + qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn); + if (!qp) { rcu_read_unlock(); goto drop; } @@ -419,9 +435,8 @@ drop: } static inline void init_packet(struct hfi1_ctxtdata *rcd, - struct hfi1_packet *packet) + struct hfi1_packet *packet) { - packet->rsize = rcd->rcvhdrqentsize; /* words */ packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */ packet->rcd = rcd; @@ -434,12 +449,7 @@ static inline void init_packet(struct hfi1_ctxtdata *rcd, packet->rcv_flags = 0; } -#ifndef CONFIG_PRESCAN_RXQ -static void prescan_rxq(struct hfi1_packet *packet) {} -#else /* !CONFIG_PRESCAN_RXQ */ -static int prescan_receive_queue; - -static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr, +static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, struct hfi1_other_headers *ohdr, u64 rhf, u32 bth1, struct ib_grh *grh) { @@ -453,7 +463,7 @@ static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr, case IB_QPT_GSI: case IB_QPT_UD: rlid = be16_to_cpu(hdr->lrh[3]); - rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK; + rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; svc_type = IB_CC_SVCTYPE_UD; break; case IB_QPT_UC: @@ -483,7 +493,7 @@ static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr, if (bth1 & HFI1_BECN_SMASK) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 lqpn = bth1 & HFI1_QPN_MASK; + u32 lqpn = bth1 & RVT_QPN_MASK; u8 sl = ibp->sc_to_sl[sc5]; process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); @@ -562,26 +572,31 @@ static inline void update_ps_mdata(struct ps_mdata *mdata, * containing Excplicit Congestion Notifications (FECNs, or BECNs). * When an ECN is found, process the Congestion Notification, and toggle * it off. + * This is declared as a macro to allow quick checking of the port to avoid + * the overhead of a function call if not enabled. */ -static void prescan_rxq(struct hfi1_packet *packet) +#define prescan_rxq(rcd, packet) \ + do { \ + if (rcd->ppd->cc_prescan) \ + __prescan_rxq(packet); \ + } while (0) +static void __prescan_rxq(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; struct ps_mdata mdata; - if (!prescan_receive_queue) - return; - init_ps_mdata(&mdata, packet); while (1) { struct hfi1_devdata *dd = rcd->dd; struct hfi1_ibport *ibp = &rcd->ppd->ibport_data; - __le32 *rhf_addr = (__le32 *) rcd->rcvhdrq + mdata.ps_head + + __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head + dd->rhf_offset; - struct hfi1_qp *qp; + struct rvt_qp *qp; struct hfi1_ib_header *hdr; struct hfi1_other_headers *ohdr; struct ib_grh *grh = NULL; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; u64 rhf = rhf_to_cpu(rhf_addr); u32 etype = rhf_rcv_type(rhf), qpn, bth1; int is_ecn = 0; @@ -600,25 +615,25 @@ static void prescan_rxq(struct hfi1_packet *packet) hfi1_get_msgheader(dd, rhf_addr); lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == HFI1_LRH_BTH) + if (lnh == HFI1_LRH_BTH) { ohdr = &hdr->u.oth; - else if (lnh == HFI1_LRH_GRH) { + } else if (lnh == HFI1_LRH_GRH) { ohdr = &hdr->u.l.oth; grh = &hdr->u.l.grh; - } else + } else { goto next; /* just in case */ - + } bth1 = be32_to_cpu(ohdr->bth[1]); is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK)); if (!is_ecn) goto next; - qpn = bth1 & HFI1_QPN_MASK; + qpn = bth1 & RVT_QPN_MASK; rcu_read_lock(); - qp = hfi1_lookup_qpn(ibp, qpn); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn); - if (qp == NULL) { + if (!qp) { rcu_read_unlock(); goto next; } @@ -633,7 +648,6 @@ next: update_ps_mdata(&mdata, rcd); } } -#endif /* CONFIG_PRESCAN_RXQ */ static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread) { @@ -683,8 +697,9 @@ static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) * The +2 is the size of the RHF. */ prefetch_range(packet->ebuf, - packet->tlen - ((packet->rcd->rcvhdrqentsize - - (rhf_hdrq_offset(packet->rhf)+2)) * 4)); + packet->tlen - ((packet->rcd->rcvhdrqentsize - + (rhf_hdrq_offset(packet->rhf) + + 2)) * 4)); } /* @@ -712,7 +727,7 @@ static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) } } - packet->rhf_addr = (__le32 *) packet->rcd->rcvhdrq + packet->rhqoff + + packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + packet->rcd->dd->rhf_offset; packet->rhf = rhf_to_cpu(packet->rhf_addr); @@ -737,7 +752,6 @@ static inline void process_rcv_update(int last, struct hfi1_packet *packet) static inline void finish_packet(struct hfi1_packet *packet) { - /* * Nothing we need to free for the packet. * @@ -746,14 +760,12 @@ static inline void finish_packet(struct hfi1_packet *packet) */ update_usrhead(packet->rcd, packet->rcd->head, packet->updegr, packet->etail, rcv_intr_dynamic, packet->numpkt); - } static inline void process_rcv_qp_work(struct hfi1_packet *packet) { - struct hfi1_ctxtdata *rcd; - struct hfi1_qp *qp, *nqp; + struct rvt_qp *qp, *nqp; rcd = packet->rcd; rcd->head = packet->rhqoff; @@ -764,17 +776,17 @@ static inline void process_rcv_qp_work(struct hfi1_packet *packet) */ list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) { list_del_init(&qp->rspwait); - if (qp->r_flags & HFI1_R_RSP_DEFERED_ACK) { - qp->r_flags &= ~HFI1_R_RSP_DEFERED_ACK; + if (qp->r_flags & RVT_R_RSP_NAK) { + qp->r_flags &= ~RVT_R_RSP_NAK; hfi1_send_rc_ack(rcd, qp, 0); } - if (qp->r_flags & HFI1_R_RSP_SEND) { + if (qp->r_flags & RVT_R_RSP_SEND) { unsigned long flags; - qp->r_flags &= ~HFI1_R_RSP_SEND; + qp->r_flags &= ~RVT_R_RSP_SEND; spin_lock_irqsave(&qp->s_lock, flags); - if (ib_hfi1_state_ops[qp->state] & - HFI1_PROCESS_OR_FLUSH_SEND) + if (ib_rvt_state_ops[qp->state] & + RVT_PROCESS_OR_FLUSH_SEND) hfi1_schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); } @@ -799,7 +811,7 @@ int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread) goto bail; } - prescan_rxq(&packet); + prescan_rxq(rcd, &packet); while (last == RCV_PKT_OK) { last = process_rcv_packet(&packet, thread); @@ -830,7 +842,7 @@ int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread) } smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ - prescan_rxq(&packet); + prescan_rxq(rcd, &packet); while (last == RCV_PKT_OK) { last = process_rcv_packet(&packet, thread); @@ -862,6 +874,37 @@ static inline void set_all_dma_rtail(struct hfi1_devdata *dd) &handle_receive_interrupt_dma_rtail; } +void set_all_slowpath(struct hfi1_devdata *dd) +{ + int i; + + /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */ + for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++) + dd->rcd[i]->do_interrupt = &handle_receive_interrupt; +} + +static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd, + struct hfi1_packet packet, + struct hfi1_devdata *dd) +{ + struct work_struct *lsaw = &rcd->ppd->linkstate_active_work; + struct hfi1_message_header *hdr = hfi1_get_msgheader(packet.rcd->dd, + packet.rhf_addr); + + if (hdr2sc(hdr, packet.rhf) != 0xf) { + int hwstate = read_logical_state(dd); + + if (hwstate != LSTATE_ACTIVE) { + dd_dev_info(dd, "Unexpected link state %d\n", hwstate); + return 0; + } + + queue_work(rcd->ppd->hfi1_wq, lsaw); + return 1; + } + return 0; +} + /* * handle_receive_interrupt - receive a packet * @rcd: the context @@ -910,17 +953,17 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread) } } - prescan_rxq(&packet); + prescan_rxq(rcd, &packet); while (last == RCV_PKT_OK) { - - if (unlikely(dd->do_drop && atomic_xchg(&dd->drop_packet, - DROP_PACKET_OFF) == DROP_PACKET_ON)) { + if (unlikely(dd->do_drop && + atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) == + DROP_PACKET_ON)) { dd->do_drop = 0; /* On to the next packet */ packet.rhqoff += packet.rsize; - packet.rhf_addr = (__le32 *) rcd->rcvhdrq + + packet.rhf_addr = (__le32 *)rcd->rcvhdrq + packet.rhqoff + dd->rhf_offset; packet.rhf = rhf_to_cpu(packet.rhf_addr); @@ -929,6 +972,11 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread) last = skip_rcv_packet(&packet, thread); skip_pkt = 0; } else { + /* Auto activate link on non-SC15 packet receive */ + if (unlikely(rcd->ppd->host_link_state == + HLS_UP_ARMED) && + set_armed_to_active(rcd, packet, dd)) + goto bail; last = process_rcv_packet(&packet, thread); } @@ -940,8 +988,7 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread) if (seq != rcd->seq_cnt) last = RCV_PKT_DONE; if (needset) { - dd_dev_info(dd, - "Switching to NO_DMA_RTAIL\n"); + dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n"); set_all_nodma_rtail(dd); needset = 0; } @@ -984,6 +1031,42 @@ bail: } /* + * We may discover in the interrupt that the hardware link state has + * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet), + * and we need to update the driver's notion of the link state. We cannot + * run set_link_state from interrupt context, so we queue this function on + * a workqueue. + * + * We delay the regular interrupt processing until after the state changes + * so that the link will be in the correct state by the time any application + * we wake up attempts to send a reply to any message it received. + * (Subsequent receive interrupts may possibly force the wakeup before we + * update the link state.) + * + * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes + * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues, + * so we're safe from use-after-free of the rcd. + */ +void receive_interrupt_work(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + linkstate_active_work); + struct hfi1_devdata *dd = ppd->dd; + int i; + + /* Received non-SC15 packet implies neighbor_normal */ + ppd->neighbor_normal = 1; + set_link_state(ppd, HLS_UP_ACTIVE); + + /* + * Interrupt all kernel contexts that could have had an + * interrupt during auto activation. + */ + for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++) + force_recv_intr(dd->rcd[i]); +} + +/* * Convert a given MTU size to the on-wire MAD packet enumeration. * Return -1 if the size is invalid. */ @@ -1037,9 +1120,9 @@ int set_mtu(struct hfi1_pportdata *ppd) ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd); mutex_lock(&ppd->hls_lock); - if (ppd->host_link_state == HLS_UP_INIT - || ppd->host_link_state == HLS_UP_ARMED - || ppd->host_link_state == HLS_UP_ACTIVE) + if (ppd->host_link_state == HLS_UP_INIT || + ppd->host_link_state == HLS_UP_ARMED || + ppd->host_link_state == HLS_UP_ACTIVE) is_up = 1; drain = !is_ax(dd) && is_up; @@ -1082,79 +1165,80 @@ int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc) return 0; } -/* - * Following deal with the "obviously simple" task of overriding the state - * of the LEDs, which normally indicate link physical and logical status. - * The complications arise in dealing with different hardware mappings - * and the board-dependent routine being called from interrupts. - * and then there's the requirement to _flash_ them. - */ -#define LED_OVER_FREQ_SHIFT 8 -#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT) -/* Below is "non-zero" to force override, but both actual LEDs are off */ -#define LED_OVER_BOTH_OFF (8) +void shutdown_led_override(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + if (atomic_read(&ppd->led_override_timer_active)) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + /* Ensure the atomic_set is visible to all CPUs */ + smp_wmb(); + } + + /* Hand control of the LED to the DC for normal operation */ + write_csr(dd, DCC_CFG_LED_CNTRL, 0); +} static void run_led_override(unsigned long opaque) { struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque; struct hfi1_devdata *dd = ppd->dd; - int timeoff; - int ph_idx; + unsigned long timeout; + int phase_idx; if (!(dd->flags & HFI1_INITTED)) return; - ph_idx = ppd->led_override_phase++ & 1; - ppd->led_override = ppd->led_override_vals[ph_idx]; - timeoff = ppd->led_override_timeoff; + phase_idx = ppd->led_override_phase & 1; - /* - * don't re-fire the timer if user asked for it to be off; we let - * it fire one more time after they turn it off to simplify - */ - if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) - mod_timer(&ppd->led_override_timer, jiffies + timeoff); + setextled(dd, phase_idx); + + timeout = ppd->led_override_vals[phase_idx]; + + /* Set up for next phase */ + ppd->led_override_phase = !ppd->led_override_phase; + + mod_timer(&ppd->led_override_timer, jiffies + timeout); } -void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val) +/* + * To have the LED blink in a particular pattern, provide timeon and timeoff + * in milliseconds. + * To turn off custom blinking and return to normal operation, use + * shutdown_led_override() + */ +void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, + unsigned int timeoff) { - struct hfi1_devdata *dd = ppd->dd; - int timeoff, freq; - - if (!(dd->flags & HFI1_INITTED)) + if (!(ppd->dd->flags & HFI1_INITTED)) return; - /* First check if we are blinking. If not, use 1HZ polling */ - timeoff = HZ; - freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + /* Convert to jiffies for direct use in timer */ + ppd->led_override_vals[0] = msecs_to_jiffies(timeoff); + ppd->led_override_vals[1] = msecs_to_jiffies(timeon); - if (freq) { - /* For blink, set each phase from one nybble of val */ - ppd->led_override_vals[0] = val & 0xF; - ppd->led_override_vals[1] = (val >> 4) & 0xF; - timeoff = (HZ << 4)/freq; - } else { - /* Non-blink set both phases the same. */ - ppd->led_override_vals[0] = val & 0xF; - ppd->led_override_vals[1] = val & 0xF; - } - ppd->led_override_timeoff = timeoff; + /* Arbitrarily start from LED on phase */ + ppd->led_override_phase = 1; /* * If the timer has not already been started, do so. Use a "quick" - * timeout so the function will be called soon, to look at our request. + * timeout so the handler will be called soon to look at our request. */ - if (atomic_inc_return(&ppd->led_override_timer_active) == 1) { - /* Need to start timer */ + if (!timer_pending(&ppd->led_override_timer)) { setup_timer(&ppd->led_override_timer, run_led_override, - (unsigned long)ppd); - + (unsigned long)ppd); ppd->led_override_timer.expires = jiffies + 1; add_timer(&ppd->led_override_timer); - } else { - if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) - mod_timer(&ppd->led_override_timer, jiffies + 1); - atomic_dec(&ppd->led_override_timer_active); + atomic_set(&ppd->led_override_timer_active, 1); + /* Ensure the atomic_set is visible to all CPUs */ + smp_wmb(); } } @@ -1184,8 +1268,8 @@ int hfi1_reset_device(int unit) if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) { dd_dev_info(dd, - "Invalid unit number %u or not initialized or not present\n", - unit); + "Invalid unit number %u or not initialized or not present\n", + unit); ret = -ENXIO; goto bail; } @@ -1203,14 +1287,8 @@ int hfi1_reset_device(int unit) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; - if (atomic_read(&ppd->led_override_timer_active)) { - /* Need to stop LED timer, _then_ shut off LEDs */ - del_timer_sync(&ppd->led_override_timer); - atomic_set(&ppd->led_override_timer_active, 0); - } - /* Shut off LEDs after we are sure timer is not running */ - ppd->led_override = LED_OVER_BOTH_OFF; + shutdown_led_override(ppd); } if (dd->flags & HFI1_HAS_SEND_DMA) sdma_exit(dd); @@ -1221,11 +1299,11 @@ int hfi1_reset_device(int unit) if (ret) dd_dev_err(dd, - "Reinitialize unit %u after reset failed with %d\n", - unit, ret); + "Reinitialize unit %u after reset failed with %d\n", + unit, ret); else dd_dev_info(dd, "Reinitialized unit %u after resetting\n", - unit); + unit); bail: return ret; @@ -1282,7 +1360,7 @@ int process_receive_bypass(struct hfi1_packet *packet) handle_eflags(packet); dd_dev_err(packet->rcd->dd, - "Bypass packets are not supported in normal operation. Dropping\n"); + "Bypass packets are not supported in normal operation. Dropping\n"); return RHF_RCV_CONTINUE; } @@ -1320,6 +1398,6 @@ int kdeth_process_eager(struct hfi1_packet *packet) int process_receive_invalid(struct hfi1_packet *packet) { dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n", - rhf_rcv_type(packet->rhf)); + rhf_rcv_type(packet->rhf)); return RHF_RCV_CONTINUE; } diff --git a/drivers/staging/rdma/hfi1/efivar.c b/drivers/staging/rdma/hfi1/efivar.c index 7dc5bae220e0..3f014f96f9e0 100644 --- a/drivers/staging/rdma/hfi1/efivar.c +++ b/drivers/staging/rdma/hfi1/efivar.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -128,13 +125,12 @@ static int read_efi_var(const char *name, unsigned long *size, * temporary buffer. Now allocate a correctly sized * buffer. */ - data = kmalloc(temp_size, GFP_KERNEL); + data = kmemdup(temp_buffer, temp_size, GFP_KERNEL); if (!data) { ret = -ENOMEM; goto fail; } - memcpy(data, temp_buffer, temp_size); *size = temp_size; *return_data = data; diff --git a/drivers/staging/rdma/hfi1/efivar.h b/drivers/staging/rdma/hfi1/efivar.h index 070706225c51..94e9e70de568 100644 --- a/drivers/staging/rdma/hfi1/efivar.h +++ b/drivers/staging/rdma/hfi1/efivar.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c index fb620c97f592..bd8771570f81 100644 --- a/drivers/staging/rdma/hfi1/eprom.c +++ b/drivers/staging/rdma/hfi1/eprom.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -99,17 +96,17 @@ /* sleep length while waiting for controller */ #define WAIT_SLEEP_US 100 /* must be larger than 5 (see usage) */ -#define COUNT_DELAY_SEC(n) ((n) * (1000000/WAIT_SLEEP_US)) +#define COUNT_DELAY_SEC(n) ((n) * (1000000 / WAIT_SLEEP_US)) /* GPIO pins */ -#define EPROM_WP_N (1ull << 14) /* EPROM write line */ +#define EPROM_WP_N BIT_ULL(14) /* EPROM write line */ /* - * Use the EP mutex to guard against other callers from within the driver. - * Also covers usage of eprom_available. + * How long to wait for the EPROM to become available, in ms. + * The spec 32 Mb EPROM takes around 40s to erase then write. + * Double it for safety. */ -static DEFINE_MUTEX(eprom_mutex); -static int eprom_available; /* default: not available */ +#define EPROM_TIMEOUT 80000 /* ms */ /* * Turn on external enable line that allows writing on the flash. @@ -117,11 +114,9 @@ static int eprom_available; /* default: not available */ static void write_enable(struct hfi1_devdata *dd) { /* raise signal */ - write_csr(dd, ASIC_GPIO_OUT, - read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N); + write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N); /* raise enable */ - write_csr(dd, ASIC_GPIO_OE, - read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N); + write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N); } /* @@ -130,11 +125,9 @@ static void write_enable(struct hfi1_devdata *dd) static void write_disable(struct hfi1_devdata *dd) { /* lower signal */ - write_csr(dd, ASIC_GPIO_OUT, - read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N); + write_csr(dd, ASIC_GPIO_OUT, read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N); /* lower enable */ - write_csr(dd, ASIC_GPIO_OE, - read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N); + write_csr(dd, ASIC_GPIO_OE, read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N); } /* @@ -212,8 +205,8 @@ static int erase_range(struct hfi1_devdata *dd, u32 start, u32 len) /* check the end points for the minimum erase */ if ((start & MASK_4KB) || (end & MASK_4KB)) { dd_dev_err(dd, - "%s: non-aligned range (0x%x,0x%x) for a 4KB erase\n", - __func__, start, end); + "%s: non-aligned range (0x%x,0x%x) for a 4KB erase\n", + __func__, start, end); return -EINVAL; } @@ -256,7 +249,7 @@ static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result) int i; write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset)); - for (i = 0; i < EP_PAGE_SIZE/sizeof(u32); i++) + for (i = 0; i < EP_PAGE_SIZE / sizeof(u32); i++) result[i] = (u32)read_csr(dd, ASIC_EEP_DATA); write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */ } @@ -267,7 +260,7 @@ static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result) static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) { u32 offset; - u32 buffer[EP_PAGE_SIZE/sizeof(u32)]; + u32 buffer[EP_PAGE_SIZE / sizeof(u32)]; int ret = 0; /* reject anything not on an EPROM page boundary */ @@ -277,7 +270,7 @@ static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) for (offset = 0; offset < len; offset += EP_PAGE_SIZE) { read_page(dd, start + offset, buffer); if (copy_to_user((void __user *)(addr + offset), - buffer, EP_PAGE_SIZE)) { + buffer, EP_PAGE_SIZE)) { ret = -EFAULT; goto done; } @@ -298,7 +291,7 @@ static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data) write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE); write_csr(dd, ASIC_EEP_DATA, data[0]); write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset)); - for (i = 1; i < EP_PAGE_SIZE/sizeof(u32); i++) + for (i = 1; i < EP_PAGE_SIZE / sizeof(u32); i++) write_csr(dd, ASIC_EEP_DATA, data[i]); /* will close the open page */ return wait_for_not_busy(dd); @@ -310,7 +303,7 @@ static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data) static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) { u32 offset; - u32 buffer[EP_PAGE_SIZE/sizeof(u32)]; + u32 buffer[EP_PAGE_SIZE / sizeof(u32)]; int ret = 0; /* reject anything not on an EPROM page boundary */ @@ -321,7 +314,7 @@ static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) for (offset = 0; offset < len; offset += EP_PAGE_SIZE) { if (copy_from_user(buffer, (void __user *)(addr + offset), - EP_PAGE_SIZE)) { + EP_PAGE_SIZE)) { ret = -EFAULT; goto done; } @@ -353,44 +346,42 @@ static inline u32 extract_rstart(u32 composite) * * Return 0 on success, -ERRNO on error */ -int handle_eprom_command(const struct hfi1_cmd *cmd) +int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd) { struct hfi1_devdata *dd; u32 dev_id; u32 rlen; /* range length */ u32 rstart; /* range start */ + int i_minor; int ret = 0; /* - * The EPROM is per-device, so use unit 0 as that will always - * exist. + * Map the device file to device data using the relative minor. + * The device file minor number is the unit number + 1. 0 is + * the generic device file - reject it. */ - dd = hfi1_lookup(0); + i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE; + if (i_minor <= 0) + return -EINVAL; + dd = hfi1_lookup(i_minor - 1); if (!dd) { - pr_err("%s: cannot find unit 0!\n", __func__); + pr_err("%s: cannot find unit %d!\n", __func__, i_minor); return -EINVAL; } - /* lock against other callers touching the ASIC block */ - mutex_lock(&eprom_mutex); - - /* some platforms do not have an EPROM */ - if (!eprom_available) { - ret = -ENOSYS; - goto done_asic; - } + /* some devices do not have an EPROM */ + if (!dd->eprom_available) + return -EOPNOTSUPP; - /* lock against the other HFI on another OS */ - ret = acquire_hw_mutex(dd); + ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); if (ret) { - dd_dev_err(dd, - "%s: unable to acquire hw mutex, no EPROM support\n", - __func__); + dd_dev_err(dd, "%s: unable to acquire EPROM resource\n", + __func__); goto done_asic; } dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n", - __func__, cmd->type, cmd->len, cmd->addr); + __func__, cmd->type, cmd->len, cmd->addr); switch (cmd->type) { case HFI1_CMD_EP_INFO: @@ -401,7 +392,7 @@ int handle_eprom_command(const struct hfi1_cmd *cmd) dev_id = read_device_id(dd); /* addr points to a u32 user buffer */ if (copy_to_user((void __user *)cmd->addr, &dev_id, - sizeof(u32))) + sizeof(u32))) ret = -EFAULT; break; @@ -429,14 +420,13 @@ int handle_eprom_command(const struct hfi1_cmd *cmd) default: dd_dev_err(dd, "%s: unexpected command %d\n", - __func__, cmd->type); + __func__, cmd->type); ret = -EINVAL; break; } - release_hw_mutex(dd); + release_chip_resource(dd, CR_EPROM); done_asic: - mutex_unlock(&eprom_mutex); return ret; } @@ -447,44 +437,35 @@ int eprom_init(struct hfi1_devdata *dd) { int ret = 0; - /* only the discrete chip has an EPROM, nothing to do */ + /* only the discrete chip has an EPROM */ if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0) return 0; - /* lock against other callers */ - mutex_lock(&eprom_mutex); - if (eprom_available) /* already initialized */ - goto done_asic; - /* - * Lock against the other HFI on another OS - the mutex above - * would have caught anything in this driver. It is OK if - * both OSes reset the EPROM - as long as they don't do it at - * the same time. + * It is OK if both HFIs reset the EPROM as long as they don't + * do it at the same time. */ - ret = acquire_hw_mutex(dd); + ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); if (ret) { dd_dev_err(dd, - "%s: unable to acquire hw mutex, no EPROM support\n", - __func__); + "%s: unable to acquire EPROM resource, no EPROM support\n", + __func__); goto done_asic; } /* reset EPROM to be sure it is in a good state */ /* set reset */ - write_csr(dd, ASIC_EEP_CTL_STAT, - ASIC_EEP_CTL_STAT_EP_RESET_SMASK); + write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK); /* clear reset, set speed */ write_csr(dd, ASIC_EEP_CTL_STAT, - EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT); + EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT); /* wake the device with command "release powerdown NoID" */ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID); - eprom_available = 1; - release_hw_mutex(dd); + dd->eprom_available = true; + release_chip_resource(dd, CR_EPROM); done_asic: - mutex_unlock(&eprom_mutex); return ret; } diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h index 64a64276be81..d41f0b1afb15 100644 --- a/drivers/staging/rdma/hfi1/eprom.h +++ b/drivers/staging/rdma/hfi1/eprom.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -52,4 +49,4 @@ struct hfi1_cmd; struct hfi1_devdata; int eprom_init(struct hfi1_devdata *dd); -int handle_eprom_command(const struct hfi1_cmd *cmd); +int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd); diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c index d57d549052c8..e460261f94b7 100644 --- a/drivers/staging/rdma/hfi1/file_ops.c +++ b/drivers/staging/rdma/hfi1/file_ops.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -60,6 +57,8 @@ #include "user_sdma.h" #include "user_exp_rcv.h" #include "eprom.h" +#include "aspm.h" +#include "mmu_rb.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -96,9 +95,6 @@ static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long); static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); static int vma_fault(struct vm_area_struct *, struct vm_fault *); -static int exp_tid_setup(struct file *, struct hfi1_tid_info *); -static int exp_tid_free(struct file *, struct hfi1_tid_info *); -static void unlock_exp_tids(struct hfi1_ctxtdata *); static const struct file_operations hfi1_file_ops = { .owner = THIS_MODULE, @@ -164,7 +160,6 @@ enum mmap_types { #define dbg(fmt, ...) \ pr_info(fmt, ##__VA_ARGS__) - static inline int is_valid_mmap(u64 token) { return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC); @@ -188,6 +183,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, struct hfi1_cmd cmd; struct hfi1_user_info uinfo; struct hfi1_tid_info tinfo; + unsigned long addr; ssize_t consumed = 0, copy = 0, ret = 0; void *dest = NULL; __u64 user_val = 0; @@ -219,6 +215,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, break; case HFI1_CMD_TID_UPDATE: case HFI1_CMD_TID_FREE: + case HFI1_CMD_TID_INVAL_READ: copy = sizeof(tinfo); dest = &tinfo; break; @@ -294,9 +291,8 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, sc_return_credits(uctxt->sc); break; case HFI1_CMD_TID_UPDATE: - ret = exp_tid_setup(fp, &tinfo); + ret = hfi1_user_exp_rcv_setup(fp, &tinfo); if (!ret) { - unsigned long addr; /* * Copy the number of tidlist entries we used * and the length of the buffer we registered. @@ -311,8 +307,25 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, ret = -EFAULT; } break; + case HFI1_CMD_TID_INVAL_READ: + ret = hfi1_user_exp_rcv_invalid(fp, &tinfo); + if (ret) + break; + addr = (unsigned long)cmd.addr + + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; + break; case HFI1_CMD_TID_FREE: - ret = exp_tid_free(fp, &tinfo); + ret = hfi1_user_exp_rcv_clear(fp, &tinfo); + if (ret) + break; + addr = (unsigned long)cmd.addr + + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; break; case HFI1_CMD_RECV_CTRL: ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val); @@ -373,8 +386,10 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, break; } if (dd->flags & HFI1_FORCED_FREEZE) { - /* Don't allow context reset if we are into - * forced freeze */ + /* + * Don't allow context reset if we are into + * forced freeze + */ ret = -ENODEV; break; } @@ -382,8 +397,9 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, ret = sc_enable(sc); hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, uctxt->ctxt); - } else + } else { ret = sc_restart(sc); + } if (!ret) sc_return_credits(sc); break; @@ -393,7 +409,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data, case HFI1_CMD_EP_ERASE_RANGE: case HFI1_CMD_EP_READ_RANGE: case HFI1_CMD_EP_WRITE_RANGE: - ret = handle_eprom_command(&cmd); + ret = handle_eprom_command(fp, &cmd); break; } @@ -733,6 +749,9 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) /* drain user sdma queue */ hfi1_user_sdma_free_queues(fdata); + /* release the cpu */ + hfi1_put_proc_affinity(dd, fdata->rec_cpu_num); + /* * Clear any left over, unhandled events so the next process that * gets this context doesn't get confused. @@ -756,6 +775,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | HFI1_RCVCTRL_TIDFLOW_DIS | HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_ONE_PKT_EGR_DIS | HFI1_RCVCTRL_NO_RHQ_DROP_DIS | HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt); @@ -778,14 +798,12 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) uctxt->pionowait = 0; uctxt->event_flags = 0; - hfi1_clear_tids(uctxt); + hfi1_user_exp_rcv_free(fdata); hfi1_clear_ctxt_pkey(dd, uctxt->ctxt); - if (uctxt->tid_pg_list) - unlock_exp_tids(uctxt); - hfi1_stats.sps_ctxts--; - dd->freectxts++; + if (++dd->freectxts == dd->num_user_contexts) + aspm_enable_all(dd); mutex_unlock(&hfi1_mutex); hfi1_free_ctxtdata(dd, uctxt); done: @@ -827,8 +845,16 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) mutex_lock(&hfi1_mutex); /* First, lets check if we need to setup a shared context? */ - if (uinfo->subctxt_cnt) + if (uinfo->subctxt_cnt) { + struct hfi1_filedata *fd = fp->private_data; + ret = find_shared_ctxt(fp, uinfo); + if (ret < 0) + goto done_unlock; + if (ret) + fd->rec_cpu_num = hfi1_get_proc_affinity( + fd->uctxt->dd, fd->uctxt->numa_id); + } /* * We execute the following block if we couldn't find a @@ -838,6 +864,7 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE; ret = get_user_context(fp, uinfo, i_minor - 1, alg); } +done_unlock: mutex_unlock(&hfi1_mutex); done: return ret; @@ -963,7 +990,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, struct hfi1_filedata *fd = fp->private_data; struct hfi1_ctxtdata *uctxt; unsigned ctxt; - int ret; + int ret, numa; if (dd->flags & HFI1_FROZEN) { /* @@ -983,17 +1010,26 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, if (ctxt == dd->num_rcv_contexts) return -EBUSY; - uctxt = hfi1_create_ctxtdata(dd->pport, ctxt); + fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1); + if (fd->rec_cpu_num != -1) + numa = cpu_to_node(fd->rec_cpu_num); + else + numa = numa_node_id(); + uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa); if (!uctxt) { dd_dev_err(dd, "Unable to allocate ctxtdata memory, failing open\n"); return -ENOMEM; } + hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)", + uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num, + uctxt->numa_id); + /* * Allocate and enable a PIO send context. */ uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, - uctxt->numa_id); + uctxt->dd->node); if (!uctxt->sc) return -ENOMEM; @@ -1027,7 +1063,12 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, INIT_LIST_HEAD(&uctxt->sdma_queues); spin_lock_init(&uctxt->sdma_qlock); hfi1_stats.sps_ctxts++; - dd->freectxts--; + /* + * Disable ASPM when there are open user/PSM contexts to avoid + * issues with ASPM L1 exit latency + */ + if (dd->freectxts-- == dd->num_user_contexts) + aspm_disable_all(dd); fd->uctxt = uctxt; return 0; @@ -1036,22 +1077,19 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, static int init_subctxts(struct hfi1_ctxtdata *uctxt, const struct hfi1_user_info *uinfo) { - int ret = 0; unsigned num_subctxts; num_subctxts = uinfo->subctxt_cnt; - if (num_subctxts > HFI1_MAX_SHARED_CTXTS) { - ret = -EINVAL; - goto bail; - } + if (num_subctxts > HFI1_MAX_SHARED_CTXTS) + return -EINVAL; uctxt->subctxt_cnt = uinfo->subctxt_cnt; uctxt->subctxt_id = uinfo->subctxt_id; uctxt->active_slaves = 1; uctxt->redirect_seq_cnt = 1; set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); -bail: - return ret; + + return 0; } static int setup_subctxt(struct hfi1_ctxtdata *uctxt) @@ -1106,10 +1144,10 @@ static int user_init(struct file *fp) * has done it. */ if (fd->subctxt) { - ret = wait_event_interruptible(uctxt->wait, - !test_bit(HFI1_CTXT_MASTER_UNINIT, - &uctxt->event_flags)); - goto done; + ret = wait_event_interruptible(uctxt->wait, !test_bit( + HFI1_CTXT_MASTER_UNINIT, + &uctxt->event_flags)); + goto expected; } /* initialize poll variables... */ @@ -1147,8 +1185,16 @@ static int user_init(struct file *fp) rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + /* + * The RcvCtxtCtrl.TailUpd bit has to be explicitly written. + * We can't rely on the correct value to be set from prior + * uses of the chip or ctxt. Therefore, add the rcvctrl op + * for both cases. + */ if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; + else + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS; hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt); /* Notify any waiting slaves */ @@ -1156,8 +1202,18 @@ static int user_init(struct file *fp) clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); wake_up(&uctxt->wait); } - ret = 0; +expected: + /* + * Expected receive has to be setup for all processes (including + * shared contexts). However, it has to be done after the master + * context has been fully configured as it depends on the + * eager/expected split of the RcvArray entries. + * Setting it up here ensures that the subcontexts will be waiting + * (due to the above wait_event_interruptible() until the master + * is setup. + */ + ret = hfi1_user_exp_rcv_init(fp); done: return ret; } @@ -1227,46 +1283,6 @@ static int setup_ctxt(struct file *fp) if (ret) goto done; } - /* Setup Expected Rcv memories */ - uctxt->tid_pg_list = vzalloc(uctxt->expected_count * - sizeof(struct page **)); - if (!uctxt->tid_pg_list) { - ret = -ENOMEM; - goto done; - } - uctxt->physshadow = vzalloc(uctxt->expected_count * - sizeof(*uctxt->physshadow)); - if (!uctxt->physshadow) { - ret = -ENOMEM; - goto done; - } - /* allocate expected TID map and initialize the cursor */ - atomic_set(&uctxt->tidcursor, 0); - uctxt->numtidgroups = uctxt->expected_count / - dd->rcv_entries.group_size; - uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG + - !!(uctxt->numtidgroups % BITS_PER_LONG); - uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt * - sizeof(*uctxt->tidusemap), - GFP_KERNEL, uctxt->numa_id); - if (!uctxt->tidusemap) { - ret = -ENOMEM; - goto done; - } - /* - * In case that the number of groups is not a multiple of - * 64 (the number of groups in a tidusemap element), mark - * the extra ones as used. This will effectively make them - * permanently used and should never be assigned. Otherwise, - * the code which checks how many free groups we have will - * get completely confused about the state of the bits. - */ - if (uctxt->numtidgroups % BITS_PER_LONG) - uctxt->tidusemap[uctxt->tidmapcnt - 1] = - ~((1ULL << (uctxt->numtidgroups % - BITS_PER_LONG)) - 1); - trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0, - uctxt->tidusemap, uctxt->tidmapcnt); } ret = hfi1_user_sdma_alloc_queues(uctxt, fp); if (ret) @@ -1392,8 +1408,9 @@ static unsigned int poll_next(struct file *fp, set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags); hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt); pollflag = 0; - } else + } else { pollflag = POLLIN | POLLRDNORM; + } spin_unlock_irq(&dd->uctxt_lock); return pollflag; @@ -1471,8 +1488,9 @@ static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt, if (uctxt->rcvhdrtail_kvaddr) clear_rcvhdrtail(uctxt); rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB; - } else + } else { rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS; + } hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt); /* always; new head should be equal to new tail; see above */ bail: @@ -1505,367 +1523,6 @@ static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt, return 0; } -#define num_user_pages(vaddr, len) \ - (1 + (((((unsigned long)(vaddr) + \ - (unsigned long)(len) - 1) & PAGE_MASK) - \ - ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) - -/** - * tzcnt - count the number of trailing zeros in a 64bit value - * @value: the value to be examined - * - * Returns the number of trailing least significant zeros in the - * the input value. If the value is zero, return the number of - * bits of the value. - */ -static inline u8 tzcnt(u64 value) -{ - return value ? __builtin_ctzl(value) : sizeof(value) * 8; -} - -static inline unsigned num_free_groups(unsigned long map, u16 *start) -{ - unsigned free; - u16 bitidx = *start; - - if (bitidx >= BITS_PER_LONG) - return 0; - /* "Turn off" any bits set before our bit index */ - map &= ~((1ULL << bitidx) - 1); - free = tzcnt(map) - bitidx; - while (!free && bitidx < BITS_PER_LONG) { - /* Zero out the last set bit so we look at the rest */ - map &= ~(1ULL << bitidx); - /* - * Account for the previously checked bits and advance - * the bit index. We don't have to check for bitidx - * getting bigger than BITS_PER_LONG here as it would - * mean extra instructions that we don't need. If it - * did happen, it would push free to a negative value - * which will break the loop. - */ - free = tzcnt(map) - ++bitidx; - } - *start = bitidx; - return free; -} - -static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo) -{ - int ret = 0; - struct hfi1_filedata *fd = fp->private_data; - struct hfi1_ctxtdata *uctxt = fd->uctxt; - struct hfi1_devdata *dd = uctxt->dd; - unsigned tid, mapped = 0, npages, ngroups, exp_groups, - tidpairs = uctxt->expected_count / 2; - struct page **pages; - unsigned long vaddr, tidmap[uctxt->tidmapcnt]; - dma_addr_t *phys; - u32 tidlist[tidpairs], pairidx = 0, tidcursor; - u16 useidx, idx, bitidx, tidcnt = 0; - - vaddr = tinfo->vaddr; - - if (offset_in_page(vaddr)) { - ret = -EINVAL; - goto bail; - } - - npages = num_user_pages(vaddr, tinfo->length); - if (!npages) { - ret = -EINVAL; - goto bail; - } - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, - npages * PAGE_SIZE)) { - dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", - (void *)vaddr, npages); - ret = -EFAULT; - goto bail; - } - - memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt); - memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs); - - exp_groups = uctxt->expected_count / dd->rcv_entries.group_size; - /* which group set do we look at first? */ - tidcursor = atomic_read(&uctxt->tidcursor); - useidx = (tidcursor >> 16) & 0xffff; - bitidx = tidcursor & 0xffff; - - /* - * Keep going until we've mapped all pages or we've exhausted all - * RcvArray entries. - * This iterates over the number of tidmaps + 1 - * (idx <= uctxt->tidmapcnt) so we check the bitmap which we - * started from one more time for any free bits before the - * starting point bit. - */ - for (mapped = 0, idx = 0; - mapped < npages && idx <= uctxt->tidmapcnt;) { - u64 i, offset = 0; - unsigned free, pinned, pmapped = 0, bits_used; - u16 grp; - - /* - * "Reserve" the needed group bits under lock so other - * processes can't step in the middle of it. Once - * reserved, we don't need the lock anymore since we - * are guaranteed the groups. - */ - spin_lock(&uctxt->exp_lock); - if (uctxt->tidusemap[useidx] == -1ULL || - bitidx >= BITS_PER_LONG) { - /* no free groups in the set, use the next */ - useidx = (useidx + 1) % uctxt->tidmapcnt; - idx++; - bitidx = 0; - spin_unlock(&uctxt->exp_lock); - continue; - } - ngroups = ((npages - mapped) / dd->rcv_entries.group_size) + - !!((npages - mapped) % dd->rcv_entries.group_size); - - /* - * If we've gotten here, the current set of groups does have - * one or more free groups. - */ - free = num_free_groups(uctxt->tidusemap[useidx], &bitidx); - if (!free) { - /* - * Despite the check above, free could still come back - * as 0 because we don't check the entire bitmap but - * we start from bitidx. - */ - spin_unlock(&uctxt->exp_lock); - continue; - } - bits_used = min(free, ngroups); - tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx; - uctxt->tidusemap[useidx] |= tidmap[useidx]; - spin_unlock(&uctxt->exp_lock); - - /* - * At this point, we know where in the map we have free bits. - * properly offset into the various "shadow" arrays and compute - * the RcvArray entry index. - */ - offset = ((useidx * BITS_PER_LONG) + bitidx) * - dd->rcv_entries.group_size; - pages = uctxt->tid_pg_list + offset; - phys = uctxt->physshadow + offset; - tid = uctxt->expected_base + offset; - - /* Calculate how many pages we can pin based on free bits */ - pinned = min((bits_used * dd->rcv_entries.group_size), - (npages - mapped)); - /* - * Now that we know how many free RcvArray entries we have, - * we can pin that many user pages. - */ - ret = hfi1_acquire_user_pages(vaddr + (mapped * PAGE_SIZE), - pinned, true, pages); - if (ret) { - /* - * We can't continue because the pages array won't be - * initialized. This should never happen, - * unless perhaps the user has mpin'ed the pages - * themselves. - */ - dd_dev_info(dd, - "Failed to lock addr %p, %u pages: errno %d\n", - (void *) vaddr, pinned, -ret); - /* - * Let go of the bits that we reserved since we are not - * going to use them. - */ - spin_lock(&uctxt->exp_lock); - uctxt->tidusemap[useidx] &= - ~(((1ULL << bits_used) - 1) << bitidx); - spin_unlock(&uctxt->exp_lock); - goto done; - } - /* - * How many groups do we need based on how many pages we have - * pinned? - */ - ngroups = (pinned / dd->rcv_entries.group_size) + - !!(pinned % dd->rcv_entries.group_size); - /* - * Keep programming RcvArray entries for all the <ngroups> free - * groups. - */ - for (i = 0, grp = 0; grp < ngroups; i++, grp++) { - unsigned j; - u32 pair_size = 0, tidsize; - /* - * This inner loop will program an entire group or the - * array of pinned pages (which ever limit is hit - * first). - */ - for (j = 0; j < dd->rcv_entries.group_size && - pmapped < pinned; j++, pmapped++, tid++) { - tidsize = PAGE_SIZE; - phys[pmapped] = hfi1_map_page(dd->pcidev, - pages[pmapped], 0, - tidsize, PCI_DMA_FROMDEVICE); - trace_hfi1_exp_rcv_set(uctxt->ctxt, - fd->subctxt, - tid, vaddr, - phys[pmapped], - pages[pmapped]); - /* - * Each RcvArray entry is programmed with one - * page * worth of memory. This will handle - * the 8K MTU as well as anything smaller - * due to the fact that both entries in the - * RcvTidPair are programmed with a page. - * PSM currently does not handle anything - * bigger than 8K MTU, so should we even worry - * about 10K here? - */ - hfi1_put_tid(dd, tid, PT_EXPECTED, - phys[pmapped], - ilog2(tidsize >> PAGE_SHIFT) + 1); - pair_size += tidsize >> PAGE_SHIFT; - EXP_TID_RESET(tidlist[pairidx], LEN, pair_size); - if (!(tid % 2)) { - tidlist[pairidx] |= - EXP_TID_SET(IDX, - (tid - uctxt->expected_base) - / 2); - tidlist[pairidx] |= - EXP_TID_SET(CTRL, 1); - tidcnt++; - } else { - tidlist[pairidx] |= - EXP_TID_SET(CTRL, 2); - pair_size = 0; - pairidx++; - } - } - /* - * We've programmed the entire group (or as much of the - * group as we'll use. Now, it's time to push it out... - */ - flush_wc(); - } - mapped += pinned; - atomic_set(&uctxt->tidcursor, - (((useidx & 0xffffff) << 16) | - ((bitidx + bits_used) & 0xffffff))); - } - trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0, uctxt->tidusemap, - uctxt->tidmapcnt); - -done: - /* If we've mapped anything, copy relevant info to user */ - if (mapped) { - if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, - tidlist, sizeof(tidlist[0]) * tidcnt)) { - ret = -EFAULT; - goto done; - } - /* copy TID info to user */ - if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap, - tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt)) - ret = -EFAULT; - } -bail: - /* - * Calculate mapped length. New Exp TID protocol does not "unwind" and - * report an error if it can't map the entire buffer. It just reports - * the length that was mapped. - */ - tinfo->length = mapped * PAGE_SIZE; - tinfo->tidcnt = tidcnt; - return ret; -} - -static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo) -{ - struct hfi1_filedata *fd = fp->private_data; - struct hfi1_ctxtdata *uctxt = fd->uctxt; - struct hfi1_devdata *dd = uctxt->dd; - unsigned long tidmap[uctxt->tidmapcnt]; - struct page **pages; - dma_addr_t *phys; - u16 idx, bitidx, tid; - int ret = 0; - - if (copy_from_user(&tidmap, (void __user *)(unsigned long) - tinfo->tidmap, - sizeof(tidmap[0]) * uctxt->tidmapcnt)) { - ret = -EFAULT; - goto done; - } - for (idx = 0; idx < uctxt->tidmapcnt; idx++) { - unsigned long map; - - bitidx = 0; - if (!tidmap[idx]) - continue; - map = tidmap[idx]; - while ((bitidx = tzcnt(map)) < BITS_PER_LONG) { - int i, pcount = 0; - struct page *pshadow[dd->rcv_entries.group_size]; - unsigned offset = ((idx * BITS_PER_LONG) + bitidx) * - dd->rcv_entries.group_size; - - pages = uctxt->tid_pg_list + offset; - phys = uctxt->physshadow + offset; - tid = uctxt->expected_base + offset; - for (i = 0; i < dd->rcv_entries.group_size; - i++, tid++) { - if (pages[i]) { - hfi1_put_tid(dd, tid, PT_INVALID, - 0, 0); - trace_hfi1_exp_rcv_free(uctxt->ctxt, - fd->subctxt, - tid, phys[i], - pages[i]); - pci_unmap_page(dd->pcidev, phys[i], - PAGE_SIZE, PCI_DMA_FROMDEVICE); - pshadow[pcount] = pages[i]; - pages[i] = NULL; - pcount++; - phys[i] = 0; - } - } - flush_wc(); - hfi1_release_user_pages(pshadow, pcount, true); - clear_bit(bitidx, &uctxt->tidusemap[idx]); - map &= ~(1ULL<<bitidx); - } - } - trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 1, uctxt->tidusemap, - uctxt->tidmapcnt); -done: - return ret; -} - -static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt) -{ - struct hfi1_devdata *dd = uctxt->dd; - unsigned tid; - - dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n", - uctxt->ctxt); - for (tid = 0; tid < uctxt->expected_count; tid++) { - struct page *p = uctxt->tid_pg_list[tid]; - dma_addr_t phys; - - if (!p) - continue; - - phys = uctxt->physshadow[tid]; - uctxt->physshadow[tid] = 0; - uctxt->tid_pg_list[tid] = NULL; - pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE); - hfi1_release_user_pages(&p, 1, true); - } -} - static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt, u16 pkey) { @@ -1934,10 +1591,9 @@ static loff_t ui_lseek(struct file *filp, loff_t offset, int whence) return filp->f_pos; } - /* NOTE: assumes unsigned long is 8 bytes */ static ssize_t ui_read(struct file *filp, char __user *buf, size_t count, - loff_t *f_pos) + loff_t *f_pos) { struct hfi1_devdata *dd = filp->private_data; void __iomem *base = dd->kregbase; @@ -1973,12 +1629,12 @@ static ssize_t ui_read(struct file *filp, char __user *buf, size_t count, * them. These registers are defined as having a read value * of 0. */ - else if (csr_off == ASIC_GPIO_CLEAR - || csr_off == ASIC_GPIO_FORCE - || csr_off == ASIC_QSFP1_CLEAR - || csr_off == ASIC_QSFP1_FORCE - || csr_off == ASIC_QSFP2_CLEAR - || csr_off == ASIC_QSFP2_FORCE) + else if (csr_off == ASIC_GPIO_CLEAR || + csr_off == ASIC_GPIO_FORCE || + csr_off == ASIC_QSFP1_CLEAR || + csr_off == ASIC_QSFP1_FORCE || + csr_off == ASIC_QSFP2_CLEAR || + csr_off == ASIC_QSFP2_FORCE) data = 0; else if (csr_off >= barlen) { /* diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c index 28ae42faa018..3040162cb326 100644 --- a/drivers/staging/rdma/hfi1/firmware.c +++ b/drivers/staging/rdma/hfi1/firmware.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -77,7 +74,13 @@ static uint fw_8051_load = 1; static uint fw_fabric_serdes_load = 1; static uint fw_pcie_serdes_load = 1; static uint fw_sbus_load = 1; -static uint platform_config_load = 1; + +/* + * Access required in platform.c + * Maintains state of whether the platform config was fetched via the + * fallback option + */ +uint platform_config_load; /* Firmware file names get set in hfi1_firmware_init() based on the above */ static char *fw_8051_name; @@ -107,6 +110,7 @@ struct css_header { u32 exponent_size; /* in DWORDs */ u32 reserved[22]; }; + /* expected field values */ #define CSS_MODULE_TYPE 0x00000006 #define CSS_HEADER_LEN 0x000000a1 @@ -166,6 +170,7 @@ enum fw_state { FW_FINAL, FW_ERR }; + static enum fw_state fw_state = FW_EMPTY; static int fw_err; static struct firmware_details fw_8051; @@ -193,7 +198,7 @@ static const struct firmware *platform_config; #define RSA_ENGINE_TIMEOUT 100 /* ms */ /* hardware mutex timeout, in ms */ -#define HM_TIMEOUT 4000 /* 4 s */ +#define HM_TIMEOUT 10 /* ms */ /* 8051 memory access timeout, in us */ #define DC8051_ACCESS_TIMEOUT 100 /* us */ @@ -233,6 +238,8 @@ static const u8 all_pcie_serdes_broadcast = 0xe0; /* forwards */ static void dispose_one_firmware(struct firmware_details *fdet); +static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet); /* * Read a single 64-bit value from 8051 data memory. @@ -372,8 +379,8 @@ static int invalid_header(struct hfi1_devdata *dd, const char *what, return 0; dd_dev_err(dd, - "invalid firmware header field %s: expected 0x%x, actual 0x%x\n", - what, expected, actual); + "invalid firmware header field %s: expected 0x%x, actual 0x%x\n", + what, expected, actual); return 1; } @@ -383,19 +390,19 @@ static int invalid_header(struct hfi1_devdata *dd, const char *what, static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css) { /* verify CSS header fields (most sizes are in DW, so add /4) */ - if (invalid_header(dd, "module_type", css->module_type, CSS_MODULE_TYPE) - || invalid_header(dd, "header_len", css->header_len, - (sizeof(struct firmware_file)/4)) - || invalid_header(dd, "header_version", - css->header_version, CSS_HEADER_VERSION) - || invalid_header(dd, "module_vendor", - css->module_vendor, CSS_MODULE_VENDOR) - || invalid_header(dd, "key_size", - css->key_size, KEY_SIZE/4) - || invalid_header(dd, "modulus_size", - css->modulus_size, KEY_SIZE/4) - || invalid_header(dd, "exponent_size", - css->exponent_size, EXPONENT_SIZE/4)) { + if (invalid_header(dd, "module_type", css->module_type, + CSS_MODULE_TYPE) || + invalid_header(dd, "header_len", css->header_len, + (sizeof(struct firmware_file) / 4)) || + invalid_header(dd, "header_version", css->header_version, + CSS_HEADER_VERSION) || + invalid_header(dd, "module_vendor", css->module_vendor, + CSS_MODULE_VENDOR) || + invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) || + invalid_header(dd, "modulus_size", css->modulus_size, + KEY_SIZE / 4) || + invalid_header(dd, "exponent_size", css->exponent_size, + EXPONENT_SIZE / 4)) { return -EINVAL; } return 0; @@ -410,8 +417,8 @@ static int payload_check(struct hfi1_devdata *dd, const char *name, /* make sure we have some payload */ if (prefix_size >= file_size) { dd_dev_err(dd, - "firmware \"%s\", size %ld, must be larger than %ld bytes\n", - name, file_size, prefix_size); + "firmware \"%s\", size %ld, must be larger than %ld bytes\n", + name, file_size, prefix_size); return -EINVAL; } @@ -433,8 +440,8 @@ static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name, ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev); if (ret) { - dd_dev_err(dd, "cannot find firmware \"%s\", err %d\n", - name, ret); + dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n", + name, ret); return ret; } @@ -480,14 +487,14 @@ static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name, ret = verify_css_header(dd, css); if (ret) { dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name); - } else if ((css->size*4) == fdet->fw->size) { + } else if ((css->size * 4) == fdet->fw->size) { /* non-augmented firmware file */ struct firmware_file *ff = (struct firmware_file *) fdet->fw->data; /* make sure there are bytes in the payload */ ret = payload_check(dd, name, fdet->fw->size, - sizeof(struct firmware_file)); + sizeof(struct firmware_file)); if (ret == 0) { fdet->css_header = css; fdet->modulus = ff->modulus; @@ -505,14 +512,14 @@ static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name, dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n"); ret = -EINVAL; } - } else if ((css->size*4) + AUGMENT_SIZE == fdet->fw->size) { + } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) { /* augmented firmware file */ struct augmented_firmware_file *aff = (struct augmented_firmware_file *)fdet->fw->data; /* make sure there are bytes in the payload */ ret = payload_check(dd, name, fdet->fw->size, - sizeof(struct augmented_firmware_file)); + sizeof(struct augmented_firmware_file)); if (ret == 0) { fdet->css_header = css; fdet->modulus = aff->modulus; @@ -527,9 +534,10 @@ static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name, } else { /* css->size check failed */ dd_dev_err(dd, - "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n", - fdet->fw->size/4, (fdet->fw->size - AUGMENT_SIZE)/4, - css->size); + "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n", + fdet->fw->size / 4, + (fdet->fw->size - AUGMENT_SIZE) / 4, + css->size); ret = -EINVAL; } @@ -572,7 +580,7 @@ retry: * We tried the original and it failed. Move to the * alternate. */ - dd_dev_info(dd, "using alternate firmware names\n"); + dd_dev_warn(dd, "using alternate firmware names\n"); /* * Let others run. Some systems, when missing firmware, does * something that holds for 30 seconds. If we do that twice @@ -593,27 +601,27 @@ retry: fw_pcie_serdes_name = ALT_FW_PCIE_NAME; } - if (fw_8051_load) { - err = obtain_one_firmware(dd, fw_8051_name, &fw_8051); + if (fw_sbus_load) { + err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus); if (err) goto done; } - if (fw_fabric_serdes_load) { - err = obtain_one_firmware(dd, fw_fabric_serdes_name, - &fw_fabric); + if (fw_pcie_serdes_load) { + err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie); if (err) goto done; } - if (fw_sbus_load) { - err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus); + if (fw_fabric_serdes_load) { + err = obtain_one_firmware(dd, fw_fabric_serdes_name, + &fw_fabric); if (err) goto done; } - if (fw_pcie_serdes_load) { - err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie); + if (fw_8051_load) { + err = obtain_one_firmware(dd, fw_8051_name, &fw_8051); if (err) goto done; } @@ -621,16 +629,18 @@ retry: done: if (err) { /* oops, had problems obtaining a firmware */ - if (fw_state == FW_EMPTY) { - /* retry with alternate */ + if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) { + /* retry with alternate (RTL only) */ fw_state = FW_TRY; goto retry; } + dd_dev_err(dd, "unable to obtain working firmware\n"); fw_state = FW_ERR; fw_err = -ENOENT; } else { /* success */ - if (fw_state == FW_EMPTY) + if (fw_state == FW_EMPTY && + dd->icode != ICODE_FUNCTIONAL_SIMULATOR) fw_state = FW_TRY; /* may retry later */ else fw_state = FW_FINAL; /* cannot try again */ @@ -673,10 +683,15 @@ static int obtain_firmware(struct hfi1_devdata *dd) } /* not in FW_TRY state */ - if (fw_state == FW_FINAL) + if (fw_state == FW_FINAL) { + if (platform_config) { + dd->platform_config.data = platform_config->data; + dd->platform_config.size = platform_config->size; + } goto done; /* already acquired */ - else if (fw_state == FW_ERR) + } else if (fw_state == FW_ERR) { goto done; /* already tried and failed */ + } /* fw_state is FW_EMPTY */ /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */ @@ -685,9 +700,13 @@ static int obtain_firmware(struct hfi1_devdata *dd) if (platform_config_load) { platform_config = NULL; err = request_firmware(&platform_config, platform_config_name, - &dd->pcidev->dev); - if (err) + &dd->pcidev->dev); + if (err) { platform_config = NULL; + goto done; + } + dd->platform_config.data = platform_config->data; + dd->platform_config.size = platform_config->size; } done: @@ -761,7 +780,7 @@ static int retry_firmware(struct hfi1_devdata *dd, int load_result) static void write_rsa_data(struct hfi1_devdata *dd, int what, const u8 *data, int nbytes) { - int qw_size = nbytes/8; + int qw_size = nbytes / 8; int i; if (((unsigned long)data & 0x7) == 0) { @@ -769,14 +788,14 @@ static void write_rsa_data(struct hfi1_devdata *dd, int what, u64 *ptr = (u64 *)data; for (i = 0; i < qw_size; i++, ptr++) - write_csr(dd, what + (8*i), *ptr); + write_csr(dd, what + (8 * i), *ptr); } else { /* not aligned */ for (i = 0; i < qw_size; i++, data += 8) { u64 value; memcpy(&value, data, 8); - write_csr(dd, what + (8*i), value); + write_csr(dd, what + (8 * i), value); } } } @@ -789,7 +808,7 @@ static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what, const u8 *data, int nbytes) { u64 *ptr = (u64 *)data; - int qw_size = nbytes/8; + int qw_size = nbytes / 8; for (; qw_size > 0; qw_size--, ptr++) write_csr(dd, what, *ptr); @@ -822,7 +841,7 @@ static int run_rsa(struct hfi1_devdata *dd, const char *who, >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT; if (status != RSA_STATUS_IDLE) { dd_dev_err(dd, "%s security engine not idle - giving up\n", - who); + who); return -EBUSY; } @@ -859,7 +878,7 @@ static int run_rsa(struct hfi1_devdata *dd, const char *who, if (status == RSA_STATUS_IDLE) { /* should not happen */ dd_dev_err(dd, "%s firmware security bad idle state\n", - who); + who); ret = -EINVAL; break; } else if (status == RSA_STATUS_DONE) { @@ -893,19 +912,20 @@ static int run_rsa(struct hfi1_devdata *dd, const char *who, * is not keeping the error high. */ write_csr(dd, MISC_ERR_CLEAR, - MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK - | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK); + MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK | + MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK); /* - * All that is left are the current errors. Print failure details, - * if any. + * All that is left are the current errors. Print warnings on + * authorization failure details, if any. Firmware authorization + * can be retried, so these are only warnings. */ reg = read_csr(dd, MISC_ERR_STATUS); if (ret) { if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK) - dd_dev_err(dd, "%s firmware authorization failed\n", - who); + dd_dev_warn(dd, "%s firmware authorization failed\n", + who); if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK) - dd_dev_err(dd, "%s firmware key mismatch\n", who); + dd_dev_warn(dd, "%s firmware key mismatch\n", who); } return ret; @@ -922,7 +942,8 @@ static void load_security_variables(struct hfi1_devdata *dd, write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE); /* Security variables d. Write the header */ write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD, - (u8 *)fdet->css_header, sizeof(struct css_header)); + (u8 *)fdet->css_header, + sizeof(struct css_header)); } /* return the 8051 firmware state */ @@ -1002,7 +1023,7 @@ static int load_8051_firmware(struct hfi1_devdata *dd, /* Firmware load steps 3-5 */ ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr, - fdet->firmware_len); + fdet->firmware_len); if (ret) return ret; @@ -1029,13 +1050,13 @@ static int load_8051_firmware(struct hfi1_devdata *dd, ret = wait_fm_ready(dd, TIMEOUT_8051_START); if (ret) { /* timed out */ dd_dev_err(dd, "8051 start timeout, current state 0x%x\n", - get_firmware_state(dd)); + get_firmware_state(dd)); return -ETIMEDOUT; } read_misc_status(dd, &ver_a, &ver_b); dd_dev_info(dd, "8051 firmware version %d.%d\n", - (int)ver_b, (int)ver_a); + (int)ver_b, (int)ver_a); dd->dc8051_ver = dc8051_ver(ver_b, ver_a); return 0; @@ -1050,11 +1071,11 @@ void sbus_request(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr, u8 command, u32 data_in) { write_csr(dd, ASIC_CFG_SBUS_REQUEST, - ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) - | ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) - | ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) - | ((u64)receiver_addr - << ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT)); + ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) | + ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) | + ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) | + ((u64)receiver_addr << + ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT)); } /* @@ -1072,14 +1093,14 @@ static void turn_off_spicos(struct hfi1_devdata *dd, int flags) return; dd_dev_info(dd, "Turning off spicos:%s%s\n", - flags & SPICO_SBUS ? " SBus" : "", - flags & SPICO_FABRIC ? " fabric" : ""); + flags & SPICO_SBUS ? " SBus" : "", + flags & SPICO_FABRIC ? " fabric" : ""); write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK); /* disable SBus spico */ if (flags & SPICO_SBUS) sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01, - WRITE_SBUS_RECEIVER, 0x00000040); + WRITE_SBUS_RECEIVER, 0x00000040); /* disable the fabric serdes spicos */ if (flags & SPICO_FABRIC) @@ -1089,29 +1110,60 @@ static void turn_off_spicos(struct hfi1_devdata *dd, int flags) } /* - * Reset all of the fabric serdes for our HFI. + * Reset all of the fabric serdes for this HFI in preparation to take the + * link to Polling. + * + * To do a reset, we need to write to to the serdes registers. Unfortunately, + * the fabric serdes download to the other HFI on the ASIC will have turned + * off the firmware validation on this HFI. This means we can't write to the + * registers to reset the serdes. Work around this by performing a complete + * re-download and validation of the fabric serdes firmware. This, as a + * by-product, will reset the serdes. NOTE: the re-download requires that + * the 8051 be in the Offline state. I.e. not actively trying to use the + * serdes. This routine is called at the point where the link is Offline and + * is getting ready to go to Polling. */ void fabric_serdes_reset(struct hfi1_devdata *dd) { - u8 ra; + int ret; - if (dd->icode != ICODE_RTL_SILICON) /* only for RTL */ + if (!fw_fabric_serdes_load) return; - ra = fabric_serdes_broadcast[dd->hfi1_id]; - - acquire_hw_mutex(dd); + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, + "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n"); + return; + } set_sbus_fast_mode(dd); - /* place SerDes in reset and disable SPICO */ - sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); - /* wait 100 refclk cycles @ 156.25MHz => 640ns */ - udelay(1); - /* remove SerDes reset */ - sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); - /* turn SPICO enable on */ - sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + + if (is_ax(dd)) { + /* A0 serdes do not work with a re-download */ + u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; + + /* place SerDes in reset and disable SPICO */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); + /* wait 100 refclk cycles @ 156.25MHz => 640ns */ + udelay(1); + /* remove SerDes reset */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); + /* turn SPICO enable on */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + } else { + turn_off_spicos(dd, SPICO_FABRIC); + /* + * No need for firmware retry - what to download has already + * been decided. + * No need to pay attention to the load return - the only + * failure is a validation failure, which has already been + * checked by the initial download. + */ + (void)load_fabric_serdes_firmware(dd, &fw_fabric); + } + clear_sbus_fast_mode(dd); - release_hw_mutex(dd); + release_chip_resource(dd, CR_SBUS); } /* Access to the SBus in this routine should probably be serialized */ @@ -1120,6 +1172,9 @@ int sbus_request_slow(struct hfi1_devdata *dd, { u64 reg, count = 0; + /* make sure fast mode is clear */ + clear_sbus_fast_mode(dd); + sbus_request(dd, receiver_addr, data_addr, command, data_in); write_csr(dd, ASIC_CFG_SBUS_EXECUTE, ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK); @@ -1177,7 +1232,7 @@ static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, /* step 5: download SerDes machine code */ for (i = 0; i < fdet->firmware_len; i += 4) { sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER, - *(u32 *)&fdet->firmware_ptr[i]); + *(u32 *)&fdet->firmware_ptr[i]); } /* step 6: IMEM override off */ sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000); @@ -1216,7 +1271,7 @@ static int load_sbus_firmware(struct hfi1_devdata *dd, /* step 5: download the SBus Master machine code */ for (i = 0; i < fdet->firmware_len; i += 4) { sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER, - *(u32 *)&fdet->firmware_ptr[i]); + *(u32 *)&fdet->firmware_ptr[i]); } /* step 6: set IMEM_CNTL_EN off */ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040); @@ -1249,19 +1304,23 @@ static int load_pcie_serdes_firmware(struct hfi1_devdata *dd, /* step 3: enable XDMEM access */ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40); /* step 4: load firmware into SBus Master XDMEM */ - /* NOTE: the dmem address, write_en, and wdata are all pre-packed, - we only need to pick up the bytes and write them */ + /* + * NOTE: the dmem address, write_en, and wdata are all pre-packed, + * we only need to pick up the bytes and write them + */ for (i = 0; i < fdet->firmware_len; i += 4) { sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER, - *(u32 *)&fdet->firmware_ptr[i]); + *(u32 *)&fdet->firmware_ptr[i]); } /* step 5: disable XDMEM access */ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140); /* step 6: allow SBus Spico to run */ sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000); - /* steps 7-11: run RSA, if it succeeds, firmware is available to - be swapped */ + /* + * steps 7-11: run RSA, if it succeeds, firmware is available to + * be swapped + */ return run_rsa(dd, "PCIe serdes", fdet->signature); } @@ -1285,7 +1344,7 @@ static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2, * 23:16 BROADCAST_GROUP_2 (default 0xff) */ sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER, - (u32)bg1 << 4 | (u32)bg2 << 16); + (u32)bg1 << 4 | (u32)bg2 << 16); } } @@ -1310,8 +1369,8 @@ retry: /* timed out */ dd_dev_err(dd, - "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n", - (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up"); + "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n", + (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up"); if (try == 0) { /* break mutex and retry */ @@ -1328,10 +1387,197 @@ void release_hw_mutex(struct hfi1_devdata *dd) write_csr(dd, ASIC_CFG_MUTEX, 0); } +/* return the given resource bit(s) as a mask for the given HFI */ +static inline u64 resource_mask(u32 hfi1_id, u32 resource) +{ + return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0); +} + +static void fail_mutex_acquire_message(struct hfi1_devdata *dd, + const char *func) +{ + dd_dev_err(dd, + "%s: hardware mutex stuck - suggest rebooting the machine\n", + func); +} + +/* + * Acquire access to a chip resource. + * + * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed. + */ +static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource) +{ + u64 scratch0, all_bits, my_bit; + int ret; + + if (resource & CR_DYN_MASK) { + /* a dynamic resource is in use if either HFI has set the bit */ + all_bits = resource_mask(0, resource) | + resource_mask(1, resource); + my_bit = resource_mask(dd->hfi1_id, resource); + } else { + /* non-dynamic resources are not split between HFIs */ + all_bits = resource; + my_bit = resource; + } + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + ret = acquire_hw_mutex(dd); + if (ret) { + fail_mutex_acquire_message(dd, __func__); + ret = -EIO; + goto done; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if (scratch0 & all_bits) { + ret = -EBUSY; + } else { + write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + } + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); + return ret; +} + +/* + * Acquire access to a chip resource, wait up to mswait milliseconds for + * the resource to become available. + * + * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex + * acquire failed. + */ +int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait) +{ + unsigned long timeout; + int ret; + + timeout = jiffies + msecs_to_jiffies(mswait); + while (1) { + ret = __acquire_chip_resource(dd, resource); + if (ret != -EBUSY) + return ret; + /* resource is busy, check our timeout */ + if (time_after_eq(jiffies, timeout)) + return -EBUSY; + usleep_range(80, 120); /* arbitrary delay */ + } +} + +/* + * Release access to a chip resource + */ +void release_chip_resource(struct hfi1_devdata *dd, u32 resource) +{ + u64 scratch0, bit; + + /* only dynamic resources should ever be cleared */ + if (!(resource & CR_DYN_MASK)) { + dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__, + resource); + return; + } + bit = resource_mask(dd->hfi1_id, resource); + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + if (acquire_hw_mutex(dd)) { + fail_mutex_acquire_message(dd, __func__); + goto done; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if ((scratch0 & bit) != 0) { + scratch0 &= ~bit; + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + } else { + dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n", + __func__, dd->hfi1_id, resource); + } + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); +} + +/* + * Return true if resource is set, false otherwise. Print a warning + * if not set and a function is supplied. + */ +bool check_chip_resource(struct hfi1_devdata *dd, u32 resource, + const char *func) +{ + u64 scratch0, bit; + + if (resource & CR_DYN_MASK) + bit = resource_mask(dd->hfi1_id, resource); + else + bit = resource; + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if ((scratch0 & bit) == 0) { + if (func) + dd_dev_warn(dd, + "%s: id %d, resource 0x%x, not acquired!\n", + func, dd->hfi1_id, resource); + return false; + } + return true; +} + +static void clear_chip_resources(struct hfi1_devdata *dd, const char *func) +{ + u64 scratch0; + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + if (acquire_hw_mutex(dd)) { + fail_mutex_acquire_message(dd, func); + goto done; + } + + /* clear all dynamic access bits for this HFI */ + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK); + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); +} + +void init_chip_resources(struct hfi1_devdata *dd) +{ + /* clear any holds left by us */ + clear_chip_resources(dd, __func__); +} + +void finish_chip_resources(struct hfi1_devdata *dd) +{ + /* clear any holds left by us */ + clear_chip_resources(dd, __func__); +} + void set_sbus_fast_mode(struct hfi1_devdata *dd) { write_csr(dd, ASIC_CFG_SBUS_EXECUTE, - ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK); + ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK); } void clear_sbus_fast_mode(struct hfi1_devdata *dd) @@ -1354,23 +1600,23 @@ int load_firmware(struct hfi1_devdata *dd) int ret; if (fw_fabric_serdes_load) { - ret = acquire_hw_mutex(dd); + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); if (ret) return ret; set_sbus_fast_mode(dd); set_serdes_broadcast(dd, all_fabric_serdes_broadcast, - fabric_serdes_broadcast[dd->hfi1_id], - fabric_serdes_addrs[dd->hfi1_id], - NUM_FABRIC_SERDES); + fabric_serdes_broadcast[dd->hfi1_id], + fabric_serdes_addrs[dd->hfi1_id], + NUM_FABRIC_SERDES); turn_off_spicos(dd, SPICO_FABRIC); do { ret = load_fabric_serdes_firmware(dd, &fw_fabric); } while (retry_firmware(dd, ret)); clear_sbus_fast_mode(dd); - release_hw_mutex(dd); + release_chip_resource(dd, CR_SBUS); if (ret) return ret; } @@ -1419,18 +1665,57 @@ int hfi1_firmware_init(struct hfi1_devdata *dd) return obtain_firmware(dd); } +/* + * This function is a helper function for parse_platform_config(...) and + * does not check for validity of the platform configuration cache + * (because we know it is invalid as we are building up the cache). + * As such, this should not be called from anywhere other than + * parse_platform_config + */ +static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table) +{ + u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask; + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + + if (!system_table) + return -EINVAL; + + meta_ver_meta = + *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata + + SYSTEM_TABLE_META_VERSION); + + mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1); + ver_start = meta_ver_meta & mask; + + meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT; + + mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1); + ver_len = meta_ver_meta & mask; + + ver_start /= 8; + meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1); + + if (meta_ver < 5) { + dd_dev_info( + dd, "%s:Please update platform config\n", __func__); + return -EINVAL; + } + return 0; +} + int parse_platform_config(struct hfi1_devdata *dd) { struct platform_config_cache *pcfgcache = &dd->pcfg_cache; u32 *ptr = NULL; - u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0; + u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0; u32 record_idx = 0, table_type = 0, table_length_dwords = 0; + int ret = -EINVAL; /* assume failure */ - if (platform_config == NULL) { + if (!dd->platform_config.data) { dd_dev_info(dd, "%s: Missing config file\n", __func__); goto bail; } - ptr = (u32 *)platform_config->data; + ptr = (u32 *)dd->platform_config.data; magic_num = *ptr; ptr++; @@ -1439,12 +1724,32 @@ int parse_platform_config(struct hfi1_devdata *dd) goto bail; } - while (ptr < (u32 *)(platform_config->data + platform_config->size)) { + /* Field is file size in DWORDs */ + file_length = (*ptr) * 4; + ptr++; + + if (file_length > dd->platform_config.size) { + dd_dev_info(dd, "%s:File claims to be larger than read size\n", + __func__); + goto bail; + } else if (file_length < dd->platform_config.size) { + dd_dev_info(dd, + "%s:File claims to be smaller than read size, continuing\n", + __func__); + } + /* exactly equal, perfection */ + + /* + * In both cases where we proceed, using the self-reported file length + * is the safer option + */ + while (ptr < (u32 *)(dd->platform_config.data + file_length)) { header1 = *ptr; header2 = *(ptr + 1); if (header1 != ~header2) { dd_dev_info(dd, "%s: Failed validation at offset %ld\n", - __func__, (ptr - (u32 *)platform_config->data)); + __func__, (ptr - (u32 *) + dd->platform_config.data)); goto bail; } @@ -1467,6 +1772,9 @@ int parse_platform_config(struct hfi1_devdata *dd) case PLATFORM_CONFIG_SYSTEM_TABLE: pcfgcache->config_tables[table_type].num_table = 1; + ret = check_meta_version(dd, ptr); + if (ret) + goto bail; break; case PLATFORM_CONFIG_PORT_TABLE: pcfgcache->config_tables[table_type].num_table = @@ -1484,9 +1792,10 @@ int parse_platform_config(struct hfi1_devdata *dd) break; default: dd_dev_info(dd, - "%s: Unknown data table %d, offset %ld\n", - __func__, table_type, - (ptr - (u32 *)platform_config->data)); + "%s: Unknown data table %d, offset %ld\n", + __func__, table_type, + (ptr - (u32 *) + dd->platform_config.data)); goto bail; /* We don't trust this file now */ } pcfgcache->config_tables[table_type].table = ptr; @@ -1507,9 +1816,10 @@ int parse_platform_config(struct hfi1_devdata *dd) break; default: dd_dev_info(dd, - "%s: Unknown metadata table %d, offset %ld\n", - __func__, table_type, - (ptr - (u32 *)platform_config->data)); + "%s: Unknown meta table %d, offset %ld\n", + __func__, table_type, + (ptr - + (u32 *)dd->platform_config.data)); goto bail; /* We don't trust this file now */ } pcfgcache->config_tables[table_type].table_metadata = @@ -1518,14 +1828,16 @@ int parse_platform_config(struct hfi1_devdata *dd) /* Calculate and check table crc */ crc = crc32_le(~(u32)0, (unsigned char const *)ptr, - (table_length_dwords * 4)); + (table_length_dwords * 4)); crc ^= ~(u32)0; /* Jump the table */ ptr += table_length_dwords; if (crc != *ptr) { dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n", - __func__, (ptr - (u32 *)platform_config->data)); + __func__, (ptr - + (u32 *) + dd->platform_config.data)); goto bail; } /* Jump the CRC DWORD */ @@ -1536,11 +1848,12 @@ int parse_platform_config(struct hfi1_devdata *dd) return 0; bail: memset(pcfgcache, 0, sizeof(struct platform_config_cache)); - return -EINVAL; + return ret; } static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table, - int field, u32 *field_len_bits, u32 *field_start_bits) + int field, u32 *field_len_bits, + u32 *field_start_bits) { struct platform_config_cache *pcfgcache = &dd->pcfg_cache; u32 *src_ptr = NULL; @@ -1600,8 +1913,9 @@ static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table, * @len: length of memory pointed by @data in bytes. */ int get_platform_config_field(struct hfi1_devdata *dd, - enum platform_config_table_type_encoding table_type, - int table_index, int field_index, u32 *data, u32 len) + enum platform_config_table_type_encoding + table_type, int table_index, int field_index, + u32 *data, u32 len) { int ret = 0, wlen = 0, seek = 0; u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL; @@ -1613,7 +1927,8 @@ int get_platform_config_field(struct hfi1_devdata *dd, return -EINVAL; ret = get_platform_fw_field_metadata(dd, table_type, field_index, - &field_len_bits, &field_start_bits); + &field_len_bits, + &field_start_bits); if (ret) return -EINVAL; @@ -1629,19 +1944,21 @@ int get_platform_config_field(struct hfi1_devdata *dd, if (len < field_len_bits) return -EINVAL; - seek = field_start_bits/8; - wlen = field_len_bits/8; + seek = field_start_bits / 8; + wlen = field_len_bits / 8; src_ptr = (u32 *)((u8 *)src_ptr + seek); - /* We expect the field to be byte aligned and whole byte - * lengths if we are here */ + /* + * We expect the field to be byte aligned and whole byte + * lengths if we are here + */ memcpy(data, src_ptr, wlen); return 0; } break; case PLATFORM_CONFIG_PORT_TABLE: - /* Port table is 4 DWORDS in META_VERSION 0 */ + /* Port table is 4 DWORDS */ src_ptr = dd->hfi1_id ? pcfgcache->config_tables[table_type].table + 4 : pcfgcache->config_tables[table_type].table; @@ -1669,7 +1986,7 @@ int get_platform_config_field(struct hfi1_devdata *dd, if (!src_ptr || len < field_len_bits) return -EINVAL; - src_ptr += (field_start_bits/32); + src_ptr += (field_start_bits / 32); *data = (*src_ptr >> (field_start_bits % 32)) & ((1 << field_len_bits) - 1); @@ -1680,7 +1997,7 @@ int get_platform_config_field(struct hfi1_devdata *dd, * Download the firmware needed for the Gen3 PCIe SerDes. An update * to the SBus firmware is needed before updating the PCIe firmware. * - * Note: caller must be holding the HW mutex. + * Note: caller must be holding the SBus resource. */ int load_pcie_firmware(struct hfi1_devdata *dd) { @@ -1701,9 +2018,9 @@ int load_pcie_firmware(struct hfi1_devdata *dd) if (fw_pcie_serdes_load) { dd_dev_info(dd, "Setting PCIe SerDes broadcast\n"); set_serdes_broadcast(dd, all_pcie_serdes_broadcast, - pcie_serdes_broadcast[dd->hfi1_id], - pcie_serdes_addrs[dd->hfi1_id], - NUM_PCIE_SERDES); + pcie_serdes_broadcast[dd->hfi1_id], + pcie_serdes_addrs[dd->hfi1_id], + NUM_PCIE_SERDES); do { ret = load_pcie_serdes_firmware(dd, &fw_pcie); } while (retry_firmware(dd, ret)); @@ -1724,9 +2041,9 @@ void read_guid(struct hfi1_devdata *dd) { /* Take the DC out of reset to get a valid GUID value */ write_csr(dd, CCE_DC_CTRL, 0); - (void) read_csr(dd, CCE_DC_CTRL); + (void)read_csr(dd, CCE_DC_CTRL); dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID); dd_dev_info(dd, "GUID %llx", - (unsigned long long)dd->base_guid); + (unsigned long long)dd->base_guid); } diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h index 2611bb2e764d..16cbdc4073e0 100644 --- a/drivers/staging/rdma/hfi1/hfi.h +++ b/drivers/staging/rdma/hfi1/hfi.h @@ -1,14 +1,13 @@ #ifndef _HFI1_KERNEL_H #define _HFI1_KERNEL_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -65,6 +62,7 @@ #include <linux/cdev.h> #include <linux/delay.h> #include <linux/kthread.h> +#include <rdma/rdma_vt.h> #include "chip_registers.h" #include "common.h" @@ -73,7 +71,8 @@ #include "chip.h" #include "mad.h" #include "qsfp.h" -#include "platform_config.h" +#include "platform.h" +#include "affinity.h" /* bumped 1 from s/w major version of TrueScale */ #define HFI1_CHIP_VERS_MAJ 3U @@ -98,6 +97,8 @@ extern unsigned long hfi1_cap_mask; #define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap)) #define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \ HFI1_CAP_MISC_MASK) +/* Offline Disabled Reason is 4-bits */ +#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON) /* * Control context is always 0 and handles the error packets. @@ -177,6 +178,11 @@ struct ctxt_eager_bufs { } *rcvtids; }; +struct exp_tid_set { + struct list_head list; + u32 count; +}; + struct hfi1_ctxtdata { /* shadow the ctxt's RcvCtrl register */ u64 rcvctrl; @@ -233,20 +239,13 @@ struct hfi1_ctxtdata { u32 expected_count; /* index of first expected TID entry. */ u32 expected_base; - /* cursor into the exp group sets */ - atomic_t tidcursor; - /* number of exp TID groups assigned to the ctxt */ - u16 numtidgroups; - /* size of exp TID group fields in tidusemap */ - u16 tidmapcnt; - /* exp TID group usage bitfield array */ - unsigned long *tidusemap; - /* pinned pages for exp sends, allocated at open */ - struct page **tid_pg_list; - /* dma handles for exp tid pages */ - dma_addr_t *physshadow; + + struct exp_tid_set tid_group_list; + struct exp_tid_set tid_used_list; + struct exp_tid_set tid_full_list; + /* lock protecting all Expected TID data */ - spinlock_t exp_lock; + struct mutex exp_lock; /* number of pio bufs for this ctxt (all procs, if shared) */ u32 piocnt; /* first pio buffer for this ctxt */ @@ -311,8 +310,24 @@ struct hfi1_ctxtdata { */ struct task_struct *progress; struct list_head sdma_queues; + /* protect sdma queues */ spinlock_t sdma_qlock; + /* Is ASPM interrupt supported for this context */ + bool aspm_intr_supported; + /* ASPM state (enabled/disabled) for this context */ + bool aspm_enabled; + /* Timer for re-enabling ASPM if interrupt activity quietens down */ + struct timer_list aspm_timer; + /* Lock to serialize between intr, timer intr and user threads */ + spinlock_t aspm_lock; + /* Is ASPM processing enabled for this context (in intr context) */ + bool aspm_intr_enable; + /* Last interrupt timestamp */ + ktime_t aspm_ts_last_intr; + /* Last timestamp at which we scheduled a timer for this context */ + ktime_t aspm_ts_timer_sched; + /* * The interrupt handler for a particular receive context can vary * throughout it's lifetime. This is not a lock protected data member so @@ -335,7 +350,7 @@ struct hfi1_packet { void *hdr; struct hfi1_ctxtdata *rcd; __le32 *rhf_addr; - struct hfi1_qp *qp; + struct rvt_qp *qp; struct hfi1_other_headers *ohdr; u64 rhf; u32 maxcnt; @@ -363,6 +378,7 @@ struct hfi1_snoop_data { int mode_flag; struct cdev cdev; struct device *class_dev; + /* protect snoop data */ spinlock_t snoop_lock; struct list_head queue; wait_queue_head_t waitq; @@ -375,7 +391,7 @@ struct hfi1_snoop_data { #define HFI1_PORT_SNOOP_MODE 1U #define HFI1_PORT_CAPTURE_MODE 2U -struct hfi1_sge_state; +struct rvt_sge_state; /* * Get/Set IB link-level config parameters for f_get/set_ib_cfg() @@ -424,17 +440,17 @@ struct hfi1_sge_state; #define __HLS_GOING_OFFLINE_BP 9 #define __HLS_LINK_COOLDOWN_BP 10 -#define HLS_UP_INIT (1 << __HLS_UP_INIT_BP) -#define HLS_UP_ARMED (1 << __HLS_UP_ARMED_BP) -#define HLS_UP_ACTIVE (1 << __HLS_UP_ACTIVE_BP) -#define HLS_DN_DOWNDEF (1 << __HLS_DN_DOWNDEF_BP) /* link down default */ -#define HLS_DN_POLL (1 << __HLS_DN_POLL_BP) -#define HLS_DN_DISABLE (1 << __HLS_DN_DISABLE_BP) -#define HLS_DN_OFFLINE (1 << __HLS_DN_OFFLINE_BP) -#define HLS_VERIFY_CAP (1 << __HLS_VERIFY_CAP_BP) -#define HLS_GOING_UP (1 << __HLS_GOING_UP_BP) -#define HLS_GOING_OFFLINE (1 << __HLS_GOING_OFFLINE_BP) -#define HLS_LINK_COOLDOWN (1 << __HLS_LINK_COOLDOWN_BP) +#define HLS_UP_INIT BIT(__HLS_UP_INIT_BP) +#define HLS_UP_ARMED BIT(__HLS_UP_ARMED_BP) +#define HLS_UP_ACTIVE BIT(__HLS_UP_ACTIVE_BP) +#define HLS_DN_DOWNDEF BIT(__HLS_DN_DOWNDEF_BP) /* link down default */ +#define HLS_DN_POLL BIT(__HLS_DN_POLL_BP) +#define HLS_DN_DISABLE BIT(__HLS_DN_DISABLE_BP) +#define HLS_DN_OFFLINE BIT(__HLS_DN_OFFLINE_BP) +#define HLS_VERIFY_CAP BIT(__HLS_VERIFY_CAP_BP) +#define HLS_GOING_UP BIT(__HLS_GOING_UP_BP) +#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP) +#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP) #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) @@ -490,6 +506,7 @@ struct hfi1_sge_state; #define CNTR_DISABLED 0x2 /* Disable this counter */ #define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */ #define CNTR_VL 0x8 /* Per VL counter */ +#define CNTR_SDMA 0x10 #define CNTR_INVALID_VL -1 /* Specifies invalid VL */ #define CNTR_MODE_W 0x0 #define CNTR_MODE_R 0x1 @@ -512,10 +529,11 @@ static inline void incr_cntr32(u32 *cntr) #define MAX_NAME_SIZE 64 struct hfi1_msix_entry { + enum irq_type type; struct msix_entry msix; void *arg; char name[MAX_NAME_SIZE]; - cpumask_var_t mask; + cpumask_t mask; }; /* per-SL CCA information */ @@ -542,6 +560,7 @@ enum { }; struct vl_arb_cache { + /* protect vl arb cache */ spinlock_t lock; struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE]; }; @@ -561,7 +580,8 @@ struct hfi1_pportdata { struct kobject sl2sc_kobj; struct kobject vl2mtu_kobj; - /* QSFP support */ + /* PHY support */ + u32 port_type; struct qsfp_data qsfp_info; /* GUID for this interface, in host order */ @@ -586,6 +606,7 @@ struct hfi1_pportdata { struct work_struct link_vc_work; struct work_struct link_up_work; struct work_struct link_down_work; + struct work_struct dc_host_req_work; struct work_struct sma_message_work; struct work_struct freeze_work; struct work_struct link_downgrade_work; @@ -623,6 +644,7 @@ struct hfi1_pportdata { u16 link_speed_active; u8 vls_supported; u8 vls_operational; + u8 actual_vls_operational; /* LID mask control */ u8 lmc; /* Rx Polarity inversion (compensate for ~tx on partner) */ @@ -642,19 +664,23 @@ struct hfi1_pportdata { u8 link_enabled; /* link enabled? */ u8 linkinit_reason; u8 local_tx_rate; /* rate given to 8051 firmware */ + u8 last_pstate; /* info only */ /* placeholders for IB MAD packet settings */ u8 overrun_threshold; u8 phy_error_threshold; - /* used to override LED behavior */ - u8 led_override; /* Substituted for normal value, if non-zero */ - u16 led_override_timeoff; /* delta to next timer event */ - u8 led_override_vals[2]; /* Alternates per blink-frame */ - u8 led_override_phase; /* Just counts, LSB picks from vals[] */ + /* Used to override LED behavior for things like maintenance beaconing*/ + /* + * Alternates per phase of blink + * [0] holds LED off duration, [1] holds LED on duration + */ + unsigned long led_override_vals[2]; + u8 led_override_phase; /* LSB picks from vals[] */ atomic_t led_override_timer_active; /* Used to flash LEDs in override mode */ struct timer_list led_override_timer; + u32 sm_trap_qp; u32 sa_qp; @@ -689,10 +715,12 @@ struct hfi1_pportdata { /* CA's max number of 64 entry units in the congestion control table */ u8 cc_max_table_entries; - /* begin congestion log related entries - * cc_log_lock protects all congestion log related data */ + /* + * begin congestion log related entries + * cc_log_lock protects all congestion log related data + */ spinlock_t cc_log_lock ____cacheline_aligned_in_smp; - u8 threshold_cong_event_map[OPA_MAX_SLS/8]; + u8 threshold_cong_event_map[OPA_MAX_SLS / 8]; u16 threshold_event_counter; struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS]; int cc_log_idx; /* index for logging events */ @@ -705,8 +733,9 @@ struct hfi1_pportdata { u64 *cntrs; /* port relative synthetic counter buffer */ u64 *scntrs; - /* we synthesize port_xmit_discards from several egress errors */ + /* port_xmit_discards are synthesized from different egress errors */ u64 port_xmit_discards; + u64 port_xmit_discards_vl[C_VL_COUNT]; u64 port_xmit_constraint_errors; u64 port_rcv_constraint_errors; /* count of 'link_err' interrupts from DC */ @@ -728,6 +757,9 @@ struct hfi1_pportdata { u8 remote_link_down_reason; /* Error events that will cause a port bounce. */ u32 port_error_action; + struct work_struct linkstate_active_work; + /* Does this port need to prescan for FECNs */ + bool cc_prescan; }; typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); @@ -773,6 +805,12 @@ struct hfi1_temp { u8 triggers; /* temperature triggers */ }; +/* common data between shared ASIC HFIs */ +struct hfi1_asic_data { + struct hfi1_devdata *dds[2]; /* back pointers */ + struct mutex asic_resource_mutex; +}; + /* device data struct now contains only "general per-device" info. * fields related to a physical IB port are in a hfi1_pportdata struct. */ @@ -782,6 +820,7 @@ struct sdma_vl_map; #define BOARD_VERS_MAX 96 /* how long the version string can be */ #define SERIAL_MAX 16 /* length of the serial number */ +typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64); struct hfi1_devdata { struct hfi1_ibdev verbs_dev; /* must be first */ struct list_head list; @@ -811,6 +850,12 @@ struct hfi1_devdata { spinlock_t sc_lock; /* Per VL data. Enough for all VLs but not all elements are set/used. */ struct per_vl_data vld[PER_VL_SEND_CONTEXTS]; + /* lock for pio_map */ + spinlock_t pio_map_lock; + /* array of kernel send contexts */ + struct send_context **kernel_send_context; + /* array of vl maps */ + struct pio_vl_map __rcu *pio_map; /* seqlock for sc2vl */ seqlock_t sc2vl_lock; u64 sc2vl[4]; @@ -841,6 +886,8 @@ struct hfi1_devdata { wait_queue_head_t sdma_unfreeze_wq; atomic_t sdma_unfreeze_count; + /* common data between shared ASIC HFIs in this OS */ + struct hfi1_asic_data *asic_data; /* hfi1_pportdata, points to array of (physical) port-specific * data structs, indexed by pidx (0..n-1) @@ -873,10 +920,11 @@ struct hfi1_devdata { /* reset value */ u64 z_int_counter; u64 z_rcv_limit; + u64 z_send_schedule; /* percpu int_counter */ u64 __percpu *int_counter; u64 __percpu *rcv_limit; - + u64 __percpu *send_schedule; /* number of receive contexts in use by the driver */ u32 num_rcv_contexts; /* number of pio send contexts in use by the driver */ @@ -885,6 +933,8 @@ struct hfi1_devdata { * number of ctxts available for PSM open */ u32 freectxts; + /* total number of available user/PSM contexts */ + u32 num_user_contexts; /* base receive interrupt timeout, in CSR units */ u32 rcv_intr_timeout_csr; @@ -996,9 +1046,8 @@ struct hfi1_devdata { u16 irev; /* implementation revision */ u16 dc8051_ver; /* 8051 firmware version */ + struct platform_config platform_config; struct platform_config_cache pcfg_cache; - /* control high-level access to qsfp */ - struct mutex qsfp_i2c_mutex; struct diag_client *diag_client; spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */ @@ -1008,8 +1057,6 @@ struct hfi1_devdata { u16 psxmitwait_check_rate; /* high volume overflow errors deferred to tasklet */ struct tasklet_struct error_tasklet; - /* per device cq worker */ - struct kthread_worker *worker; /* MSI-X information */ struct hfi1_msix_entry *msix_entries; @@ -1090,10 +1137,8 @@ struct hfi1_devdata { * Handlers for outgoing data so that snoop/capture does not * have to have its hooks in the send path */ - int (*process_pio_send)(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, - u64 pbc); - int (*process_dma_send)(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, - u64 pbc); + send_routine process_pio_send; + send_routine process_dma_send; void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, const void *from, size_t count); @@ -1105,7 +1150,6 @@ struct hfi1_devdata { struct timer_list rcverr_timer; u32 rcv_ovfl_cnt; - int assigned_node_id; wait_queue_head_t event_queue; /* Save the enabled LCB error bits */ @@ -1115,6 +1159,16 @@ struct hfi1_devdata { /* receive context tail dummy address */ __le64 *rcvhdrtail_dummy_kvaddr; dma_addr_t rcvhdrtail_dummy_physaddr; + + bool eprom_available; /* true if EPROM is available for this device */ + bool aspm_supported; /* Does HW support ASPM */ + bool aspm_enabled; /* ASPM state: enabled/disabled */ + /* Serialize ASPM enable/disable between multiple verbs contexts */ + spinlock_t aspm_lock; + /* Number of verbs contexts which have disabled ASPM */ + atomic_t aspm_disabled_cnt; + + struct hfi1_affinity *affinity; }; /* 8051 firmware version helper */ @@ -1125,6 +1179,9 @@ struct hfi1_devdata { #define PT_EAGER 1 #define PT_INVALID 2 +struct tid_rb_node; +struct mmu_rb_node; + /* Private data for file operations */ struct hfi1_filedata { struct hfi1_ctxtdata *uctxt; @@ -1133,6 +1190,16 @@ struct hfi1_filedata { struct hfi1_user_sdma_pkt_q *pq; /* for cpu affinity; -1 if none */ int rec_cpu_num; + u32 tid_n_pinned; + struct rb_root tid_rb_root; + struct tid_rb_node **entry_to_rb; + spinlock_t tid_lock; /* protect tid_[limit,used] counters */ + u32 tid_limit; + u32 tid_used; + u32 *invalid_tids; + u32 invalid_tid_idx; + /* protect invalid_tids array and invalid_tid_idx */ + spinlock_t invalid_lock; }; extern struct list_head hfi1_dev_list; @@ -1156,7 +1223,7 @@ void handle_user_interrupt(struct hfi1_ctxtdata *rcd); int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *); int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *); int hfi1_create_ctxts(struct hfi1_devdata *dd); -struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32); +struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int); void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *, struct hfi1_devdata *, u8, u8); void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *); @@ -1164,6 +1231,7 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *); int handle_receive_interrupt(struct hfi1_ctxtdata *, int); int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int); +void set_all_slowpath(struct hfi1_devdata *dd); /* receive packet handler dispositions */ #define RCV_PKT_OK 0x0 /* keep going */ @@ -1184,6 +1252,15 @@ static inline u32 driver_lstate(struct hfi1_pportdata *ppd) return ppd->lstate; /* use the cached value */ } +void receive_interrupt_work(struct work_struct *work); + +/* extract service channel from header and rhf */ +static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf) +{ + return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) | + ((!!(rhf & RHF_DC_INFO_MASK)) << 4); +} + static inline u16 generate_jkey(kuid_t uid) { return from_kuid(current_user_ns(), uid) & 0xffff; @@ -1253,7 +1330,7 @@ static inline u32 egress_cycles(u32 len, u32 rate) void set_link_ipg(struct hfi1_pportdata *ppd); void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, u32 rqpn, u8 svc_type); -void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn, +void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh); @@ -1424,6 +1501,7 @@ static inline int valid_ib_mtu(unsigned int mtu) mtu == 1024 || mtu == 2048 || mtu == 4096; } + static inline int valid_opa_max_mtu(unsigned int mtu) { return mtu >= 2048 && @@ -1445,12 +1523,13 @@ void reset_link_credits(struct hfi1_devdata *dd); void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu); int snoop_recv_handler(struct hfi1_packet *packet); -int snoop_send_dma_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -int snoop_send_pio_handler(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, const void *from, size_t count); +int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc); static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd) { @@ -1472,6 +1551,11 @@ static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp) return container_of(ibp, struct hfi1_pportdata, ibport_data); } +static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi) +{ + return container_of(rdi, struct hfi1_ibdev, rdi); +} + static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); @@ -1515,12 +1599,10 @@ static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) #define HFI1_HAS_SDMA_TIMEOUT 0x8 #define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */ #define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */ -#define HFI1_DO_INIT_ASIC 0x100 /* This device will init the ASIC */ /* IB dword length mask in PBC (lower 11 bits); same for all chips */ #define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1) - /* ctxt_flag bit offsets */ /* context has been setup */ #define HFI1_CTXT_SETUP_DONE 1 @@ -1538,14 +1620,10 @@ void hfi1_free_devdata(struct hfi1_devdata *); void cc_state_reclaim(struct rcu_head *rcu); struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra); -/* - * Set LED override, only the two LSBs have "public" meaning, but - * any non-zero value substitutes them for the Link and LinkTrain - * LED states. - */ -#define HFI1_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ -#define HFI1_LED_LOG 2 /* Logical (link) YELLOW LED */ -void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val); +/* LED beaconing functions */ +void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, + unsigned int timeoff); +void shutdown_led_override(struct hfi1_pportdata *ppd); #define HFI1_CREDIT_RETURN_RATE (100) @@ -1587,12 +1665,13 @@ void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val); */ #define DEFAULT_RCVHDR_ENTSIZE 32 +bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32); int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **); -void hfi1_release_user_pages(struct page **, size_t, bool); +void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool); static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd) { - *((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL; + *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL; } static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) @@ -1601,7 +1680,7 @@ static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) * volatile because it's a DMA target from the chip, routine is * inlined, and don't want register caching or reordering. */ - return (u32) le64_to_cpu(*rcd->rcvhdrtail_kvaddr); + return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr); } /* @@ -1633,12 +1712,13 @@ void restore_pci_variables(struct hfi1_devdata *dd); int do_pcie_gen3_transition(struct hfi1_devdata *dd); int parse_platform_config(struct hfi1_devdata *dd); int get_platform_config_field(struct hfi1_devdata *dd, - enum platform_config_table_type_encoding table_type, - int table_index, int field_index, u32 *data, u32 len); + enum platform_config_table_type_encoding + table_type, int table_index, int field_index, + u32 *data, u32 len); -dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long, - size_t, int); const char *get_unit_name(int unit); +const char *get_card_name(struct rvt_dev_info *rdi); +struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi); /* * Flush write combining store buffers (if present) and perform a write @@ -1659,7 +1739,7 @@ int process_receive_invalid(struct hfi1_packet *packet); extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8]; -void update_sge(struct hfi1_sge_state *ss, u32 length); +void update_sge(struct rvt_sge_state *ss, u32 length); /* global module parameter variables */ extern unsigned int hfi1_max_mtu; @@ -1667,7 +1747,7 @@ extern unsigned int hfi1_cu; extern unsigned int user_credit_return_threshold; extern int num_user_contexts; extern unsigned n_krcvqs; -extern u8 krcvqs[]; +extern uint krcvqs[]; extern int krcvqsset; extern uint kdeth_qp; extern uint loopback; @@ -1730,7 +1810,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; if (is_ax(dd)) - /* turn off send-side job key checks - A0 erratum */ + /* turn off send-side job key checks - A0 */ return base_sc_integrity & ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; return base_sc_integrity; @@ -1757,7 +1837,7 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK; if (is_ax(dd)) - /* turn off send-side job key checks - A0 erratum */ + /* turn off send-side job key checks - A0 */ return base_sdma_integrity & ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; return base_sdma_integrity; @@ -1794,6 +1874,10 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) dev_info(&(dd)->pcidev->dev, "%s: " fmt, \ get_unit_name((dd)->unit), ##__VA_ARGS__) +#define dd_dev_dbg(dd, fmt, ...) \ + dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + #define hfi1_dev_porterr(dd, port, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ get_unit_name((dd)->unit), (dd)->unit, (port), \ @@ -1825,13 +1909,14 @@ static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd) dd->z_int_counter = get_all_cpu_total(dd->int_counter); dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit); + dd->z_send_schedule = get_all_cpu_total(dd->send_schedule); ppd = (struct hfi1_pportdata *)(dd + 1); for (i = 0; i < dd->num_pports; i++, ppd++) { - ppd->ibport_data.z_rc_acks = - get_all_cpu_total(ppd->ibport_data.rc_acks); - ppd->ibport_data.z_rc_qacks = - get_all_cpu_total(ppd->ibport_data.rc_qacks); + ppd->ibport_data.rvp.z_rc_acks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_acks); + ppd->ibport_data.rvp.z_rc_qacks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks); } } @@ -1844,6 +1929,18 @@ static inline void setextled(struct hfi1_devdata *dd, u32 on) write_csr(dd, DCC_CFG_LED_CNTRL, 0x10); } +/* return the i2c resource given the target */ +static inline u32 i2c_target(u32 target) +{ + return target ? CR_I2C2 : CR_I2C1; +} + +/* return the i2c chain chip resource that this HFI uses for QSFP */ +static inline u32 qsfp_resource(struct hfi1_devdata *dd) +{ + return i2c_target(dd->hfi1_id); +} + int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); #endif /* _HFI1_KERNEL_H */ diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index 4dd8051aba7e..deabb0812023 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -56,6 +53,7 @@ #include <linux/module.h> #include <linux/printk.h> #include <linux/hrtimer.h> +#include <rdma/rdma_vt.h> #include "hfi.h" #include "device.h" @@ -65,6 +63,7 @@ #include "sdma.h" #include "debugfs.h" #include "verbs.h" +#include "aspm.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -75,6 +74,7 @@ #define HFI1_MIN_USER_CTXT_BUFCNT 7 #define HFI1_MIN_HDRQ_EGRBUF_CNT 2 +#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ @@ -87,9 +87,9 @@ module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO); MODULE_PARM_DESC( num_user_contexts, "Set max number of user contexts to use"); -u8 krcvqs[RXE_NUM_DATA_VL]; +uint krcvqs[RXE_NUM_DATA_VL]; int krcvqsset; -module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO); +module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); /* computed based on above array */ @@ -128,16 +128,12 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) { unsigned i; int ret; - int local_node_id = pcibus_to_node(dd->pcidev->bus); /* Control context has to be always 0 */ BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); - if (local_node_id < 0) - local_node_id = numa_node_id(); - dd->assigned_node_id = local_node_id; - - dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL); + dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd), + GFP_KERNEL, dd->node); if (!dd->rcd) goto nomem; @@ -147,10 +143,10 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) struct hfi1_ctxtdata *rcd; ppd = dd->pport + (i % dd->num_pports); - rcd = hfi1_create_ctxtdata(ppd, i); + rcd = hfi1_create_ctxtdata(ppd, i, dd->node); if (!rcd) { dd_dev_err(dd, - "Unable to allocate kernel receive context, failing\n"); + "Unable to allocate kernel receive context, failing\n"); goto nomem; } /* @@ -171,7 +167,7 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); if (!rcd->sc) { dd_dev_err(dd, - "Unable to allocate kernel send context, failing\n"); + "Unable to allocate kernel send context, failing\n"); dd->rcd[rcd->ctxt] = NULL; hfi1_free_ctxtdata(dd, rcd); goto nomem; @@ -189,6 +185,12 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd) } } + /* + * Initialize aspm, to be done after gen3 transition and setting up + * contexts and before enabling interrupts + */ + aspm_init(dd); + return 0; nomem: ret = -ENOMEM; @@ -201,7 +203,8 @@ bail: /* * Common code for user and kernel context setup. */ -struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt) +struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, + int numa) { struct hfi1_devdata *dd = ppd->dd; struct hfi1_ctxtdata *rcd; @@ -224,10 +227,10 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt) rcd->cnt = 1; rcd->ctxt = ctxt; dd->rcd[ctxt] = rcd; - rcd->numa_id = numa_node_id(); + rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; - spin_lock_init(&rcd->exp_lock); + mutex_init(&rcd->exp_lock); /* * Calculate the context's RcvArray entry starting point. @@ -260,7 +263,7 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt) /* Validate and initialize Rcv Hdr Q variables */ if (rcvhdrcnt % HDRQ_INCREMENT) { dd_dev_err(dd, - "ctxt%u: header queue count %d must be divisible by %d\n", + "ctxt%u: header queue count %d must be divisible by %lu\n", rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT); goto bail; } @@ -380,7 +383,7 @@ void set_link_ipg(struct hfi1_pportdata *ppd) cc_state = get_cc_state(ppd); - if (cc_state == NULL) + if (!cc_state) /* * This should _never_ happen - rcu_read_lock() is held, * and set_link_ipg() should not be called if cc_state @@ -432,7 +435,7 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) cc_state = get_cc_state(ppd); - if (cc_state == NULL) { + if (!cc_state) { rcu_read_unlock(); return HRTIMER_NORESTART; } @@ -494,14 +497,19 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, INIT_WORK(&ppd->link_vc_work, handle_verify_cap); INIT_WORK(&ppd->link_up_work, handle_link_up); INIT_WORK(&ppd->link_down_work, handle_link_down); + INIT_WORK(&ppd->dc_host_req_work, handle_8051_request); INIT_WORK(&ppd->freeze_work, handle_freeze); INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); INIT_WORK(&ppd->sma_message_work, handle_sma_message); INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); + INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); + INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); + mutex_init(&ppd->hls_lock); spin_lock_init(&ppd->sdma_alllock); spin_lock_init(&ppd->qsfp_info.qsfp_lock); + ppd->qsfp_info.ppd = ppd; ppd->sm_trap_qp = 0x0; ppd->sa_qp = 0x1; @@ -583,8 +591,8 @@ static void enable_chip(struct hfi1_devdata *dd) * Enable kernel ctxts' receive and receive interrupt. * Other ctxts done as user opens and initializes them. */ - rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; for (i = 0; i < dd->first_user_ctxt; ++i) { + rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR)) @@ -730,7 +738,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) lastfail = hfi1_setup_eagerbufs(rcd); if (lastfail) dd_dev_err(dd, - "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); + "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); } if (lastfail) ret = lastfail; @@ -763,7 +771,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) /* enable chip even if we have an error, so we can debug cause */ enable_chip(dd); - ret = hfi1_cq_init(dd); done: /* * Set status even if port serdes is not initialized @@ -780,20 +787,15 @@ done: for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; - /* initialize the qsfp if it exists - * Requires interrupts to be enabled so we are notified - * when the QSFP completes reset, and has - * to be done before bringing up the SERDES + /* + * start the serdes - must be after interrupts are + * enabled so we are notified when the link goes up */ - init_qsfp(ppd); - - /* start the serdes - must be after interrupts are - enabled so we are notified when the link goes up */ lastfail = bringup_serdes(ppd); if (lastfail) dd_dev_info(dd, - "Failed to bring up port %u\n", - ppd->port); + "Failed to bring up port %u\n", + ppd->port); /* * Set status even if port serdes is not initialized @@ -905,6 +907,8 @@ static void shutdown_device(struct hfi1_devdata *dd) /* disable the send device */ pio_send_control(dd, PSC_GLOBAL_DISABLE); + shutdown_led_override(ppd); + /* * Clear SerdesEnable. * We can't count on interrupts since we are stopping. @@ -962,17 +966,33 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) kfree(rcd->egrbufs.buffers); sc_free(rcd->sc); - vfree(rcd->physshadow); - vfree(rcd->tid_pg_list); vfree(rcd->user_event_mask); vfree(rcd->subctxt_uregbase); vfree(rcd->subctxt_rcvegrbuf); vfree(rcd->subctxt_rcvhdr_base); - kfree(rcd->tidusemap); kfree(rcd->opstats); kfree(rcd); } +/* + * Release our hold on the shared asic data. If we are the last one, + * free the structure. Must be holding hfi1_devs_lock. + */ +static void release_asic_data(struct hfi1_devdata *dd) +{ + int other; + + if (!dd->asic_data) + return; + dd->asic_data->dds[dd->hfi1_id] = NULL; + other = dd->hfi1_id ? 0 : 1; + if (!dd->asic_data->dds[other]) { + /* we are the last holder, free it */ + kfree(dd->asic_data); + } + dd->asic_data = NULL; +} + void hfi1_free_devdata(struct hfi1_devdata *dd) { unsigned long flags; @@ -980,12 +1000,15 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) spin_lock_irqsave(&hfi1_devs_lock, flags); idr_remove(&hfi1_unit_table, dd->unit); list_del(&dd->list); + release_asic_data(dd); spin_unlock_irqrestore(&hfi1_devs_lock, flags); - hfi1_dbg_ibdev_exit(&dd->verbs_dev); + free_platform_config(dd); rcu_barrier(); /* wait for rcu callbacks to complete */ free_percpu(dd->int_counter); free_percpu(dd->rcv_limit); - ib_dealloc_device(&dd->verbs_dev.ibdev); + hfi1_dev_affinity_free(dd); + free_percpu(dd->send_schedule); + ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); } /* @@ -1000,19 +1023,19 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) { unsigned long flags; struct hfi1_devdata *dd; - int ret; + int ret, nports; - dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra); + /* extra is * number of ports */ + nports = extra / sizeof(struct hfi1_pportdata); + + dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, + nports); if (!dd) return ERR_PTR(-ENOMEM); - /* extra is * number of ports */ - dd->num_pports = extra / sizeof(struct hfi1_pportdata); + dd->num_pports = nports; dd->pport = (struct hfi1_pportdata *)(dd + 1); INIT_LIST_HEAD(&dd->list); - dd->node = dev_to_node(&pdev->dev); - if (dd->node < 0) - dd->node = 0; idr_preload(GFP_KERNEL); spin_lock_irqsave(&hfi1_devs_lock, flags); @@ -1042,9 +1065,9 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) spin_lock_init(&dd->sc_init_lock); spin_lock_init(&dd->dc8051_lock); spin_lock_init(&dd->dc8051_memlock); - mutex_init(&dd->qsfp_i2c_mutex); seqlock_init(&dd->sc2vl_lock); spin_lock_init(&dd->sde_map_lock); + spin_lock_init(&dd->pio_map_lock); init_waitqueue_head(&dd->event_queue); dd->int_counter = alloc_percpu(u64); @@ -1063,6 +1086,14 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) goto bail; } + dd->send_schedule = alloc_percpu(u64); + if (!dd->send_schedule) { + ret = -ENOMEM; + hfi1_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); + goto bail; + } + if (!hfi1_cpulist_count) { u32 count = num_online_cpus(); @@ -1075,13 +1106,12 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) &pdev->dev, "Could not alloc cpulist info, cpu affinity might be wrong\n"); } - hfi1_dbg_ibdev_init(&dd->verbs_dev); return dd; bail: if (!list_empty(&dd->list)) list_del_init(&dd->list); - ib_dealloc_device(&dd->verbs_dev.ibdev); + ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); return ERR_PTR(ret); } @@ -1174,8 +1204,10 @@ static int __init hfi1_mod_init(void) user_credit_return_threshold = 100; compute_krcvqs(); - /* sanitize receive interrupt count, time must wait until after - the hardware type is known */ + /* + * sanitize receive interrupt count, time must wait until after + * the hardware type is known + */ if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; /* reject invalid combinations */ @@ -1210,6 +1242,9 @@ static int __init hfi1_mod_init(void) idr_init(&hfi1_unit_table); hfi1_dbg_init(); + ret = hfi1_wss_init(); + if (ret < 0) + goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1218,6 +1253,8 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: + hfi1_wss_exit(); +bail_wss: hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); dev_cleanup(); @@ -1233,6 +1270,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + hfi1_wss_exit(); hfi1_dbg_exit(); hfi1_cpulist_count = 0; kfree(hfi1_cpulist); @@ -1304,16 +1342,18 @@ static void cleanup_device_data(struct hfi1_devdata *dd) } } kfree(tmp); + free_pio_map(dd); /* must follow rcv context free - need to remove rcv's hooks */ for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) sc_free(dd->send_contexts[ctxt].sc); dd->num_send_contexts = 0; kfree(dd->send_contexts); dd->send_contexts = NULL; + kfree(dd->hw_to_sw); + dd->hw_to_sw = NULL; kfree(dd->boardname); vfree(dd->events); vfree(dd->status); - hfi1_cq_exit(dd); } /* @@ -1347,6 +1387,13 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) ret = -EINVAL; goto bail; } + if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { + hfi1_early_err(&pdev->dev, + "Receive header queue count cannot be greater than %u\n", + HFI1_MAX_HDRQ_EGRBUF_CNT); + ret = -EINVAL; + goto bail; + } /* use the encoding function as a sanitization check */ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", @@ -1423,8 +1470,11 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) * we still create devices, so diags, etc. can be used * to determine cause of problem. */ - if (!initfail && !ret) + if (!initfail && !ret) { dd->flags |= HFI1_INITTED; + /* create debufs files after init and ib register */ + hfi1_dbg_ibdev_init(&dd->verbs_dev); + } j = hfi1_device_create(dd); if (j) @@ -1465,6 +1515,8 @@ static void remove_one(struct pci_dev *pdev) { struct hfi1_devdata *dd = pci_get_drvdata(pdev); + /* close debugfs files before ib unregister */ + hfi1_dbg_ibdev_exit(&dd->verbs_dev); /* unregister from IB core */ hfi1_unregister_ib_device(dd); @@ -1517,18 +1569,11 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) if (!rcd->rcvhdrq) { dd_dev_err(dd, - "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", - amt, rcd->ctxt); + "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", + amt, rcd->ctxt); goto bail; } - /* Event mask is per device now and is in hfi1_devdata */ - /*if (rcd->ctxt >= dd->first_user_ctxt) { - rcd->user_event_mask = vmalloc_user(PAGE_SIZE); - if (!rcd->user_event_mask) - goto bail_free_hdrq; - }*/ - if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, @@ -1569,8 +1614,8 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) bail_free: dd_dev_err(dd, - "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", - rcd->ctxt); + "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", + rcd->ctxt); vfree(rcd->user_event_mask); rcd->user_event_mask = NULL; dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, @@ -1660,7 +1705,7 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) if (rcd->egrbufs.rcvtid_size == round_mtu || !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", - rcd->ctxt); + rcd->ctxt); goto bail_rcvegrbuf_phys; } @@ -1695,8 +1740,9 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) rcd->egrbufs.buffers[j].len)) { j++; offset = 0; - } else + } else { offset += new_size; + } } rcd->egrbufs.rcvtid_size = new_size; } @@ -1709,7 +1755,6 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size, rcd->egrbufs.size); - /* * Set the contexts rcv array head update threshold to the closest * power of 2 (so we can use a mask instead of modulo) below half @@ -1743,14 +1788,14 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, - rcd->egrbufs.rcvtids[idx].phys, order); + rcd->egrbufs.rcvtids[idx].phys, order); cond_resched(); } goto bail; bail_rcvegrbuf_phys: for (idx = 0; idx < rcd->egrbufs.alloced && - rcd->egrbufs.buffers[idx].addr; + rcd->egrbufs.buffers[idx].addr; idx++) { dma_free_coherent(&dd->pcidev->dev, rcd->egrbufs.buffers[idx].len, diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c index 426582b9ab65..65348d16ab2f 100644 --- a/drivers/staging/rdma/hfi1/intr.c +++ b/drivers/staging/rdma/hfi1/intr.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -98,7 +95,7 @@ static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev) */ if (!(dd->flags & HFI1_INITTED)) return; - event.device = &dd->verbs_dev.ibdev; + event.device = &dd->verbs_dev.rdi.ibdev; event.element.port_num = ppd->port; event.event = ev; ib_dispatch_event(&event); @@ -131,28 +128,26 @@ void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup) * NOTE: This uses this device's vAU, vCU, and vl15_init for * the remote values. Both sides must be using the values. */ - if (quick_linkup - || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { set_up_vl15(dd, dd->vau, dd->vl15_init); assign_remote_cm_au_table(dd, dd->vcu); ppd->neighbor_guid = - read_csr(dd, - DC_DC8051_STS_REMOTE_GUID); + read_csr(dd, DC_DC8051_STS_REMOTE_GUID); ppd->neighbor_type = read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) & DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK; ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) & - DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK; - dd_dev_info(dd, - "Neighbor GUID: %llx Neighbor type %d\n", - ppd->neighbor_guid, - ppd->neighbor_type); + DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK; + dd_dev_info(dd, "Neighbor GUID: %llx Neighbor type %d\n", + ppd->neighbor_guid, + ppd->neighbor_type); } /* physical link went up */ ppd->linkup = 1; - ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE; + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); /* link widths are not available until the link is fully up */ get_linkup_link_widths(ppd); @@ -165,7 +160,7 @@ void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup) reset_link_credits(dd); /* freeze after a link down to guarantee a clean egress */ - start_freeze_handling(ppd, FREEZE_SELF|FREEZE_LINK_DOWN); + start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN); ev = IB_EVENT_PORT_ERR; @@ -177,8 +172,6 @@ void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup) /* notify IB of the link change */ signal_ib_event(ppd, ev); } - - } /* diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h index e8ba5606d08d..2ec6ef38d389 100644 --- a/drivers/staging/rdma/hfi1/iowait.h +++ b/drivers/staging/rdma/hfi1/iowait.h @@ -1,14 +1,13 @@ #ifndef _HFI1_IOWAIT_H #define _HFI1_IOWAIT_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -54,6 +51,8 @@ #include <linux/workqueue.h> #include <linux/sched.h> +#include "sdma_txreq.h" + /* * typedef (*restart_t)() - restart callback * @work: pointer to work structure @@ -67,9 +66,11 @@ struct sdma_engine; * @list: used to add/insert into QP/PQ wait lists * @tx_head: overflow list of sdma_txreq's * @sleep: no space callback - * @wakeup: space callback + * @wakeup: space callback wakeup + * @sdma_drained: sdma count drained * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 + * @wait_pio: wait for pio_busy == 0 * @sdma_busy: # of packets in flight * @count: total number of descriptors in tx_head'ed list * @tx_limit: limit for overflow queuing @@ -101,9 +102,12 @@ struct iowait { struct sdma_txreq *tx, unsigned seq); void (*wakeup)(struct iowait *wait, int reason); + void (*sdma_drained)(struct iowait *wait); struct work_struct iowork; wait_queue_head_t wait_dma; + wait_queue_head_t wait_pio; atomic_t sdma_busy; + atomic_t pio_busy; u32 count; u32 tx_limit; u32 tx_count; @@ -117,7 +121,7 @@ struct iowait { * @tx_limit: limit for overflow queuing * @func: restart function for workqueue * @sleep: sleep function for no space - * @wakeup: wakeup function for no space + * @resume: wakeup function for no space * * This function initializes the iowait * structure embedded in the QP or PQ. @@ -133,17 +137,21 @@ static inline void iowait_init( struct iowait *wait, struct sdma_txreq *tx, unsigned seq), - void (*wakeup)(struct iowait *wait, int reason)) + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)) { wait->count = 0; INIT_LIST_HEAD(&wait->list); INIT_LIST_HEAD(&wait->tx_head); INIT_WORK(&wait->iowork, func); init_waitqueue_head(&wait->wait_dma); + init_waitqueue_head(&wait->wait_pio); atomic_set(&wait->sdma_busy, 0); + atomic_set(&wait->pio_busy, 0); wait->tx_limit = tx_limit; wait->sleep = sleep; wait->wakeup = wakeup; + wait->sdma_drained = sdma_drained; } /** @@ -174,6 +182,88 @@ static inline void iowait_sdma_drain(struct iowait *wait) } /** + * iowait_sdma_pending() - return sdma pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_sdma_pending(struct iowait *wait) +{ + return atomic_read(&wait->sdma_busy); +} + +/** + * iowait_sdma_inc - note sdma io pending + * @wait: iowait structure + */ +static inline void iowait_sdma_inc(struct iowait *wait) +{ + atomic_inc(&wait->sdma_busy); +} + +/** + * iowait_sdma_add - add count to pending + * @wait: iowait structure + */ +static inline void iowait_sdma_add(struct iowait *wait, int count) +{ + atomic_add(count, &wait->sdma_busy); +} + +/** + * iowait_sdma_dec - note sdma complete + * @wait: iowait structure + */ +static inline int iowait_sdma_dec(struct iowait *wait) +{ + return atomic_dec_and_test(&wait->sdma_busy); +} + +/** + * iowait_pio_drain() - wait for pios to drain + * + * @wait: iowait structure + * + * This will delay until the iowait pios have + * completed. + */ +static inline void iowait_pio_drain(struct iowait *wait) +{ + wait_event_timeout(wait->wait_pio, + !atomic_read(&wait->pio_busy), + HZ); +} + +/** + * iowait_pio_pending() - return pio pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_pio_pending(struct iowait *wait) +{ + return atomic_read(&wait->pio_busy); +} + +/** + * iowait_pio_inc - note pio pending + * @wait: iowait structure + */ +static inline void iowait_pio_inc(struct iowait *wait) +{ + atomic_inc(&wait->pio_busy); +} + +/** + * iowait_sdma_dec - note pio complete + * @wait: iowait structure + */ +static inline int iowait_pio_dec(struct iowait *wait) +{ + return atomic_dec_and_test(&wait->pio_busy); +} + +/** * iowait_drain_wakeup() - trigger iowait_drain() waiter * * @wait: iowait structure @@ -183,6 +273,28 @@ static inline void iowait_sdma_drain(struct iowait *wait) static inline void iowait_drain_wakeup(struct iowait *wait) { wake_up(&wait->wait_dma); + wake_up(&wait->wait_pio); + if (wait->sdma_drained) + wait->sdma_drained(wait); +} + +/** + * iowait_get_txhead() - get packet off of iowait list + * + * @wait wait struture + */ +static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&wait->tx_head)) { + tx = list_first_entry( + &wait->tx_head, + struct sdma_txreq, + list); + list_del_init(&tx->list); + } + return tx; } #endif diff --git a/drivers/staging/rdma/hfi1/keys.c b/drivers/staging/rdma/hfi1/keys.c deleted file mode 100644 index cb4e6087dfdb..000000000000 --- a/drivers/staging/rdma/hfi1/keys.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include "hfi.h" - -/** - * hfi1_alloc_lkey - allocate an lkey - * @mr: memory region that this lkey protects - * @dma_region: 0->normal key, 1->restricted DMA key - * - * Returns 0 if successful, otherwise returns -errno. - * - * Increments mr reference count as required. - * - * Sets the lkey field mr for non-dma regions. - * - */ - -int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region) -{ - unsigned long flags; - u32 r; - u32 n; - int ret = 0; - struct hfi1_ibdev *dev = to_idev(mr->pd->device); - struct hfi1_lkey_table *rkt = &dev->lk_table; - - hfi1_get_mr(mr); - spin_lock_irqsave(&rkt->lock, flags); - - /* special case for dma_mr lkey == 0 */ - if (dma_region) { - struct hfi1_mregion *tmr; - - tmr = rcu_access_pointer(dev->dma_mr); - if (!tmr) { - rcu_assign_pointer(dev->dma_mr, mr); - mr->lkey_published = 1; - } else { - hfi1_put_mr(mr); - } - goto success; - } - - /* Find the next available LKEY */ - r = rkt->next; - n = r; - for (;;) { - if (!rcu_access_pointer(rkt->table[r])) - break; - r = (r + 1) & (rkt->max - 1); - if (r == n) - goto bail; - } - rkt->next = (r + 1) & (rkt->max - 1); - /* - * Make sure lkey is never zero which is reserved to indicate an - * unrestricted LKEY. - */ - rkt->gen++; - /* - * bits are capped in verbs.c to ensure enough bits for - * generation number - */ - mr->lkey = (r << (32 - hfi1_lkey_table_size)) | - ((((1 << (24 - hfi1_lkey_table_size)) - 1) & rkt->gen) - << 8); - if (mr->lkey == 0) { - mr->lkey |= 1 << 8; - rkt->gen++; - } - rcu_assign_pointer(rkt->table[r], mr); - mr->lkey_published = 1; -success: - spin_unlock_irqrestore(&rkt->lock, flags); -out: - return ret; -bail: - hfi1_put_mr(mr); - spin_unlock_irqrestore(&rkt->lock, flags); - ret = -ENOMEM; - goto out; -} - -/** - * hfi1_free_lkey - free an lkey - * @mr: mr to free from tables - */ -void hfi1_free_lkey(struct hfi1_mregion *mr) -{ - unsigned long flags; - u32 lkey = mr->lkey; - u32 r; - struct hfi1_ibdev *dev = to_idev(mr->pd->device); - struct hfi1_lkey_table *rkt = &dev->lk_table; - int freed = 0; - - spin_lock_irqsave(&rkt->lock, flags); - if (!mr->lkey_published) - goto out; - if (lkey == 0) - RCU_INIT_POINTER(dev->dma_mr, NULL); - else { - r = lkey >> (32 - hfi1_lkey_table_size); - RCU_INIT_POINTER(rkt->table[r], NULL); - } - mr->lkey_published = 0; - freed++; -out: - spin_unlock_irqrestore(&rkt->lock, flags); - if (freed) { - synchronize_rcu(); - hfi1_put_mr(mr); - } -} - -/** - * hfi1_lkey_ok - check IB SGE for validity and initialize - * @rkt: table containing lkey to check SGE against - * @pd: protection domain - * @isge: outgoing internal SGE - * @sge: SGE to check - * @acc: access flags - * - * Return 1 if valid and successful, otherwise returns 0. - * - * increments the reference count upon success - * - * Check the IB SGE for validity and initialize our internal version - * of it. - */ -int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd, - struct hfi1_sge *isge, struct ib_sge *sge, int acc) -{ - struct hfi1_mregion *mr; - unsigned n, m; - size_t off; - - /* - * We use LKEY == zero for kernel virtual addresses - * (see hfi1_get_dma_mr and dma.c). - */ - rcu_read_lock(); - if (sge->lkey == 0) { - struct hfi1_ibdev *dev = to_idev(pd->ibpd.device); - - if (pd->user) - goto bail; - mr = rcu_dereference(dev->dma_mr); - if (!mr) - goto bail; - atomic_inc(&mr->refcount); - rcu_read_unlock(); - - isge->mr = mr; - isge->vaddr = (void *) sge->addr; - isge->length = sge->length; - isge->sge_length = sge->length; - isge->m = 0; - isge->n = 0; - goto ok; - } - mr = rcu_dereference( - rkt->table[(sge->lkey >> (32 - hfi1_lkey_table_size))]); - if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) - goto bail; - - off = sge->addr - mr->user_base; - if (unlikely(sge->addr < mr->user_base || - off + sge->length > mr->length || - (mr->access_flags & acc) != acc)) - goto bail; - atomic_inc(&mr->refcount); - rcu_read_unlock(); - - off += mr->offset; - if (mr->page_shift) { - /* - page sizes are uniform power of 2 so no loop is necessary - entries_spanned_by_off is the number of times the loop below - would have executed. - */ - size_t entries_spanned_by_off; - - entries_spanned_by_off = off >> mr->page_shift; - off -= (entries_spanned_by_off << mr->page_shift); - m = entries_spanned_by_off / HFI1_SEGSZ; - n = entries_spanned_by_off % HFI1_SEGSZ; - } else { - m = 0; - n = 0; - while (off >= mr->map[m]->segs[n].length) { - off -= mr->map[m]->segs[n].length; - n++; - if (n >= HFI1_SEGSZ) { - m++; - n = 0; - } - } - } - isge->mr = mr; - isge->vaddr = mr->map[m]->segs[n].vaddr + off; - isge->length = mr->map[m]->segs[n].length - off; - isge->sge_length = sge->length; - isge->m = m; - isge->n = n; -ok: - return 1; -bail: - rcu_read_unlock(); - return 0; -} - -/** - * hfi1_rkey_ok - check the IB virtual address, length, and RKEY - * @qp: qp for validation - * @sge: SGE state - * @len: length of data - * @vaddr: virtual address to place data - * @rkey: rkey to check - * @acc: access flags - * - * Return 1 if successful, otherwise 0. - * - * increments the reference count upon success - */ -int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge, - u32 len, u64 vaddr, u32 rkey, int acc) -{ - struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; - struct hfi1_mregion *mr; - unsigned n, m; - size_t off; - - /* - * We use RKEY == zero for kernel virtual addresses - * (see hfi1_get_dma_mr and dma.c). - */ - rcu_read_lock(); - if (rkey == 0) { - struct hfi1_pd *pd = to_ipd(qp->ibqp.pd); - struct hfi1_ibdev *dev = to_idev(pd->ibpd.device); - - if (pd->user) - goto bail; - mr = rcu_dereference(dev->dma_mr); - if (!mr) - goto bail; - atomic_inc(&mr->refcount); - rcu_read_unlock(); - - sge->mr = mr; - sge->vaddr = (void *) vaddr; - sge->length = len; - sge->sge_length = len; - sge->m = 0; - sge->n = 0; - goto ok; - } - - mr = rcu_dereference( - rkt->table[(rkey >> (32 - hfi1_lkey_table_size))]); - if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) - goto bail; - - off = vaddr - mr->iova; - if (unlikely(vaddr < mr->iova || off + len > mr->length || - (mr->access_flags & acc) == 0)) - goto bail; - atomic_inc(&mr->refcount); - rcu_read_unlock(); - - off += mr->offset; - if (mr->page_shift) { - /* - page sizes are uniform power of 2 so no loop is necessary - entries_spanned_by_off is the number of times the loop below - would have executed. - */ - size_t entries_spanned_by_off; - - entries_spanned_by_off = off >> mr->page_shift; - off -= (entries_spanned_by_off << mr->page_shift); - m = entries_spanned_by_off / HFI1_SEGSZ; - n = entries_spanned_by_off % HFI1_SEGSZ; - } else { - m = 0; - n = 0; - while (off >= mr->map[m]->segs[n].length) { - off -= mr->map[m]->segs[n].length; - n++; - if (n >= HFI1_SEGSZ) { - m++; - n = 0; - } - } - } - sge->mr = mr; - sge->vaddr = mr->map[m]->segs[n].vaddr + off; - sge->length = mr->map[m]->segs[n].length - off; - sge->sge_length = len; - sge->m = m; - sge->n = n; -ok: - return 1; -bail: - rcu_read_unlock(); - return 0; -} diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c index 4f5dbd14b5de..0ec748e7e7b6 100644 --- a/drivers/staging/rdma/hfi1/mad.c +++ b/drivers/staging/rdma/hfi1/mad.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -55,6 +52,7 @@ #include "hfi.h" #include "mad.h" #include "trace.h" +#include "qp.h" /* the reset value from the FM is supposed to be 0xffff, handle both */ #define OPA_LINK_WIDTH_RESET_OLD 0x0fff @@ -91,7 +89,7 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) int pkey_idx; u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp; - agent = ibp->send_agent; + agent = ibp->rvp.send_agent; if (!agent) return; @@ -100,7 +98,8 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) return; /* o14-2 */ - if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout)) + if (ibp->rvp.trap_timeout && time_before(jiffies, + ibp->rvp.trap_timeout)) return; pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); @@ -121,42 +120,43 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; smp->class_version = OPA_SMI_CLASS_VERSION; smp->method = IB_MGMT_METHOD_TRAP; - ibp->tid++; - smp->tid = cpu_to_be64(ibp->tid); + ibp->rvp.tid++; + smp->tid = cpu_to_be64(ibp->rvp.tid); smp->attr_id = IB_SMP_ATTR_NOTICE; /* o14-1: smp->mkey = 0; */ memcpy(smp->route.lid.data, data, len); - spin_lock_irqsave(&ibp->lock, flags); - if (!ibp->sm_ah) { - if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (!ibp->rvp.sm_ah) { + if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { struct ib_ah *ah; - ah = hfi1_create_qp0_ah(ibp, ibp->sm_lid); - if (IS_ERR(ah)) + ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid); + if (IS_ERR(ah)) { ret = PTR_ERR(ah); - else { + } else { send_buf->ah = ah; - ibp->sm_ah = to_iah(ah); + ibp->rvp.sm_ah = ibah_to_rvtah(ah); ret = 0; } - } else + } else { ret = -EINVAL; + } } else { - send_buf->ah = &ibp->sm_ah->ibah; + send_buf->ah = &ibp->rvp.sm_ah->ibah; ret = 0; } - spin_unlock_irqrestore(&ibp->lock, flags); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); if (!ret) ret = ib_post_send_mad(send_buf, NULL); if (!ret) { /* 4.096 usec. */ - timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000; - ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout); + timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000; + ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout); } else { ib_free_send_mad(send_buf); - ibp->trap_timeout = 0; + ibp->rvp.trap_timeout = 0; } } @@ -174,10 +174,10 @@ void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, memset(&data, 0, sizeof(data)); if (trap_num == OPA_TRAP_BAD_P_KEY) - ibp->pkey_violations++; + ibp->rvp.pkey_violations++; else - ibp->qkey_violations++; - ibp->n_pkt_drops++; + ibp->rvp.qkey_violations++; + ibp->rvp.n_pkt_drops++; /* Send violation trap */ data.generic_type = IB_NOTICE_TYPE_SECURITY; @@ -233,9 +233,12 @@ static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, /* * Send a Port Capability Mask Changed trap (ch. 14.3.11). */ -void hfi1_cap_mask_chg(struct hfi1_ibport *ibp) +void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) { struct opa_mad_notice_attr data; + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data; u32 lid = ppd_from_ibp(ibp)->lid; memset(&data, 0, sizeof(data)); @@ -245,7 +248,7 @@ void hfi1_cap_mask_chg(struct hfi1_ibport *ibp) data.trap_num = OPA_TRAP_CHANGE_CAPABILITY; data.issuer_lid = cpu_to_be32(lid); data.ntc_144.lid = data.issuer_lid; - data.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags); + data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); send_trap(ibp, &data, sizeof(data)); } @@ -407,37 +410,38 @@ static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, int ret = 0; /* Is the mkey in the process of expiring? */ - if (ibp->mkey_lease_timeout && - time_after_eq(jiffies, ibp->mkey_lease_timeout)) { + if (ibp->rvp.mkey_lease_timeout && + time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) { /* Clear timeout and mkey protection field. */ - ibp->mkey_lease_timeout = 0; - ibp->mkeyprot = 0; + ibp->rvp.mkey_lease_timeout = 0; + ibp->rvp.mkeyprot = 0; } - if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->mkey == 0 || - ibp->mkey == mkey) + if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->rvp.mkey == 0 || + ibp->rvp.mkey == mkey) valid_mkey = 1; /* Unset lease timeout on any valid Get/Set/TrapRepress */ - if (valid_mkey && ibp->mkey_lease_timeout && + if (valid_mkey && ibp->rvp.mkey_lease_timeout && (mad->method == IB_MGMT_METHOD_GET || mad->method == IB_MGMT_METHOD_SET || mad->method == IB_MGMT_METHOD_TRAP_REPRESS)) - ibp->mkey_lease_timeout = 0; + ibp->rvp.mkey_lease_timeout = 0; if (!valid_mkey) { switch (mad->method) { case IB_MGMT_METHOD_GET: /* Bad mkey not a violation below level 2 */ - if (ibp->mkeyprot < 2) + if (ibp->rvp.mkeyprot < 2) break; case IB_MGMT_METHOD_SET: case IB_MGMT_METHOD_TRAP_REPRESS: - if (ibp->mkey_violations != 0xFFFF) - ++ibp->mkey_violations; - if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period) - ibp->mkey_lease_timeout = jiffies + - ibp->mkey_lease_period * HZ; + if (ibp->rvp.mkey_violations != 0xFFFF) + ++ibp->rvp.mkey_violations; + if (!ibp->rvp.mkey_lease_timeout && + ibp->rvp.mkey_lease_period) + ibp->rvp.mkey_lease_timeout = jiffies + + ibp->rvp.mkey_lease_period * HZ; /* Generate a trap notice. */ bad_mkey(ibp, mad, mkey, dr_slid, return_path, hop_cnt); @@ -501,16 +505,6 @@ void read_ltp_rtt(struct hfi1_devdata *dd) write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg); } -static u8 __opa_porttype(struct hfi1_pportdata *ppd) -{ - if (qsfp_mod_present(ppd)) { - if (ppd->qsfp_info.cache_valid) - return OPA_PORT_TYPE_STANDARD; - return OPA_PORT_TYPE_DISCONNECTED; - } - return OPA_PORT_TYPE_UNKNOWN; -} - static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, u32 *resp_len) @@ -522,6 +516,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct opa_port_info *pi = (struct opa_port_info *)data; u8 mtu; u8 credit_rate; + u8 is_beaconing_active; u32 state; u32 num_ports = OPA_AM_NPORT(am); u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); @@ -538,8 +533,8 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, ppd = dd->pport + (port - 1); ibp = &ppd->ibport_data; - if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || - ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -548,14 +543,14 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, /* Only return the mkey if the protection field allows it. */ if (!(smp->method == IB_MGMT_METHOD_GET && - ibp->mkey != smp->mkey && - ibp->mkeyprot == 1)) - pi->mkey = ibp->mkey; - - pi->subnet_prefix = ibp->gid_prefix; - pi->sm_lid = cpu_to_be32(ibp->sm_lid); - pi->ib_cap_mask = cpu_to_be32(ibp->port_cap_flags); - pi->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period); + ibp->rvp.mkey != smp->mkey && + ibp->rvp.mkeyprot == 1)) + pi->mkey = ibp->rvp.mkey; + + pi->subnet_prefix = ibp->rvp.gid_prefix; + pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid); + pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period); pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp); pi->sa_qp = cpu_to_be32(ppd->sa_qp); @@ -581,38 +576,45 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, if (start_of_sm_config && (state == IB_PORT_INIT)) ppd->is_sm_config_started = 1; - pi->port_phys_conf = __opa_porttype(ppd) & 0xf; + pi->port_phys_conf = (ppd->port_type & 0xf); #if PI_LED_ENABLE_SUP pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; pi->port_states.ledenable_offlinereason |= ppd->is_sm_config_started << 5; + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active); + pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6; pi->port_states.ledenable_offlinereason |= - ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON; + ppd->offline_disabled_reason; #else pi->port_states.offline_reason = ppd->neighbor_normal << 4; pi->port_states.offline_reason |= ppd->is_sm_config_started << 5; - pi->port_states.offline_reason |= ppd->offline_disabled_reason & - OPA_PI_MASK_OFFLINE_REASON; + pi->port_states.offline_reason |= ppd->offline_disabled_reason; #endif /* PI_LED_ENABLE_SUP */ pi->port_states.portphysstate_portstate = (hfi1_ibphys_portstate(ppd) << 4) | state; - pi->mkeyprotect_lmc = (ibp->mkeyprot << 6) | ppd->lmc; + pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc; memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu)); for (i = 0; i < ppd->vls_supported; i++) { mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU); if ((i % 2) == 0) - pi->neigh_mtu.pvlx_to_mtu[i/2] |= (mtu << 4); + pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4); else - pi->neigh_mtu.pvlx_to_mtu[i/2] |= mtu; + pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu; } /* don't forget VL 15 */ mtu = mtu_to_enum(dd->vld[15].mtu, 2048); - pi->neigh_mtu.pvlx_to_mtu[15/2] |= mtu; - pi->smsl = ibp->sm_sl & OPA_PI_MASK_SMSL; + pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu; + pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL; pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS); pi->partenforce_filterraw |= (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON); @@ -620,17 +622,17 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN; if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT) pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT; - pi->mkey_violations = cpu_to_be16(ibp->mkey_violations); + pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations); /* P_KeyViolations are counted by hardware. */ - pi->pkey_violations = cpu_to_be16(ibp->pkey_violations); - pi->qkey_violations = cpu_to_be16(ibp->qkey_violations); + pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations); + pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations); pi->vl.cap = ppd->vls_supported; - pi->vl.high_limit = cpu_to_be16(ibp->vl_high_limit); + pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit); pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP); pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP); - pi->clientrereg_subnettimeout = ibp->subnet_timeout; + pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout; pi->port_link_mode = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 | OPA_PORT_LINK_MODE_OPA << 5 | @@ -701,8 +703,10 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp); - /* this counter is 16 bits wide, but the replay_depth.wire - * variable is only 8 bits */ + /* + * this counter is 16 bits wide, but the replay_depth.wire + * variable is only 8 bits + */ if (tmp > 0xff) tmp = 0xff; pi->replay_depth.wire = tmp; @@ -749,7 +753,7 @@ static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - n_blocks_avail = (u16) (npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1; + n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1; size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16); @@ -763,7 +767,7 @@ static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - p = (__be16 *) data; + p = (__be16 *)data; q = (u16 *)data; /* get the real pkeys if we are requesting the first block */ if (start_block == 0) { @@ -772,9 +776,9 @@ static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, p[i] = cpu_to_be16(q[i]); if (resp_len) *resp_len += size; - } else + } else { smp->status |= IB_SMP_INVALID_FIELD; - + } return reply((struct ib_mad_hdr *)smp); } @@ -901,8 +905,8 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, u32 logical_old = driver_logical_state(ppd); int ret, logical_allowed, physical_allowed; - logical_allowed = ret = - logical_transition_allowed(logical_old, logical_new); + ret = logical_transition_allowed(logical_old, logical_new); + logical_allowed = ret; if (ret == HFI_TRANSITION_DISALLOWED || ret == HFI_TRANSITION_UNDEFINED) { @@ -912,8 +916,8 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, return ret; } - physical_allowed = ret = - physical_transition_allowed(physical_old, physical_new); + ret = physical_transition_allowed(physical_old, physical_new); + physical_allowed = ret; if (ret == HFI_TRANSITION_DISALLOWED || ret == HFI_TRANSITION_UNDEFINED) { @@ -928,6 +932,14 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, return HFI_TRANSITION_IGNORED; /* + * A change request of Physical Port State from + * 'Offline' to 'Polling' should be ignored. + */ + if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) && + (physical_new == IB_PORTPHYSSTATE_POLLING)) + return HFI_TRANSITION_IGNORED; + + /* * Either physical_allowed or logical_allowed is * HFI_TRANSITION_ALLOWED. */ @@ -972,16 +984,15 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, break; /* FALLTHROUGH */ case IB_PORT_DOWN: - if (phys_state == IB_PORTPHYSSTATE_NOP) + if (phys_state == IB_PORTPHYSSTATE_NOP) { link_state = HLS_DN_DOWNDEF; - else if (phys_state == IB_PORTPHYSSTATE_POLLING) { + } else if (phys_state == IB_PORTPHYSSTATE_POLLING) { link_state = HLS_DN_POLL; - set_link_down_reason(ppd, - OPA_LINKDOWN_REASON_FM_BOUNCE, 0, - OPA_LINKDOWN_REASON_FM_BOUNCE); - } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE, + 0, OPA_LINKDOWN_REASON_FM_BOUNCE); + } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) { link_state = HLS_DN_DISABLE; - else { + } else { pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n", phys_state); smp->status |= IB_SMP_INVALID_FIELD; @@ -991,11 +1002,11 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, set_link_state(ppd, link_state); if (link_state == HLS_DN_DISABLE && (ppd->offline_disabled_reason > - OPA_LINKDOWN_REASON_SMA_DISABLED || + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) || ppd->offline_disabled_reason == - OPA_LINKDOWN_REASON_NONE)) + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))) ppd->offline_disabled_reason = - OPA_LINKDOWN_REASON_SMA_DISABLED; + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED); /* * Don't send a reply if the response would be sent * through the disabled port. @@ -1091,13 +1102,13 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, ls_old = driver_lstate(ppd); - ibp->mkey = pi->mkey; - ibp->gid_prefix = pi->subnet_prefix; - ibp->mkey_lease_period = be16_to_cpu(pi->mkey_lease_period); + ibp->rvp.mkey = pi->mkey; + ibp->rvp.gid_prefix = pi->subnet_prefix; + ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period); /* Must be a valid unicast LID address. */ if ((lid == 0 && ls_old > IB_PORT_INIT) || - lid >= HFI1_MULTICAST_LID_BASE) { + lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { smp->status |= IB_SMP_INVALID_FIELD; pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n", lid); @@ -1130,23 +1141,23 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, /* Must be a valid unicast LID address. */ if ((smlid == 0 && ls_old > IB_PORT_INIT) || - smlid >= HFI1_MULTICAST_LID_BASE) { + smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { smp->status |= IB_SMP_INVALID_FIELD; pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid); - } else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) { + } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) { pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid); - spin_lock_irqsave(&ibp->lock, flags); - if (ibp->sm_ah) { - if (smlid != ibp->sm_lid) - ibp->sm_ah->attr.dlid = smlid; - if (msl != ibp->sm_sl) - ibp->sm_ah->attr.sl = msl; + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (ibp->rvp.sm_ah) { + if (smlid != ibp->rvp.sm_lid) + ibp->rvp.sm_ah->attr.dlid = smlid; + if (msl != ibp->rvp.sm_sl) + ibp->rvp.sm_ah->attr.sl = msl; } - spin_unlock_irqrestore(&ibp->lock, flags); - if (smlid != ibp->sm_lid) - ibp->sm_lid = smlid; - if (msl != ibp->sm_sl) - ibp->sm_sl = msl; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + if (smlid != ibp->rvp.sm_lid) + ibp->rvp.sm_lid = smlid; + if (msl != ibp->rvp.sm_sl) + ibp->rvp.sm_sl = msl; event.event = IB_EVENT_SM_CHANGE; ib_dispatch_event(&event); } @@ -1167,8 +1178,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, ppd->port_error_action = be32_to_cpu(pi->port_error_action); lwe = be16_to_cpu(pi->link_width.enabled); if (lwe) { - if (lwe == OPA_LINK_WIDTH_RESET - || lwe == OPA_LINK_WIDTH_RESET_OLD) + if (lwe == OPA_LINK_WIDTH_RESET || + lwe == OPA_LINK_WIDTH_RESET_OLD) set_link_width_enabled(ppd, ppd->link_width_supported); else if ((lwe & ~ppd->link_width_supported) == 0) set_link_width_enabled(ppd, lwe); @@ -1177,19 +1188,21 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, } lwe = be16_to_cpu(pi->link_width_downgrade.enabled); /* LWD.E is always applied - 0 means "disabled" */ - if (lwe == OPA_LINK_WIDTH_RESET - || lwe == OPA_LINK_WIDTH_RESET_OLD) { + if (lwe == OPA_LINK_WIDTH_RESET || + lwe == OPA_LINK_WIDTH_RESET_OLD) { set_link_width_downgrade_enabled(ppd, - ppd->link_width_downgrade_supported); + ppd-> + link_width_downgrade_supported + ); } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) { /* only set and apply if something changed */ if (lwe != ppd->link_width_downgrade_enabled) { set_link_width_downgrade_enabled(ppd, lwe); call_link_downgrade_policy = 1; } - } else + } else { smp->status |= IB_SMP_INVALID_FIELD; - + } lse = be16_to_cpu(pi->link_speed.enabled); if (lse) { if (lse & be16_to_cpu(pi->link_speed.supported)) @@ -1198,22 +1211,24 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, smp->status |= IB_SMP_INVALID_FIELD; } - ibp->mkeyprot = (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6; - ibp->vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF; + ibp->rvp.mkeyprot = + (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6; + ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF; (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT, - ibp->vl_high_limit); + ibp->rvp.vl_high_limit); - if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || - ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } for (i = 0; i < ppd->vls_supported; i++) { if ((i % 2) == 0) - mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i/2] >> 4) - & 0xF); + mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >> + 4) & 0xF); else - mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i/2] & 0xF); + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] & + 0xF); if (mtu == 0xffff) { pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n", mtu, @@ -1223,8 +1238,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, } if (dd->vld[i].mtu != mtu) { dd_dev_info(dd, - "MTU change on vl %d from %d to %d\n", - i, dd->vld[i].mtu, mtu); + "MTU change on vl %d from %d to %d\n", + i, dd->vld[i].mtu, mtu); dd->vld[i].mtu = mtu; call_set_mtu++; } @@ -1232,13 +1247,13 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, /* As per OPAV1 spec: VL15 must support and be configured * for operation with a 2048 or larger MTU. */ - mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15/2] & 0xF); + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF); if (mtu < 2048 || mtu == 0xffff) mtu = 2048; if (dd->vld[15].mtu != mtu) { dd_dev_info(dd, - "MTU change on vl 15 from %d to %d\n", - dd->vld[15].mtu, mtu); + "MTU change on vl 15 from %d to %d\n", + dd->vld[15].mtu, mtu); dd->vld[15].mtu = mtu; call_set_mtu++; } @@ -1254,21 +1269,21 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, smp->status |= IB_SMP_INVALID_FIELD; } else { if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS, - vls) == -EINVAL) + vls) == -EINVAL) smp->status |= IB_SMP_INVALID_FIELD; } } if (pi->mkey_violations == 0) - ibp->mkey_violations = 0; + ibp->rvp.mkey_violations = 0; if (pi->pkey_violations == 0) - ibp->pkey_violations = 0; + ibp->rvp.pkey_violations = 0; if (pi->qkey_violations == 0) - ibp->qkey_violations = 0; + ibp->rvp.qkey_violations = 0; - ibp->subnet_timeout = + ibp->rvp.subnet_timeout = pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT; crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode); @@ -1388,7 +1403,7 @@ static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); event.event = IB_EVENT_PKEY_CHANGE; - event.device = &dd->verbs_dev.ibdev; + event.device = &dd->verbs_dev.rdi.ibdev; event.element.port_num = port; ib_dispatch_event(&event); } @@ -1402,7 +1417,7 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, struct hfi1_devdata *dd = dd_from_ibdev(ibdev); u32 n_blocks_sent = OPA_AM_NBLK(am); u32 start_block = am & 0x7ff; - u16 *p = (u16 *) data; + u16 *p = (u16 *)data; __be16 *q = (__be16 *)data; int i; u16 n_blocks_avail; @@ -1415,7 +1430,7 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - n_blocks_avail = (u16)(npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1; + n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1; if (start_block + n_blocks_sent > n_blocks_avail || n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) { @@ -1514,14 +1529,22 @@ static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, struct hfi1_ibport *ibp = to_iport(ibdev, port); u8 *p = data; int i; + u8 sc; if (am) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } - for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) - ibp->sl_to_sc[i] = *p++; + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) { + sc = *p++; + if (ibp->sl_to_sc[i] != sc) { + ibp->sl_to_sc[i] = sc; + + /* Put all stale qps into error state */ + hfi1_error_port_qps(ibp, i); + } + } return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len); } @@ -1574,7 +1597,7 @@ static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, { u32 n_blocks = OPA_AM_NBLK(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - void *vp = (void *) data; + void *vp = (void *)data; size_t size = 4 * sizeof(u64); if (n_blocks != 1) { @@ -1597,7 +1620,7 @@ static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, u32 n_blocks = OPA_AM_NBLK(am); int async_update = OPA_AM_ASYNC(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - void *vp = (void *) data; + void *vp = (void *)data; struct hfi1_pportdata *ppd; int lstate; @@ -1609,8 +1632,10 @@ static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, /* IB numbers ports from 1, hw from 0 */ ppd = dd->pport + (port - 1); lstate = driver_lstate(ppd); - /* it's known that async_update is 0 by this point, but include - * the explicit check for clarity */ + /* + * it's known that async_update is 0 by this point, but include + * the explicit check for clarity + */ if (!async_update && (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) { smp->status |= IB_SMP_INVALID_FIELD; @@ -1629,7 +1654,7 @@ static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, u32 n_blocks = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; - void *vp = (void *) data; + void *vp = (void *)data; int size; if (n_blocks != 1) { @@ -1654,7 +1679,7 @@ static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, u32 n_blocks = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; - void *vp = (void *) data; + void *vp = (void *)data; int lstate; if (n_blocks != 1) { @@ -1687,7 +1712,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, u32 lstate; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; - struct opa_port_state_info *psi = (struct opa_port_state_info *) data; + struct opa_port_state_info *psi = (struct opa_port_state_info *)data; if (nports != 1) { smp->status |= IB_SMP_INVALID_FIELD; @@ -1707,12 +1732,11 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, psi->port_states.ledenable_offlinereason |= ppd->is_sm_config_started << 5; psi->port_states.ledenable_offlinereason |= - ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON; + ppd->offline_disabled_reason; #else psi->port_states.offline_reason = ppd->neighbor_normal << 4; psi->port_states.offline_reason |= ppd->is_sm_config_started << 5; - psi->port_states.offline_reason |= ppd->offline_disabled_reason & - OPA_PI_MASK_OFFLINE_REASON; + psi->port_states.offline_reason |= ppd->offline_disabled_reason; #endif /* PI_LED_ENABLE_SUP */ psi->port_states.portphysstate_portstate = @@ -1737,7 +1761,7 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, u8 ls_new, ps_new; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; - struct opa_port_state_info *psi = (struct opa_port_state_info *) data; + struct opa_port_state_info *psi = (struct opa_port_state_info *)data; int ret, invalid = 0; if (nports != 1) { @@ -1782,14 +1806,16 @@ static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data, u32 len = OPA_AM_CI_LEN(am) + 1; int ret; -#define __CI_PAGE_SIZE (1 << 7) /* 128 bytes */ +#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */ #define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1) #define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK) - /* check that addr is within spec, and - * addr and (addr + len - 1) are on the same "page" */ + /* + * check that addr is within spec, and + * addr and (addr + len - 1) are on the same "page" + */ if (addr >= 4096 || - (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) { + (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } @@ -1823,7 +1849,7 @@ static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data, u32 num_ports = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; - struct buffer_control *p = (struct buffer_control *) data; + struct buffer_control *p = (struct buffer_control *)data; int size; if (num_ports != 1) { @@ -1846,7 +1872,7 @@ static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data, u32 num_ports = OPA_AM_NPORT(am); struct hfi1_devdata *dd = dd_from_ibdev(ibdev); struct hfi1_pportdata *ppd; - struct buffer_control *p = (struct buffer_control *) data; + struct buffer_control *p = (struct buffer_control *)data; if (num_ports != 1) { smp->status |= IB_SMP_INVALID_FIELD; @@ -1919,13 +1945,15 @@ static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, switch (section) { case OPA_VLARB_LOW_ELEMENTS: - (void) fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p); + (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p); break; case OPA_VLARB_HIGH_ELEMENTS: - (void) fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p); + (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p); break; - /* neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX - * can be changed from the default values */ + /* + * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX + * can be changed from the default values + */ case OPA_VLARB_PREEMPT_ELEMENTS: /* FALLTHROUGH */ case OPA_VLARB_PREEMPT_MATRIX: @@ -2137,8 +2165,10 @@ struct opa_port_data_counters_msg { }; struct opa_port_error_counters64_msg { - /* Request contains first two fields, response contains the - * whole magilla */ + /* + * Request contains first two fields, response contains the + * whole magilla + */ __be64 port_select_mask[4]; __be32 vl_select_mask; @@ -2172,7 +2202,6 @@ struct opa_port_error_info_msg { __be32 error_info_select_mask; __be32 reserved1; struct _port_ei { - u8 port_number; u8 reserved2[7]; @@ -2251,7 +2280,7 @@ enum error_info_selects { }; static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp, - struct ib_device *ibdev, u32 *resp_len) + struct ib_device *ibdev, u32 *resp_len) { struct opa_class_port_info *p = (struct opa_class_port_info *)pmp->data; @@ -2279,23 +2308,29 @@ static void a0_portstatus(struct hfi1_pportdata *ppd, { if (!is_bx(ppd->dd)) { unsigned long vl; - u64 max_vl_xmit_wait = 0, tmp; + u64 sum_vl_xmit_wait = 0; u32 vl_all_mask = VL_MASK_ALL; for_each_set_bit(vl, (unsigned long *)&(vl_all_mask), 8 * sizeof(vl_all_mask)) { - tmp = read_port_cntr(ppd, C_TX_WAIT_VL, - idx_from_vl(vl)); - if (tmp > max_vl_xmit_wait) - max_vl_xmit_wait = tmp; + u64 tmp = sum_vl_xmit_wait + + read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl)); + if (tmp < sum_vl_xmit_wait) { + /* we wrapped */ + sum_vl_xmit_wait = (u64)~0; + break; + } + sum_vl_xmit_wait = tmp; } - rsp->port_xmit_wait = cpu_to_be64(max_vl_xmit_wait); + if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait) + rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait); } } - static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, + u8 port, u32 *resp_len) { struct opa_port_status_req *req = (struct opa_port_status_req *)pmp->data; @@ -2320,8 +2355,8 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - if (nports != 1 || (port_num && port_num != port) - || num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) { + if (nports != 1 || (port_num && port_num != port) || + num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)pmp); } @@ -2351,7 +2386,7 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, CNTR_INVALID_VL)); rsp->port_multicast_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, - CNTR_INVALID_VL)); + CNTR_INVALID_VL)); rsp->port_multicast_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL)); @@ -2380,7 +2415,7 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, } tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, - CNTR_INVALID_VL); + CNTR_INVALID_VL); if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { /* overflow/wrapped */ rsp->link_error_recovery = cpu_to_be32(~0); @@ -2395,13 +2430,13 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL)); rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, - CNTR_INVALID_VL)); + CNTR_INVALID_VL)); /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */ tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; - vlinfo = &(rsp->vls[0]); + vlinfo = &rsp->vls[0]; vfi = 0; /* The vl_select_mask has been checked above, and we know * that it contains only entries which represent valid VLs. @@ -2417,27 +2452,27 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, rsp->vls[vfi].port_vl_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_xmit_data = cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_xmit_pkts = cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_xmit_wait = cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_fecn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_becn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); vlinfo++; vfi++; @@ -2467,7 +2502,7 @@ static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port, error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL); error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR, - CNTR_INVALID_VL); + CNTR_INVALID_VL); /* local link integrity must be right-shifted by the lli resolution */ tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); @@ -2477,10 +2512,10 @@ static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port, tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL); error_counter_summary += (tmp >> res_ler); error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR, - CNTR_INVALID_VL); + CNTR_INVALID_VL); error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR, - CNTR_INVALID_VL); + CNTR_INVALID_VL); /* ppd->link_downed is a 32-bit value */ error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL); @@ -2491,21 +2526,22 @@ static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port, return error_counter_summary; } -static void a0_datacounters(struct hfi1_devdata *dd, struct _port_dctrs *rsp, +static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp, u32 vl_select_mask) { - if (!is_bx(dd)) { + if (!is_bx(ppd->dd)) { unsigned long vl; - int vfi = 0; u64 sum_vl_xmit_wait = 0; + u32 vl_all_mask = VL_MASK_ALL; - for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), - 8 * sizeof(vl_select_mask)) { + for_each_set_bit(vl, (unsigned long *)&(vl_all_mask), + 8 * sizeof(vl_all_mask)) { u64 tmp = sum_vl_xmit_wait + - be64_to_cpu(rsp->vls[vfi++].port_vl_xmit_wait); + read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl)); if (tmp < sum_vl_xmit_wait) { /* we wrapped */ - sum_vl_xmit_wait = (u64) ~0; + sum_vl_xmit_wait = (u64)~0; break; } sum_vl_xmit_wait = tmp; @@ -2515,8 +2551,30 @@ static void a0_datacounters(struct hfi1_devdata *dd, struct _port_dctrs *rsp, } } +static void pma_get_opa_port_dctrs(struct ib_device *ibdev, + struct _port_dctrs *rsp) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + + rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, + CNTR_INVALID_VL)); + rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_xmit_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, + CNTR_INVALID_VL)); +} + static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, + u8 port, u32 *resp_len) { struct opa_port_data_counters_msg *req = (struct opa_port_data_counters_msg *)pmp->data; @@ -2572,7 +2630,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } - rsp = (struct _port_dctrs *)&(req->port[0]); + rsp = (struct _port_dctrs *)&req->port[0]; memset(rsp, 0, sizeof(*rsp)); rsp->port_number = port; @@ -2583,39 +2641,19 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, */ hfi1_read_link_quality(dd, &lq); rsp->link_quality_indicator = cpu_to_be32((u32)lq); + pma_get_opa_port_dctrs(ibdev, rsp); - /* rsp->sw_port_congestion is 0 for HFIs */ - /* rsp->port_xmit_time_cong is 0 for HFIs */ - /* rsp->port_xmit_wasted_bw ??? */ - /* rsp->port_xmit_wait_data ??? */ - /* rsp->port_mark_fecn is 0 for HFIs */ - - rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, - CNTR_INVALID_VL)); - rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, - CNTR_INVALID_VL)); - rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, - CNTR_INVALID_VL)); - rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, - CNTR_INVALID_VL)); - rsp->port_multicast_xmit_pkts = - cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, - CNTR_INVALID_VL)); - rsp->port_multicast_rcv_pkts = - cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, - CNTR_INVALID_VL)); rsp->port_xmit_wait = cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL)); rsp->port_rcv_fecn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); rsp->port_rcv_becn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL)); - rsp->port_error_counter_summary = cpu_to_be64(get_error_counter_summary(ibdev, port, res_lli, res_ler)); - vlinfo = &(rsp->vls[0]); + vlinfo = &rsp->vls[0]; vfi = 0; /* The vl_select_mask has been checked above, and we know * that it contains only entries which represent valid VLs. @@ -2623,49 +2661,50 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, * any additional checks for vl. */ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), - 8 * sizeof(req->vl_select_mask)) { + 8 * sizeof(req->vl_select_mask)) { memset(vlinfo, 0, sizeof(*vlinfo)); rsp->vls[vfi].port_vl_xmit_data = cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_xmit_pkts = cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_xmit_wait = cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_fecn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_becn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, - idx_from_vl(vl))); + idx_from_vl(vl))); /* rsp->port_vl_xmit_time_cong is 0 for HFIs */ /* rsp->port_vl_xmit_wasted_bw ??? */ /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? - * does this differ from rsp->vls[vfi].port_vl_xmit_wait */ + * does this differ from rsp->vls[vfi].port_vl_xmit_wait + */ /*rsp->vls[vfi].port_vl_mark_fecn = - cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT - + offset)); - */ + * cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + * + offset)); + */ vlinfo++; vfi++; } - a0_datacounters(dd, rsp, vl_select_mask); + a0_datacounters(ppd, rsp, vl_select_mask); if (resp_len) *resp_len += response_data_size; @@ -2673,12 +2712,88 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } +static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *) + pmp->data; + struct _port_dctrs rsp; + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + memset(&rsp, 0, sizeof(rsp)); + pma_get_opa_port_dctrs(ibdev, &rsp); + + p->port_xmit_data = rsp.port_xmit_data; + p->port_rcv_data = rsp.port_rcv_data; + p->port_xmit_packets = rsp.port_xmit_pkts; + p->port_rcv_packets = rsp.port_rcv_pkts; + p->port_unicast_xmit_packets = 0; + p->port_unicast_rcv_packets = 0; + p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts; + p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts; + +bail: + return reply((struct ib_mad_hdr *)pmp); +} + +static void pma_get_opa_port_ectrs(struct ib_device *ibdev, + struct _port_ectrs *rsp, u8 port) +{ + u64 tmp, tmp2; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { + /* overflow/wrapped */ + rsp->link_error_recovery = cpu_to_be32(~0); + } else { + rsp->link_error_recovery = cpu_to_be32(tmp2); + } + + rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL)); + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_switch_relay_errors = 0; + rsp->port_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL)); + rsp->port_xmit_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); + if (tmp2 < tmp) { + /* overflow/wrapped */ + rsp->local_link_integrity_errors = cpu_to_be64(~0); + } else { + rsp->local_link_integrity_errors = cpu_to_be64(tmp2); + } + rsp->excessive_buffer_overruns = + cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); +} + static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, + u8 port, u32 *resp_len) { size_t response_data_size; struct _port_ectrs *rsp; - unsigned long port_num; + u8 port_num; struct opa_port_error_counters64_msg *req; struct hfi1_devdata *dd = dd_from_ibdev(ibdev); u32 num_ports; @@ -2688,7 +2803,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, struct hfi1_pportdata *ppd; struct _vls_ectrs *vlinfo; unsigned long vl; - u64 port_mask, tmp, tmp2; + u64 port_mask, tmp; u32 vl_select_mask; int vfi; @@ -2717,62 +2832,34 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, */ port_mask = be64_to_cpu(req->port_select_mask[3]); port_num = find_first_bit((unsigned long *)&port_mask, - sizeof(port_mask)); + sizeof(port_mask)); - if ((u8)port_num != port) { + if (port_num != port) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)pmp); } - rsp = (struct _port_ectrs *)&(req->port[0]); + rsp = (struct _port_ectrs *)&req->port[0]; ibp = to_iport(ibdev, port_num); ppd = ppd_from_ibp(ibp); memset(rsp, 0, sizeof(*rsp)); - rsp->port_number = (u8)port_num; + rsp->port_number = port_num; + + pma_get_opa_port_ectrs(ibdev, rsp, port_num); - rsp->port_rcv_constraint_errors = - cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, - CNTR_INVALID_VL)); - /* port_rcv_switch_relay_errors is 0 for HFIs */ - rsp->port_xmit_discards = - cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, - CNTR_INVALID_VL)); rsp->port_rcv_remote_physical_errors = cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, - CNTR_INVALID_VL)); - tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); - tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); - if (tmp2 < tmp) { - /* overflow/wrapped */ - rsp->local_link_integrity_errors = cpu_to_be64(~0); - } else { - rsp->local_link_integrity_errors = cpu_to_be64(tmp2); - } - tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); - tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, - CNTR_INVALID_VL); - if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { - /* overflow/wrapped */ - rsp->link_error_recovery = cpu_to_be32(~0); - } else { - rsp->link_error_recovery = cpu_to_be32(tmp2); - } - rsp->port_xmit_constraint_errors = - cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, - CNTR_INVALID_VL)); - rsp->excessive_buffer_overruns = - cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); + CNTR_INVALID_VL)); rsp->fm_config_errors = cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, - CNTR_INVALID_VL)); - rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, - CNTR_INVALID_VL)); + CNTR_INVALID_VL)); tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; - vlinfo = (struct _vls_ectrs *)&(rsp->vls[0]); + vlinfo = (struct _vls_ectrs *)&rsp->vls[0]; vfi = 0; vl_select_mask = be32_to_cpu(req->vl_select_mask); for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), @@ -2789,8 +2876,94 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, return reply((struct ib_mad_hdr *)pmp); } +static int pma_get_ib_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct _port_ectrs rsp; + u64 temp_link_overrun_errors; + u64 temp_64; + u32 temp_32; + + memset(&rsp, 0, sizeof(rsp)); + pma_get_opa_port_ectrs(ibdev, &rsp, port); + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + p->symbol_error_counter = 0; /* N/A for OPA */ + + temp_32 = be32_to_cpu(rsp.link_error_recovery); + if (temp_32 > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = (u8)temp_32; + + temp_32 = be32_to_cpu(rsp.link_downed); + if (temp_32 > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)temp_32; + + temp_64 = be64_to_cpu(rsp.port_rcv_errors); + if (temp_64 > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors); + if (temp_64 > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors); + p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_xmit_discards); + if (temp_64 > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors); + if (temp_64 > 0xFFUL) + p->port_xmit_constraint_errors = 0xFF; + else + p->port_xmit_constraint_errors = (u8)temp_64; + + temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors); + if (temp_64 > 0xFFUL) + p->port_rcv_constraint_errors = 0xFFUL; + else + p->port_rcv_constraint_errors = (u8)temp_64; + + /* LocalLink: 7:4, BufferOverrun: 3:0 */ + temp_64 = be64_to_cpu(rsp.local_link_integrity_errors); + if (temp_64 > 0xFUL) + temp_64 = 0xFUL; + + temp_link_overrun_errors = temp_64 << 4; + + temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns); + if (temp_64 > 0xFUL) + temp_64 = 0xFUL; + temp_link_overrun_errors |= temp_64; + + p->link_overrun_errors = (u8)temp_link_overrun_errors; + + p->vl15_dropped = 0; /* N/A for OPA */ + +bail: + return reply((struct ib_mad_hdr *)pmp); +} + static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, + u8 port, u32 *resp_len) { size_t response_data_size; struct _port_ei *rsp; @@ -2798,12 +2971,12 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, struct hfi1_devdata *dd = dd_from_ibdev(ibdev); u64 port_mask; u32 num_ports; - unsigned long port_num; + u8 port_num; u8 num_pslm; u64 reg; req = (struct opa_port_error_info_msg *)pmp->data; - rsp = (struct _port_ei *)&(req->port[0]); + rsp = (struct _port_ei *)&req->port[0]; num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); @@ -2831,7 +3004,7 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, port_num = find_first_bit((unsigned long *)&port_mask, sizeof(port_mask)); - if ((u8)port_num != port) { + if (port_num != port) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)pmp); } @@ -2840,15 +3013,17 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, rsp->port_rcv_ei.status_and_code = dd->err_info_rcvport.status_and_code; memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1, - &dd->err_info_rcvport.packet_flit1, sizeof(u64)); + &dd->err_info_rcvport.packet_flit1, sizeof(u64)); memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2, - &dd->err_info_rcvport.packet_flit2, sizeof(u64)); + &dd->err_info_rcvport.packet_flit2, sizeof(u64)); /* ExcessiverBufferOverrunInfo */ reg = read_csr(dd, RCV_ERR_INFO); if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) { - /* if the RcvExcessBufferOverrun bit is set, save SC of - * first pkt that encountered an excess buffer overrun */ + /* + * if the RcvExcessBufferOverrun bit is set, save SC of + * first pkt that encountered an excess buffer overrun + */ u8 tmp = (u8)reg; tmp &= RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK; @@ -2885,7 +3060,8 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, } static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, + u8 port, u32 *resp_len) { struct opa_clear_port_status *req = (struct opa_clear_port_status *)pmp->data; @@ -2944,8 +3120,9 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0); /* Only applicable for switch */ - /*if (counter_select & CS_PORT_MARK_FECN) - write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);*/ + /* if (counter_select & CS_PORT_MARK_FECN) + * write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0); + */ if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS) write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0); @@ -2968,7 +3145,7 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, if (counter_select & CS_LINK_ERROR_RECOVERY) { write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, - CNTR_INVALID_VL, 0); + CNTR_INVALID_VL, 0); } if (counter_select & CS_PORT_RCV_ERRORS) @@ -2990,7 +3167,6 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), 8 * sizeof(vl_select_mask)) { - if (counter_select & CS_PORT_XMIT_DATA) write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0); @@ -3019,9 +3195,9 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, if (counter_select & CS_PORT_RCV_BUBBLE) write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0); - /*if (counter_select & CS_PORT_MARK_FECN) - write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0); - */ + /* if (counter_select & CS_PORT_MARK_FECN) + * write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0); + */ /* port_vl_xmit_discards ??? */ } @@ -3032,19 +3208,20 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, } static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, - struct ib_device *ibdev, u8 port, u32 *resp_len) + struct ib_device *ibdev, + u8 port, u32 *resp_len) { struct _port_ei *rsp; struct opa_port_error_info_msg *req; struct hfi1_devdata *dd = dd_from_ibdev(ibdev); u64 port_mask; u32 num_ports; - unsigned long port_num; + u8 port_num; u8 num_pslm; u32 error_info_select; req = (struct opa_port_error_info_msg *)pmp->data; - rsp = (struct _port_ei *)&(req->port[0]); + rsp = (struct _port_ei *)&req->port[0]; num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); @@ -3064,7 +3241,7 @@ static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, port_num = find_first_bit((unsigned long *)&port_mask, sizeof(port_mask)); - if ((u8)port_num != port) { + if (port_num != port) { pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)pmp); } @@ -3078,8 +3255,10 @@ static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, /* ExcessiverBufferOverrunInfo */ if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO) - /* status bit is essentially kept in the h/w - bit 5 of - * RCV_ERR_INFO */ + /* + * status bit is essentially kept in the h/w - bit 5 of + * RCV_ERR_INFO + */ write_csr(dd, RCV_ERR_INFO, RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); @@ -3131,13 +3310,12 @@ static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data, } static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, - u8 *data, - struct ib_device *ibdev, - u8 port, u32 *resp_len) + u8 *data, struct ib_device *ibdev, + u8 port, u32 *resp_len) { int i; struct opa_congestion_setting_attr *p = - (struct opa_congestion_setting_attr *) data; + (struct opa_congestion_setting_attr *)data; struct hfi1_ibport *ibp = to_iport(ibdev, port); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct opa_congestion_setting_entry_shadow *entries; @@ -3147,7 +3325,7 @@ static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, cc_state = get_cc_state(ppd); - if (cc_state == NULL) { + if (!cc_state) { rcu_read_unlock(); return reply((struct ib_mad_hdr *)smp); } @@ -3176,7 +3354,7 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, u32 *resp_len) { struct opa_congestion_setting_attr *p = - (struct opa_congestion_setting_attr *) data; + (struct opa_congestion_setting_attr *)data; struct hfi1_ibport *ibp = to_iport(ibdev, port); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct opa_congestion_setting_entry_shadow *entries; @@ -3238,7 +3416,7 @@ static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am, continue; memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3); memcpy(cong_log->events[i].remote_qp_number_cn_entry, - &cce->rqpn, 3); + &cce->rqpn, 3); cong_log->events[i].sl_svc_type_cn_entry = ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7); cong_log->events[i].remote_lid_cn_entry = @@ -3268,7 +3446,7 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, u32 *resp_len) { struct ib_cc_table_attr *cc_table_attr = - (struct ib_cc_table_attr *) data; + (struct ib_cc_table_attr *)data; struct hfi1_ibport *ibp = to_iport(ibdev, port); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); u32 start_block = OPA_AM_START_BLK(am); @@ -3289,7 +3467,7 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, cc_state = get_cc_state(ppd); - if (cc_state == NULL) { + if (!cc_state) { rcu_read_unlock(); return reply((struct ib_mad_hdr *)smp); } @@ -3309,7 +3487,7 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, rcu_read_unlock(); if (resp_len) - *resp_len += sizeof(u16)*(IB_CCT_ENTRIES * n_blocks + 1); + *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1); return reply((struct ib_mad_hdr *)smp); } @@ -3325,7 +3503,7 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, u32 *resp_len) { - struct ib_cc_table_attr *p = (struct ib_cc_table_attr *) data; + struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data; struct hfi1_ibport *ibp = to_iport(ibdev, port); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); u32 start_block = OPA_AM_START_BLK(am); @@ -3355,14 +3533,14 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, } new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL); - if (new_cc_state == NULL) + if (!new_cc_state) goto getit; spin_lock(&ppd->cc_state_lock); old_cc_state = get_cc_state(ppd); - if (old_cc_state == NULL) { + if (!old_cc_state) { spin_unlock(&ppd->cc_state_lock); kfree(new_cc_state); return reply((struct ib_mad_hdr *)smp); @@ -3402,26 +3580,31 @@ struct opa_led_info { }; #define OPA_LED_SHIFT 31 -#define OPA_LED_MASK (1 << OPA_LED_SHIFT) +#define OPA_LED_MASK BIT(OPA_LED_SHIFT) static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, u32 *resp_len) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - struct opa_led_info *p = (struct opa_led_info *) data; + struct hfi1_pportdata *ppd = dd->pport; + struct opa_led_info *p = (struct opa_led_info *)data; u32 nport = OPA_AM_NPORT(am); - u64 reg; + u32 is_beaconing_active; if (nport != 1) { smp->status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)smp); } - reg = read_csr(dd, DCC_CFG_LED_CNTRL); - if ((reg & DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK) && - ((reg & DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK) == 0xf)) - p->rsvd_led_mask = cpu_to_be32(OPA_LED_MASK); + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active); + p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT); if (resp_len) *resp_len += sizeof(struct opa_led_info); @@ -3434,7 +3617,7 @@ static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, u32 *resp_len) { struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - struct opa_led_info *p = (struct opa_led_info *) data; + struct opa_led_info *p = (struct opa_led_info *)data; u32 nport = OPA_AM_NPORT(am); int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK); @@ -3443,7 +3626,10 @@ static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - setextled(dd, on); + if (on) + hfi1_start_led_override(dd->pport, 2000, 1500); + else + shutdown_led_override(dd->pport); return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len); } @@ -3486,7 +3672,7 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, break; case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, - resp_len); + resp_len); break; case OPA_ATTRIB_ID_PORT_STATE_INFO: ret = __subn_get_opa_psi(smp, am, data, ibdev, port, @@ -3525,9 +3711,9 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, resp_len); break; case IB_SMP_ATTR_SM_INFO: - if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; - if (ibp->port_cap_flags & IB_PORT_SM) + if (ibp->rvp.port_cap_flags & IB_PORT_SM) return IB_MAD_RESULT_SUCCESS; /* FALLTHROUGH */ default: @@ -3568,7 +3754,7 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, break; case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port, - resp_len); + resp_len); break; case OPA_ATTRIB_ID_PORT_STATE_INFO: ret = __subn_set_opa_psi(smp, am, data, ibdev, port, @@ -3595,9 +3781,9 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, resp_len); break; case IB_SMP_ATTR_SM_INFO: - if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; - if (ibp->port_cap_flags & IB_PORT_SM) + if (ibp->rvp.port_cap_flags & IB_PORT_SM) return IB_MAD_RESULT_SUCCESS; /* FALLTHROUGH */ default: @@ -3647,14 +3833,13 @@ static int subn_get_opa_aggregate(struct opa_smp *smp, /* zero the payload for this segment */ memset(next_smp + sizeof(*agg), 0, agg_data_len); - (void) subn_get_opa_sma(agg->attr_id, smp, am, agg->data, + (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data, ibdev, port, NULL); if (smp->status & ~IB_SMP_DIRECTION) { set_aggr_error(agg); return reply((struct ib_mad_hdr *)smp); } next_smp += agg_size; - } return reply((struct ib_mad_hdr *)smp); @@ -3691,14 +3876,13 @@ static int subn_set_opa_aggregate(struct opa_smp *smp, return reply((struct ib_mad_hdr *)smp); } - (void) subn_set_opa_sma(agg->attr_id, smp, am, agg->data, + (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data, ibdev, port, NULL); if (smp->status & ~IB_SMP_DIRECTION) { set_aggr_error(agg); return reply((struct ib_mad_hdr *)smp); } next_smp += agg_size; - } return reply((struct ib_mad_hdr *)smp); @@ -3816,7 +4000,7 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, if (smp->class_version != OPA_SMI_CLASS_VERSION) { smp->status |= IB_SMP_UNSUP_VERSION; ret = reply((struct ib_mad_hdr *)smp); - goto bail; + return ret; } ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey, smp->route.dr.dr_slid, smp->route.dr.return_path, @@ -3836,13 +4020,13 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, smp->method == IB_MGMT_METHOD_SET) && port_num && port_num <= ibdev->phys_port_cnt && port != port_num) - (void) check_mkey(to_iport(ibdev, port_num), + (void)check_mkey(to_iport(ibdev, port_num), (struct ib_mad_hdr *)smp, 0, smp->mkey, smp->route.dr.dr_slid, smp->route.dr.return_path, smp->hop_cnt); ret = IB_MAD_RESULT_FAILURE; - goto bail; + return ret; } *resp_len = opa_get_smp_header_size(smp); @@ -3854,23 +4038,25 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, clear_opa_smp_data(smp); ret = subn_get_opa_sma(attr_id, smp, am, data, ibdev, port, resp_len); - goto bail; + break; case OPA_ATTRIB_ID_AGGREGATE: ret = subn_get_opa_aggregate(smp, ibdev, port, resp_len); - goto bail; + break; } + break; case IB_MGMT_METHOD_SET: switch (attr_id) { default: ret = subn_set_opa_sma(attr_id, smp, am, data, ibdev, port, resp_len); - goto bail; + break; case OPA_ATTRIB_ID_AGGREGATE: ret = subn_set_opa_aggregate(smp, ibdev, port, resp_len); - goto bail; + break; } + break; case IB_MGMT_METHOD_TRAP: case IB_MGMT_METHOD_REPORT: case IB_MGMT_METHOD_REPORT_RESP: @@ -3881,13 +4067,13 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, * Just tell the caller to process it normally. */ ret = IB_MAD_RESULT_SUCCESS; - goto bail; + break; default: smp->status |= IB_SMP_UNSUP_METHOD; ret = reply((struct ib_mad_hdr *)smp); + break; } -bail: return ret; } @@ -3903,7 +4089,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, if (smp->class_version != 1) { smp->status |= IB_SMP_UNSUP_VERSION; ret = reply((struct ib_mad_hdr *)smp); - goto bail; + return ret; } ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, @@ -3924,13 +4110,13 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, smp->method == IB_MGMT_METHOD_SET) && port_num && port_num <= ibdev->phys_port_cnt && port != port_num) - (void) check_mkey(to_iport(ibdev, port_num), - (struct ib_mad_hdr *)smp, 0, - smp->mkey, - (__force __be32)smp->dr_slid, - smp->return_path, smp->hop_cnt); + (void)check_mkey(to_iport(ibdev, port_num), + (struct ib_mad_hdr *)smp, 0, + smp->mkey, + (__force __be32)smp->dr_slid, + smp->return_path, smp->hop_cnt); ret = IB_MAD_RESULT_FAILURE; - goto bail; + return ret; } switch (smp->method) { @@ -3938,15 +4124,77 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, switch (smp->attr_id) { case IB_SMP_ATTR_NODE_INFO: ret = subn_get_nodeinfo(smp, ibdev, port); - goto bail; + break; default: smp->status |= IB_SMP_UNSUP_METH_ATTR; ret = reply((struct ib_mad_hdr *)smp); - goto bail; + break; } + break; + } + + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port, + const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + struct ib_class_port_info *cpi = (struct ib_class_port_info *) + &pmp->data; + int ret = IB_MAD_RESULT_FAILURE; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)pmp); + return ret; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_COUNTERS: + ret = pma_get_ib_portcounters(pmp, ibdev, port); + break; + case IB_PMA_PORT_COUNTERS_EXT: + ret = pma_get_ib_portcounters_ext(pmp, ibdev, port); + break; + case IB_PMA_CLASS_PORT_INFO: + cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + ret = reply((struct ib_mad_hdr *)pmp); + break; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + break; + + case IB_MGMT_METHOD_SET: + if (pmp->mad_hdr.attr_id) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + } + break; + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + break; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)pmp); + break; } -bail: return ret; } @@ -3971,44 +4219,46 @@ static int process_perf_opa(struct ib_device *ibdev, u8 port, switch (pmp->mad_hdr.attr_id) { case IB_PMA_CLASS_PORT_INFO: ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len); - goto bail; + break; case OPA_PM_ATTRIB_ID_PORT_STATUS: ret = pma_get_opa_portstatus(pmp, ibdev, port, - resp_len); - goto bail; + resp_len); + break; case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS: ret = pma_get_opa_datacounters(pmp, ibdev, port, - resp_len); - goto bail; + resp_len); + break; case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS: ret = pma_get_opa_porterrors(pmp, ibdev, port, - resp_len); - goto bail; + resp_len); + break; case OPA_PM_ATTRIB_ID_ERROR_INFO: ret = pma_get_opa_errorinfo(pmp, ibdev, port, - resp_len); - goto bail; + resp_len); + break; default: pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; ret = reply((struct ib_mad_hdr *)pmp); - goto bail; + break; } + break; case IB_MGMT_METHOD_SET: switch (pmp->mad_hdr.attr_id) { case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS: ret = pma_set_opa_portstatus(pmp, ibdev, port, - resp_len); - goto bail; + resp_len); + break; case OPA_PM_ATTRIB_ID_ERROR_INFO: ret = pma_set_opa_errorinfo(pmp, ibdev, port, - resp_len); - goto bail; + resp_len); + break; default: pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; ret = reply((struct ib_mad_hdr *)pmp); - goto bail; + break; } + break; case IB_MGMT_METHOD_TRAP: case IB_MGMT_METHOD_GET_RESP: @@ -4018,14 +4268,14 @@ static int process_perf_opa(struct ib_device *ibdev, u8 port, * Just tell the caller to process it normally. */ ret = IB_MAD_RESULT_SUCCESS; - goto bail; + break; default: pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; ret = reply((struct ib_mad_hdr *)pmp); + break; } -bail: return ret; } @@ -4090,12 +4340,15 @@ static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port, case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: case IB_MGMT_CLASS_SUBN_LID_ROUTED: ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad); - goto bail; + break; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port, in_mad, out_mad); + break; default: ret = IB_MAD_RESULT_SUCCESS; + break; } -bail: return ret; } @@ -4147,66 +4400,3 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, return IB_MAD_RESULT_FAILURE; } - -static void send_handler(struct ib_mad_agent *agent, - struct ib_mad_send_wc *mad_send_wc) -{ - ib_free_send_mad(mad_send_wc->send_buf); -} - -int hfi1_create_agents(struct hfi1_ibdev *dev) -{ - struct hfi1_devdata *dd = dd_from_dev(dev); - struct ib_mad_agent *agent; - struct hfi1_ibport *ibp; - int p; - int ret; - - for (p = 0; p < dd->num_pports; p++) { - ibp = &dd->pport[p].ibport_data; - agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI, - NULL, 0, send_handler, - NULL, NULL, 0); - if (IS_ERR(agent)) { - ret = PTR_ERR(agent); - goto err; - } - - ibp->send_agent = agent; - } - - return 0; - -err: - for (p = 0; p < dd->num_pports; p++) { - ibp = &dd->pport[p].ibport_data; - if (ibp->send_agent) { - agent = ibp->send_agent; - ibp->send_agent = NULL; - ib_unregister_mad_agent(agent); - } - } - - return ret; -} - -void hfi1_free_agents(struct hfi1_ibdev *dev) -{ - struct hfi1_devdata *dd = dd_from_dev(dev); - struct ib_mad_agent *agent; - struct hfi1_ibport *ibp; - int p; - - for (p = 0; p < dd->num_pports; p++) { - ibp = &dd->pport[p].ibport_data; - if (ibp->send_agent) { - agent = ibp->send_agent; - ibp->send_agent = NULL; - ib_unregister_mad_agent(agent); - } - if (ibp->sm_ah) { - ib_destroy_ah(&ibp->sm_ah->ibah); - ibp->sm_ah = NULL; - } - } -} diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h index f0317750e2fc..55ee08675333 100644 --- a/drivers/staging/rdma/hfi1/mad.h +++ b/drivers/staging/rdma/hfi1/mad.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -51,8 +48,10 @@ #define _HFI1_MAD_H #include <rdma/ib_pma.h> -#define USE_PI_LED_ENABLE 1 /* use led enabled bit in struct - * opa_port_states, if available */ +#define USE_PI_LED_ENABLE 1 /* + * use led enabled bit in struct + * opa_port_states, if available + */ #include <rdma/opa_smi.h> #include <rdma/opa_port_info.h> #ifndef PI_LED_ENABLE_SUP @@ -235,7 +234,6 @@ struct ib_pma_portcounters_cong { #define IB_CC_SVCTYPE_RD 0x2 #define IB_CC_SVCTYPE_UD 0x3 - /* * There should be an equivalent IB #define for the following, but * I cannot find it. @@ -267,7 +265,7 @@ struct opa_hfi1_cong_log { u8 congestion_flags; __be16 threshold_event_counter; __be32 current_time_stamp; - u8 threshold_cong_event_map[OPA_MAX_SLS/8]; + u8 threshold_cong_event_map[OPA_MAX_SLS / 8]; struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS]; } __packed; diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c new file mode 100644 index 000000000000..c7ad0164ea9a --- /dev/null +++ b/drivers/staging/rdma/hfi1/mmu_rb.c @@ -0,0 +1,292 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/list.h> +#include <linux/mmu_notifier.h> +#include <linux/interval_tree_generic.h> + +#include "mmu_rb.h" +#include "trace.h" + +struct mmu_rb_handler { + struct list_head list; + struct mmu_notifier mn; + struct rb_root *root; + spinlock_t lock; /* protect the RB tree */ + struct mmu_rb_ops *ops; +}; + +static LIST_HEAD(mmu_rb_handlers); +static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */ + +static unsigned long mmu_node_start(struct mmu_rb_node *); +static unsigned long mmu_node_last(struct mmu_rb_node *); +static struct mmu_rb_handler *find_mmu_handler(struct rb_root *); +static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, + unsigned long); +static inline void mmu_notifier_range_start(struct mmu_notifier *, + struct mm_struct *, + unsigned long, unsigned long); +static void mmu_notifier_mem_invalidate(struct mmu_notifier *, + unsigned long, unsigned long); +static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, + unsigned long, unsigned long); + +static struct mmu_notifier_ops mn_opts = { + .invalidate_page = mmu_notifier_page, + .invalidate_range_start = mmu_notifier_range_start, +}; + +INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last, + mmu_node_start, mmu_node_last, static, __mmu_int_rb); + +static unsigned long mmu_node_start(struct mmu_rb_node *node) +{ + return node->addr & PAGE_MASK; +} + +static unsigned long mmu_node_last(struct mmu_rb_node *node) +{ + return PAGE_ALIGN((node->addr & PAGE_MASK) + node->len) - 1; +} + +int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) +{ + struct mmu_rb_handler *handlr; + unsigned long flags; + + if (!ops->invalidate) + return -EINVAL; + + handlr = kmalloc(sizeof(*handlr), GFP_KERNEL); + if (!handlr) + return -ENOMEM; + + handlr->root = root; + handlr->ops = ops; + INIT_HLIST_NODE(&handlr->mn.hlist); + spin_lock_init(&handlr->lock); + handlr->mn.ops = &mn_opts; + spin_lock_irqsave(&mmu_rb_lock, flags); + list_add_tail(&handlr->list, &mmu_rb_handlers); + spin_unlock_irqrestore(&mmu_rb_lock, flags); + + return mmu_notifier_register(&handlr->mn, current->mm); +} + +void hfi1_mmu_rb_unregister(struct rb_root *root) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + unsigned long flags; + + if (!handler) + return; + + spin_lock_irqsave(&mmu_rb_lock, flags); + list_del(&handler->list); + spin_unlock_irqrestore(&mmu_rb_lock, flags); + + if (!RB_EMPTY_ROOT(root)) { + struct rb_node *node; + struct mmu_rb_node *rbnode; + + while ((node = rb_first(root))) { + rbnode = rb_entry(node, struct mmu_rb_node, node); + rb_erase(node, root); + if (handler->ops->remove) + handler->ops->remove(root, rbnode, false); + } + } + + if (current->mm) + mmu_notifier_unregister(&handler->mn, current->mm); + kfree(handler); +} + +int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *node; + unsigned long flags; + int ret = 0; + + if (!handler) + return -EINVAL; + + spin_lock_irqsave(&handler->lock, flags); + hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr, + mnode->len); + node = __mmu_rb_search(handler, mnode->addr, mnode->len); + if (node) { + ret = -EINVAL; + goto unlock; + } + __mmu_int_rb_insert(mnode, root); + + if (handler->ops->insert) { + ret = handler->ops->insert(root, mnode); + if (ret) + __mmu_int_rb_remove(mnode, root); + } +unlock: + spin_unlock_irqrestore(&handler->lock, flags); + return ret; +} + +/* Caller must host handler lock */ +static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, + unsigned long addr, + unsigned long len) +{ + struct mmu_rb_node *node = NULL; + + hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len); + if (!handler->ops->filter) { + node = __mmu_int_rb_iter_first(handler->root, addr, + (addr + len) - 1); + } else { + for (node = __mmu_int_rb_iter_first(handler->root, addr, + (addr + len) - 1); + node; + node = __mmu_int_rb_iter_next(node, addr, + (addr + len) - 1)) { + if (handler->ops->filter(node, addr, len)) + return node; + } + } + return node; +} + +static void __mmu_rb_remove(struct mmu_rb_handler *handler, + struct mmu_rb_node *node, bool arg) +{ + /* Validity of handler and node pointers has been checked by caller. */ + hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, + node->len); + __mmu_int_rb_remove(node, handler->root); + if (handler->ops->remove) + handler->ops->remove(handler->root, node, arg); +} + +struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr, + unsigned long len) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *node; + unsigned long flags; + + if (!handler) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, addr, len); + spin_unlock_irqrestore(&handler->lock, flags); + + return node; +} + +void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + unsigned long flags; + + if (!handler || !node) + return; + + spin_lock_irqsave(&handler->lock, flags); + __mmu_rb_remove(handler, node, false); + spin_unlock_irqrestore(&handler->lock, flags); +} + +static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root) +{ + struct mmu_rb_handler *handler; + unsigned long flags; + + spin_lock_irqsave(&mmu_rb_lock, flags); + list_for_each_entry(handler, &mmu_rb_handlers, list) { + if (handler->root == root) + goto unlock; + } + handler = NULL; +unlock: + spin_unlock_irqrestore(&mmu_rb_lock, flags); + return handler; +} + +static inline void mmu_notifier_page(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long addr) +{ + mmu_notifier_mem_invalidate(mn, addr, addr + PAGE_SIZE); +} + +static inline void mmu_notifier_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + mmu_notifier_mem_invalidate(mn, start, end); +} + +static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, + unsigned long start, unsigned long end) +{ + struct mmu_rb_handler *handler = + container_of(mn, struct mmu_rb_handler, mn); + struct rb_root *root = handler->root; + struct mmu_rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&handler->lock, flags); + for (node = __mmu_int_rb_iter_first(root, start, end - 1); node; + node = __mmu_int_rb_iter_next(node, start, end - 1)) { + hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u", + node->addr, node->len); + if (handler->ops->invalidate(root, node)) + __mmu_rb_remove(handler, node, true); + } + spin_unlock_irqrestore(&handler->lock, flags); +} diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h new file mode 100644 index 000000000000..f8523fdb8a18 --- /dev/null +++ b/drivers/staging/rdma/hfi1/mmu_rb.h @@ -0,0 +1,73 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_MMU_RB_H +#define _HFI1_MMU_RB_H + +#include "hfi.h" + +struct mmu_rb_node { + unsigned long addr; + unsigned long len; + unsigned long __last; + struct rb_node node; +}; + +struct mmu_rb_ops { + bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long); + int (*insert)(struct rb_root *, struct mmu_rb_node *); + void (*remove)(struct rb_root *, struct mmu_rb_node *, bool); + int (*invalidate)(struct rb_root *, struct mmu_rb_node *); +}; + +int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops); +void hfi1_mmu_rb_unregister(struct rb_root *); +int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); +void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); +struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long, + unsigned long); + +#endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c deleted file mode 100644 index a3f8b884fdd6..000000000000 --- a/drivers/staging/rdma/hfi1/mr.c +++ /dev/null @@ -1,473 +0,0 @@ -/* - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <rdma/ib_umem.h> -#include <rdma/ib_smi.h> - -#include "hfi.h" - -/* Fast memory region */ -struct hfi1_fmr { - struct ib_fmr ibfmr; - struct hfi1_mregion mr; /* must be last */ -}; - -static inline struct hfi1_fmr *to_ifmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct hfi1_fmr, ibfmr); -} - -static int init_mregion(struct hfi1_mregion *mr, struct ib_pd *pd, - int count) -{ - int m, i = 0; - int rval = 0; - - m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ; - for (; i < m; i++) { - mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL); - if (!mr->map[i]) - goto bail; - } - mr->mapsz = m; - init_completion(&mr->comp); - /* count returning the ptr to user */ - atomic_set(&mr->refcount, 1); - mr->pd = pd; - mr->max_segs = count; -out: - return rval; -bail: - while (i) - kfree(mr->map[--i]); - rval = -ENOMEM; - goto out; -} - -static void deinit_mregion(struct hfi1_mregion *mr) -{ - int i = mr->mapsz; - - mr->mapsz = 0; - while (i) - kfree(mr->map[--i]); -} - - -/** - * hfi1_get_dma_mr - get a DMA memory region - * @pd: protection domain for this memory region - * @acc: access flags - * - * Returns the memory region on success, otherwise returns an errno. - * Note that all DMA addresses should be created via the - * struct ib_dma_mapping_ops functions (see dma.c). - */ -struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc) -{ - struct hfi1_mr *mr = NULL; - struct ib_mr *ret; - int rval; - - if (to_ipd(pd)->user) { - ret = ERR_PTR(-EPERM); - goto bail; - } - - mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - rval = init_mregion(&mr->mr, pd, 0); - if (rval) { - ret = ERR_PTR(rval); - goto bail; - } - - - rval = hfi1_alloc_lkey(&mr->mr, 1); - if (rval) { - ret = ERR_PTR(rval); - goto bail_mregion; - } - - mr->mr.access_flags = acc; - ret = &mr->ibmr; -done: - return ret; - -bail_mregion: - deinit_mregion(&mr->mr); -bail: - kfree(mr); - goto done; -} - -static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd) -{ - struct hfi1_mr *mr; - int rval = -ENOMEM; - int m; - - /* Allocate struct plus pointers to first level page tables. */ - m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ; - mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL); - if (!mr) - goto bail; - - rval = init_mregion(&mr->mr, pd, count); - if (rval) - goto bail; - - rval = hfi1_alloc_lkey(&mr->mr, 0); - if (rval) - goto bail_mregion; - mr->ibmr.lkey = mr->mr.lkey; - mr->ibmr.rkey = mr->mr.lkey; -done: - return mr; - -bail_mregion: - deinit_mregion(&mr->mr); -bail: - kfree(mr); - mr = ERR_PTR(rval); - goto done; -} - -/** - * hfi1_reg_user_mr - register a userspace memory region - * @pd: protection domain for this memory region - * @start: starting userspace address - * @length: length of region to register - * @mr_access_flags: access flags for this memory region - * @udata: unused by the driver - * - * Returns the memory region on success, otherwise returns an errno. - */ -struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt_addr, int mr_access_flags, - struct ib_udata *udata) -{ - struct hfi1_mr *mr; - struct ib_umem *umem; - struct scatterlist *sg; - int n, m, entry; - struct ib_mr *ret; - - if (length == 0) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - - umem = ib_umem_get(pd->uobject->context, start, length, - mr_access_flags, 0); - if (IS_ERR(umem)) - return (void *) umem; - - n = umem->nmap; - - mr = alloc_mr(n, pd); - if (IS_ERR(mr)) { - ret = (struct ib_mr *)mr; - ib_umem_release(umem); - goto bail; - } - - mr->mr.user_base = start; - mr->mr.iova = virt_addr; - mr->mr.length = length; - mr->mr.offset = ib_umem_offset(umem); - mr->mr.access_flags = mr_access_flags; - mr->umem = umem; - - if (is_power_of_2(umem->page_size)) - mr->mr.page_shift = ilog2(umem->page_size); - m = 0; - n = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - void *vaddr; - - vaddr = page_address(sg_page(sg)); - if (!vaddr) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - mr->mr.map[m]->segs[n].vaddr = vaddr; - mr->mr.map[m]->segs[n].length = umem->page_size; - n++; - if (n == HFI1_SEGSZ) { - m++; - n = 0; - } - } - ret = &mr->ibmr; - -bail: - return ret; -} - -/** - * hfi1_dereg_mr - unregister and free a memory region - * @ibmr: the memory region to free - * - * Returns 0 on success. - * - * Note that this is called to free MRs created by hfi1_get_dma_mr() - * or hfi1_reg_user_mr(). - */ -int hfi1_dereg_mr(struct ib_mr *ibmr) -{ - struct hfi1_mr *mr = to_imr(ibmr); - int ret = 0; - unsigned long timeout; - - hfi1_free_lkey(&mr->mr); - - hfi1_put_mr(&mr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&mr->mr.comp, - 5 * HZ); - if (!timeout) { - dd_dev_err( - dd_from_ibdev(mr->mr.pd->device), - "hfi1_dereg_mr timeout mr %p pd %p refcount %u\n", - mr, mr->mr.pd, atomic_read(&mr->mr.refcount)); - hfi1_get_mr(&mr->mr); - ret = -EBUSY; - goto out; - } - deinit_mregion(&mr->mr); - if (mr->umem) - ib_umem_release(mr->umem); - kfree(mr); -out: - return ret; -} - -/* - * Allocate a memory region usable with the - * IB_WR_REG_MR send work request. - * - * Return the memory region on success, otherwise return an errno. - * FIXME: IB_WR_REG_MR is not supported - */ -struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd, - enum ib_mr_type mr_type, - u32 max_num_sg) -{ - struct hfi1_mr *mr; - - if (mr_type != IB_MR_TYPE_MEM_REG) - return ERR_PTR(-EINVAL); - - mr = alloc_mr(max_num_sg, pd); - if (IS_ERR(mr)) - return (struct ib_mr *)mr; - - return &mr->ibmr; -} - -/** - * hfi1_alloc_fmr - allocate a fast memory region - * @pd: the protection domain for this memory region - * @mr_access_flags: access flags for this memory region - * @fmr_attr: fast memory region attributes - * - * Returns the memory region on success, otherwise returns an errno. - */ -struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct hfi1_fmr *fmr; - int m; - struct ib_fmr *ret; - int rval = -ENOMEM; - - /* Allocate struct plus pointers to first level page tables. */ - m = (fmr_attr->max_pages + HFI1_SEGSZ - 1) / HFI1_SEGSZ; - fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL); - if (!fmr) - goto bail; - - rval = init_mregion(&fmr->mr, pd, fmr_attr->max_pages); - if (rval) - goto bail; - - /* - * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & - * rkey. - */ - rval = hfi1_alloc_lkey(&fmr->mr, 0); - if (rval) - goto bail_mregion; - fmr->ibfmr.rkey = fmr->mr.lkey; - fmr->ibfmr.lkey = fmr->mr.lkey; - /* - * Resources are allocated but no valid mapping (RKEY can't be - * used). - */ - fmr->mr.access_flags = mr_access_flags; - fmr->mr.max_segs = fmr_attr->max_pages; - fmr->mr.page_shift = fmr_attr->page_shift; - - ret = &fmr->ibfmr; -done: - return ret; - -bail_mregion: - deinit_mregion(&fmr->mr); -bail: - kfree(fmr); - ret = ERR_PTR(rval); - goto done; -} - -/** - * hfi1_map_phys_fmr - set up a fast memory region - * @ibmfr: the fast memory region to set up - * @page_list: the list of pages to associate with the fast memory region - * @list_len: the number of pages to associate with the fast memory region - * @iova: the virtual address of the start of the fast memory region - * - * This may be called from interrupt context. - */ - -int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova) -{ - struct hfi1_fmr *fmr = to_ifmr(ibfmr); - struct hfi1_lkey_table *rkt; - unsigned long flags; - int m, n, i; - u32 ps; - int ret; - - i = atomic_read(&fmr->mr.refcount); - if (i > 2) - return -EBUSY; - - if (list_len > fmr->mr.max_segs) { - ret = -EINVAL; - goto bail; - } - rkt = &to_idev(ibfmr->device)->lk_table; - spin_lock_irqsave(&rkt->lock, flags); - fmr->mr.user_base = iova; - fmr->mr.iova = iova; - ps = 1 << fmr->mr.page_shift; - fmr->mr.length = list_len * ps; - m = 0; - n = 0; - for (i = 0; i < list_len; i++) { - fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; - fmr->mr.map[m]->segs[n].length = ps; - if (++n == HFI1_SEGSZ) { - m++; - n = 0; - } - } - spin_unlock_irqrestore(&rkt->lock, flags); - ret = 0; - -bail: - return ret; -} - -/** - * hfi1_unmap_fmr - unmap fast memory regions - * @fmr_list: the list of fast memory regions to unmap - * - * Returns 0 on success. - */ -int hfi1_unmap_fmr(struct list_head *fmr_list) -{ - struct hfi1_fmr *fmr; - struct hfi1_lkey_table *rkt; - unsigned long flags; - - list_for_each_entry(fmr, fmr_list, ibfmr.list) { - rkt = &to_idev(fmr->ibfmr.device)->lk_table; - spin_lock_irqsave(&rkt->lock, flags); - fmr->mr.user_base = 0; - fmr->mr.iova = 0; - fmr->mr.length = 0; - spin_unlock_irqrestore(&rkt->lock, flags); - } - return 0; -} - -/** - * hfi1_dealloc_fmr - deallocate a fast memory region - * @ibfmr: the fast memory region to deallocate - * - * Returns 0 on success. - */ -int hfi1_dealloc_fmr(struct ib_fmr *ibfmr) -{ - struct hfi1_fmr *fmr = to_ifmr(ibfmr); - int ret = 0; - unsigned long timeout; - - hfi1_free_lkey(&fmr->mr); - hfi1_put_mr(&fmr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&fmr->mr.comp, - 5 * HZ); - if (!timeout) { - hfi1_get_mr(&fmr->mr); - ret = -EBUSY; - goto out; - } - deinit_mregion(&fmr->mr); - kfree(fmr); -out: - return ret; -} diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h index f64eec1c2951..6ef3c1cbdcd7 100644 --- a/drivers/staging/rdma/hfi1/opa_compat.h +++ b/drivers/staging/rdma/hfi1/opa_compat.h @@ -1,14 +1,13 @@ #ifndef _LINUX_H #define _LINUX_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -111,19 +108,4 @@ enum opa_port_phys_state { /* values 12-15 are reserved/ignored */ }; -/* OPA_PORT_TYPE_* definitions - these belong in opa_port_info.h */ -#define OPA_PORT_TYPE_UNKNOWN 0 -#define OPA_PORT_TYPE_DISCONNECTED 1 -/* port is not currently usable, CableInfo not available */ -#define OPA_PORT_TYPE_FIXED 2 -/* A fixed backplane port in a director class switch. All OPA ASICS */ -#define OPA_PORT_TYPE_VARIABLE 3 -/* A backplane port in a blade system, possibly mixed configuration */ -#define OPA_PORT_TYPE_STANDARD 4 -/* implies a SFF-8636 defined format for CableInfo (QSFP) */ -#define OPA_PORT_TYPE_SI_PHOTONICS 5 -/* A silicon photonics module implies TBD defined format for CableInfo - * as defined by Intel SFO group */ -/* 6 - 15 are reserved */ - #endif /* _LINUX_H */ diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c index 8317b07d722a..42a409f16449 100644 --- a/drivers/staging/rdma/hfi1/pcie.c +++ b/drivers/staging/rdma/hfi1/pcie.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -57,6 +54,7 @@ #include "hfi.h" #include "chip_registers.h" +#include "aspm.h" /* link speed vector for Gen3 speed - not in Linux headers */ #define GEN1_SPEED_VECTOR 0x1 @@ -122,8 +120,9 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) goto bail; } ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - } else + } else { ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + } if (ret) { hfi1_early_err(&pdev->dev, "Unable to set DMA consistent mask: %d\n", ret); @@ -131,13 +130,7 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - ret = pci_enable_pcie_error_reporting(pdev); - if (ret) { - hfi1_early_err(&pdev->dev, - "Unable to enable pcie error reporting: %d\n", - ret); - ret = 0; - } + (void)pci_enable_pcie_error_reporting(pdev); goto done; bail: @@ -222,10 +215,9 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev, pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl); pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl); pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2, - &dd->pcie_devctl2); + &dd->pcie_devctl2); pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0); - pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, - &dd->pci_lnkctl3); + pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3); pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2); return 0; @@ -238,7 +230,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev, */ void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd) { - u64 __iomem *base = (void __iomem *) dd->kregbase; + u64 __iomem *base = (void __iomem *)dd->kregbase; dd->flags &= ~HFI1_PRESENT; dd->kregbase = NULL; @@ -276,7 +268,7 @@ void hfi1_pcie_flr(struct hfi1_devdata *dd) clear: pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL, - PCI_EXP_DEVCTL_BCR_FLR); + PCI_EXP_DEVCTL_BCR_FLR); /* PCIe spec requires the function to be back within 100ms */ msleep(100); } @@ -289,9 +281,11 @@ static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt, struct msix_entry *msix_entry; int i; - /* We can't pass hfi1_msix_entry array to msix_setup + /* + * We can't pass hfi1_msix_entry array to msix_setup * so use a dummy msix_entry array and copy the allocated - * irq back to the hfi1_msix_entry array. */ + * irq back to the hfi1_msix_entry array. + */ msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL); if (!msix_entry) { ret = -ENOMEM; @@ -321,7 +315,6 @@ do_intx: nvec, ret); *msixcnt = 0; hfi1_enable_intx(dd->pcidev); - } /* return the PCIe link speed from the given link status */ @@ -369,6 +362,7 @@ static void update_lbus_info(struct hfi1_devdata *dd) int pcie_speeds(struct hfi1_devdata *dd) { u32 linkcap; + struct pci_dev *parent = dd->pcidev->bus->self; if (!pci_is_pcie(dd->pcidev)) { dd_dev_err(dd, "Can't find PCI Express capability!\n"); @@ -381,15 +375,15 @@ int pcie_speeds(struct hfi1_devdata *dd) pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap); if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) { dd_dev_info(dd, - "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n", - linkcap & PCI_EXP_LNKCAP_SLS); + "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n", + linkcap & PCI_EXP_LNKCAP_SLS); dd->link_gen3_capable = 0; } /* * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed */ - if (dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) { + if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) { dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n"); dd->link_gen3_capable = 0; } @@ -397,9 +391,7 @@ int pcie_speeds(struct hfi1_devdata *dd) /* obtain the link width and current speed */ update_lbus_info(dd); - /* check against expected pcie width and complain if "wrong" */ - if (dd->lbus_width < 16) - dd_dev_err(dd, "PCIe width %u (x16 HFI)\n", dd->lbus_width); + dd_dev_info(dd, "%s\n", dd->lbus_info); return 0; } @@ -438,23 +430,18 @@ void hfi1_enable_intx(struct pci_dev *pdev) void restore_pci_variables(struct hfi1_devdata *dd) { pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command); - pci_write_config_dword(dd->pcidev, - PCI_BASE_ADDRESS_0, dd->pcibar0); - pci_write_config_dword(dd->pcidev, - PCI_BASE_ADDRESS_1, dd->pcibar1); - pci_write_config_dword(dd->pcidev, - PCI_ROM_ADDRESS, dd->pci_rom); + pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0); + pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1); + pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom); pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl); pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl); pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2, - dd->pcie_devctl2); + dd->pcie_devctl2); pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0); - pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, - dd->pci_lnkctl3); + pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3); pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2); } - /* * BIOS may not set PCIe bus-utilization parameters for best performance. * Check and optionally adjust them to maximize our throughput. @@ -463,6 +450,10 @@ static int hfi1_pcie_caps; module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); +uint aspm_mode = ASPM_MODE_DISABLED; +module_param_named(aspm, aspm_mode, uint, S_IRUGO); +MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); + static void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; @@ -481,6 +472,12 @@ static void tune_pcie_caps(struct hfi1_devdata *dd) } /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; + /* + * The driver cannot perform the tuning if it does not have + * access to the upstream component. + */ + if (!parent) + return; if (!pci_is_root_bus(parent->bus)) { dd_dev_info(dd, "Parent not root\n"); return; @@ -534,6 +531,7 @@ static void tune_pcie_caps(struct hfi1_devdata *dd) pcie_set_readrq(dd->pcidev, ep_mrrs); } } + /* End of PCIe capability tuning */ /* @@ -748,21 +746,22 @@ static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs, c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div); c_plus1 = eq[i][POST] / div; pci_write_config_dword(pdev, PCIE_CFG_REG_PL102, - eq_value(c_minus1, c0, c_plus1)); + eq_value(c_minus1, c0, c_plus1)); /* check if these coefficients violate EQ rules */ pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105, - &violation); + &violation); if (violation & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){ if (hit_error == 0) { dd_dev_err(dd, - "Gen3 EQ Table Coefficient rule violations\n"); + "Gen3 EQ Table Coefficient rule violations\n"); dd_dev_err(dd, " prec attn post\n"); } dd_dev_err(dd, " p%02d: %02x %02x %02x\n", - i, (u32)eq[i][0], (u32)eq[i][1], (u32)eq[i][2]); + i, (u32)eq[i][0], (u32)eq[i][1], + (u32)eq[i][2]); dd_dev_err(dd, " %02x %02x %02x\n", - (u32)c_minus1, (u32)c0, (u32)c_plus1); + (u32)c_minus1, (u32)c0, (u32)c_plus1); hit_error = 1; } } @@ -774,7 +773,7 @@ static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs, /* * Steps to be done after the PCIe firmware is downloaded and * before the SBR for the Pcie Gen3. - * The hardware mutex is already being held. + * The SBus resource is already being held. */ static void pcie_post_steps(struct hfi1_devdata *dd) { @@ -817,8 +816,8 @@ static int trigger_sbr(struct hfi1_devdata *dd) list_for_each_entry(pdev, &dev->bus->devices, bus_list) if (pdev != dev) { dd_dev_err(dd, - "%s: another device is on the same bus\n", - __func__); + "%s: another device is on the same bus\n", + __func__); return -ENOTTY; } @@ -842,8 +841,8 @@ static void write_gasket_interrupt(struct hfi1_devdata *dd, int index, u16 code, u16 data) { write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8), - (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) - |((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT))); + (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) | + ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT))); } /* @@ -853,25 +852,101 @@ static void arm_gasket_logic(struct hfi1_devdata *dd) { u64 reg; - reg = (((u64)1 << dd->hfi1_id) - << ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) - | ((u64)pcie_serdes_broadcast[dd->hfi1_id] - << ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT - | ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK - | ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) - << ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT - ); + reg = (((u64)1 << dd->hfi1_id) << + ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) | + ((u64)pcie_serdes_broadcast[dd->hfi1_id] << + ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT | + ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK | + ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) << + ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT); write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg); /* read back to push the write */ read_csr(dd, ASIC_PCIE_SD_HOST_CMD); } /* + * CCE_PCIE_CTRL long name helpers + * We redefine these shorter macros to use in the code while leaving + * chip_registers.h to be autogenerated from the hardware spec. + */ +#define LANE_BUNDLE_MASK CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK +#define LANE_BUNDLE_SHIFT CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT +#define LANE_DELAY_MASK CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK +#define LANE_DELAY_SHIFT CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT +#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT +#define MARGIN_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_SHIFT +#define MARGIN_G1_G2_OVERWRITE_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK +#define MARGIN_G1_G2_OVERWRITE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT +#define MARGIN_GEN1_GEN2_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK +#define MARGIN_GEN1_GEN2_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT + + /* + * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C). + */ +static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname) +{ + u64 pcie_ctrl; + u64 xmt_margin; + u64 xmt_margin_oe; + u64 lane_delay; + u64 lane_bundle; + + pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL); + + /* + * For Discrete, use full-swing. + * - PCIe TX defaults to full-swing. + * Leave this register as default. + * For Integrated, use half-swing + * - Copy xmt_margin and xmt_margin_oe + * from Gen1/Gen2 to Gen3. + */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */ + /* extract initial fields */ + xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT) + & MARGIN_GEN1_GEN2_MASK; + xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT) + & MARGIN_G1_G2_OVERWRITE_MASK; + lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK; + lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT) + & LANE_BUNDLE_MASK; + + /* + * For A0, EFUSE values are not set. Override with the + * correct values. + */ + if (is_ax(dd)) { + /* + * xmt_margin and OverwiteEnabel should be the + * same for Gen1/Gen2 and Gen3 + */ + xmt_margin = 0x5; + xmt_margin_oe = 0x1; + lane_delay = 0xF; /* Delay 240ns. */ + lane_bundle = 0x0; /* Set to 1 lane. */ + } + + /* overwrite existing values */ + pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT) + | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT) + | (xmt_margin << MARGIN_SHIFT) + | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT) + | (lane_delay << LANE_DELAY_SHIFT) + | (lane_bundle << LANE_BUNDLE_SHIFT); + + write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl); + } + + dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n", + fname, pcie_ctrl); +} + +/* * Do all the steps needed to transition the PCIe link to Gen3 speed. */ int do_pcie_gen3_transition(struct hfi1_devdata *dd) { - struct pci_dev *parent; + struct pci_dev *parent = dd->pcidev->bus->self; u64 fw_ctrl; u64 reg, therm; u32 reg32, fs, lf; @@ -880,8 +955,7 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) int do_retry, retry_count = 0; uint default_pset; u16 target_vector, target_speed; - u16 lnkctl, lnkctl2, vendor; - u8 nsbr = 1; + u16 lnkctl2, vendor; u8 div; const u8 (*eq)[3]; int return_error = 0; @@ -908,17 +982,21 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) /* if already at target speed, done (unless forced) */ if (dd->lbus_speed == target_speed) { dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__, - pcie_target, - pcie_force ? "re-doing anyway" : "skipping"); + pcie_target, + pcie_force ? "re-doing anyway" : "skipping"); if (!pcie_force) return 0; } /* - * A0 needs an additional SBR + * The driver cannot do the transition if it has no access to the + * upstream component */ - if (is_ax(dd)) - nsbr++; + if (!parent) { + dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n", + __func__); + return 0; + } /* * Do the Gen3 transition. Steps are those of the PCIe Gen3 @@ -934,10 +1012,13 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) goto done_no_mutex; } - /* hold the HW mutex across the firmware download and SBR */ - ret = acquire_hw_mutex(dd); - if (ret) + /* hold the SBus resource across the firmware download and SBR */ + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, "%s: unable to acquire SBus resource\n", + __func__); return ret; + } /* make sure thermal polling is not causing interrupts */ therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN); @@ -955,8 +1036,11 @@ retry: /* step 4: download PCIe Gen3 SerDes firmware */ dd_dev_info(dd, "%s: downloading firmware\n", __func__); ret = load_pcie_firmware(dd); - if (ret) + if (ret) { + /* do not proceed if the firmware cannot be downloaded */ + return_error = 1; goto done; + } /* step 5: set up device parameter settings */ dd_dev_info(dd, "%s: setting PCIe registers\n", __func__); @@ -986,7 +1070,7 @@ retry: * PcieCfgRegPl100 - Gen3 Control * * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl - * turn on PcieCfgRegPl100.EqEieosCnt (erratum) + * turn on PcieCfgRegPl100.EqEieosCnt * Everything else zero. */ reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK; @@ -1016,8 +1100,10 @@ retry: default_pset = DEFAULT_MCP_PSET; } pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101, - (fs << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) - | (lf << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT)); + (fs << + PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) | + (lf << + PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT)); ret = load_eq_table(dd, eq, fs, div); if (ret) goto done; @@ -1031,15 +1117,15 @@ retry: pcie_pset = default_pset; if (pcie_pset > 10) { /* valid range is 0-10, inclusive */ dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n", - __func__, pcie_pset, default_pset); + __func__, pcie_pset, default_pset); pcie_pset = default_pset; } dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset); pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106, - ((1 << pcie_pset) - << PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) - | PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK - | PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK); + ((1 << pcie_pset) << + PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) | + PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK | + PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK); /* * step 5b: Do post firmware download steps via SBus @@ -1064,17 +1150,15 @@ retry: /* * step 5d: program XMT margin - * Right now, leave the default alone. To change, do a - * read-modify-write of: - * CcePcieCtrl.XmtMargin - * CcePcieCtrl.XmitMarginOverwriteEnable */ + write_xmt_margin(dd, __func__); - /* step 5e: disable active state power management (ASPM) */ + /* + * step 5e: disable active state power management (ASPM). It + * will be enabled if required later + */ dd_dev_info(dd, "%s: clearing ASPM\n", __func__); - pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &lnkctl); - lnkctl &= ~PCI_EXP_LNKCTL_ASPMC; - pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, lnkctl); + aspm_hw_disable_l1(dd); /* * step 5f: clear DirectSpeedChange @@ -1093,16 +1177,15 @@ retry: * that it is Gen3 capable earlier. */ dd_dev_info(dd, "%s: setting parent target link speed\n", __func__); - parent = dd->pcidev->bus->self; pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2); dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, - (u32)lnkctl2); + (u32)lnkctl2); /* only write to parent if target is not as high as ours */ if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) { lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK; lnkctl2 |= target_vector; dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); + (u32)lnkctl2); pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2); } else { dd_dev_info(dd, "%s: ..target speed is OK\n", __func__); @@ -1111,17 +1194,17 @@ retry: dd_dev_info(dd, "%s: setting target link speed\n", __func__); pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, - (u32)lnkctl2); + (u32)lnkctl2); lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK; lnkctl2 |= target_vector; dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, - (u32)lnkctl2); + (u32)lnkctl2); pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); /* step 5h: arm gasket logic */ /* hold DC in reset across the SBR */ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); - (void) read_csr(dd, CCE_DC_CTRL); /* DC reset hold */ + (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */ /* save firmware control across the SBR */ fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL); @@ -1152,8 +1235,8 @@ retry: ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor); if (ret) { dd_dev_info(dd, - "%s: read of VendorID failed after SBR, err %d\n", - __func__, ret); + "%s: read of VendorID failed after SBR, err %d\n", + __func__, ret); return_error = 1; goto done; } @@ -1193,8 +1276,7 @@ retry: write_csr(dd, CCE_DC_CTRL, 0); /* Set the LED off */ - if (is_ax(dd)) - setextled(dd, 0); + setextled(dd, 0); /* check for any per-lane errors */ pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, ®32); @@ -1205,8 +1287,8 @@ retry: & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK; if ((status & (1 << dd->hfi1_id)) == 0) { dd_dev_err(dd, - "%s: gasket status 0x%x, expecting 0x%x\n", - __func__, status, 1 << dd->hfi1_id); + "%s: gasket status 0x%x, expecting 0x%x\n", + __func__, status, 1 << dd->hfi1_id); ret = -EIO; goto done; } @@ -1223,13 +1305,13 @@ retry: /* update our link information cache */ update_lbus_info(dd); dd_dev_info(dd, "%s: new speed and width: %s\n", __func__, - dd->lbus_info); + dd->lbus_info); if (dd->lbus_speed != target_speed) { /* not target */ /* maybe retry */ do_retry = retry_count < pcie_retry; dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n", - pcie_target, do_retry ? ", retrying" : ""); + pcie_target, do_retry ? ", retrying" : ""); retry_count++; if (do_retry) { msleep(100); /* allow time to settle */ @@ -1245,7 +1327,7 @@ done: dd_dev_info(dd, "%s: Re-enable therm polling\n", __func__); } - release_hw_mutex(dd); + release_chip_resource(dd, CR_SBUS); done_no_mutex: /* return no error if it is OK to be at current speed */ if (ret && !return_error) { diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c index b51a4416312b..c6849ce9e5eb 100644 --- a/drivers/staging/rdma/hfi1/pio.c +++ b/drivers/staging/rdma/hfi1/pio.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -101,7 +98,7 @@ void pio_send_control(struct hfi1_devdata *dd, int op) /* Fall through */ case PSC_DATA_VL_ENABLE: /* Disallow sending on VLs not enabled */ - mask = (((~0ull)<<num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK)<< + mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) << SEND_CTRL_UNSUPPORTED_VL_SHIFT; reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask; break; @@ -130,7 +127,7 @@ void pio_send_control(struct hfi1_devdata *dd, int op) if (write) { write_csr(dd, SEND_CTRL, reg); if (flush) - (void) read_csr(dd, SEND_CTRL); /* flush write */ + (void)read_csr(dd, SEND_CTRL); /* flush write */ } spin_unlock_irqrestore(&dd->sendctrl_lock, flags); @@ -177,8 +174,10 @@ static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = { /* memory pool information, used when calculating final sizes */ struct mem_pool_info { - int centipercent; /* 100th of 1% of memory to use, -1 if blocks - already set */ + int centipercent; /* + * 100th of 1% of memory to use, -1 if blocks + * already set + */ int count; /* count of contexts in the pool */ int blocks; /* block size of the pool */ int size; /* context size, in blocks */ @@ -312,7 +311,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) if (i == SC_ACK) { count = dd->n_krcv_queues; } else if (i == SC_KERNEL) { - count = num_vls + 1 /* VL15 */; + count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */; } else if (count == SCC_PER_CPU) { count = dd->num_rcv_contexts - dd->n_krcv_queues; } else if (count < 0) { @@ -509,7 +508,7 @@ static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context) sci = &dd->send_contexts[sw_index]; if (!sci->allocated) { dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n", - __func__, sw_index, hw_context); + __func__, sw_index, hw_context); } sci->allocated = 0; dd->hw_to_sw[hw_context] = INVALID_SCI; @@ -625,7 +624,7 @@ void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold) & SC(CREDIT_CTRL_THRESHOLD_MASK)) << SC(CREDIT_CTRL_THRESHOLD_SHIFT)); write_kctxt_csr(sc->dd, sc->hw_context, - SC(CREDIT_CTRL), sc->credit_ctrl); + SC(CREDIT_CTRL), sc->credit_ctrl); /* force a credit return on change to avoid a possible stall */ force_return = 1; @@ -700,7 +699,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, if (dd->flags & HFI1_FROZEN) return NULL; - sc = kzalloc_node(sizeof(struct send_context), GFP_KERNEL, numa); + sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa); if (!sc) return NULL; @@ -763,9 +762,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, /* set the default partition key */ write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), - (DEFAULT_PKEY & - SC(CHECK_PARTITION_KEY_VALUE_MASK)) - << SC(CHECK_PARTITION_KEY_VALUE_SHIFT)); + (SC(CHECK_PARTITION_KEY_VALUE_MASK) & + DEFAULT_PKEY) << + SC(CHECK_PARTITION_KEY_VALUE_SHIFT)); /* per context type checks */ if (type == SC_USER) { @@ -778,8 +777,8 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, /* set the send context check opcode mask and value */ write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), - ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) | - ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT))); + ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) | + ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT))); /* set up credit return */ reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK); @@ -797,7 +796,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, thresh = sc_percent_to_threshold(sc, 50); } else if (type == SC_USER) { thresh = sc_percent_to_threshold(sc, - user_credit_return_threshold); + user_credit_return_threshold); } else { /* kernel */ thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize); } @@ -852,7 +851,6 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, sc->credit_ctrl, thresh); - return sc; } @@ -971,11 +969,11 @@ static void sc_wait_for_packet_egress(struct send_context *sc, int pause) if (loop > 500) { /* timed out - bounce the link */ dd_dev_err(dd, - "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n", - __func__, sc->sw_index, - sc->hw_context, (u32)reg); + "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n", + __func__, sc->sw_index, + sc->hw_context, (u32)reg); queue_work(dd->pport->hfi1_wq, - &dd->pport->link_bounce_work); + &dd->pport->link_bounce_work); break; } loop++; @@ -1021,7 +1019,7 @@ int sc_restart(struct send_context *sc) return -EINVAL; dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index, - sc->hw_context); + sc->hw_context); /* * Step 1: Wait for the context to actually halt. @@ -1036,7 +1034,7 @@ int sc_restart(struct send_context *sc) break; if (loop > 100) { dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n", - __func__, sc->sw_index, sc->hw_context); + __func__, sc->sw_index, sc->hw_context); return -ETIME; } loop++; @@ -1062,9 +1060,9 @@ int sc_restart(struct send_context *sc) break; if (loop > 100) { dd_dev_err(dd, - "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n", - __func__, sc->sw_index, - sc->hw_context, count); + "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n", + __func__, sc->sw_index, + sc->hw_context, count); } loop++; udelay(1); @@ -1177,18 +1175,18 @@ void pio_reset_all(struct hfi1_devdata *dd) if (ret == -EIO) { /* clear the error */ write_csr(dd, SEND_PIO_ERR_CLEAR, - SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK); + SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK); } /* reset init all */ write_csr(dd, SEND_PIO_INIT_CTXT, - SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK); + SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK); udelay(2); ret = pio_init_wait_progress(dd); if (ret < 0) { dd_dev_err(dd, - "PIO send context init %s while initializing all PIO blocks\n", - ret == -ETIMEDOUT ? "is stuck" : "had an error"); + "PIO send context init %s while initializing all PIO blocks\n", + ret == -ETIMEDOUT ? "is stuck" : "had an error"); } } @@ -1236,8 +1234,7 @@ int sc_enable(struct send_context *sc) */ reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS)); if (reg) - write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), - reg); + write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg); /* * The HW PIO initialization engine can handle only one init @@ -1295,7 +1292,7 @@ void sc_return_credits(struct send_context *sc) /* a 0->1 transition schedules a credit return */ write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), - SC(CREDIT_FORCE_FORCE_RETURN_SMASK)); + SC(CREDIT_FORCE_FORCE_RETURN_SMASK)); /* * Ensure that the write is flushed and the credit return is * scheduled. We care more about the 0 -> 1 transition. @@ -1321,7 +1318,7 @@ void sc_drop(struct send_context *sc) return; dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n", - __func__, sc->sw_index, sc->hw_context); + __func__, sc->sw_index, sc->hw_context); } /* @@ -1346,7 +1343,7 @@ void sc_stop(struct send_context *sc, int flag) wake_up(&sc->halt_wait); } -#define BLOCK_DWORDS (PIO_BLOCK_SIZE/sizeof(u32)) +#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32)) #define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS) /* @@ -1430,8 +1427,10 @@ retry: next = head + 1; if (next >= sc->sr_size) next = 0; - /* update the head - must be last! - the releaser can look at fields - in pbuf once we move the head */ + /* + * update the head - must be last! - the releaser can look at fields + * in pbuf once we move the head + */ smp_wmb(); sc->sr_head = next; spin_unlock_irqrestore(&sc->alloc_lock, flags); @@ -1469,7 +1468,7 @@ void sc_add_credit_return_intr(struct send_context *sc) if (sc->credit_intr_count == 0) { sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK); write_kctxt_csr(sc->dd, sc->hw_context, - SC(CREDIT_CTRL), sc->credit_ctrl); + SC(CREDIT_CTRL), sc->credit_ctrl); } sc->credit_intr_count++; spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); @@ -1491,7 +1490,7 @@ void sc_del_credit_return_intr(struct send_context *sc) if (sc->credit_intr_count == 0) { sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK); write_kctxt_csr(sc->dd, sc->hw_context, - SC(CREDIT_CTRL), sc->credit_ctrl); + SC(CREDIT_CTRL), sc->credit_ctrl); } spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); } @@ -1526,8 +1525,9 @@ static void sc_piobufavail(struct send_context *sc) struct hfi1_devdata *dd = sc->dd; struct hfi1_ibdev *dev = &dd->verbs_dev; struct list_head *list; - struct hfi1_qp *qps[PIO_WAIT_BATCH_SIZE]; - struct hfi1_qp *qp; + struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE]; + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; unsigned long flags; unsigned i, n = 0; @@ -1545,24 +1545,28 @@ static void sc_piobufavail(struct send_context *sc) struct iowait *wait; if (n == ARRAY_SIZE(qps)) - goto full; + break; wait = list_first_entry(list, struct iowait, list); - qp = container_of(wait, struct hfi1_qp, s_iowait); - list_del_init(&qp->s_iowait.list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); /* refcount held until actual wake up */ qps[n++] = qp; } /* - * Counting: only call wantpiobuf_intr() if there were waiters and they - * are now all gone. + * If there had been waiters and there are more + * insure that we redo the force to avoid a potential hang. */ - if (n) + if (n) { hfi1_sc_wantpiobuf_intr(sc, 0); -full: + if (!list_empty(list)) + hfi1_sc_wantpiobuf_intr(sc, 1); + } write_sequnlock_irqrestore(&dev->iowait_lock, flags); for (i = 0; i < n; i++) - hfi1_qp_wakeup(qps[i], HFI1_S_WAIT_PIO); + hfi1_qp_wakeup(qps[i], + RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN); } /* translate a send credit update to a bit code of reasons */ @@ -1661,7 +1665,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context) sw_index = dd->hw_to_sw[hw_context]; if (unlikely(sw_index >= dd->num_send_contexts)) { dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n", - __func__, hw_context, sw_index); + __func__, hw_context, sw_index); goto done; } sc = dd->send_contexts[sw_index].sc; @@ -1674,8 +1678,8 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context) sw_index = dd->hw_to_sw[gc]; if (unlikely(sw_index >= dd->num_send_contexts)) { dd_dev_err(dd, - "%s: invalid hw (%u) to sw (%u) mapping\n", - __func__, hw_context, sw_index); + "%s: invalid hw (%u) to sw (%u) mapping\n", + __func__, hw_context, sw_index); continue; } sc_release_update(dd->send_contexts[sw_index].sc); @@ -1684,11 +1688,217 @@ done: spin_unlock(&dd->sc_lock); } +/* + * pio_select_send_context_vl() - select send context + * @dd: devdata + * @selector: a spreading factor + * @vl: this vl + * + * This function returns a send context based on the selector and a vl. + * The mapping fields are protected by RCU + */ +struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd, + u32 selector, u8 vl) +{ + struct pio_vl_map *m; + struct pio_map_elem *e; + struct send_context *rval; + + /* + * NOTE This should only happen if SC->VL changed after the initial + * checks on the QP/AH + * Default will return VL0's send context below + */ + if (unlikely(vl >= num_vls)) { + rval = NULL; + goto done; + } + + rcu_read_lock(); + m = rcu_dereference(dd->pio_map); + if (unlikely(!m)) { + rcu_read_unlock(); + return dd->vld[0].sc; + } + e = m->map[vl & m->mask]; + rval = e->ksc[selector & e->mask]; + rcu_read_unlock(); + +done: + rval = !rval ? dd->vld[0].sc : rval; + return rval; +} + +/* + * pio_select_send_context_sc() - select send context + * @dd: devdata + * @selector: a spreading factor + * @sc5: the 5 bit sc + * + * This function returns an send context based on the selector and an sc + */ +struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd, + u32 selector, u8 sc5) +{ + u8 vl = sc_to_vlt(dd, sc5); + + return pio_select_send_context_vl(dd, selector, vl); +} + +/* + * Free the indicated map struct + */ +static void pio_map_free(struct pio_vl_map *m) +{ + int i; + + for (i = 0; m && i < m->actual_vls; i++) + kfree(m->map[i]); + kfree(m); +} + +/* + * Handle RCU callback + */ +static void pio_map_rcu_callback(struct rcu_head *list) +{ + struct pio_vl_map *m = container_of(list, struct pio_vl_map, list); + + pio_map_free(m); +} + +/* + * pio_map_init - called when #vls change + * @dd: hfi1_devdata + * @port: port number + * @num_vls: number of vls + * @vl_scontexts: per vl send context mapping (optional) + * + * This routine changes the mapping based on the number of vls. + * + * vl_scontexts is used to specify a non-uniform vl/send context + * loading. NULL implies auto computing the loading and giving each + * VL an uniform distribution of send contexts per VL. + * + * The auto algorithm computers the sc_per_vl and the number of extra + * send contexts. Any extra send contexts are added from the last VL + * on down + * + * rcu locking is used here to control access to the mapping fields. + * + * If either the num_vls or num_send_contexts are non-power of 2, the + * array sizes in the struct pio_vl_map and the struct pio_map_elem are + * rounded up to the next highest power of 2 and the first entry is + * reused in a round robin fashion. + * + * If an error occurs the map change is not done and the mapping is not + * chaged. + * + */ +int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts) +{ + int i, j; + int extra, sc_per_vl; + int scontext = 1; + int num_kernel_send_contexts = 0; + u8 lvl_scontexts[OPA_MAX_VLS]; + struct pio_vl_map *oldmap, *newmap; + + if (!vl_scontexts) { + /* send context 0 reserved for VL15 */ + for (i = 1; i < dd->num_send_contexts; i++) + if (dd->send_contexts[i].type == SC_KERNEL) + num_kernel_send_contexts++; + /* truncate divide */ + sc_per_vl = num_kernel_send_contexts / num_vls; + /* extras */ + extra = num_kernel_send_contexts % num_vls; + vl_scontexts = lvl_scontexts; + /* add extras from last vl down */ + for (i = num_vls - 1; i >= 0; i--, extra--) + vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0); + } + /* build new map */ + newmap = kzalloc(sizeof(*newmap) + + roundup_pow_of_two(num_vls) * + sizeof(struct pio_map_elem *), + GFP_KERNEL); + if (!newmap) + goto bail; + newmap->actual_vls = num_vls; + newmap->vls = roundup_pow_of_two(num_vls); + newmap->mask = (1 << ilog2(newmap->vls)) - 1; + for (i = 0; i < newmap->vls; i++) { + /* save for wrap around */ + int first_scontext = scontext; + + if (i < newmap->actual_vls) { + int sz = roundup_pow_of_two(vl_scontexts[i]); + + /* only allocate once */ + newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) + + sz * sizeof(struct + send_context *), + GFP_KERNEL); + if (!newmap->map[i]) + goto bail; + newmap->map[i]->mask = (1 << ilog2(sz)) - 1; + /* assign send contexts */ + for (j = 0; j < sz; j++) { + if (dd->kernel_send_context[scontext]) + newmap->map[i]->ksc[j] = + dd->kernel_send_context[scontext]; + if (++scontext >= first_scontext + + vl_scontexts[i]) + /* wrap back to first send context */ + scontext = first_scontext; + } + } else { + /* just re-use entry without allocating */ + newmap->map[i] = newmap->map[i % num_vls]; + } + scontext = first_scontext + vl_scontexts[i]; + } + /* newmap in hand, save old map */ + spin_lock_irq(&dd->pio_map_lock); + oldmap = rcu_dereference_protected(dd->pio_map, + lockdep_is_held(&dd->pio_map_lock)); + + /* publish newmap */ + rcu_assign_pointer(dd->pio_map, newmap); + + spin_unlock_irq(&dd->pio_map_lock); + /* success, free any old map after grace period */ + if (oldmap) + call_rcu(&oldmap->list, pio_map_rcu_callback); + return 0; +bail: + /* free any partial allocation */ + pio_map_free(newmap); + return -ENOMEM; +} + +void free_pio_map(struct hfi1_devdata *dd) +{ + /* Free PIO map if allocated */ + if (rcu_access_pointer(dd->pio_map)) { + spin_lock_irq(&dd->pio_map_lock); + pio_map_free(rcu_access_pointer(dd->pio_map)); + RCU_INIT_POINTER(dd->pio_map, NULL); + spin_unlock_irq(&dd->pio_map_lock); + synchronize_rcu(); + } + kfree(dd->kernel_send_context); + dd->kernel_send_context = NULL; +} + int init_pervl_scs(struct hfi1_devdata *dd) { int i; - u64 mask, all_vl_mask = (u64) 0x80ff; /* VLs 0-7, 15 */ + u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */ + u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */ u32 ctxt; + struct hfi1_pportdata *ppd = dd->pport; dd->vld[15].sc = sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node); @@ -1696,6 +1906,12 @@ int init_pervl_scs(struct hfi1_devdata *dd) goto nomem; hfi1_init_ctxt(dd->vld[15].sc); dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); + + dd->kernel_send_context = kmalloc_node(dd->num_send_contexts * + sizeof(struct send_context *), + GFP_KERNEL, dd->node); + dd->kernel_send_context[0] = dd->vld[15].sc; + for (i = 0; i < num_vls; i++) { /* * Since this function does not deal with a specific @@ -1708,12 +1924,19 @@ int init_pervl_scs(struct hfi1_devdata *dd) dd->rcd[0]->rcvhdrqentsize, dd->node); if (!dd->vld[i].sc) goto nomem; - + dd->kernel_send_context[i + 1] = dd->vld[i].sc; hfi1_init_ctxt(dd->vld[i].sc); - /* non VL15 start with the max MTU */ dd->vld[i].mtu = hfi1_max_mtu; } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) { + dd->kernel_send_context[i + 1] = + sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->kernel_send_context[i + 1]) + goto nomem; + hfi1_init_ctxt(dd->kernel_send_context[i + 1]); + } + sc_enable(dd->vld[15].sc); ctxt = dd->vld[15].sc->hw_context; mask = all_vl_mask & ~(1LL << 15); @@ -1721,17 +1944,29 @@ int init_pervl_scs(struct hfi1_devdata *dd) dd_dev_info(dd, "Using send context %u(%u) for VL15\n", dd->vld[15].sc->sw_index, ctxt); + for (i = 0; i < num_vls; i++) { sc_enable(dd->vld[i].sc); ctxt = dd->vld[i].sc->hw_context; - mask = all_vl_mask & ~(1LL << i); + mask = all_vl_mask & ~(data_vls_mask); write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) { + sc_enable(dd->kernel_send_context[i + 1]); + ctxt = dd->kernel_send_context[i + 1]->hw_context; + mask = all_vl_mask & ~(data_vls_mask); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + } + + if (pio_map_init(dd, ppd->port - 1, num_vls, NULL)) + goto nomem; return 0; nomem: sc_free(dd->vld[15].sc); for (i = 0; i < num_vls; i++) sc_free(dd->vld[i].sc); + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) + sc_free(dd->kernel_send_context[i + 1]); return -ENOMEM; } @@ -1769,11 +2004,11 @@ int init_credit_return(struct hfi1_devdata *dd) bytes, &dd->cr_base[i].pa, GFP_KERNEL); - if (dd->cr_base[i].va == NULL) { + if (!dd->cr_base[i].va) { set_dev_node(&dd->pcidev->dev, dd->node); dd_dev_err(dd, - "Unable to allocate credit return DMA range for NUMA %d\n", - i); + "Unable to allocate credit return DMA range for NUMA %d\n", + i); ret = -ENOMEM; goto done; } @@ -1797,10 +2032,10 @@ void free_credit_return(struct hfi1_devdata *dd) for (i = 0; i < num_numa; i++) { if (dd->cr_base[i].va) { dma_free_coherent(&dd->pcidev->dev, - TXE_NUM_CONTEXTS - * sizeof(struct credit_return), - dd->cr_base[i].va, - dd->cr_base[i].pa); + TXE_NUM_CONTEXTS * + sizeof(struct credit_return), + dd->cr_base[i].va, + dd->cr_base[i].pa); } } kfree(dd->cr_base); diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h index 53d3e0a79375..0026976ce4f6 100644 --- a/drivers/staging/rdma/hfi1/pio.h +++ b/drivers/staging/rdma/hfi1/pio.h @@ -1,14 +1,13 @@ #ifndef _PIO_H #define _PIO_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -50,7 +47,6 @@ * */ - /* send context types */ #define SC_KERNEL 0 #define SC_ACK 1 @@ -106,6 +102,7 @@ struct send_context { struct hfi1_devdata *dd; /* device */ void __iomem *base_addr; /* start of PIO memory */ union pio_shadow_ring *sr; /* shadow ring */ + volatile __le64 *hw_free; /* HW free counter */ struct work_struct halt_work; /* halted context work queue entry */ unsigned long flags; /* flags */ @@ -165,6 +162,112 @@ struct sc_config_sizes { short int count; }; +/* + * The diagram below details the relationship of the mapping structures + * + * Since the mapping now allows for non-uniform send contexts per vl, the + * number of send contexts for a vl is either the vl_scontexts[vl] or + * a computation based on num_kernel_send_contexts/num_vls: + * + * For example: + * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls + * + * n = roundup to next highest power of 2 using nactual + * + * In the case where there are num_kernel_send_contexts/num_vls doesn't divide + * evenly, the extras are added from the last vl downward. + * + * For the case where n > nactual, the send contexts are assigned + * in a round robin fashion wrapping back to the first send context + * for a particular vl. + * + * dd->pio_map + * | pio_map_elem[0] + * | +--------------------+ + * v | mask | + * pio_vl_map |--------------------| + * +--------------------------+ | ksc[0] -> sc 1 | + * | list (RCU) | |--------------------| + * |--------------------------| ->| ksc[1] -> sc 2 | + * | mask | --/ |--------------------| + * |--------------------------| -/ | * | + * | actual_vls (max 8) | -/ |--------------------| + * |--------------------------| --/ | ksc[n] -> sc n | + * | vls (max 8) | -/ +--------------------+ + * |--------------------------| --/ + * | map[0] |-/ + * |--------------------------| +--------------------+ + * | map[1] |--- | mask | + * |--------------------------| \---- |--------------------| + * | * | \-- | ksc[0] -> sc 1+n | + * | * | \---- |--------------------| + * | * | \->| ksc[1] -> sc 2+n | + * |--------------------------| |--------------------| + * | map[vls - 1] |- | * | + * +--------------------------+ \- |--------------------| + * \- | ksc[m] -> sc m+n | + * \ +--------------------+ + * \- + * \ + * \- +--------------------+ + * \- | mask | + * \ |--------------------| + * \- | ksc[0] -> sc 1+m+n | + * \- |--------------------| + * >| ksc[1] -> sc 2+m+n | + * |--------------------| + * | * | + * |--------------------| + * | ksc[o] -> sc o+m+n | + * +--------------------+ + * + */ + +/* Initial number of send contexts per VL */ +#define INIT_SC_PER_VL 2 + +/* + * struct pio_map_elem - mapping for a vl + * @mask - selector mask + * @ksc - array of kernel send contexts for this vl + * + * The mask is used to "mod" the selector to + * produce index into the trailing array of + * kscs + */ +struct pio_map_elem { + u32 mask; + struct send_context *ksc[0]; +}; + +/* + * struct pio_vl_map - mapping for a vl + * @list - rcu head for free callback + * @mask - vl mask to "mod" the vl to produce an index to map array + * @actual_vls - number of vls + * @vls - numbers of vls rounded to next power of 2 + * @map - array of pio_map_elem entries + * + * This is the parent mapping structure. The trailing members of the + * struct point to pio_map_elem entries, which in turn point to an + * array of kscs for that vl. + */ +struct pio_vl_map { + struct rcu_head list; + u32 mask; + u8 actual_vls; + u8 vls; + struct pio_map_elem *map[0]; +}; + +int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, + u8 *vl_scontexts); +void free_pio_map(struct hfi1_devdata *dd); +struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd, + u32 selector, u8 vl); +struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd, + u32 selector, u8 sc5); + /* send context functions */ int init_credit_return(struct hfi1_devdata *dd); void free_credit_return(struct hfi1_devdata *dd); @@ -183,7 +286,7 @@ void sc_flush(struct send_context *sc); void sc_drop(struct send_context *sc); void sc_stop(struct send_context *sc, int bit); struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, - pio_release_cb cb, void *arg); + pio_release_cb cb, void *arg); void sc_release_update(struct send_context *sc); void sc_return_credits(struct send_context *sc); void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context); @@ -212,12 +315,11 @@ void pio_kernel_unfreeze(struct hfi1_devdata *dd); void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl); void pio_send_control(struct hfi1_devdata *dd, int op); - /* PIO copy routines */ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, const void *from, size_t count); void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, - const void *from, size_t nbytes); + const void *from, size_t nbytes); void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes); void seg_pio_copy_end(struct pio_buf *pbuf); diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c index ebb0bafc68cb..228e9fb76e08 100644 --- a/drivers/staging/rdma/hfi1/pio_copy.c +++ b/drivers/staging/rdma/hfi1/pio_copy.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -52,9 +49,9 @@ /* additive distance between non-SOP and SOP space */ #define SOP_DISTANCE (TXE_PIO_SIZE / 2) -#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE-1) +#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1) /* number of QUADWORDs in a block */ -#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE/sizeof(u64)) +#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64)) /** * pio_copy - copy data block to MMIO space @@ -83,11 +80,13 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, dest += sizeof(u64); /* calculate where the QWORD data ends - in SOP=1 space */ - dend = dest + ((count>>1) * sizeof(u64)); + dend = dest + ((count >> 1) * sizeof(u64)); if (dend < send) { - /* all QWORD data is within the SOP block, does *not* - reach the end of the SOP block */ + /* + * all QWORD data is within the SOP block, does *not* + * reach the end of the SOP block + */ while (dest < dend) { writeq(*(u64 *)from, dest); @@ -152,8 +151,10 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, writeq(val.val64, dest); dest += sizeof(u64); } - /* fill in rest of block, no need to check pbuf->end - as we only wrap on a block boundary */ + /* + * fill in rest of block, no need to check pbuf->end + * as we only wrap on a block boundary + */ while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) { writeq(0, dest); dest += sizeof(u64); @@ -177,7 +178,7 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, * "zero" shift - bit shift used to zero out upper bytes. Input is * the count of LSB bytes to preserve. */ -#define zshift(x) (8 * (8-(x))) +#define zshift(x) (8 * (8 - (x))) /* * "merge" shift - bit shift used to merge with carry bytes. Input is @@ -196,7 +197,7 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, * o nbytes must not span a QW boundary */ static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, - unsigned int nbytes) + unsigned int nbytes) { unsigned long off; @@ -223,7 +224,7 @@ static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, * o nbytes may span a QW boundary */ static inline void read_extra_bytes(struct pio_buf *pbuf, - const void *from, unsigned int nbytes) + const void *from, unsigned int nbytes) { unsigned long off = (unsigned long)from & 0x7; unsigned int room, xbytes; @@ -244,7 +245,7 @@ static inline void read_extra_bytes(struct pio_buf *pbuf, pbuf->carry.val64 |= (((*(u64 *)from) >> mshift(off)) << zshift(xbytes)) - >> zshift(xbytes+pbuf->carry_bytes); + >> zshift(xbytes + pbuf->carry_bytes); off = 0; pbuf->carry_bytes += xbytes; nbytes -= xbytes; @@ -362,7 +363,7 @@ static inline void jcopy(u8 *dest, const u8 *src, u32 n) * o from may _not_ be u64 aligned. */ static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, - unsigned int nbytes) + unsigned int nbytes) { jcopy(&pbuf->carry.val8[0], from, nbytes); pbuf->carry_bytes = nbytes; @@ -377,7 +378,7 @@ static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, * o nbytes may span a QW boundary */ static inline void read_extra_bytes(struct pio_buf *pbuf, - const void *from, unsigned int nbytes) + const void *from, unsigned int nbytes) { jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes); pbuf->carry_bytes += nbytes; @@ -411,7 +412,7 @@ static inline void merge_write8( jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder); writeq(pbuf->carry.val64, dest); - jcopy(&pbuf->carry.val8[0], src+remainder, pbuf->carry_bytes); + jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes); } /* @@ -433,7 +434,7 @@ static inline int carry_write8(struct pio_buf *pbuf, void *dest) u64 zero = 0; jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero, - 8 - pbuf->carry_bytes); + 8 - pbuf->carry_bytes); writeq(pbuf->carry.val64, dest); return 1; } @@ -453,7 +454,7 @@ static inline int carry_write8(struct pio_buf *pbuf, void *dest) * @nbytes: bytes to copy */ void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, - const void *from, size_t nbytes) + const void *from, size_t nbytes) { void __iomem *dest = pbuf->start + SOP_DISTANCE; void __iomem *send = dest + PIO_BLOCK_SIZE; @@ -463,11 +464,13 @@ void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, dest += sizeof(u64); /* calculate where the QWORD data ends - in SOP=1 space */ - dend = dest + ((nbytes>>3) * sizeof(u64)); + dend = dest + ((nbytes >> 3) * sizeof(u64)); if (dend < send) { - /* all QWORD data is within the SOP block, does *not* - reach the end of the SOP block */ + /* + * all QWORD data is within the SOP block, does *not* + * reach the end of the SOP block + */ while (dest < dend) { writeq(*(u64 *)from, dest); @@ -562,8 +565,10 @@ static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes) void __iomem *send; /* SOP end */ void __iomem *xend; - /* calculate the end of data or end of block, whichever - comes first */ + /* + * calculate the end of data or end of block, whichever + * comes first + */ send = pbuf->start + PIO_BLOCK_SIZE; xend = send < dend ? send : dend; @@ -639,13 +644,13 @@ static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes) * Must handle nbytes < 8. */ static void mid_copy_straight(struct pio_buf *pbuf, - const void *from, size_t nbytes) + const void *from, size_t nbytes) { void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); void __iomem *dend; /* 8-byte data end */ /* calculate 8-byte data end */ - dend = dest + ((nbytes>>3) * sizeof(u64)); + dend = dest + ((nbytes >> 3) * sizeof(u64)); if (pbuf->qw_written < PIO_BLOCK_QWS) { /* @@ -656,8 +661,10 @@ static void mid_copy_straight(struct pio_buf *pbuf, void __iomem *send; /* SOP end */ void __iomem *xend; - /* calculate the end of data or end of block, whichever - comes first */ + /* + * calculate the end of data or end of block, whichever + * comes first + */ send = pbuf->start + PIO_BLOCK_SIZE; xend = send < dend ? send : dend; @@ -713,7 +720,7 @@ static void mid_copy_straight(struct pio_buf *pbuf, /* we know carry_bytes was zero on entry to this routine */ read_low_bytes(pbuf, from, nbytes & 0x7); - pbuf->qw_written += nbytes>>3; + pbuf->qw_written += nbytes >> 3; } /* diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c new file mode 100644 index 000000000000..0a1d074583e4 --- /dev/null +++ b/drivers/staging/rdma/hfi1/platform.c @@ -0,0 +1,893 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "efivar.h" + +void get_platform_config(struct hfi1_devdata *dd) +{ + int ret = 0; + unsigned long size = 0; + u8 *temp_platform_config = NULL; + + ret = read_hfi1_efi_var(dd, "configuration", &size, + (void **)&temp_platform_config); + if (ret) { + dd_dev_info(dd, + "%s: Failed to get platform config from UEFI, falling back to request firmware\n", + __func__); + /* fall back to request firmware */ + platform_config_load = 1; + goto bail; + } + + dd->platform_config.data = temp_platform_config; + dd->platform_config.size = size; + +bail: + /* exit */; +} + +void free_platform_config(struct hfi1_devdata *dd) +{ + if (!platform_config_load) { + /* + * was loaded from EFI, release memory + * allocated by read_efi_var + */ + kfree(dd->platform_config.data); + } + /* + * else do nothing, dispose_firmware will release + * struct firmware platform_config on driver exit + */ +} + +int set_qsfp_tx(struct hfi1_pportdata *ppd, int on) +{ + u8 tx_ctrl_byte = on ? 0x0 : 0xF; + int ret = 0; + + ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS, + &tx_ctrl_byte, 1); + /* we expected 1, so consider 0 an error */ + if (ret == 0) + ret = -EIO; + else if (ret == 1) + ret = 0; + return ret; +} + +static int qual_power(struct hfi1_pportdata *ppd) +{ + u32 cable_power_class = 0, power_class_max = 0; + u8 *cache = ppd->qsfp_info.cache; + int ret = 0; + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0, + SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4); + if (ret) + return ret; + + if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4) + cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]); + else + cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= 3 && cable_power_class > (power_class_max - 1)) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); + else if (cable_power_class > 4 && cable_power_class > (power_class_max)) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); + /* + * cable_power_class will never have value 4 as this simply + * means the high power settings are unused + */ + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) { + dd_dev_info( + ppd->dd, + "%s: Port disabled due to system power restrictions\n", + __func__); + ret = -EPERM; + } + return ret; +} + +static int qual_bitrate(struct hfi1_pportdata *ppd) +{ + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + u8 *cache = ppd->qsfp_info.cache; + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) && + cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY); + + if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) && + cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY); + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) { + dd_dev_info( + ppd->dd, + "%s: Cable failed bitrate check, disabling port\n", + __func__); + return -EPERM; + } + return 0; +} + +static int set_qsfp_high_power(struct hfi1_pportdata *ppd) +{ + u8 cable_power_class = 0, power_ctrl_byte = 0; + u8 *cache = ppd->qsfp_info.cache; + int ret; + + if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4) + cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]); + else + cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class) { + power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS]; + + power_ctrl_byte |= 1; + power_ctrl_byte &= ~(0x2); + + ret = qsfp_write(ppd, ppd->dd->hfi1_id, + QSFP_PWR_CTRL_BYTE_OFFS, + &power_ctrl_byte, 1); + if (ret != 1) + return -EIO; + + if (cable_power_class > 3) { + /* > power class 4*/ + power_ctrl_byte |= (1 << 2); + ret = qsfp_write(ppd, ppd->dd->hfi1_id, + QSFP_PWR_CTRL_BYTE_OFFS, + &power_ctrl_byte, 1); + if (ret != 1) + return -EIO; + } + + /* SFF 8679 rev 1.7 LPMode Deassert time */ + msleep(300); + } + return 0; +} + +static void apply_rx_cdr(struct hfi1_pportdata *ppd, + u32 rx_preset_index, + u8 *cdr_ctrl_byte) +{ + u32 rx_preset; + u8 *cache = ppd->qsfp_info.cache; + + if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) && + (cache[QSFP_CDR_INFO_OFFS] & 0x40))) + return; + + /* rx_preset preset to zero to catch error */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info( + ppd->dd, + "%s: RX_CDR_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR, + &rx_preset, 4); + + /* Expand cdr setting to all 4 lanes */ + rx_preset = (rx_preset | (rx_preset << 1) | + (rx_preset << 2) | (rx_preset << 3)); + + if (rx_preset) { + *cdr_ctrl_byte |= rx_preset; + } else { + *cdr_ctrl_byte &= rx_preset; + /* Preserve current TX CDR status */ + *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0); + } +} + +static void apply_tx_cdr(struct hfi1_pportdata *ppd, + u32 tx_preset_index, + u8 *ctr_ctrl_byte) +{ + u32 tx_preset; + u8 *cache = ppd->qsfp_info.cache; + + if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) && + (cache[QSFP_CDR_INFO_OFFS] & 0x80))) + return; + + get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, + TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4); + + if (!tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX_CDR_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, + TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4); + + /* Expand cdr setting to all 4 lanes */ + tx_preset = (tx_preset | (tx_preset << 1) | + (tx_preset << 2) | (tx_preset << 3)); + + if (tx_preset) + *ctr_ctrl_byte |= (tx_preset << 4); + else + /* Preserve current/determined RX CDR status */ + *ctr_ctrl_byte &= ((tx_preset << 4) | 0xF); +} + +static void apply_cdr_settings( + struct hfi1_pportdata *ppd, u32 rx_preset_index, + u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS]; + + apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte); + + apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte); + + qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS, + &cdr_ctrl_byte, 1); +} + +static void apply_tx_eq_auto(struct hfi1_pportdata *ppd) +{ + u8 *cache = ppd->qsfp_info.cache; + u8 tx_eq; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8)) + return; + /* Disable adaptive TX EQ if present */ + tx_eq = cache[(128 * 3) + 241]; + tx_eq &= 0xF0; + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1); +} + +static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + u32 tx_preset; + u8 tx_eq; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4)) + return; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY, + &tx_preset, 4); + if (!tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX_EQ_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ, + &tx_preset, 4); + + if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX EQ %x unsupported\n", + __func__, tx_preset); + + dd_dev_info( + ppd->dd, + "%s: Applying EQ %x\n", + __func__, cache[608] & 0xF0); + + tx_preset = (cache[608] & 0xF0) >> 4; + } + + tx_eq = tx_preset | (tx_preset << 4); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1); +} + +static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index) +{ + u32 rx_preset; + u8 rx_eq, *cache = ppd->qsfp_info.cache; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2)) + return; + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info( + ppd->dd, + "%s: RX_EMP_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP, + &rx_preset, 4); + + if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) { + dd_dev_info( + ppd->dd, + "%s: Requested RX EMP %x\n", + __func__, rx_preset); + + dd_dev_info( + ppd->dd, + "%s: Applying supported EMP %x\n", + __func__, cache[608] & 0xF); + + rx_preset = cache[608] & 0xF; + } + + rx_eq = rx_preset | (rx_preset << 4); + + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1); +} + +static void apply_eq_settings(struct hfi1_pportdata *ppd, + u32 rx_preset_index, u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + + /* no point going on w/o a page 3 */ + if (cache[2] & 4) { + dd_dev_info(ppd->dd, + "%s: Upper page 03 not present\n", + __func__); + return; + } + + apply_tx_eq_auto(ppd); + + apply_tx_eq_prog(ppd, tx_preset_index); + + apply_rx_eq_emp(ppd, rx_preset_index); +} + +static void apply_rx_amplitude_settings( + struct hfi1_pportdata *ppd, u32 rx_preset_index, + u32 tx_preset_index) +{ + u32 rx_preset; + u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache; + + /* no point going on w/o a page 3 */ + if (cache[2] & 4) { + dd_dev_info(ppd->dd, + "%s: Upper page 03 not present\n", + __func__); + return; + } + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) { + dd_dev_info(ppd->dd, + "%s: RX_AMP_APPLY is set to disabled\n", + __func__); + return; + } + + get_platform_config_field(ppd->dd, + PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, + RX_PRESET_TABLE_QSFP_RX_AMP_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info(ppd->dd, + "%s: RX_AMP_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field(ppd->dd, + PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, + RX_PRESET_TABLE_QSFP_RX_AMP, + &rx_preset, 4); + + dd_dev_info(ppd->dd, + "%s: Requested RX AMP %x\n", + __func__, + rx_preset); + + for (i = 0; i < 4; i++) { + if (cache[(128 * 3) + 225] & (1 << i)) { + preferred = i; + if (preferred == rx_preset) + break; + } + } + + /* + * Verify that preferred RX amplitude is not just a + * fall through of the default + */ + if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) { + dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n"); + return; + } + + dd_dev_info(ppd->dd, + "%s: Applying RX AMP %x\n", __func__, preferred); + + rx_amp = preferred | (preferred << 4); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1); +} + +#define OPA_INVALID_INDEX 0xFFF + +static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id, + u32 config_data, const char *message) +{ + u8 i; + int ret = HCMD_SUCCESS; + + for (i = 0; i < 4; i++) { + ret = load_8051_config(ppd->dd, field_id, i, config_data); + if (ret != HCMD_SUCCESS) { + dd_dev_err( + ppd->dd, + "%s: %s for lane %u failed\n", + message, __func__, i); + } + } +} + +static void apply_tunings( + struct hfi1_pportdata *ppd, u32 tx_preset_index, + u8 tuning_method, u32 total_atten, u8 limiting_active) +{ + int ret = 0; + u32 config_data = 0, tx_preset = 0; + u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0; + u8 *cache = ppd->qsfp_info.cache; + + /* Enable external device config if channel is limiting active */ + read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, + GENERAL_CONFIG, &config_data); + config_data |= limiting_active; + ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, + GENERAL_CONFIG, config_data); + if (ret != HCMD_SUCCESS) + dd_dev_err( + ppd->dd, + "%s: Failed to set enable external device config\n", + __func__); + + config_data = 0; /* re-init */ + /* Pass tuning method to 8051 */ + read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, + &config_data); + config_data |= tuning_method; + ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, + config_data); + if (ret != HCMD_SUCCESS) + dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n", + __func__); + + /* Set same channel loss for both TX and RX */ + config_data = 0 | (total_atten << 16) | (total_atten << 24); + apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data, + "Setting channel loss"); + + /* Inform 8051 of cable capabilities */ + if (ppd->qsfp_info.cache_valid) { + external_device_config = + ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) | + ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) | + ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) | + (cache[QSFP_EQ_INFO_OFFS] & 0x4); + ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, + GENERAL_CONFIG, &config_data); + /* Clear, then set the external device config field */ + config_data &= ~(0xFF << 24); + config_data |= (external_device_config << 24); + ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, + GENERAL_CONFIG, config_data); + if (ret != HCMD_SUCCESS) + dd_dev_info(ppd->dd, + "%s: Failed set ext device config params\n", + __func__); + } + + if (tx_preset_index == OPA_INVALID_INDEX) { + if (ppd->port_type == PORT_TYPE_QSFP && limiting_active) + dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n", + __func__); + return; + } + + /* Following for limiting active channels only */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, + TX_PRESET_TABLE_PRECUR, &tx_preset, 4); + precur = tx_preset; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4); + attn = tx_preset; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4); + postcur = tx_preset; + + config_data = precur | (attn << 8) | (postcur << 16); + + apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data, + "Applying TX settings"); +} + +static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, + u32 *ptr_rx_preset, u32 *ptr_total_atten) +{ + int ret; + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + u8 *cache = ppd->qsfp_info.cache; + + ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT); + if (ret) { + dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", + __func__, (int)ppd->dd->hfi1_id); + return ret; + } + + ppd->qsfp_info.limiting_active = 1; + + ret = set_qsfp_tx(ppd, 0); + if (ret) + goto bail_unlock; + + ret = qual_power(ppd); + if (ret) + goto bail_unlock; + + ret = qual_bitrate(ppd); + if (ret) + goto bail_unlock; + + if (ppd->qsfp_info.reset_needed) { + reset_qsfp(ppd); + ppd->qsfp_info.reset_needed = 0; + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + } else { + ppd->qsfp_info.reset_needed = 1; + } + + ret = set_qsfp_high_power(ppd); + if (ret) + goto bail_unlock; + + if (cache[QSFP_EQ_INFO_OFFS] & 0x4) { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ, + ptr_tx_preset, 4); + if (ret) { + *ptr_tx_preset = OPA_INVALID_INDEX; + goto bail_unlock; + } + } else { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ, + ptr_tx_preset, 4); + if (ret) { + *ptr_tx_preset = OPA_INVALID_INDEX; + goto bail_unlock; + } + } + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4); + if (ret) { + *ptr_rx_preset = OPA_INVALID_INDEX; + goto bail_unlock; + } + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4); + else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G)) + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4); + + apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + ret = set_qsfp_tx(ppd, 1); + +bail_unlock: + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + return ret; +} + +static int tune_qsfp(struct hfi1_pportdata *ppd, + u32 *ptr_tx_preset, u32 *ptr_rx_preset, + u8 *ptr_tuning_method, u32 *ptr_total_atten) +{ + u32 cable_atten = 0, remote_atten = 0, platform_atten = 0; + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + int ret = 0; + u8 *cache = ppd->qsfp_info.cache; + + switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) { + case 0xA ... 0xB: + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, + &platform_atten, 4); + if (ret) + return ret; + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) + cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS]; + else if ((lss & OPA_LINK_SPEED_12_5G) && + (lse & OPA_LINK_SPEED_12_5G)) + cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS]; + + /* Fallback to configured attenuation if cable memory is bad */ + if (cable_atten == 0 || cable_atten > 36) { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_SYSTEM_TABLE, 0, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G, + &cable_atten, 4); + if (ret) + return ret; + } + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4); + if (ret) + return ret; + + *ptr_total_atten = platform_atten + cable_atten + remote_atten; + + *ptr_tuning_method = OPA_PASSIVE_TUNING; + break; + case 0x0 ... 0x9: /* fallthrough */ + case 0xC: /* fallthrough */ + case 0xE: + ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset, + ptr_total_atten); + if (ret) + return ret; + + *ptr_tuning_method = OPA_ACTIVE_TUNING; + break; + case 0xD: /* fallthrough */ + case 0xF: + default: + dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n", + __func__); + break; + } + return ret; +} + +/* + * This function communicates its success or failure via ppd->driver_link_ready + * Thus, it depends on its association with start_link(...) which checks + * driver_link_ready before proceeding with the link negotiation and + * initialization process. + */ +void tune_serdes(struct hfi1_pportdata *ppd) +{ + int ret = 0; + u32 total_atten = 0; + u32 remote_atten = 0, platform_atten = 0; + u32 rx_preset_index, tx_preset_index; + u8 tuning_method = 0, limiting_active = 0; + struct hfi1_devdata *dd = ppd->dd; + + rx_preset_index = OPA_INVALID_INDEX; + tx_preset_index = OPA_INVALID_INDEX; + + /* the link defaults to enabled */ + ppd->link_enabled = 1; + /* the driver link ready state defaults to not ready */ + ppd->driver_link_ready = 0; + ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); + + /* Skip the tuning for testing (loopback != none) and simulations */ + if (loopback != LOOPBACK_NONE || + ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + ppd->driver_link_ready = 1; + return; + } + + ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_PORT_TYPE, &ppd->port_type, + 4); + if (ret) + ppd->port_type = PORT_TYPE_UNKNOWN; + + switch (ppd->port_type) { + case PORT_TYPE_DISCONNECTED: + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED); + dd_dev_info(dd, "%s: Port disconnected, disabling port\n", + __func__); + goto bail; + case PORT_TYPE_FIXED: + /* platform_atten, remote_atten pre-zeroed to catch error */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4); + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4); + + total_atten = platform_atten + remote_atten; + + tuning_method = OPA_PASSIVE_TUNING; + break; + case PORT_TYPE_VARIABLE: + if (qsfp_mod_present(ppd)) { + /* + * platform_atten, remote_atten pre-zeroed to + * catch error + */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, + &platform_atten, 4); + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, + &remote_atten, 4); + + total_atten = platform_atten + remote_atten; + + tuning_method = OPA_PASSIVE_TUNING; + } else + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG); + break; + case PORT_TYPE_QSFP: + if (qsfp_mod_present(ppd)) { + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + + if (ppd->qsfp_info.cache_valid) { + ret = tune_qsfp(ppd, + &tx_preset_index, + &rx_preset_index, + &tuning_method, + &total_atten); + + /* + * We may have modified the QSFP memory, so + * update the cache to reflect the changes + */ + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + if (ret) + goto bail; + + limiting_active = + ppd->qsfp_info.limiting_active; + } else { + dd_dev_err(dd, + "%s: Reading QSFP memory failed\n", + __func__); + goto bail; + } + } else + ppd->offline_disabled_reason = + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + break; + default: + dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__); + ppd->port_type = PORT_TYPE_UNKNOWN; + tuning_method = OPA_UNKNOWN_TUNING; + total_atten = 0; + limiting_active = 0; + tx_preset_index = OPA_INVALID_INDEX; + break; + } + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) + apply_tunings(ppd, tx_preset_index, tuning_method, + total_atten, limiting_active); + + if (!ret) + ppd->driver_link_ready = 1; + + return; +bail: + ppd->driver_link_ready = 0; +} diff --git a/drivers/staging/rdma/hfi1/platform_config.h b/drivers/staging/rdma/hfi1/platform.h index 8a94a8342052..19620cf546d5 100644 --- a/drivers/staging/rdma/hfi1/platform_config.h +++ b/drivers/staging/rdma/hfi1/platform.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -47,8 +44,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ -#ifndef __PLATFORM_CONFIG_H -#define __PLATFORM_CONFIG_H +#ifndef __PLATFORM_H +#define __PLATFORM_H #define METADATA_TABLE_FIELD_START_SHIFT 0 #define METADATA_TABLE_FIELD_START_LEN_BITS 15 @@ -94,17 +91,18 @@ enum platform_config_system_table_fields { enum platform_config_port_table_fields { PORT_TABLE_RESERVED, PORT_TABLE_PORT_TYPE, - PORT_TABLE_ATTENUATION_12G, - PORT_TABLE_ATTENUATION_25G, + PORT_TABLE_LOCAL_ATTEN_12G, + PORT_TABLE_LOCAL_ATTEN_25G, PORT_TABLE_LINK_SPEED_SUPPORTED, PORT_TABLE_LINK_WIDTH_SUPPORTED, + PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED, + PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED, PORT_TABLE_VL_CAP, PORT_TABLE_MTU_CAP, PORT_TABLE_TX_LANE_ENABLE_MASK, PORT_TABLE_LOCAL_MAX_TIMEOUT, - PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED, - PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED, - PORT_TABLE_TX_PRESET_IDX_PASSIVE_CU, + PORT_TABLE_REMOTE_ATTEN_12G, + PORT_TABLE_REMOTE_ATTEN_25G, PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ, PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ, PORT_TABLE_RX_PRESET_IDX, @@ -115,10 +113,10 @@ enum platform_config_port_table_fields { enum platform_config_rx_preset_table_fields { RX_PRESET_TABLE_RESERVED, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, - RX_PRESET_TABLE_QSFP_RX_EQ_APPLY, + RX_PRESET_TABLE_QSFP_RX_EMP_APPLY, RX_PRESET_TABLE_QSFP_RX_AMP_APPLY, RX_PRESET_TABLE_QSFP_RX_CDR, - RX_PRESET_TABLE_QSFP_RX_EQ, + RX_PRESET_TABLE_QSFP_RX_EMP, RX_PRESET_TABLE_QSFP_RX_AMP, RX_PRESET_TABLE_MAX }; @@ -149,6 +147,11 @@ enum platform_config_variable_settings_table_fields { VARIABLE_SETTINGS_TABLE_MAX }; +struct platform_config { + size_t size; + const u8 *data; +}; + struct platform_config_data { u32 *table; u32 *table_metadata; @@ -179,9 +182,11 @@ static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = { * fields defined for each table above */ -/*===================================================== +/* + * ===================================================== * System table encodings - *====================================================*/ + * ===================================================== + */ #define PLATFORM_CONFIG_MAGIC_NUM 0x3d4f5041 #define PLATFORM_CONFIG_MAGIC_NUMBER_LEN 4 @@ -199,12 +204,13 @@ enum platform_config_qsfp_power_class_encoding { QSFP_POWER_CLASS_7 }; - -/*===================================================== +/* + * ==================================================== * Port table encodings - *==================================================== */ + * ==================================================== + */ enum platform_config_port_type_encoding { - PORT_TYPE_RESERVED, + PORT_TYPE_UNKNOWN, PORT_TYPE_DISCONNECTED, PORT_TYPE_FIXED, PORT_TYPE_VARIABLE, @@ -283,4 +289,16 @@ enum platform_config_local_max_timeout_encoding { LOCAL_MAX_TIMEOUT_1000_S }; -#endif /*__PLATFORM_CONFIG_H*/ +enum link_tuning_encoding { + OPA_PASSIVE_TUNING, + OPA_ACTIVE_TUNING, + OPA_UNKNOWN_TUNING +}; + +/* platform.c */ +void get_platform_config(struct hfi1_devdata *dd); +void free_platform_config(struct hfi1_devdata *dd); +int set_qsfp_tx(struct hfi1_pportdata *ppd, int on); +void tune_serdes(struct hfi1_pportdata *ppd); + +#endif /*__PLATFORM_H*/ diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c index ce036810d576..29a5ad28019b 100644 --- a/drivers/staging/rdma/hfi1/qp.c +++ b/drivers/staging/rdma/hfi1/qp.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -54,31 +51,32 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/seq_file.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> #include "hfi.h" #include "qp.h" #include "trace.h" -#include "sdma.h" +#include "verbs_txreq.h" -#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - -static unsigned int hfi1_qp_table_size = 256; +unsigned int hfi1_qp_table_size = 256; module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO); MODULE_PARM_DESC(qp_table_size, "QP table size"); -static void flush_tx_list(struct hfi1_qp *qp); +static void flush_tx_list(struct rvt_qp *qp); static int iowait_sleep( struct sdma_engine *sde, struct iowait *wait, struct sdma_txreq *stx, unsigned seq); static void iowait_wakeup(struct iowait *wait, int reason); +static void iowait_sdma_drained(struct iowait *wait); +static void qp_pio_drain(struct rvt_qp *qp); -static inline unsigned mk_qpn(struct hfi1_qpn_table *qpt, - struct qpn_map *map, unsigned off) +static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, + struct rvt_qpn_map *map, unsigned off) { - return (map - qpt->map) * BITS_PER_PAGE + off; + return (map - qpt->map) * RVT_BITS_PER_PAGE + off; } /* @@ -118,437 +116,15 @@ static const u16 credit_table[31] = { 32768 /* 1E */ }; -static void get_map_page(struct hfi1_qpn_table *qpt, struct qpn_map *map) -{ - unsigned long page = get_zeroed_page(GFP_KERNEL); - - /* - * Free the page if someone raced with us installing it. - */ - - spin_lock(&qpt->lock); - if (map->page) - free_page(page); - else - map->page = (void *)page; - spin_unlock(&qpt->lock); -} - -/* - * Allocate the next available QPN or - * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI. - */ -static int alloc_qpn(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt, - enum ib_qp_type type, u8 port) -{ - u32 i, offset, max_scan, qpn; - struct qpn_map *map; - u32 ret; - - if (type == IB_QPT_SMI || type == IB_QPT_GSI) { - unsigned n; - - ret = type == IB_QPT_GSI; - n = 1 << (ret + 2 * (port - 1)); - spin_lock(&qpt->lock); - if (qpt->flags & n) - ret = -EINVAL; - else - qpt->flags |= n; - spin_unlock(&qpt->lock); - goto bail; - } - - qpn = qpt->last + qpt->incr; - if (qpn >= QPN_MAX) - qpn = qpt->incr | ((qpt->last & 1) ^ 1); - /* offset carries bit 0 */ - offset = qpn & BITS_PER_PAGE_MASK; - map = &qpt->map[qpn / BITS_PER_PAGE]; - max_scan = qpt->nmaps - !offset; - for (i = 0;;) { - if (unlikely(!map->page)) { - get_map_page(qpt, map); - if (unlikely(!map->page)) - break; - } - do { - if (!test_and_set_bit(offset, map->page)) { - qpt->last = qpn; - ret = qpn; - goto bail; - } - offset += qpt->incr; - /* - * This qpn might be bogus if offset >= BITS_PER_PAGE. - * That is OK. It gets re-assigned below - */ - qpn = mk_qpn(qpt, map, offset); - } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); - /* - * In order to keep the number of pages allocated to a - * minimum, we scan the all existing pages before increasing - * the size of the bitmap table. - */ - if (++i > max_scan) { - if (qpt->nmaps == QPNMAP_ENTRIES) - break; - map = &qpt->map[qpt->nmaps++]; - /* start at incr with current bit 0 */ - offset = qpt->incr | (offset & 1); - } else if (map < &qpt->map[qpt->nmaps]) { - ++map; - /* start at incr with current bit 0 */ - offset = qpt->incr | (offset & 1); - } else { - map = &qpt->map[0]; - /* wrap to first map page, invert bit 0 */ - offset = qpt->incr | ((offset & 1) ^ 1); - } - /* there can be no bits at shift and below */ - WARN_ON(offset & (dd->qos_shift - 1)); - qpn = mk_qpn(qpt, map, offset); - } - - ret = -ENOMEM; - -bail: - return ret; -} - -static void free_qpn(struct hfi1_qpn_table *qpt, u32 qpn) -{ - struct qpn_map *map; - - map = qpt->map + qpn / BITS_PER_PAGE; - if (map->page) - clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); -} - -/* - * Put the QP into the hash table. - * The hash table holds a reference to the QP. - */ -static void insert_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp) -{ - struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - unsigned long flags; - - atomic_inc(&qp->refcount); - spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags); - - if (qp->ibqp.qp_num <= 1) { - rcu_assign_pointer(ibp->qp[qp->ibqp.qp_num], qp); - } else { - u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num); - - qp->next = dev->qp_dev->qp_table[n]; - rcu_assign_pointer(dev->qp_dev->qp_table[n], qp); - trace_hfi1_qpinsert(qp, n); - } - - spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags); -} - -/* - * Remove the QP from the table so it can't be found asynchronously by - * the receive interrupt routine. - */ -static void remove_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp) -{ - struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num); - unsigned long flags; - int removed = 1; - - spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags); - - if (rcu_dereference_protected(ibp->qp[0], - lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) { - RCU_INIT_POINTER(ibp->qp[0], NULL); - } else if (rcu_dereference_protected(ibp->qp[1], - lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) { - RCU_INIT_POINTER(ibp->qp[1], NULL); - } else { - struct hfi1_qp *q; - struct hfi1_qp __rcu **qpp; - - removed = 0; - qpp = &dev->qp_dev->qp_table[n]; - for (; (q = rcu_dereference_protected(*qpp, - lockdep_is_held(&dev->qp_dev->qpt_lock))) - != NULL; - qpp = &q->next) - if (q == qp) { - RCU_INIT_POINTER(*qpp, - rcu_dereference_protected(qp->next, - lockdep_is_held(&dev->qp_dev->qpt_lock))); - removed = 1; - trace_hfi1_qpremove(qp, n); - break; - } - } - - spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags); - if (removed) { - synchronize_rcu(); - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); - } -} - -/** - * free_all_qps - check for QPs still in use - * @qpt: the QP table to empty - * - * There should not be any QPs still in use. - * Free memory for table. - */ -static unsigned free_all_qps(struct hfi1_devdata *dd) -{ - struct hfi1_ibdev *dev = &dd->verbs_dev; - unsigned long flags; - struct hfi1_qp *qp; - unsigned n, qp_inuse = 0; - - for (n = 0; n < dd->num_pports; n++) { - struct hfi1_ibport *ibp = &dd->pport[n].ibport_data; - - if (!hfi1_mcast_tree_empty(ibp)) - qp_inuse++; - rcu_read_lock(); - if (rcu_dereference(ibp->qp[0])) - qp_inuse++; - if (rcu_dereference(ibp->qp[1])) - qp_inuse++; - rcu_read_unlock(); - } - - if (!dev->qp_dev) - goto bail; - spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags); - for (n = 0; n < dev->qp_dev->qp_table_size; n++) { - qp = rcu_dereference_protected(dev->qp_dev->qp_table[n], - lockdep_is_held(&dev->qp_dev->qpt_lock)); - RCU_INIT_POINTER(dev->qp_dev->qp_table[n], NULL); - - for (; qp; qp = rcu_dereference_protected(qp->next, - lockdep_is_held(&dev->qp_dev->qpt_lock))) - qp_inuse++; - } - spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags); - synchronize_rcu(); -bail: - return qp_inuse; -} - -/** - * reset_qp - initialize the QP state to the reset state - * @qp: the QP to reset - * @type: the QP type - */ -static void reset_qp(struct hfi1_qp *qp, enum ib_qp_type type) -{ - qp->remote_qpn = 0; - qp->qkey = 0; - qp->qp_access_flags = 0; - iowait_init( - &qp->s_iowait, - 1, - hfi1_do_send, - iowait_sleep, - iowait_wakeup); - qp->s_flags &= HFI1_S_SIGNAL_REQ_WR; - qp->s_hdrwords = 0; - qp->s_wqe = NULL; - qp->s_draining = 0; - qp->s_next_psn = 0; - qp->s_last_psn = 0; - qp->s_sending_psn = 0; - qp->s_sending_hpsn = 0; - qp->s_psn = 0; - qp->r_psn = 0; - qp->r_msn = 0; - if (type == IB_QPT_RC) { - qp->s_state = IB_OPCODE_RC_SEND_LAST; - qp->r_state = IB_OPCODE_RC_SEND_LAST; - } else { - qp->s_state = IB_OPCODE_UC_SEND_LAST; - qp->r_state = IB_OPCODE_UC_SEND_LAST; - } - qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; - qp->r_nak_state = 0; - qp->r_adefered = 0; - qp->r_aflags = 0; - qp->r_flags = 0; - qp->s_head = 0; - qp->s_tail = 0; - qp->s_cur = 0; - qp->s_acked = 0; - qp->s_last = 0; - qp->s_ssn = 1; - qp->s_lsn = 0; - clear_ahg(qp); - qp->s_mig_state = IB_MIG_MIGRATED; - memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); - qp->r_head_ack_queue = 0; - qp->s_tail_ack_queue = 0; - qp->s_num_rd_atomic = 0; - if (qp->r_rq.wq) { - qp->r_rq.wq->head = 0; - qp->r_rq.wq->tail = 0; - } - qp->r_sge.num_sge = 0; -} - -static void clear_mr_refs(struct hfi1_qp *qp, int clr_sends) -{ - unsigned n; - - if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags)) - hfi1_put_ss(&qp->s_rdma_read_sge); - - hfi1_put_ss(&qp->r_sge); - - if (clr_sends) { - while (qp->s_last != qp->s_head) { - struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_last); - unsigned i; - - for (i = 0; i < wqe->wr.num_sge; i++) { - struct hfi1_sge *sge = &wqe->sg_list[i]; - - hfi1_put_mr(sge->mr); - } - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&to_iah(wqe->ud_wr.ah)->refcount); - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - } - if (qp->s_rdma_mr) { - hfi1_put_mr(qp->s_rdma_mr); - qp->s_rdma_mr = NULL; - } - } - - if (qp->ibqp.qp_type != IB_QPT_RC) - return; - - for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { - struct hfi1_ack_entry *e = &qp->s_ack_queue[n]; - - if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && - e->rdma_sge.mr) { - hfi1_put_mr(e->rdma_sge.mr); - e->rdma_sge.mr = NULL; - } - } -} - -/** - * hfi1_error_qp - put a QP into the error state - * @qp: the QP to put into the error state - * @err: the receive completion error to signal if a RWQE is active - * - * Flushes both send and receive work queues. - * Returns true if last WQE event should be generated. - * The QP r_lock and s_lock should be held and interrupts disabled. - * If we are already in error state, just return. - */ -int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err) +static void flush_tx_list(struct rvt_qp *qp) { - struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); - struct ib_wc wc; - int ret = 0; - - if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) - goto bail; - - qp->state = IB_QPS_ERR; - - if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) { - qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR); - del_timer(&qp->s_timer); - } - - if (qp->s_flags & HFI1_S_ANY_WAIT_SEND) - qp->s_flags &= ~HFI1_S_ANY_WAIT_SEND; - - write_seqlock(&dev->iowait_lock); - if (!list_empty(&qp->s_iowait.list) && !(qp->s_flags & HFI1_S_BUSY)) { - qp->s_flags &= ~HFI1_S_ANY_WAIT_IO; - list_del_init(&qp->s_iowait.list); - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); - } - write_sequnlock(&dev->iowait_lock); - - if (!(qp->s_flags & HFI1_S_BUSY)) { - qp->s_hdrwords = 0; - if (qp->s_rdma_mr) { - hfi1_put_mr(qp->s_rdma_mr); - qp->s_rdma_mr = NULL; - } - flush_tx_list(qp); - } - - /* Schedule the sending tasklet to drain the send work queue. */ - if (qp->s_last != qp->s_head) - hfi1_schedule_send(qp); - - clear_mr_refs(qp, 0); - - memset(&wc, 0, sizeof(wc)); - wc.qp = &qp->ibqp; - wc.opcode = IB_WC_RECV; - - if (test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) { - wc.wr_id = qp->r_wr_id; - wc.status = err; - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); - } - wc.status = IB_WC_WR_FLUSH_ERR; + struct hfi1_qp_priv *priv = qp->priv; - if (qp->r_rq.wq) { - struct hfi1_rwq *wq; - u32 head; - u32 tail; - - spin_lock(&qp->r_rq.lock); - - /* sanity check pointers before trusting them */ - wq = qp->r_rq.wq; - head = wq->head; - if (head >= qp->r_rq.size) - head = 0; - tail = wq->tail; - if (tail >= qp->r_rq.size) - tail = 0; - while (tail != head) { - wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; - if (++tail >= qp->r_rq.size) - tail = 0; - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); - } - wq->tail = tail; - - spin_unlock(&qp->r_rq.lock); - } else if (qp->ibqp.event_handler) - ret = 1; - -bail: - return ret; -} - -static void flush_tx_list(struct hfi1_qp *qp) -{ - while (!list_empty(&qp->s_iowait.tx_head)) { + while (!list_empty(&priv->s_iowait.tx_head)) { struct sdma_txreq *tx; tx = list_first_entry( - &qp->s_iowait.tx_head, + &priv->s_iowait.tx_head, struct sdma_txreq, list); list_del_init(&tx->list); @@ -557,14 +133,15 @@ static void flush_tx_list(struct hfi1_qp *qp) } } -static void flush_iowait(struct hfi1_qp *qp) +static void flush_iowait(struct rvt_qp *qp) { + struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); unsigned long flags; write_seqlock_irqsave(&dev->iowait_lock, flags); - if (!list_empty(&qp->s_iowait.list)) { - list_del_init(&qp->s_iowait.list); + if (!list_empty(&priv->s_iowait.list)) { + list_del_init(&priv->s_iowait.list); if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } @@ -597,362 +174,106 @@ static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) return ib_mtu_enum_to_int(mtu); } - -/** - * hfi1_modify_qp - modify the attributes of a queue pair - * @ibqp: the queue pair who's attributes we're modifying - * @attr: the new attributes - * @attr_mask: the mask of attributes to modify - * @udata: user data for libibverbs.so - * - * Returns 0 on success, otherwise returns an errno. - */ -int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) +int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) { + struct ib_qp *ibqp = &qp->ibqp; struct hfi1_ibdev *dev = to_idev(ibqp->device); - struct hfi1_qp *qp = to_iqp(ibqp); - enum ib_qp_state cur_state, new_state; - struct ib_event ev; - int lastwqe = 0; - int mig = 0; - int ret; - u32 pmtu = 0; /* for gcc warning only */ struct hfi1_devdata *dd = dd_from_dev(dev); - - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_lock); - - cur_state = attr_mask & IB_QP_CUR_STATE ? - attr->cur_qp_state : qp->state; - new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, IB_LINK_LAYER_UNSPECIFIED)) - goto inval; + u8 sc; if (attr_mask & IB_QP_AV) { - u8 sc; - - if (attr->ah_attr.dlid >= HFI1_MULTICAST_LID_BASE) - goto inval; - if (hfi1_check_ah(qp->ibqp.device, &attr->ah_attr)) - goto inval; sc = ah_to_sc(ibqp->device, &attr->ah_attr); + if (sc == 0xf) + return -EINVAL; + if (!qp_to_sdma_engine(qp, sc) && dd->flags & HFI1_HAS_SEND_DMA) - goto inval; + return -EINVAL; + + if (!qp_to_send_context(qp, sc)) + return -EINVAL; } if (attr_mask & IB_QP_ALT_PATH) { - u8 sc; - - if (attr->alt_ah_attr.dlid >= HFI1_MULTICAST_LID_BASE) - goto inval; - if (hfi1_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) - goto inval; - if (attr->alt_pkey_index >= hfi1_get_npkeys(dd)) - goto inval; sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr); + if (sc == 0xf) + return -EINVAL; + if (!qp_to_sdma_engine(qp, sc) && dd->flags & HFI1_HAS_SEND_DMA) - goto inval; - } - - if (attr_mask & IB_QP_PKEY_INDEX) - if (attr->pkey_index >= hfi1_get_npkeys(dd)) - goto inval; - - if (attr_mask & IB_QP_MIN_RNR_TIMER) - if (attr->min_rnr_timer > 31) - goto inval; + return -EINVAL; - if (attr_mask & IB_QP_PORT) - if (qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI || - attr->port_num == 0 || - attr->port_num > ibqp->device->phys_port_cnt) - goto inval; - - if (attr_mask & IB_QP_DEST_QPN) - if (attr->dest_qp_num > HFI1_QPN_MASK) - goto inval; + if (!qp_to_send_context(qp, sc)) + return -EINVAL; + } - if (attr_mask & IB_QP_RETRY_CNT) - if (attr->retry_cnt > 7) - goto inval; + return 0; +} - if (attr_mask & IB_QP_RNR_RETRY) - if (attr->rnr_retry > 7) - goto inval; +void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; - /* - * Don't allow invalid path_mtu values. OK to set greater - * than the active mtu (or even the max_cap, if we have tuned - * that to a small mtu. We'll set qp->path_mtu - * to the lesser of requested attribute mtu and active, - * for packetizing messages. - * Note that the QP port has to be set in INIT and MTU in RTR. - */ - if (attr_mask & IB_QP_PATH_MTU) { - int mtu, pidx = qp->port_num - 1; - - dd = dd_from_dev(dev); - mtu = verbs_mtu_enum_to_int(ibqp->device, attr->path_mtu); - if (mtu == -1) - goto inval; - - if (mtu > dd->pport[pidx].ibmtu) - pmtu = mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048); - else - pmtu = attr->path_mtu; + if (attr_mask & IB_QP_AV) { + priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); } - if (attr_mask & IB_QP_PATH_MIG_STATE) { - if (attr->path_mig_state == IB_MIG_REARM) { - if (qp->s_mig_state == IB_MIG_ARMED) - goto inval; - if (new_state != IB_QPS_RTS) - goto inval; - } else if (attr->path_mig_state == IB_MIG_MIGRATED) { - if (qp->s_mig_state == IB_MIG_REARM) - goto inval; - if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD) - goto inval; - if (qp->s_mig_state == IB_MIG_ARMED) - mig = 1; - } else - goto inval; + if (attr_mask & IB_QP_PATH_MIG_STATE && + attr->path_mig_state == IB_MIG_MIGRATED && + qp->s_mig_state == IB_MIG_ARMED) { + qp->s_flags |= RVT_S_AHG_CLEAR; + priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); } +} - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - if (attr->max_dest_rd_atomic > HFI1_MAX_RDMA_ATOMIC) - goto inval; - - switch (new_state) { - case IB_QPS_RESET: - if (qp->state != IB_QPS_RESET) { - qp->state = IB_QPS_RESET; - flush_iowait(qp); - qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT); - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - /* Stop the sending work queue and retry timer */ - cancel_work_sync(&qp->s_iowait.iowork); - del_timer_sync(&qp->s_timer); - iowait_sdma_drain(&qp->s_iowait); - flush_tx_list(qp); - remove_qp(dev, qp); - wait_event(qp->wait, !atomic_read(&qp->refcount)); - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_lock); - clear_mr_refs(qp, 1); - clear_ahg(qp); - reset_qp(qp, ibqp->qp_type); - } - break; - - case IB_QPS_RTR: - /* Allow event to re-trigger if QP set to RTR more than once */ - qp->r_flags &= ~HFI1_R_COMM_EST; - qp->state = new_state; - break; - - case IB_QPS_SQD: - qp->s_draining = qp->s_last != qp->s_cur; - qp->state = new_state; - break; +/** + * hfi1_check_send_wqe - validate wqe + * @qp - The qp + * @wqe - The built wqe + * + * validate wqe. This is called + * prior to inserting the wqe into + * the ring but after the wqe has been + * setup. + * + * Returns 0 on success, -EINVAL on failure + * + */ +int hfi1_check_send_wqe(struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct rvt_ah *ah; - case IB_QPS_SQE: - if (qp->ibqp.qp_type == IB_QPT_RC) - goto inval; - qp->state = new_state; + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + if (wqe->length > 0x80000000U) + return -EINVAL; break; - - case IB_QPS_ERR: - lastwqe = hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + case IB_QPT_SMI: + ah = ibah_to_rvtah(wqe->ud_wr.ah); + if (wqe->length > (1 << ah->log_pmtu)) + return -EINVAL; break; - + case IB_QPT_GSI: + case IB_QPT_UD: + ah = ibah_to_rvtah(wqe->ud_wr.ah); + if (wqe->length > (1 << ah->log_pmtu)) + return -EINVAL; + if (ibp->sl_to_sc[ah->attr.sl] == 0xf) + return -EINVAL; default: - qp->state = new_state; break; } - - if (attr_mask & IB_QP_PKEY_INDEX) - qp->s_pkey_index = attr->pkey_index; - - if (attr_mask & IB_QP_PORT) - qp->port_num = attr->port_num; - - if (attr_mask & IB_QP_DEST_QPN) - qp->remote_qpn = attr->dest_qp_num; - - if (attr_mask & IB_QP_SQ_PSN) { - qp->s_next_psn = attr->sq_psn & PSN_MODIFY_MASK; - qp->s_psn = qp->s_next_psn; - qp->s_sending_psn = qp->s_next_psn; - qp->s_last_psn = qp->s_next_psn - 1; - qp->s_sending_hpsn = qp->s_last_psn; - } - - if (attr_mask & IB_QP_RQ_PSN) - qp->r_psn = attr->rq_psn & PSN_MODIFY_MASK; - - if (attr_mask & IB_QP_ACCESS_FLAGS) - qp->qp_access_flags = attr->qp_access_flags; - - if (attr_mask & IB_QP_AV) { - qp->remote_ah_attr = attr->ah_attr; - qp->s_srate = attr->ah_attr.static_rate; - qp->srate_mbps = ib_rate_to_mbps(qp->s_srate); - qp->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); - qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc); - } - - if (attr_mask & IB_QP_ALT_PATH) { - qp->alt_ah_attr = attr->alt_ah_attr; - qp->s_alt_pkey_index = attr->alt_pkey_index; - } - - if (attr_mask & IB_QP_PATH_MIG_STATE) { - qp->s_mig_state = attr->path_mig_state; - if (mig) { - qp->remote_ah_attr = qp->alt_ah_attr; - qp->port_num = qp->alt_ah_attr.port_num; - qp->s_pkey_index = qp->s_alt_pkey_index; - qp->s_flags |= HFI1_S_AHG_CLEAR; - qp->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); - qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc); - } - } - - if (attr_mask & IB_QP_PATH_MTU) { - struct hfi1_ibport *ibp; - u8 sc, vl; - u32 mtu; - - dd = dd_from_dev(dev); - ibp = &dd->pport[qp->port_num - 1].ibport_data; - - sc = ibp->sl_to_sc[qp->remote_ah_attr.sl]; - vl = sc_to_vlt(dd, sc); - - mtu = verbs_mtu_enum_to_int(ibqp->device, pmtu); - if (vl < PER_VL_SEND_CONTEXTS) - mtu = min_t(u32, mtu, dd->vld[vl].mtu); - pmtu = mtu_to_enum(mtu, OPA_MTU_8192); - - qp->path_mtu = pmtu; - qp->pmtu = mtu; - } - - if (attr_mask & IB_QP_RETRY_CNT) { - qp->s_retry_cnt = attr->retry_cnt; - qp->s_retry = attr->retry_cnt; - } - - if (attr_mask & IB_QP_RNR_RETRY) { - qp->s_rnr_retry_cnt = attr->rnr_retry; - qp->s_rnr_retry = attr->rnr_retry; - } - - if (attr_mask & IB_QP_MIN_RNR_TIMER) - qp->r_min_rnr_timer = attr->min_rnr_timer; - - if (attr_mask & IB_QP_TIMEOUT) { - qp->timeout = attr->timeout; - qp->timeout_jiffies = - usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / - 1000UL); - } - - if (attr_mask & IB_QP_QKEY) - qp->qkey = attr->qkey; - - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - qp->r_max_rd_atomic = attr->max_dest_rd_atomic; - - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) - qp->s_max_rd_atomic = attr->max_rd_atomic; - - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - insert_qp(dev, qp); - - if (lastwqe) { - ev.device = qp->ibqp.device; - ev.element.qp = &qp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); - } - if (mig) { - ev.device = qp->ibqp.device; - ev.element.qp = &qp->ibqp; - ev.event = IB_EVENT_PATH_MIG; - qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); - } - ret = 0; - goto bail; - -inval: - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - ret = -EINVAL; - -bail: - return ret; -} - -int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_qp_init_attr *init_attr) -{ - struct hfi1_qp *qp = to_iqp(ibqp); - - attr->qp_state = qp->state; - attr->cur_qp_state = attr->qp_state; - attr->path_mtu = qp->path_mtu; - attr->path_mig_state = qp->s_mig_state; - attr->qkey = qp->qkey; - attr->rq_psn = mask_psn(qp->r_psn); - attr->sq_psn = mask_psn(qp->s_next_psn); - attr->dest_qp_num = qp->remote_qpn; - attr->qp_access_flags = qp->qp_access_flags; - attr->cap.max_send_wr = qp->s_size - 1; - attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; - attr->cap.max_send_sge = qp->s_max_sge; - attr->cap.max_recv_sge = qp->r_rq.max_sge; - attr->cap.max_inline_data = 0; - attr->ah_attr = qp->remote_ah_attr; - attr->alt_ah_attr = qp->alt_ah_attr; - attr->pkey_index = qp->s_pkey_index; - attr->alt_pkey_index = qp->s_alt_pkey_index; - attr->en_sqd_async_notify = 0; - attr->sq_draining = qp->s_draining; - attr->max_rd_atomic = qp->s_max_rd_atomic; - attr->max_dest_rd_atomic = qp->r_max_rd_atomic; - attr->min_rnr_timer = qp->r_min_rnr_timer; - attr->port_num = qp->port_num; - attr->timeout = qp->timeout; - attr->retry_cnt = qp->s_retry_cnt; - attr->rnr_retry = qp->s_rnr_retry_cnt; - attr->alt_port_num = qp->alt_ah_attr.port_num; - attr->alt_timeout = qp->alt_timeout; - - init_attr->event_handler = qp->ibqp.event_handler; - init_attr->qp_context = qp->ibqp.qp_context; - init_attr->send_cq = qp->ibqp.send_cq; - init_attr->recv_cq = qp->ibqp.recv_cq; - init_attr->srq = qp->ibqp.srq; - init_attr->cap = attr->cap; - if (qp->s_flags & HFI1_S_SIGNAL_REQ_WR) - init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; - else - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; - init_attr->qp_type = qp->ibqp.qp_type; - init_attr->port_num = qp->port_num; - return 0; + return wqe->length <= piothreshold; } /** @@ -961,7 +282,7 @@ int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, * * Returns the AETH. */ -__be32 hfi1_compute_aeth(struct hfi1_qp *qp) +__be32 hfi1_compute_aeth(struct rvt_qp *qp) { u32 aeth = qp->r_msn & HFI1_MSN_MASK; @@ -974,7 +295,7 @@ __be32 hfi1_compute_aeth(struct hfi1_qp *qp) } else { u32 min, max, x; u32 credits; - struct hfi1_rwq *wq = qp->r_rq.wq; + struct rvt_rwq *wq = qp->r_rq.wq; u32 head; u32 tail; @@ -1004,12 +325,13 @@ __be32 hfi1_compute_aeth(struct hfi1_qp *qp) x = (min + max) / 2; if (credit_table[x] == credits) break; - if (credit_table[x] > credits) + if (credit_table[x] > credits) { max = x; - else if (min == x) - break; - else + } else { + if (min == x) + break; min = x; + } } aeth |= x << HFI1_AETH_CREDIT_SHIFT; } @@ -1017,348 +339,58 @@ __be32 hfi1_compute_aeth(struct hfi1_qp *qp) } /** - * hfi1_create_qp - create a queue pair for a device - * @ibpd: the protection domain who's device we create the queue pair for - * @init_attr: the attributes of the queue pair - * @udata: user data for libibverbs.so - * - * Returns the queue pair on success, otherwise returns an errno. - * - * Called by the ib_create_qp() core verbs function. - */ -struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd, - struct ib_qp_init_attr *init_attr, - struct ib_udata *udata) -{ - struct hfi1_qp *qp; - int err; - struct hfi1_swqe *swq = NULL; - struct hfi1_ibdev *dev; - struct hfi1_devdata *dd; - size_t sz; - size_t sg_list_sz; - struct ib_qp *ret; - - if (init_attr->cap.max_send_sge > hfi1_max_sges || - init_attr->cap.max_send_wr > hfi1_max_qp_wrs || - init_attr->create_flags) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - - /* Check receive queue parameters if no SRQ is specified. */ - if (!init_attr->srq) { - if (init_attr->cap.max_recv_sge > hfi1_max_sges || - init_attr->cap.max_recv_wr > hfi1_max_qp_wrs) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - if (init_attr->cap.max_send_sge + - init_attr->cap.max_send_wr + - init_attr->cap.max_recv_sge + - init_attr->cap.max_recv_wr == 0) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - } - - switch (init_attr->qp_type) { - case IB_QPT_SMI: - case IB_QPT_GSI: - if (init_attr->port_num == 0 || - init_attr->port_num > ibpd->device->phys_port_cnt) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - case IB_QPT_UC: - case IB_QPT_RC: - case IB_QPT_UD: - sz = sizeof(struct hfi1_sge) * - init_attr->cap.max_send_sge + - sizeof(struct hfi1_swqe); - swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); - if (swq == NULL) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - sz = sizeof(*qp); - sg_list_sz = 0; - if (init_attr->srq) { - struct hfi1_srq *srq = to_isrq(init_attr->srq); - - if (srq->rq.max_sge > 1) - sg_list_sz = sizeof(*qp->r_sg_list) * - (srq->rq.max_sge - 1); - } else if (init_attr->cap.max_recv_sge > 1) - sg_list_sz = sizeof(*qp->r_sg_list) * - (init_attr->cap.max_recv_sge - 1); - qp = kzalloc(sz + sg_list_sz, GFP_KERNEL); - if (!qp) { - ret = ERR_PTR(-ENOMEM); - goto bail_swq; - } - RCU_INIT_POINTER(qp->next, NULL); - qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL); - if (!qp->s_hdr) { - ret = ERR_PTR(-ENOMEM); - goto bail_qp; - } - qp->timeout_jiffies = - usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / - 1000UL); - if (init_attr->srq) - sz = 0; - else { - qp->r_rq.size = init_attr->cap.max_recv_wr + 1; - qp->r_rq.max_sge = init_attr->cap.max_recv_sge; - sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + - sizeof(struct hfi1_rwqe); - qp->r_rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + - qp->r_rq.size * sz); - if (!qp->r_rq.wq) { - ret = ERR_PTR(-ENOMEM); - goto bail_qp; - } - } - - /* - * ib_create_qp() will initialize qp->ibqp - * except for qp->ibqp.qp_num. - */ - spin_lock_init(&qp->r_lock); - spin_lock_init(&qp->s_lock); - spin_lock_init(&qp->r_rq.lock); - atomic_set(&qp->refcount, 0); - init_waitqueue_head(&qp->wait); - init_timer(&qp->s_timer); - qp->s_timer.data = (unsigned long)qp; - INIT_LIST_HEAD(&qp->rspwait); - qp->state = IB_QPS_RESET; - qp->s_wq = swq; - qp->s_size = init_attr->cap.max_send_wr + 1; - qp->s_max_sge = init_attr->cap.max_send_sge; - if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) - qp->s_flags = HFI1_S_SIGNAL_REQ_WR; - dev = to_idev(ibpd->device); - dd = dd_from_dev(dev); - err = alloc_qpn(dd, &dev->qp_dev->qpn_table, init_attr->qp_type, - init_attr->port_num); - if (err < 0) { - ret = ERR_PTR(err); - vfree(qp->r_rq.wq); - goto bail_qp; - } - qp->ibqp.qp_num = err; - qp->port_num = init_attr->port_num; - reset_qp(qp, init_attr->qp_type); - - break; - - default: - /* Don't support raw QPs */ - ret = ERR_PTR(-ENOSYS); - goto bail; - } - - init_attr->cap.max_inline_data = 0; - - /* - * Return the address of the RWQ as the offset to mmap. - * See hfi1_mmap() for details. - */ - if (udata && udata->outlen >= sizeof(__u64)) { - if (!qp->r_rq.wq) { - __u64 offset = 0; - - err = ib_copy_to_udata(udata, &offset, - sizeof(offset)); - if (err) { - ret = ERR_PTR(err); - goto bail_ip; - } - } else { - u32 s = sizeof(struct hfi1_rwq) + qp->r_rq.size * sz; - - qp->ip = hfi1_create_mmap_info(dev, s, - ibpd->uobject->context, - qp->r_rq.wq); - if (!qp->ip) { - ret = ERR_PTR(-ENOMEM); - goto bail_ip; - } - - err = ib_copy_to_udata(udata, &(qp->ip->offset), - sizeof(qp->ip->offset)); - if (err) { - ret = ERR_PTR(err); - goto bail_ip; - } - } - } - - spin_lock(&dev->n_qps_lock); - if (dev->n_qps_allocated == hfi1_max_qps) { - spin_unlock(&dev->n_qps_lock); - ret = ERR_PTR(-ENOMEM); - goto bail_ip; - } - - dev->n_qps_allocated++; - spin_unlock(&dev->n_qps_lock); - - if (qp->ip) { - spin_lock_irq(&dev->pending_lock); - list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); - spin_unlock_irq(&dev->pending_lock); - } - - ret = &qp->ibqp; - - /* - * We have our QP and its good, now keep track of what types of opcodes - * can be processed on this QP. We do this by keeping track of what the - * 3 high order bits of the opcode are. - */ - switch (init_attr->qp_type) { - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_UD: - qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & OPCODE_QP_MASK; - break; - case IB_QPT_RC: - qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & OPCODE_QP_MASK; - break; - case IB_QPT_UC: - qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & OPCODE_QP_MASK; - break; - default: - ret = ERR_PTR(-EINVAL); - goto bail_ip; - } - - goto bail; - -bail_ip: - if (qp->ip) - kref_put(&qp->ip->ref, hfi1_release_mmap_info); - else - vfree(qp->r_rq.wq); - free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num); -bail_qp: - kfree(qp->s_hdr); - kfree(qp); -bail_swq: - vfree(swq); -bail: - return ret; -} - -/** - * hfi1_destroy_qp - destroy a queue pair - * @ibqp: the queue pair to destroy + * _hfi1_schedule_send - schedule progress + * @qp: the QP * - * Returns 0 on success. + * This schedules qp progress w/o regard to the s_flags. * - * Note that this can be called while the QP is actively sending or - * receiving! + * It is only used in the post send, which doesn't hold + * the s_lock. */ -int hfi1_destroy_qp(struct ib_qp *ibqp) +void _hfi1_schedule_send(struct rvt_qp *qp) { - struct hfi1_qp *qp = to_iqp(ibqp); - struct hfi1_ibdev *dev = to_idev(ibqp->device); - - /* Make sure HW and driver activity is stopped. */ - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_lock); - if (qp->state != IB_QPS_RESET) { - qp->state = IB_QPS_RESET; - flush_iowait(qp); - qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT); - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - cancel_work_sync(&qp->s_iowait.iowork); - del_timer_sync(&qp->s_timer); - iowait_sdma_drain(&qp->s_iowait); - flush_tx_list(qp); - remove_qp(dev, qp); - wait_event(qp->wait, !atomic_read(&qp->refcount)); - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_lock); - clear_mr_refs(qp, 1); - clear_ahg(qp); - } - spin_unlock(&qp->s_lock); - spin_unlock_irq(&qp->r_lock); - - /* all user's cleaned up, mark it available */ - free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num); - spin_lock(&dev->n_qps_lock); - dev->n_qps_allocated--; - spin_unlock(&dev->n_qps_lock); + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - if (qp->ip) - kref_put(&qp->ip->ref, hfi1_release_mmap_info); - else - vfree(qp->r_rq.wq); - vfree(qp->s_wq); - kfree(qp->s_hdr); - kfree(qp); - return 0; + iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); } -/** - * init_qpn_table - initialize the QP number table for a device - * @qpt: the QPN table - */ -static int init_qpn_table(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt) +static void qp_pio_drain(struct rvt_qp *qp) { - u32 offset, qpn, i; - struct qpn_map *map; - int ret = 0; + struct hfi1_ibdev *dev; + struct hfi1_qp_priv *priv = qp->priv; - spin_lock_init(&qpt->lock); - - qpt->last = 0; - qpt->incr = 1 << dd->qos_shift; - - /* insure we don't assign QPs from KDETH 64K window */ - qpn = kdeth_qp << 16; - qpt->nmaps = qpn / BITS_PER_PAGE; - /* This should always be zero */ - offset = qpn & BITS_PER_PAGE_MASK; - map = &qpt->map[qpt->nmaps]; - dd_dev_info(dd, "Reserving QPNs for KDETH window from 0x%x to 0x%x\n", - qpn, qpn + 65535); - for (i = 0; i < 65536; i++) { - if (!map->page) { - get_map_page(qpt, map); - if (!map->page) { - ret = -ENOMEM; - break; - } - } - set_bit(offset, map->page); - offset++; - if (offset == BITS_PER_PAGE) { - /* next page */ - qpt->nmaps++; - map++; - offset = 0; - } + if (!priv->s_sendcontext) + return; + dev = to_idev(qp->ibqp.device); + while (iowait_pio_pending(&priv->s_iowait)) { + write_seqlock_irq(&dev->iowait_lock); + hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1); + write_sequnlock_irq(&dev->iowait_lock); + iowait_pio_drain(&priv->s_iowait); + write_seqlock_irq(&dev->iowait_lock); + hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0); + write_sequnlock_irq(&dev->iowait_lock); } - return ret; } /** - * free_qpn_table - free the QP number table for a device - * @qpt: the QPN table + * hfi1_schedule_send - schedule progress + * @qp: the QP + * + * This schedules qp progress and caller should hold + * the s_lock. */ -static void free_qpn_table(struct hfi1_qpn_table *qpt) +void hfi1_schedule_send(struct rvt_qp *qp) { - int i; - - for (i = 0; i < ARRAY_SIZE(qpt->map); i++) - free_page((unsigned long) qpt->map[i].page); + if (hfi1_send_ok(qp)) + _hfi1_schedule_send(qp); } /** @@ -1368,7 +400,7 @@ static void free_qpn_table(struct hfi1_qpn_table *qpt) * * The QP s_lock should be held. */ -void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth) +void hfi1_get_credit(struct rvt_qp *qp, u32 aeth) { u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK; @@ -1378,27 +410,27 @@ void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth) * honor the credit field. */ if (credit == HFI1_AETH_CREDIT_INVAL) { - if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) { - qp->s_flags |= HFI1_S_UNLIMITED_CREDIT; - if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) { - qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT; + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) { + qp->s_flags |= RVT_S_UNLIMITED_CREDIT; + if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT; hfi1_schedule_send(qp); } } - } else if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) { + } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) { /* Compute new LSN (i.e., MSN + credit) */ credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK; if (cmp_msn(credit, qp->s_lsn) > 0) { qp->s_lsn = credit; - if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) { - qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT; + if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT; hfi1_schedule_send(qp); } } } } -void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag) +void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) { unsigned long flags; @@ -1421,16 +453,17 @@ static int iowait_sleep( unsigned seq) { struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq); - struct hfi1_qp *qp; + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; unsigned long flags; int ret = 0; struct hfi1_ibdev *dev; qp = tx->qp; + priv = qp->priv; spin_lock_irqsave(&qp->s_lock, flags); - if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { - + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { /* * If we couldn't queue the DMA request, save the info * and try again later rather than destroying the @@ -1442,18 +475,18 @@ static int iowait_sleep( write_seqlock(&dev->iowait_lock); if (sdma_progress(sde, seq, stx)) goto eagain; - if (list_empty(&qp->s_iowait.list)) { + if (list_empty(&priv->s_iowait.list)) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - ibp->n_dmawait++; - qp->s_flags |= HFI1_S_WAIT_DMA_DESC; - list_add_tail(&qp->s_iowait.list, &sde->dmawait); - trace_hfi1_qpsleep(qp, HFI1_S_WAIT_DMA_DESC); + ibp->rvp.n_dmawait++; + qp->s_flags |= RVT_S_WAIT_DMA_DESC; + list_add_tail(&priv->s_iowait.list, &sde->dmawait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC); atomic_inc(&qp->refcount); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~HFI1_S_BUSY; + qp->s_flags &= ~RVT_S_BUSY; spin_unlock_irqrestore(&qp->s_lock, flags); ret = -EBUSY; } else { @@ -1470,61 +503,25 @@ eagain: static void iowait_wakeup(struct iowait *wait, int reason) { - struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait); + struct rvt_qp *qp = iowait_to_qp(wait); WARN_ON(reason != SDMA_AVAIL_REASON); - hfi1_qp_wakeup(qp, HFI1_S_WAIT_DMA_DESC); + hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC); } -int hfi1_qp_init(struct hfi1_ibdev *dev) +static void iowait_sdma_drained(struct iowait *wait) { - struct hfi1_devdata *dd = dd_from_dev(dev); - int i; - int ret = -ENOMEM; - - /* allocate parent object */ - dev->qp_dev = kzalloc(sizeof(*dev->qp_dev), GFP_KERNEL); - if (!dev->qp_dev) - goto nomem; - /* allocate hash table */ - dev->qp_dev->qp_table_size = hfi1_qp_table_size; - dev->qp_dev->qp_table_bits = ilog2(hfi1_qp_table_size); - dev->qp_dev->qp_table = - kmalloc(dev->qp_dev->qp_table_size * - sizeof(*dev->qp_dev->qp_table), - GFP_KERNEL); - if (!dev->qp_dev->qp_table) - goto nomem; - for (i = 0; i < dev->qp_dev->qp_table_size; i++) - RCU_INIT_POINTER(dev->qp_dev->qp_table[i], NULL); - spin_lock_init(&dev->qp_dev->qpt_lock); - /* initialize qpn map */ - ret = init_qpn_table(dd, &dev->qp_dev->qpn_table); - if (ret) - goto nomem; - return ret; -nomem: - if (dev->qp_dev) { - kfree(dev->qp_dev->qp_table); - free_qpn_table(&dev->qp_dev->qpn_table); - kfree(dev->qp_dev); - } - return ret; -} + struct rvt_qp *qp = iowait_to_qp(wait); -void hfi1_qp_exit(struct hfi1_ibdev *dev) -{ - struct hfi1_devdata *dd = dd_from_dev(dev); - u32 qps_inuse; - - qps_inuse = free_all_qps(dd); - if (qps_inuse) - dd_dev_err(dd, "QP memory leak! %u still in use\n", - qps_inuse); - if (dev->qp_dev) { - kfree(dev->qp_dev->qp_table); - free_qpn_table(&dev->qp_dev->qpn_table); - kfree(dev->qp_dev); + /* + * This happens when the send engine notes + * a QP in the error state and cannot + * do the flush work until that QP's + * sdma work has finished. + */ + if (qp->s_flags & RVT_S_WAIT_DMA) { + qp->s_flags &= ~RVT_S_WAIT_DMA; + hfi1_schedule_send(qp); } } @@ -1537,7 +534,7 @@ void hfi1_qp_exit(struct hfi1_ibdev *dev) * Return: * A send engine for the qp or NULL for SMI type qp. */ -struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5) +struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5) { struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct sdma_engine *sde; @@ -1554,9 +551,33 @@ struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5) return sde; } +/* + * qp_to_send_context - map a qp to a send context + * @qp: the QP + * @sc5: the 5 bit sc + * + * Return: + * A send context for the qp + */ +struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + /* SMA packets to VL15 */ + return dd->vld[15].sc; + default: + break; + } + + return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, + sc5); +} + struct qp_iter { struct hfi1_ibdev *dev; - struct hfi1_qp *qp; + struct rvt_qp *qp; int specials; int n; }; @@ -1570,7 +591,7 @@ struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev) return NULL; iter->dev = dev; - iter->specials = dev->ibdev.phys_port_cnt * 2; + iter->specials = dev->rdi.ibdev.phys_port_cnt * 2; if (qp_iter_next(iter)) { kfree(iter); return NULL; @@ -1584,8 +605,8 @@ int qp_iter_next(struct qp_iter *iter) struct hfi1_ibdev *dev = iter->dev; int n = iter->n; int ret = 1; - struct hfi1_qp *pqp = iter->qp; - struct hfi1_qp *qp; + struct rvt_qp *pqp = iter->qp; + struct rvt_qp *qp; /* * The approach is to consider the special qps @@ -1597,11 +618,11 @@ int qp_iter_next(struct qp_iter *iter) * * n = 0..iter->specials is the special qp indices * - * n = iter->specials..dev->qp_dev->qp_table_size+iter->specials are + * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are * the potential hash bucket entries * */ - for (; n < dev->qp_dev->qp_table_size + iter->specials; n++) { + for (; n < dev->rdi.qp_dev->qp_table_size + iter->specials; n++) { if (pqp) { qp = rcu_dereference(pqp->next); } else { @@ -1610,17 +631,17 @@ int qp_iter_next(struct qp_iter *iter) struct hfi1_ibport *ibp; int pidx; - pidx = n % dev->ibdev.phys_port_cnt; + pidx = n % dev->rdi.ibdev.phys_port_cnt; ppd = &dd_from_dev(dev)->pport[pidx]; ibp = &ppd->ibport_data; if (!(n & 1)) - qp = rcu_dereference(ibp->qp[0]); + qp = rcu_dereference(ibp->rvp.qp[0]); else - qp = rcu_dereference(ibp->qp[1]); + qp = rcu_dereference(ibp->rvp.qp[1]); } else { qp = rcu_dereference( - dev->qp_dev->qp_table[ + dev->rdi.qp_dev->qp_table[ (n - iter->specials)]); } } @@ -1638,7 +659,7 @@ static const char * const qp_type_str[] = { "SMI", "GSI", "RC", "UC", "UD", }; -static int qp_idle(struct hfi1_qp *qp) +static int qp_idle(struct rvt_qp *qp) { return qp->s_last == qp->s_acked && @@ -1649,14 +670,17 @@ static int qp_idle(struct hfi1_qp *qp) void qp_iter_print(struct seq_file *s, struct qp_iter *iter) { - struct hfi1_swqe *wqe; - struct hfi1_qp *qp = iter->qp; + struct rvt_swqe *wqe; + struct rvt_qp *qp = iter->qp; + struct hfi1_qp_priv *priv = qp->priv; struct sdma_engine *sde; + struct send_context *send_context; - sde = qp_to_sdma_engine(qp, qp->s_sc); - wqe = get_swqe_ptr(qp, qp->s_last); + sde = qp_to_sdma_engine(qp, priv->s_sc); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + send_context = qp_to_send_context(qp, priv->s_sc); seq_printf(s, - "N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x SL %u MTU %d %u %u %u SDE %p,%u\n", + "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n", iter->n, qp_idle(qp) ? "I" : "B", qp->ibqp.qp_num, @@ -1666,8 +690,9 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter) wqe ? wqe->wr.opcode : 0, qp->s_hdrwords, qp->s_flags, - atomic_read(&qp->s_iowait.sdma_busy), - !list_empty(&qp->s_iowait.list), + iowait_sdma_pending(&priv->s_iowait), + iowait_pio_pending(&priv->s_iowait), + !list_empty(&priv->s_iowait.list), qp->timeout, wqe ? wqe->ssn : 0, qp->s_lsn, @@ -1676,20 +701,26 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter) qp->s_sending_psn, qp->s_sending_hpsn, qp->s_last, qp->s_acked, qp->s_cur, qp->s_tail, qp->s_head, qp->s_size, + qp->s_avail, qp->remote_qpn, qp->remote_ah_attr.dlid, qp->remote_ah_attr.sl, qp->pmtu, + qp->s_retry, qp->s_retry_cnt, - qp->timeout, qp->s_rnr_retry_cnt, sde, - sde ? sde->this_idx : 0); + sde ? sde->this_idx : 0, + send_context, + send_context ? send_context->sw_index : 0, + ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head, + ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail, + qp->pid); } -void qp_comm_est(struct hfi1_qp *qp) +void qp_comm_est(struct rvt_qp *qp) { - qp->r_flags |= HFI1_R_COMM_EST; + qp->r_flags |= RVT_R_COMM_EST; if (qp->ibqp.event_handler) { struct ib_event ev; @@ -1700,24 +731,241 @@ void qp_comm_est(struct hfi1_qp *qp) } } +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, + gfp_t gfp) +{ + struct hfi1_qp_priv *priv; + + priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node); + if (!priv) + return ERR_PTR(-ENOMEM); + + priv->owner = qp; + + priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node); + if (!priv->s_hdr) { + kfree(priv); + return ERR_PTR(-ENOMEM); + } + setup_timer(&priv->s_rnr_timer, hfi1_rc_rnr_retry, (unsigned long)qp); + qp->s_timer.function = hfi1_rc_timeout; + return priv; +} + +void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + kfree(priv->s_hdr); + kfree(priv); +} + +unsigned free_all_qps(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + int n; + unsigned qp_inuse = 0; + + for (n = 0; n < dd->num_pports; n++) { + struct hfi1_ibport *ibp = &dd->pport[n].ibport_data; + + rcu_read_lock(); + if (rcu_dereference(ibp->rvp.qp[0])) + qp_inuse++; + if (rcu_dereference(ibp->rvp.qp[1])) + qp_inuse++; + rcu_read_unlock(); + } + + return qp_inuse; +} + +void flush_qp_waiters(struct rvt_qp *qp) +{ + flush_iowait(qp); + hfi1_stop_rc_timers(qp); +} + +void stop_send_queue(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + cancel_work_sync(&priv->s_iowait.iowork); + hfi1_del_timers_sync(qp); +} + +void quiesce_qp(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + iowait_sdma_drain(&priv->s_iowait); + qp_pio_drain(qp); + flush_tx_list(qp); +} + +void notify_qp_reset(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + iowait_init( + &priv->s_iowait, + 1, + _hfi1_do_send, + iowait_sleep, + iowait_wakeup, + iowait_sdma_drained); + priv->r_adefered = 0; + clear_ahg(qp); +} + /* * Switch to alternate path. * The QP s_lock should be held and interrupts disabled. */ -void hfi1_migrate_qp(struct hfi1_qp *qp) +void hfi1_migrate_qp(struct rvt_qp *qp) { + struct hfi1_qp_priv *priv = qp->priv; struct ib_event ev; qp->s_mig_state = IB_MIG_MIGRATED; qp->remote_ah_attr = qp->alt_ah_attr; qp->port_num = qp->alt_ah_attr.port_num; qp->s_pkey_index = qp->s_alt_pkey_index; - qp->s_flags |= HFI1_S_AHG_CLEAR; - qp->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr); - qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc); + qp->s_flags |= RVT_S_AHG_CLEAR; + priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); ev.device = qp->ibqp.device; ev.element.qp = &qp->ibqp; ev.event = IB_EVENT_PATH_MIG; qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); } + +int mtu_to_path_mtu(u32 mtu) +{ + return mtu_to_enum(mtu, OPA_MTU_8192); +} + +u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu) +{ + u32 mtu; + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + struct hfi1_ibport *ibp; + u8 sc, vl; + + ibp = &dd->pport[qp->port_num - 1].ibport_data; + sc = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + vl = sc_to_vlt(dd, sc); + + mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu); + if (vl < PER_VL_SEND_CONTEXTS) + mtu = min_t(u32, mtu, dd->vld[vl].mtu); + return mtu; +} + +int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr) +{ + int mtu, pidx = qp->port_num - 1; + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu); + if (mtu == -1) + return -1; /* values less than 0 are error */ + + if (mtu > dd->pport[pidx].ibmtu) + return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048); + else + return attr->path_mtu; +} + +void notify_error_qp(struct rvt_qp *qp) +{ + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + + write_seqlock(&dev->iowait_lock); + if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) { + qp->s_flags &= ~RVT_S_ANY_WAIT_IO; + list_del_init(&priv->s_iowait.list); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + write_sequnlock(&dev->iowait_lock); + + if (!(qp->s_flags & RVT_S_BUSY)) { + qp->s_hdrwords = 0; + if (qp->s_rdma_mr) { + rvt_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + flush_tx_list(qp); + } +} + +/** + * hfi1_error_port_qps - put a port's RC/UC qps into error state + * @ibp: the ibport. + * @sl: the service level. + * + * This function places all RC/UC qps with a given service level into error + * state. It is generally called to force upper lay apps to abandon stale qps + * after an sl->sc mapping change. + */ +void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl) +{ + struct rvt_qp *qp = NULL; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_ibdev *dev = &ppd->dd->verbs_dev; + int n; + int lastwqe; + struct ib_event ev; + + rcu_read_lock(); + + /* Deal only with RC/UC qps that use the given SL. */ + for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) { + for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) { + if (qp->port_num == ppd->port && + (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) && + qp->remote_ah_attr.sl == sl && + (ib_rvt_state_ops[qp->state] & + RVT_POST_SEND_OK)) { + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + lastwqe = rvt_error_qp(qp, + IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = + IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } + } + } + + rcu_read_unlock(); +} diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h index 62a94c5d7dca..e7bc8d6cf681 100644 --- a/drivers/staging/rdma/hfi1/qp.h +++ b/drivers/staging/rdma/hfi1/qp.h @@ -1,14 +1,13 @@ #ifndef _QP_H #define _QP_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -51,119 +48,33 @@ */ #include <linux/hash.h> +#include <rdma/rdmavt_qp.h> #include "verbs.h" #include "sdma.h" -#define QPN_MAX (1 << 24) -#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) +extern unsigned int hfi1_qp_table_size; /* - * QPN-map pages start out as NULL, they get allocated upon - * first use and are never deallocated. This way, - * large bitmaps are not allocated unless large numbers of QPs are used. - */ -struct qpn_map { - void *page; -}; - -struct hfi1_qpn_table { - spinlock_t lock; /* protect changes in this struct */ - unsigned flags; /* flags for QP0/1 allocated for each port */ - u32 last; /* last QP number allocated */ - u32 nmaps; /* size of the map table */ - u16 limit; - u8 incr; - /* bit map of free QP numbers other than 0/1 */ - struct qpn_map map[QPNMAP_ENTRIES]; -}; - -struct hfi1_qp_ibdev { - u32 qp_table_size; - u32 qp_table_bits; - struct hfi1_qp __rcu **qp_table; - spinlock_t qpt_lock; - struct hfi1_qpn_table qpn_table; -}; - -static inline u32 qpn_hash(struct hfi1_qp_ibdev *dev, u32 qpn) -{ - return hash_32(qpn, dev->qp_table_bits); -} - -/** - * hfi1_lookup_qpn - return the QP with the given QPN - * @ibp: the ibport - * @qpn: the QP number to look up - * - * The caller must hold the rcu_read_lock(), and keep the lock until - * the returned qp is no longer in use. + * free_ahg - clear ahg from QP */ -static inline struct hfi1_qp *hfi1_lookup_qpn(struct hfi1_ibport *ibp, - u32 qpn) __must_hold(RCU) +static inline void clear_ahg(struct rvt_qp *qp) { - struct hfi1_qp *qp = NULL; - - if (unlikely(qpn <= 1)) { - qp = rcu_dereference(ibp->qp[qpn]); - } else { - struct hfi1_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; - u32 n = qpn_hash(dev->qp_dev, qpn); - - for (qp = rcu_dereference(dev->qp_dev->qp_table[n]); qp; - qp = rcu_dereference(qp->next)) - if (qp->ibqp.qp_num == qpn) - break; - } - return qp; -} + struct hfi1_qp_priv *priv = qp->priv; -/** - * clear_ahg - reset ahg status in qp - * @qp - qp pointer - */ -static inline void clear_ahg(struct hfi1_qp *qp) -{ - qp->s_hdr->ahgcount = 0; - qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR); - if (qp->s_sde && qp->s_ahgidx >= 0) - sdma_ahg_free(qp->s_sde, qp->s_ahgidx); + priv->s_hdr->ahgcount = 0; + qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR); + if (priv->s_sde && qp->s_ahgidx >= 0) + sdma_ahg_free(priv->s_sde, qp->s_ahgidx); qp->s_ahgidx = -1; } /** - * hfi1_error_qp - put a QP into the error state - * @qp: the QP to put into the error state - * @err: the receive completion error to signal if a RWQE is active - * - * Flushes both send and receive work queues. - * Returns true if last WQE event should be generated. - * The QP r_lock and s_lock should be held and interrupts disabled. - * If we are already in error state, just return. - */ -int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err); - -/** - * hfi1_modify_qp - modify the attributes of a queue pair - * @ibqp: the queue pair who's attributes we're modifying - * @attr: the new attributes - * @attr_mask: the mask of attributes to modify - * @udata: user data for libibverbs.so - * - * Returns 0 on success, otherwise returns an errno. - */ -int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata); - -int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_qp_init_attr *init_attr); - -/** * hfi1_compute_aeth - compute the AETH (syndrome + MSN) * @qp: the queue pair to compute the AETH for * * Returns the AETH. */ -__be32 hfi1_compute_aeth(struct hfi1_qp *qp); +__be32 hfi1_compute_aeth(struct rvt_qp *qp); /** * hfi1_create_qp - create a queue pair for a device @@ -179,45 +90,23 @@ struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); /** - * hfi1_destroy_qp - destroy a queue pair - * @ibqp: the queue pair to destroy - * - * Returns 0 on success. - * - * Note that this can be called while the QP is actively sending or - * receiving! - */ -int hfi1_destroy_qp(struct ib_qp *ibqp); - -/** * hfi1_get_credit - flush the send work queue of a QP * @qp: the qp who's send work queue to flush * @aeth: the Acknowledge Extended Transport Header * * The QP s_lock should be held. */ -void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth); - -/** - * hfi1_qp_init - allocate QP tables - * @dev: a pointer to the hfi1_ibdev - */ -int hfi1_qp_init(struct hfi1_ibdev *dev); - -/** - * hfi1_qp_exit - free the QP related structures - * @dev: a pointer to the hfi1_ibdev - */ -void hfi1_qp_exit(struct hfi1_ibdev *dev); +void hfi1_get_credit(struct rvt_qp *qp, u32 aeth); /** * hfi1_qp_wakeup - wake up on the indicated event * @qp: the QP * @flag: flag the qp on which the qp is stalled */ -void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag); +void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag); -struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5); +struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5); +struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); struct qp_iter; @@ -244,43 +133,28 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter); * qp_comm_est - handle trap with QP established * @qp: the QP */ -void qp_comm_est(struct hfi1_qp *qp); +void qp_comm_est(struct rvt_qp *qp); -/** - * _hfi1_schedule_send - schedule progress - * @qp: the QP - * - * This schedules qp progress w/o regard to the s_flags. - * - * It is only used in the post send, which doesn't hold - * the s_lock. - */ -static inline void _hfi1_schedule_send(struct hfi1_qp *qp) -{ - struct hfi1_ibport *ibp = - to_iport(qp->ibqp.device, qp->port_num); - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); +void _hfi1_schedule_send(struct rvt_qp *qp); +void hfi1_schedule_send(struct rvt_qp *qp); - iowait_schedule(&qp->s_iowait, ppd->hfi1_wq, - qp->s_sde ? - qp->s_sde->cpu : - cpumask_first(cpumask_of_node(dd->assigned_node_id))); -} - -/** - * hfi1_schedule_send - schedule progress - * @qp: the QP - * - * This schedules qp progress and caller should hold - * the s_lock. - */ -static inline void hfi1_schedule_send(struct hfi1_qp *qp) -{ - if (hfi1_send_ok(qp)) - _hfi1_schedule_send(qp); -} - -void hfi1_migrate_qp(struct hfi1_qp *qp); +void hfi1_migrate_qp(struct rvt_qp *qp); +/* + * Functions provided by hfi1 driver for rdmavt to use + */ +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, + gfp_t gfp); +void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); +unsigned free_all_qps(struct rvt_dev_info *rdi); +void notify_qp_reset(struct rvt_qp *qp); +int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr); +void flush_qp_waiters(struct rvt_qp *qp); +void notify_error_qp(struct rvt_qp *qp); +void stop_send_queue(struct rvt_qp *qp); +void quiesce_qp(struct rvt_qp *qp); +u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); +int mtu_to_path_mtu(u32 mtu); +void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl); #endif /* _QP_H */ diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c index 6326a915d7fd..9ed1963010fe 100644 --- a/drivers/staging/rdma/hfi1/qsfp.c +++ b/drivers/staging/rdma/hfi1/qsfp.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -62,7 +59,7 @@ #define I2C_MAX_RETRY 4 /* - * Unlocked i2c write. Must hold dd->qsfp_i2c_mutex. + * Raw i2c write. No set-up or lock checking. */ static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) @@ -71,14 +68,6 @@ static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int ret, cnt; u8 *buff = bp; - /* Make sure TWSI bus is in sane state. */ - ret = hfi1_twsi_reset(dd, target); - if (ret) { - hfi1_dev_porterr(dd, ppd->port, - "I2C interface Reset for write failed\n"); - return -EIO; - } - cnt = 0; while (cnt < len) { int wlen = len - cnt; @@ -99,48 +88,45 @@ static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, return cnt; } +/* + * Caller must hold the i2c chain resource. + */ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) { - struct hfi1_devdata *dd = ppd->dd; int ret; - ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex); - if (!ret) { - ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len); - mutex_unlock(&dd->qsfp_i2c_mutex); + if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "I2C chain %d write interface reset failed\n", + target); + return ret; } - return ret; + return __i2c_write(ppd, target, i2c_addr, offset, bp, len); } /* - * Unlocked i2c read. Must hold dd->qsfp_i2c_mutex. + * Raw i2c read. No set-up or lock checking. */ static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) { struct hfi1_devdata *dd = ppd->dd; int ret, cnt, pass = 0; - int stuck = 0; - u8 *buff = bp; - - /* Make sure TWSI bus is in sane state. */ - ret = hfi1_twsi_reset(dd, target); - if (ret) { - hfi1_dev_porterr(dd, ppd->port, - "I2C interface Reset for read failed\n"); - ret = -EIO; - stuck = 1; - goto exit; - } + int orig_offset = offset; cnt = 0; while (cnt < len) { int rlen = len - cnt; ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset, - buff + cnt, rlen); + bp + cnt, rlen); /* Some QSFP's fail first try. Retry as experiment */ if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY) continue; @@ -156,14 +142,11 @@ static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, ret = cnt; exit: - if (stuck) - dd_dev_err(dd, "I2C interface bus stuck non-idle\n"); - - if (pass >= I2C_MAX_RETRY && ret) + if (ret < 0) { hfi1_dev_porterr(dd, ppd->port, - "I2C failed even retrying\n"); - else if (pass) - hfi1_dev_porterr(dd, ppd->port, "I2C retries: %d\n", pass); + "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n", + target, i2c_addr, orig_offset, len); + } /* Must wait min 20us between qsfp i2c transactions */ udelay(20); @@ -171,21 +154,35 @@ exit: return ret; } +/* + * Caller must hold the i2c chain resource. + */ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, void *bp, int len) { - struct hfi1_devdata *dd = ppd->dd; int ret; - ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex); - if (!ret) { - ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len); - mutex_unlock(&dd->qsfp_i2c_mutex); + if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "I2C chain %d read interface reset failed\n", + target); + return ret; } - return ret; + return __i2c_read(ppd, target, i2c_addr, offset, bp, len); } +/* + * Write page n, offset m of QSFP memory as defined by SFF 8636 + * by writing @addr = ((256 * n) + m) + * + * Caller must hold the i2c chain resource. + */ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) { @@ -195,50 +192,81 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int ret; u8 page; - ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex); - if (ret) + if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d write interface reset failed\n", + target); return ret; + } while (count < len) { /* - * Set the qsfp page based on a zero-based addresss + * Set the qsfp page based on a zero-based address * and a page size of QSFP_PAGESIZE bytes. */ page = (u8)(addr / QSFP_PAGESIZE); - ret = __i2c_write(ppd, target, QSFP_DEV, - QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); if (ret != 1) { - hfi1_dev_porterr( - ppd->dd, - ppd->port, - "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret); + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", + target, ret); ret = -EIO; break; } - /* truncate write to end of page if crossing page boundary */ offset = addr % QSFP_PAGESIZE; nwrite = len - count; - if ((offset + nwrite) > QSFP_PAGESIZE) - nwrite = QSFP_PAGESIZE - offset; + /* truncate write to boundary if crossing boundary */ + if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY) + nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); - ret = __i2c_write(ppd, target, QSFP_DEV, offset, bp + count, - nwrite); - if (ret <= 0) /* stop on error or nothing read */ + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + offset, bp + count, nwrite); + if (ret <= 0) /* stop on error or nothing written */ break; count += ret; addr += ret; } - mutex_unlock(&ppd->dd->qsfp_i2c_mutex); - if (ret < 0) return ret; return count; } +/* + * Perform a stand-alone single QSFP write. Acquire the resource, do the + * read, then release the resource. + */ +int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 resource = qsfp_resource(dd); + int ret; + + ret = acquire_chip_resource(dd, resource, QSFP_WAIT); + if (ret) + return ret; + ret = qsfp_write(ppd, target, addr, bp, len); + release_chip_resource(dd, resource); + + return ret; +} + +/* + * Access page n, offset m of QSFP memory as defined by SFF 8636 + * by reading @addr = ((256 * n) + m) + * + * Caller must hold the i2c chain resource. + */ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len) { @@ -248,9 +276,17 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int ret; u8 page; - ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex); - if (ret) + if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d read interface reset failed\n", + target); return ret; + } while (count < len) { /* @@ -258,25 +294,26 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, * and a page size of QSFP_PAGESIZE bytes. */ page = (u8)(addr / QSFP_PAGESIZE); - ret = __i2c_write(ppd, target, QSFP_DEV, - QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); if (ret != 1) { - hfi1_dev_porterr( - ppd->dd, - ppd->port, - "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret); + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", + target, ret); ret = -EIO; break; } - /* truncate read to end of page if crossing page boundary */ offset = addr % QSFP_PAGESIZE; nread = len - count; - if ((offset + nread) > QSFP_PAGESIZE) - nread = QSFP_PAGESIZE - offset; - - ret = __i2c_read(ppd, target, QSFP_DEV, offset, bp + count, - nread); + /* truncate read to boundary if crossing boundary */ + if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY) + nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); + + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + offset, bp + count, nread); if (ret <= 0) /* stop on error or nothing read */ break; @@ -284,17 +321,40 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, addr += ret; } - mutex_unlock(&ppd->dd->qsfp_i2c_mutex); - if (ret < 0) return ret; return count; } /* + * Perform a stand-alone single QSFP read. Acquire the resource, do the + * read, then release the resource. + */ +int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 resource = qsfp_resource(dd); + int ret; + + ret = acquire_chip_resource(dd, resource, QSFP_WAIT); + if (ret) + return ret; + ret = qsfp_read(ppd, target, addr, bp, len); + release_chip_resource(dd, resource); + + return ret; +} + +/* * This function caches the QSFP memory range in 128 byte chunks. * As an example, the next byte after address 255 is byte 128 from * upper page 01H (if existing) rather than byte 0 from lower page 00H. + * Access page n, offset m of QSFP memory as defined by SFF 8636 + * in the cache by reading byte ((128 * n) + m) + * The calls to qsfp_{read,write} in this function correctly handle the + * address map difference between this mapping and the mapping implemented + * by those functions */ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) { @@ -304,79 +364,84 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) u8 *cache = &cp->cache[0]; /* ensure sane contents on invalid reads, for cable swaps */ - memset(cache, 0, (QSFP_MAX_NUM_PAGES*128)); - dd_dev_info(ppd->dd, "%s: called\n", __func__); + memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + if (!qsfp_mod_present(ppd)) { ret = -ENODEV; - goto bail; + goto bail_no_release; } - ret = qsfp_read(ppd, target, 0, cache, 256); - if (ret != 256) { + ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT); + if (ret) + goto bail_no_release; + + ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE); + if (ret != QSFP_PAGESIZE) { dd_dev_info(ppd->dd, - "%s: Read of pages 00H failed, expected 256, got %d\n", - __func__, ret); + "%s: Page 0 read failed, expected %d, got %d\n", + __func__, QSFP_PAGESIZE, ret); goto bail; } - if (cache[0] != 0x0C && cache[0] != 0x0D) - goto bail; - /* Is paging enabled? */ if (!(cache[2] & 4)) { - /* Paging enabled, page 03 required */ if ((cache[195] & 0xC0) == 0xC0) { /* all */ ret = qsfp_read(ppd, target, 384, cache + 256, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } ret = qsfp_read(ppd, target, 640, cache + 384, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } ret = qsfp_read(ppd, target, 896, cache + 512, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } } else if ((cache[195] & 0x80) == 0x80) { /* only page 2 and 3 */ ret = qsfp_read(ppd, target, 640, cache + 384, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } ret = qsfp_read(ppd, target, 896, cache + 512, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } } else if ((cache[195] & 0x40) == 0x40) { /* only page 1 and 3 */ ret = qsfp_read(ppd, target, 384, cache + 256, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } ret = qsfp_read(ppd, target, 896, cache + 512, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } } else { /* only page 3 */ ret = qsfp_read(ppd, target, 896, cache + 512, 128); if (ret <= 0 || ret != 128) { - dd_dev_info(ppd->dd, "%s: failed\n", __func__); + dd_dev_info(ppd->dd, "%s failed\n", __func__); goto bail; } } } + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); ppd->qsfp_info.cache_valid = 1; ppd->qsfp_info.cache_refresh_required = 0; @@ -385,7 +450,9 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) return 0; bail: - memset(cache, 0, (QSFP_MAX_NUM_PAGES*128)); + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); +bail_no_release: + memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); return ret; } @@ -434,7 +501,7 @@ int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len, if (port_num > dd->num_pports || port_num < 1) { dd_dev_info(dd, "%s: Invalid port number %d\n", - __func__, port_num); + __func__, port_num); ret = -EINVAL; goto set_zeroes; } @@ -485,7 +552,6 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) lenstr[1] = '\0'; if (ppd->qsfp_info.cache_valid) { - if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]); @@ -529,7 +595,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK); for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) { - sofar += scnprintf(buf + sofar, len-sofar, + sofar += scnprintf(buf + sofar, len - sofar, " %02X", bin_buff[iidx]); } sofar += scnprintf(buf + sofar, len - sofar, "\n"); diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h index d30c2a6baa0b..831fe4cf1345 100644 --- a/drivers/staging/rdma/hfi1/qsfp.h +++ b/drivers/staging/rdma/hfi1/qsfp.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,23 +56,28 @@ * Below are masks for QSFP pins. Pins are the same for HFI0 and HFI1. * _N means asserted low */ -#define QSFP_HFI0_I2CCLK (1 << 0) -#define QSFP_HFI0_I2CDAT (1 << 1) -#define QSFP_HFI0_RESET_N (1 << 2) -#define QSFP_HFI0_INT_N (1 << 3) -#define QSFP_HFI0_MODPRST_N (1 << 4) +#define QSFP_HFI0_I2CCLK BIT(0) +#define QSFP_HFI0_I2CDAT BIT(1) +#define QSFP_HFI0_RESET_N BIT(2) +#define QSFP_HFI0_INT_N BIT(3) +#define QSFP_HFI0_MODPRST_N BIT(4) /* QSFP is paged at 256 bytes */ #define QSFP_PAGESIZE 256 +/* Reads/writes cannot cross 128 byte boundaries */ +#define QSFP_RW_BOUNDARY 128 + +/* number of bytes in i2c offset for QSFP devices */ +#define __QSFP_OFFSET_SIZE 1 /* num address bytes */ +#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8) /* shifted value */ /* Defined fields that Intel requires of qualified cables */ /* Byte 0 is Identifier, not checked */ /* Byte 1 is reserved "status MSB" */ -/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */ -/* - * Rest of first 128 not used, although 127 is reserved for page select - * if module is not "Flat memory". - */ +#define QSFP_TX_CTRL_BYTE_OFFS 86 +#define QSFP_PWR_CTRL_BYTE_OFFS 93 +#define QSFP_CDR_CTRL_BYTE_OFFS 98 + #define QSFP_PAGE_SELECT_BYTE_OFFS 127 /* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ #define QSFP_MOD_ID_OFFS 128 @@ -87,7 +89,8 @@ /* Byte 130 is Connector type. Not Intel req'd */ /* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */ /* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */ -/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not Intel req'd */ +/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */ +#define QSFP_NOM_BIT_RATE_100_OFFS 140 /* Byte 141 is Extended Rate Select. Not Intel req'd */ /* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */ /* Byte 146 is length for Copper. Units of 1 meter */ @@ -135,11 +138,18 @@ extern const char *const hfi1_qsfp_devtech[16]; */ #define QSFP_ATTEN_OFFS 186 #define QSFP_ATTEN_LEN 2 -/* Bytes 188,189 are Wavelength tolerance, not Intel req'd */ +/* + * Bytes 188,189 are Wavelength tolerance, if optical + * If copper, they are attenuation in dB: + * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s + */ +#define QSFP_CU_ATTEN_7G_OFFS 188 +#define QSFP_CU_ATTEN_12G_OFFS 189 /* Byte 190 is Max Case Temp. Not Intel req'd */ /* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */ #define QSFP_CC_OFFS 191 -/* Bytes 192..195 are Options implemented in qsfp. Not Intel req'd */ +#define QSFP_EQ_INFO_OFFS 193 +#define QSFP_CDR_INFO_OFFS 194 /* Bytes 196..211 are Serial Number, String */ #define QSFP_SN_OFFS 196 #define QSFP_SN_LEN 16 @@ -150,6 +160,8 @@ extern const char *const hfi1_qsfp_devtech[16]; #define QSFP_LOT_OFFS 218 #define QSFP_LOT_LEN 2 /* Bytes 220, 221 indicate monitoring options, Not Intel req'd */ +/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */ +#define QSFP_NOM_BIT_RATE_250_OFFS 222 /* Byte 223 is LSB of sum of bytes 192..222 */ #define QSFP_CC_EXT_OFFS 223 @@ -191,6 +203,7 @@ extern const char *const hfi1_qsfp_devtech[16]; */ #define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_HIGH_PWR(pbyte) (((pbyte) & 3) | 4) #define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) #define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) @@ -198,10 +211,12 @@ struct qsfp_data { /* Helps to find our way */ struct hfi1_pportdata *ppd; struct work_struct qsfp_work; - u8 cache[QSFP_MAX_NUM_PAGES*128]; + u8 cache[QSFP_MAX_NUM_PAGES * 128]; + /* protect qsfp data */ spinlock_t qsfp_lock; u8 check_interrupt_flags; - u8 qsfp_interrupt_functional; + u8 reset_needed; + u8 limiting_active; u8 cache_valid; u8 cache_refresh_required; }; @@ -220,3 +235,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int len); +int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c index 6f4a155f7931..0d7e1017f3cb 100644 --- a/drivers/staging/rdma/hfi1/rc.c +++ b/drivers/staging/rdma/hfi1/rc.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -49,18 +46,151 @@ */ #include <linux/io.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> #include "hfi.h" #include "qp.h" -#include "sdma.h" +#include "verbs_txreq.h" #include "trace.h" /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_RC_##x -static void rc_timeout(unsigned long arg); +/** + * hfi1_add_retry_timer - add/start a retry timer + * @qp - the QP + * + * add a retry timer on the QP + */ +static inline void hfi1_add_retry_timer(struct rvt_qp *qp) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + qp->s_flags |= RVT_S_TIMER; + /* 4.096 usec. * (1 << qp->timeout) */ + qp->s_timer.expires = jiffies + qp->timeout_jiffies + + rdi->busy_jiffies; + add_timer(&qp->s_timer); +} + +/** + * hfi1_add_rnr_timer - add/start an rnr timer + * @qp - the QP + * @to - timeout in usecs + * + * add an rnr timer on the QP + */ +void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to) +{ + struct hfi1_qp_priv *priv = qp->priv; + + qp->s_flags |= RVT_S_WAIT_RNR; + qp->s_timer.expires = jiffies + usecs_to_jiffies(to); + add_timer(&priv->s_rnr_timer); +} + +/** + * hfi1_mod_retry_timer - mod a retry timer + * @qp - the QP + * + * Modify a potentially already running retry + * timer + */ +static inline void hfi1_mod_retry_timer(struct rvt_qp *qp) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + qp->s_flags |= RVT_S_TIMER; + /* 4.096 usec. * (1 << qp->timeout) */ + mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies + + rdi->busy_jiffies); +} + +/** + * hfi1_stop_retry_timer - stop a retry timer + * @qp - the QP + * + * stop a retry timer and return if the timer + * had been pending. + */ +static inline int hfi1_stop_retry_timer(struct rvt_qp *qp) +{ + int rval = 0; + + /* Remove QP from retry */ + if (qp->s_flags & RVT_S_TIMER) { + qp->s_flags &= ~RVT_S_TIMER; + rval = del_timer(&qp->s_timer); + } + return rval; +} + +/** + * hfi1_stop_rc_timers - stop all timers + * @qp - the QP + * + * stop any pending timers + */ +void hfi1_stop_rc_timers(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + /* Remove QP from all timers */ + if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { + qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); + del_timer(&qp->s_timer); + del_timer(&priv->s_rnr_timer); + } +} + +/** + * hfi1_stop_rnr_timer - stop an rnr timer + * @qp - the QP + * + * stop an rnr timer and return if the timer + * had been pending. + */ +static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp) +{ + int rval = 0; + struct hfi1_qp_priv *priv = qp->priv; + + /* Remove QP from rnr timer */ + if (qp->s_flags & RVT_S_WAIT_RNR) { + qp->s_flags &= ~RVT_S_WAIT_RNR; + rval = del_timer(&priv->s_rnr_timer); + } + return rval; +} + +/** + * hfi1_del_timers_sync - wait for any timeout routines to exit + * @qp - the QP + */ +void hfi1_del_timers_sync(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + del_timer_sync(&qp->s_timer); + del_timer_sync(&priv->s_rnr_timer); +} -static u32 restart_sge(struct hfi1_sge_state *ss, struct hfi1_swqe *wqe, +/* only opcode mask for adaptive pio */ +const u32 rc_only_opcode = + BIT(OP(SEND_ONLY) & 0x1f) | + BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) | + BIT(OP(RDMA_READ_REQUEST & 0x1f)) | + BIT(OP(ACKNOWLEDGE & 0x1f)) | + BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) | + BIT(OP(COMPARE_SWAP & 0x1f)) | + BIT(OP(FETCH_ADD & 0x1f)); + +static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 psn, u32 pmtu) { u32 len; @@ -74,38 +204,32 @@ static u32 restart_sge(struct hfi1_sge_state *ss, struct hfi1_swqe *wqe, return wqe->length - len; } -static void start_timer(struct hfi1_qp *qp) -{ - qp->s_flags |= HFI1_S_TIMER; - qp->s_timer.function = rc_timeout; - /* 4.096 usec. * (1 << qp->timeout) */ - qp->s_timer.expires = jiffies + qp->timeout_jiffies; - add_timer(&qp->s_timer); -} - /** * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) * @dev: the device for this QP * @qp: a pointer to the QP * @ohdr: a pointer to the IB header being constructed - * @pmtu: the path MTU + * @ps: the xmit packet state * * Return 1 if constructed; otherwise, return 0. * Note that we are in the responder's side of the QP context. * Note the QP s_lock must be held. */ -static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp, - struct hfi1_other_headers *ohdr, u32 pmtu) +static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, + struct hfi1_other_headers *ohdr, + struct hfi1_pkt_state *ps) { - struct hfi1_ack_entry *e; + struct rvt_ack_entry *e; u32 hwords; u32 len; u32 bth0; u32 bth2; int middle = 0; + u32 pmtu = qp->pmtu; + struct hfi1_qp_priv *priv = qp->priv; /* Don't send an ACK if we aren't supposed to. */ - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ @@ -116,7 +240,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp, case OP(RDMA_READ_RESPONSE_ONLY): e = &qp->s_ack_queue[qp->s_tail_ack_queue]; if (e->rdma_sge.mr) { - hfi1_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } /* FALLTHROUGH */ @@ -133,7 +257,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp, case OP(ACKNOWLEDGE): /* Check for no next entry in the queue. */ if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { - if (qp->s_flags & HFI1_S_ACK_PENDING) + if (qp->s_flags & RVT_S_ACK_PENDING) goto normal; goto bail; } @@ -152,9 +276,9 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp, goto bail; } /* Copy SGE state in case we need to resend */ - qp->s_rdma_mr = e->rdma_sge.mr; - if (qp->s_rdma_mr) - hfi1_get_mr(qp->s_rdma_mr); + ps->s_txreq->mr = e->rdma_sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); qp->s_ack_rdma_sge.sge = e->rdma_sge; qp->s_ack_rdma_sge.num_sge = 1; qp->s_cur_sge = &qp->s_ack_rdma_sge; @@ -191,9 +315,9 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp, /* FALLTHROUGH */ case OP(RDMA_READ_RESPONSE_MIDDLE): qp->s_cur_sge = &qp->s_ack_rdma_sge; - qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr; - if (qp->s_rdma_mr) - hfi1_get_mr(qp->s_rdma_mr); + ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); len = qp->s_ack_rdma_sge.sge.sge_length; if (len > pmtu) { len = pmtu; @@ -218,7 +342,7 @@ normal: * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~HFI1_S_ACK_PENDING; + qp->s_flags &= ~RVT_S_ACK_PENDING; qp->s_cur_sge = NULL; if (qp->s_nak_state) ohdr->u.aeth = @@ -234,20 +358,23 @@ normal: } qp->s_rdma_ack_cnt++; qp->s_hdrwords = hwords; + ps->s_txreq->sde = priv->s_sde; qp->s_cur_size = len; - hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle); + hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; return 1; bail: qp->s_ack_state = OP(ACKNOWLEDGE); /* * Ensure s_rdma_ack_cnt changes are committed prior to resetting - * HFI1_S_RESP_PENDING + * RVT_S_RESP_PENDING */ smp_wmb(); - qp->s_flags &= ~(HFI1_S_RESP_PENDING - | HFI1_S_ACK_PENDING - | HFI1_S_AHG_VALID); + qp->s_flags &= ~(RVT_S_RESP_PENDING + | RVT_S_ACK_PENDING + | RVT_S_AHG_VALID); return 0; } @@ -255,14 +382,17 @@ bail: * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) * @qp: a pointer to the QP * + * Assumes s_lock is held. + * * Return 1 if constructed; otherwise, return 0. */ -int hfi1_make_rc_req(struct hfi1_qp *qp) +int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { + struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); struct hfi1_other_headers *ohdr; - struct hfi1_sge_state *ss; - struct hfi1_swqe *wqe; + struct rvt_sge_state *ss; + struct rvt_swqe *wqe; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ u32 hwords = 5; u32 len; @@ -270,51 +400,48 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) u32 bth2; u32 pmtu = qp->pmtu; char newreq; - unsigned long flags; - int ret = 0; int middle = 0; int delta; - ohdr = &qp->s_hdr->ibh.u.oth; - if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr->ibh.u.l.oth; + ps->s_txreq = get_txreq(ps->dev, qp); + if (IS_ERR(ps->s_txreq)) + goto bail_no_tx; - /* - * The lock is needed to synchronize between the sending tasklet, - * the receive interrupt handler, and timeout re-sends. - */ - spin_lock_irqsave(&qp->s_lock, flags); + ohdr = &ps->s_txreq->phdr.hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; /* Sending responses has higher priority over sending requests. */ - if ((qp->s_flags & HFI1_S_RESP_PENDING) && - make_rc_ack(dev, qp, ohdr, pmtu)) - goto done; + if ((qp->s_flags & RVT_S_RESP_PENDING) && + make_rc_ack(dev, qp, ohdr, ps)) + return 1; - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) { - if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - if (qp->s_last == qp->s_head) + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ - if (atomic_read(&qp->s_iowait.sdma_busy)) { - qp->s_flags |= HFI1_S_WAIT_DMA; + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } clear_ahg(qp); - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ - goto done; + goto done_free_tx; } - if (qp->s_flags & (HFI1_S_WAIT_RNR | HFI1_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) goto bail; if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { - qp->s_flags |= HFI1_S_WAIT_PSN; + qp->s_flags |= RVT_S_WAIT_PSN; goto bail; } qp->s_sending_psn = qp->s_psn; @@ -322,10 +449,10 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) } /* Send a request. */ - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); switch (qp->s_state) { default: - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) goto bail; /* * Resend an old request or start a new one. @@ -347,11 +474,11 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) */ if ((wqe->wr.send_flags & IB_SEND_FENCE) && qp->s_num_rd_atomic) { - qp->s_flags |= HFI1_S_WAIT_FENCE; + qp->s_flags |= RVT_S_WAIT_FENCE; goto bail; } - wqe->psn = qp->s_next_psn; newreq = 1; + qp->s_psn = wqe->psn; } /* * Note that we have to be careful not to modify the @@ -365,21 +492,19 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: /* If no credit, return. */ - if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) && + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { - qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT; + qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } - wqe->lpsn = wqe->psn; if (len > pmtu) { - wqe->lpsn += (len - 1) / pmtu; qp->s_state = OP(SEND_FIRST); len = pmtu; break; } - if (wqe->wr.opcode == IB_WR_SEND) + if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_ONLY); - else { + } else { qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; @@ -393,14 +518,14 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) break; case IB_WR_RDMA_WRITE: - if (newreq && !(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; /* FALLTHROUGH */ case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ - if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) && + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { - qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT; + qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; goto bail; } ohdr->u.rc.reth.vaddr = @@ -409,16 +534,14 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) cpu_to_be32(wqe->rdma_wr.rkey); ohdr->u.rc.reth.length = cpu_to_be32(len); hwords += sizeof(struct ib_reth) / sizeof(u32); - wqe->lpsn = wqe->psn; if (len > pmtu) { - wqe->lpsn += (len - 1) / pmtu; qp->s_state = OP(RDMA_WRITE_FIRST); len = pmtu; break; } - if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { qp->s_state = OP(RDMA_WRITE_ONLY); - else { + } else { qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); /* Immediate data comes after RETH */ @@ -440,19 +563,12 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) if (newreq) { if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { - qp->s_flags |= HFI1_S_WAIT_RDMAR; + qp->s_flags |= RVT_S_WAIT_RDMAR; goto bail; } qp->s_num_rd_atomic++; - if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - /* - * Adjust s_next_psn to count the - * expected number of responses. - */ - if (len > pmtu) - qp->s_next_psn += (len - 1) / pmtu; - wqe->lpsn = qp->s_next_psn++; } ohdr->u.rc.reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr); @@ -477,13 +593,12 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) if (newreq) { if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { - qp->s_flags |= HFI1_S_WAIT_RDMAR; + qp->s_flags |= RVT_S_WAIT_RDMAR; goto bail; } qp->s_num_rd_atomic++; - if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) qp->s_lsn++; - wqe->lpsn = wqe->psn; } if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { qp->s_state = OP(COMPARE_SWAP); @@ -526,11 +641,8 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) } if (wqe->wr.opcode == IB_WR_RDMA_READ) qp->s_psn = wqe->lpsn + 1; - else { + else qp->s_psn++; - if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; - } break; case OP(RDMA_READ_RESPONSE_FIRST): @@ -550,8 +662,6 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) /* FALLTHROUGH */ case OP(SEND_MIDDLE): bth2 = mask_psn(qp->s_psn++); - if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; if (len > pmtu) { @@ -559,9 +669,9 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) middle = HFI1_CAP_IS_KSET(SDMA_AHG); break; } - if (wqe->wr.opcode == IB_WR_SEND) + if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_LAST); - else { + } else { qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; @@ -592,8 +702,6 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): bth2 = mask_psn(qp->s_psn++); - if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; if (len > pmtu) { @@ -601,9 +709,9 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) middle = HFI1_CAP_IS_KSET(SDMA_AHG); break; } - if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { qp->s_state = OP(RDMA_WRITE_LAST); - else { + } else { qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; @@ -648,13 +756,14 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) delta = delta_psn(bth2, wqe->psn); if (delta && delta % HFI1_PSN_CREDIT == 0) bth2 |= IB_BTH_REQ_ACK; - if (qp->s_flags & HFI1_S_SEND_ONE) { - qp->s_flags &= ~HFI1_S_SEND_ONE; - qp->s_flags |= HFI1_S_WAIT_ACK; + if (qp->s_flags & RVT_S_SEND_ONE) { + qp->s_flags &= ~RVT_S_SEND_ONE; + qp->s_flags |= RVT_S_WAIT_ACK; bth2 |= IB_BTH_REQ_ACK; } qp->s_len -= len; qp->s_hdrwords = hwords; + ps->s_txreq->sde = priv->s_sde; qp->s_cur_sge = ss; qp->s_cur_size = len; hfi1_make_ruc_header( @@ -662,16 +771,25 @@ int hfi1_make_rc_req(struct hfi1_qp *qp) ohdr, bth0 | (qp->s_state << 24), bth2, - middle); -done: - ret = 1; - goto unlock; + middle, + ps); + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; bail: - qp->s_flags &= ~HFI1_S_BUSY; -unlock: - spin_unlock_irqrestore(&qp->s_lock, flags); - return ret; + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + qp->s_hdrwords = 0; + return 0; } /** @@ -682,7 +800,7 @@ unlock: * Note that RDMA reads and atomics are handled in the * send side QP state and tasklet. */ -void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp, +void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp, int is_fecn) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); @@ -700,7 +818,7 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp, unsigned long flags; /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ - if (qp->s_flags & HFI1_S_RESP_PENDING) + if (qp->s_flags & RVT_S_RESP_PENDING) goto queue_ack; /* Ensure s_rdma_ack_cnt changes are committed */ @@ -763,7 +881,7 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp, goto queue_ack; } - trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr); + trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr); /* write the pbc and data */ ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords); @@ -771,13 +889,13 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp, return; queue_ack: - this_cpu_inc(*ibp->rc_qacks); + this_cpu_inc(*ibp->rvp.rc_qacks); spin_lock_irqsave(&qp->s_lock, flags); - qp->s_flags |= HFI1_S_ACK_PENDING | HFI1_S_RESP_PENDING; + qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; qp->s_nak_state = qp->r_nak_state; qp->s_ack_psn = qp->r_ack_psn; if (is_fecn) - qp->s_flags |= HFI1_S_ECN; + qp->s_flags |= RVT_S_ECN; /* Schedule the send tasklet. */ hfi1_schedule_send(qp); @@ -793,10 +911,10 @@ queue_ack: * for the given QP. * Called at interrupt level with the QP s_lock held. */ -static void reset_psn(struct hfi1_qp *qp, u32 psn) +static void reset_psn(struct rvt_qp *qp, u32 psn) { u32 n = qp->s_acked; - struct hfi1_swqe *wqe = get_swqe_ptr(qp, n); + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); u32 opcode; qp->s_cur = n; @@ -819,7 +937,7 @@ static void reset_psn(struct hfi1_qp *qp, u32 psn) n = 0; if (n == qp->s_tail) break; - wqe = get_swqe_ptr(qp, n); + wqe = rvt_get_swqe_ptr(qp, n); diff = cmp_psn(psn, wqe->psn); if (diff < 0) break; @@ -865,23 +983,23 @@ static void reset_psn(struct hfi1_qp *qp, u32 psn) done: qp->s_psn = psn; /* - * Set HFI1_S_WAIT_PSN as rc_complete() may start the timer + * Set RVT_S_WAIT_PSN as rc_complete() may start the timer * asynchronously before the send tasklet can get scheduled. * Doing it in hfi1_make_rc_req() is too late. */ if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) - qp->s_flags |= HFI1_S_WAIT_PSN; - qp->s_flags &= ~HFI1_S_AHG_VALID; + qp->s_flags |= RVT_S_WAIT_PSN; + qp->s_flags &= ~RVT_S_AHG_VALID; } /* * Back up requester to resend the last un-ACKed request. * The QP r_lock and s_lock should be held and interrupts disabled. */ -static void restart_rc(struct hfi1_qp *qp, u32 psn, int wait) +static void restart_rc(struct rvt_qp *qp, u32 psn, int wait) { - struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_acked); + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct hfi1_ibport *ibp; if (qp->s_retry == 0) { @@ -890,42 +1008,44 @@ static void restart_rc(struct hfi1_qp *qp, u32 psn, int wait) qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); - hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; - } else /* need to handle delayed completion */ + } else { /* need to handle delayed completion */ return; - } else + } + } else { qp->s_retry--; + } ibp = to_iport(qp->ibqp.device, qp->port_num); if (wqe->wr.opcode == IB_WR_RDMA_READ) - ibp->n_rc_resends++; + ibp->rvp.n_rc_resends++; else - ibp->n_rc_resends += delta_psn(qp->s_psn, psn); + ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); - qp->s_flags &= ~(HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | - HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_PSN | - HFI1_S_WAIT_ACK); + qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | + RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | + RVT_S_WAIT_ACK); if (wait) - qp->s_flags |= HFI1_S_SEND_ONE; + qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); } /* * This is called from s_timer for missing responses. */ -static void rc_timeout(unsigned long arg) +void hfi1_rc_timeout(unsigned long arg) { - struct hfi1_qp *qp = (struct hfi1_qp *)arg; + struct rvt_qp *qp = (struct rvt_qp *)arg; struct hfi1_ibport *ibp; unsigned long flags; spin_lock_irqsave(&qp->r_lock, flags); spin_lock(&qp->s_lock); - if (qp->s_flags & HFI1_S_TIMER) { + if (qp->s_flags & RVT_S_TIMER) { ibp = to_iport(qp->ibqp.device, qp->port_num); - ibp->n_rc_timeouts++; - qp->s_flags &= ~HFI1_S_TIMER; + ibp->rvp.n_rc_timeouts++; + qp->s_flags &= ~RVT_S_TIMER; del_timer(&qp->s_timer); trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1); restart_rc(qp, qp->s_last_psn + 1, 1); @@ -940,15 +1060,12 @@ static void rc_timeout(unsigned long arg) */ void hfi1_rc_rnr_retry(unsigned long arg) { - struct hfi1_qp *qp = (struct hfi1_qp *)arg; + struct rvt_qp *qp = (struct rvt_qp *)arg; unsigned long flags; spin_lock_irqsave(&qp->s_lock, flags); - if (qp->s_flags & HFI1_S_WAIT_RNR) { - qp->s_flags &= ~HFI1_S_WAIT_RNR; - del_timer(&qp->s_timer); - hfi1_schedule_send(qp); - } + hfi1_stop_rnr_timer(qp); + hfi1_schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); } @@ -956,14 +1073,14 @@ void hfi1_rc_rnr_retry(unsigned long arg) * Set qp->s_sending_psn to the next PSN after the given one. * This would be psn+1 except when RDMA reads are present. */ -static void reset_sending_psn(struct hfi1_qp *qp, u32 psn) +static void reset_sending_psn(struct rvt_qp *qp, u32 psn) { - struct hfi1_swqe *wqe; + struct rvt_swqe *wqe; u32 n = qp->s_last; /* Find the work request corresponding to the given PSN. */ for (;;) { - wqe = get_swqe_ptr(qp, n); + wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ) qp->s_sending_psn = wqe->lpsn + 1; @@ -981,16 +1098,16 @@ static void reset_sending_psn(struct hfi1_qp *qp, u32 psn) /* * This should be called with the QP s_lock held and interrupts disabled. */ -void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr) +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr) { struct hfi1_other_headers *ohdr; - struct hfi1_swqe *wqe; + struct rvt_swqe *wqe; struct ib_wc wc; unsigned i; u32 opcode; u32 psn; - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) return; /* Find out where the BTH is */ @@ -1016,22 +1133,30 @@ void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr) */ if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && !(qp->s_flags & - (HFI1_S_TIMER | HFI1_S_WAIT_RNR | HFI1_S_WAIT_PSN)) && - (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) - start_timer(qp); + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + hfi1_add_retry_timer(qp); while (qp->s_last != qp->s_acked) { - wqe = get_swqe_ptr(qp, qp->s_last); + u32 s_last; + + wqe = rvt_get_swqe_ptr(qp, qp->s_last); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; + s_last = qp->s_last; + if (++s_last >= qp->s_size) + s_last = 0; + qp->s_last = s_last; + /* see post_send() */ + barrier(); for (i = 0; i < wqe->wr.num_sge; i++) { - struct hfi1_sge *sge = &wqe->sg_list[i]; + struct rvt_sge *sge = &wqe->sg_list[i]; - hfi1_put_mr(sge->mr); + rvt_put_mr(sge->mr); } /* Post a send completion queue entry if requested. */ - if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) || + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr.wr_id; @@ -1039,26 +1164,24 @@ void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr) wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; wc.byte_len = wqe->length; wc.qp = &qp->ibqp; - hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0); } - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; } /* * If we were waiting for sends to complete before re-sending, * and they are now complete, restart sending. */ trace_hfi1_rc_sendcomplete(qp, psn); - if (qp->s_flags & HFI1_S_WAIT_PSN && + if (qp->s_flags & RVT_S_WAIT_PSN && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { - qp->s_flags &= ~HFI1_S_WAIT_PSN; + qp->s_flags &= ~RVT_S_WAIT_PSN; qp->s_sending_psn = qp->s_psn; qp->s_sending_hpsn = qp->s_psn - 1; hfi1_schedule_send(qp); } } -static inline void update_last_psn(struct hfi1_qp *qp, u32 psn) +static inline void update_last_psn(struct rvt_qp *qp, u32 psn) { qp->s_last_psn = psn; } @@ -1068,9 +1191,9 @@ static inline void update_last_psn(struct hfi1_qp *qp, u32 psn) * This is similar to hfi1_send_complete but has to check to be sure * that the SGEs are not being referenced if the SWQE is being resent. */ -static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp, - struct hfi1_swqe *wqe, - struct hfi1_ibport *ibp) +static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp) { struct ib_wc wc; unsigned i; @@ -1082,13 +1205,21 @@ static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp, */ if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + u32 s_last; + for (i = 0; i < wqe->wr.num_sge; i++) { - struct hfi1_sge *sge = &wqe->sg_list[i]; + struct rvt_sge *sge = &wqe->sg_list[i]; - hfi1_put_mr(sge->mr); + rvt_put_mr(sge->mr); } + s_last = qp->s_last; + if (++s_last >= qp->s_size) + s_last = 0; + qp->s_last = s_last; + /* see post_send() */ + barrier(); /* Post a send completion queue entry if requested. */ - if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) || + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr.wr_id; @@ -1096,14 +1227,12 @@ static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp, wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; wc.byte_len = wqe->length; wc.qp = &qp->ibqp; - hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0); } - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; } else { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - this_cpu_inc(*ibp->rc_delayed_comp); + this_cpu_inc(*ibp->rvp.rc_delayed_comp); /* * If send progress not running attempt to progress * SDMA queue. @@ -1131,7 +1260,7 @@ static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp, if (++qp->s_cur >= qp->s_size) qp->s_cur = 0; qp->s_acked = qp->s_cur; - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); if (qp->s_acked != qp->s_tail) { qp->s_state = OP(SEND_LAST); qp->s_psn = wqe->psn; @@ -1141,7 +1270,7 @@ static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp, qp->s_acked = 0; if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) qp->s_draining = 0; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); } return wqe; } @@ -1157,21 +1286,16 @@ static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp, * May be called at interrupt level, with the QP s_lock held. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, +static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, struct hfi1_ctxtdata *rcd) { struct hfi1_ibport *ibp; enum ib_wc_status status; - struct hfi1_swqe *wqe; + struct rvt_swqe *wqe; int ret = 0; u32 ack_psn; int diff; - - /* Remove QP from retry timer */ - if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) { - qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR); - del_timer(&qp->s_timer); - } + unsigned long to; /* * Note that NAKs implicitly ACK outstanding SEND and RDMA write @@ -1182,7 +1306,7 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, ack_psn = psn; if (aeth >> 29) ack_psn--; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); ibp = to_iport(qp->ibqp.device, qp->port_num); /* @@ -1200,7 +1324,7 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, opcode == OP(RDMA_READ_RESPONSE_ONLY) && diff == 0) { ret = 1; - goto bail; + goto bail_stop; } /* * If this request is a RDMA read or atomic, and the ACK is @@ -1217,11 +1341,11 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { /* Retry this request. */ - if (!(qp->r_flags & HFI1_R_RDMAR_SEQ)) { - qp->r_flags |= HFI1_R_RDMAR_SEQ; + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; restart_rc(qp, qp->s_last_psn + 1, 0); if (list_empty(&qp->rspwait)) { - qp->r_flags |= HFI1_R_RSP_SEND; + qp->r_flags |= RVT_R_RSP_SEND; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); @@ -1231,7 +1355,7 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, * No need to process the ACK/NAK since we are * restarting an earlier request. */ - goto bail; + goto bail_stop; } if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { @@ -1244,14 +1368,14 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { qp->s_num_rd_atomic--; /* Restart sending task if fence is complete */ - if ((qp->s_flags & HFI1_S_WAIT_FENCE) && + if ((qp->s_flags & RVT_S_WAIT_FENCE) && !qp->s_num_rd_atomic) { - qp->s_flags &= ~(HFI1_S_WAIT_FENCE | - HFI1_S_WAIT_ACK); + qp->s_flags &= ~(RVT_S_WAIT_FENCE | + RVT_S_WAIT_ACK); hfi1_schedule_send(qp); - } else if (qp->s_flags & HFI1_S_WAIT_RDMAR) { - qp->s_flags &= ~(HFI1_S_WAIT_RDMAR | - HFI1_S_WAIT_ACK); + } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { + qp->s_flags &= ~(RVT_S_WAIT_RDMAR | + RVT_S_WAIT_ACK); hfi1_schedule_send(qp); } } @@ -1262,40 +1386,43 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, switch (aeth >> 29) { case 0: /* ACK */ - this_cpu_inc(*ibp->rc_acks); + this_cpu_inc(*ibp->rvp.rc_acks); if (qp->s_acked != qp->s_tail) { /* * We are expecting more ACKs so - * reset the re-transmit timer. + * mod the retry timer. */ - start_timer(qp); + hfi1_mod_retry_timer(qp); /* * We can stop re-sending the earlier packets and * continue with the next packet the receiver wants. */ if (cmp_psn(qp->s_psn, psn) <= 0) reset_psn(qp, psn + 1); - } else if (cmp_psn(qp->s_psn, psn) <= 0) { - qp->s_state = OP(SEND_LAST); - qp->s_psn = psn + 1; + } else { + /* No more acks - kill all timers */ + hfi1_stop_rc_timers(qp); + if (cmp_psn(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } } - if (qp->s_flags & HFI1_S_WAIT_ACK) { - qp->s_flags &= ~HFI1_S_WAIT_ACK; + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; hfi1_schedule_send(qp); } hfi1_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; update_last_psn(qp, psn); - ret = 1; - goto bail; + return 1; case 1: /* RNR NAK */ - ibp->n_rnr_naks++; + ibp->rvp.n_rnr_naks++; if (qp->s_acked == qp->s_tail) - goto bail; - if (qp->s_flags & HFI1_S_WAIT_RNR) - goto bail; + goto bail_stop; + if (qp->s_flags & RVT_S_WAIT_RNR) + goto bail_stop; if (qp->s_rnr_retry == 0) { status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; @@ -1306,28 +1433,27 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, /* The last valid PSN is the previous PSN. */ update_last_psn(qp, psn - 1); - ibp->n_rc_resends += delta_psn(qp->s_psn, psn); + ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); reset_psn(qp, psn); - qp->s_flags &= ~(HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_ACK); - qp->s_flags |= HFI1_S_WAIT_RNR; - qp->s_timer.function = hfi1_rc_rnr_retry; - qp->s_timer.expires = jiffies + usecs_to_jiffies( + qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); + hfi1_stop_rc_timers(qp); + to = ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) & - HFI1_AETH_CREDIT_MASK]); - add_timer(&qp->s_timer); - goto bail; + HFI1_AETH_CREDIT_MASK]; + hfi1_add_rnr_timer(qp, to); + return 0; case 3: /* NAK */ if (qp->s_acked == qp->s_tail) - goto bail; + goto bail_stop; /* The last valid PSN is the previous PSN. */ update_last_psn(qp, psn - 1); switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ - ibp->n_seq_naks++; + ibp->rvp.n_seq_naks++; /* * Back up to the responder's expected PSN. * Note that we might get a NAK in the middle of an @@ -1340,21 +1466,21 @@ static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, case 1: /* Invalid Request */ status = IB_WC_REM_INV_REQ_ERR; - ibp->n_other_naks++; + ibp->rvp.n_other_naks++; goto class_b; case 2: /* Remote Access Error */ status = IB_WC_REM_ACCESS_ERR; - ibp->n_other_naks++; + ibp->rvp.n_other_naks++; goto class_b; case 3: /* Remote Operation Error */ status = IB_WC_REM_OP_ERR; - ibp->n_other_naks++; + ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { hfi1_send_complete(qp, wqe, status); - hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1364,15 +1490,16 @@ class_b: } qp->s_retry = qp->s_retry_cnt; qp->s_rnr_retry = qp->s_rnr_retry_cnt; - goto bail; + goto bail_stop; default: /* 2: reserved */ reserved: /* Ignore reserved NAK codes. */ - goto bail; + goto bail_stop; } - -bail: + return ret; +bail_stop: + hfi1_stop_rc_timers(qp); return ret; } @@ -1380,18 +1507,15 @@ bail: * We have seen an out of sequence RDMA read middle or last packet. * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. */ -static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn, +static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, struct hfi1_ctxtdata *rcd) { - struct hfi1_swqe *wqe; + struct rvt_swqe *wqe; /* Remove QP from retry timer */ - if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) { - qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR); - del_timer(&qp->s_timer); - } + hfi1_stop_rc_timers(qp); - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || @@ -1401,11 +1525,11 @@ static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn, wqe = do_rc_completion(qp, wqe, ibp); } - ibp->n_rdma_seq++; - qp->r_flags |= HFI1_R_RDMAR_SEQ; + ibp->rvp.n_rdma_seq++; + qp->r_flags |= RVT_R_RDMAR_SEQ; restart_rc(qp, qp->s_last_psn + 1, 0); if (list_empty(&qp->rspwait)) { - qp->r_flags |= HFI1_R_RSP_SEND; + qp->r_flags |= RVT_R_RSP_SEND; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); } @@ -1429,11 +1553,11 @@ static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn, */ static void rc_rcv_resp(struct hfi1_ibport *ibp, struct hfi1_other_headers *ohdr, - void *data, u32 tlen, struct hfi1_qp *qp, + void *data, u32 tlen, struct rvt_qp *qp, u32 opcode, u32 psn, u32 hdrsize, u32 pmtu, struct hfi1_ctxtdata *rcd) { - struct hfi1_swqe *wqe; + struct rvt_swqe *wqe; enum ib_wc_status status; unsigned long flags; int diff; @@ -1446,7 +1570,8 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp, trace_hfi1_rc_ack(qp, psn); /* Ignore invalid responses. */ - if (cmp_psn(psn, qp->s_next_psn) >= 0) + smp_read_barrier_depends(); /* see post_one_send */ + if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0) goto ack_done; /* Ignore duplicate responses. */ @@ -1465,15 +1590,15 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp, * Skip everything other than the PSN we expect, if we are waiting * for a reply to a restarted RDMA read or atomic op. */ - if (qp->r_flags & HFI1_R_RDMAR_SEQ) { + if (qp->r_flags & RVT_R_RDMAR_SEQ) { if (cmp_psn(psn, qp->s_last_psn + 1) != 0) goto ack_done; - qp->r_flags &= ~HFI1_R_RDMAR_SEQ; + qp->r_flags &= ~RVT_R_RDMAR_SEQ; } if (unlikely(qp->s_acked == qp->s_tail)) goto ack_done; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); status = IB_WC_SUCCESS; switch (opcode) { @@ -1484,14 +1609,15 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp, if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { __be32 *p = ohdr->u.at.atomic_ack_eth; - val = ((u64) be32_to_cpu(p[0]) << 32) | + val = ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]); - } else + } else { val = 0; + } if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || opcode != OP(RDMA_READ_RESPONSE_FIRST)) goto ack_done; - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) goto ack_op_err; /* @@ -1519,10 +1645,10 @@ read_middle: * We got a response so update the timeout. * 4.096 usec. * (1 << qp->timeout) */ - qp->s_flags |= HFI1_S_TIMER; + qp->s_flags |= RVT_S_TIMER; mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); - if (qp->s_flags & HFI1_S_WAIT_ACK) { - qp->s_flags &= ~HFI1_S_WAIT_ACK; + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; hfi1_schedule_send(qp); } @@ -1536,7 +1662,7 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1556,7 +1682,7 @@ read_middle: * have to be careful to copy the data to the right * location. */ - wqe = get_swqe_ptr(qp, qp->s_acked); + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, wqe, psn, pmtu); goto read_last; @@ -1580,9 +1706,9 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0); WARN_ON(qp->s_rdma_read_sge.num_sge); - (void) do_rc_ack(qp, aeth, psn, + (void)do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); goto ack_done; } @@ -1600,7 +1726,7 @@ ack_len_err: ack_err: if (qp->s_last == qp->s_acked) { hfi1_send_complete(qp, wqe, status); - hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1609,22 +1735,24 @@ bail: } static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, - struct hfi1_qp *qp) + struct rvt_qp *qp) { if (list_empty(&qp->rspwait)) { - qp->r_flags |= HFI1_R_RSP_DEFERED_ACK; + qp->r_flags |= RVT_R_RSP_NAK; atomic_inc(&qp->refcount); list_add_tail(&qp->rspwait, &rcd->qp_wait_list); } } -static inline void rc_cancel_ack(struct hfi1_qp *qp) +static inline void rc_cancel_ack(struct rvt_qp *qp) { - qp->r_adefered = 0; + struct hfi1_qp_priv *priv = qp->priv; + + priv->r_adefered = 0; if (list_empty(&qp->rspwait)) return; list_del_init(&qp->rspwait); - qp->r_flags &= ~HFI1_R_RSP_DEFERED_ACK; + qp->r_flags &= ~RVT_R_RSP_NAK; if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } @@ -1645,11 +1773,11 @@ static inline void rc_cancel_ack(struct hfi1_qp *qp) * schedule a response to be sent. */ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, - struct hfi1_qp *qp, u32 opcode, u32 psn, int diff, - struct hfi1_ctxtdata *rcd) + struct rvt_qp *qp, u32 opcode, u32 psn, + int diff, struct hfi1_ctxtdata *rcd) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - struct hfi1_ack_entry *e; + struct rvt_ack_entry *e; unsigned long flags; u8 i, prev; int old_req; @@ -1662,7 +1790,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, * Don't queue the NAK if we already sent one. */ if (!qp->r_nak_state) { - ibp->n_rc_seqnak++; + ibp->rvp.n_rc_seqnak++; qp->r_nak_state = IB_NAK_PSN_ERROR; /* Use the expected PSN. */ qp->r_ack_psn = qp->r_psn; @@ -1694,7 +1822,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, */ e = NULL; old_req = 1; - ibp->n_rc_dupreq++; + ibp->rvp.n_rc_dupreq++; spin_lock_irqsave(&qp->s_lock, flags); @@ -1747,7 +1875,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, if (unlikely(offset + len != e->rdma_sge.sge_length)) goto unlock_done; if (e->rdma_sge.mr) { - hfi1_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } if (len != 0) { @@ -1755,8 +1883,8 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, u64 vaddr = be64_to_cpu(reth->vaddr); int ok; - ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, - IB_ACCESS_REMOTE_READ); + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); if (unlikely(!ok)) goto unlock_done; } else { @@ -1778,7 +1906,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, * or the send tasklet is already backed up to send an * earlier entry, we can ignore this request. */ - if (!e || e->opcode != (u8) opcode || old_req) + if (!e || e->opcode != (u8)opcode || old_req) goto unlock_done; qp->s_tail_ack_queue = prev; break; @@ -1810,7 +1938,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, break; } qp->s_ack_state = OP(ACKNOWLEDGE); - qp->s_flags |= HFI1_S_RESP_PENDING; + qp->s_flags |= RVT_S_RESP_PENDING; qp->r_nak_state = 0; hfi1_schedule_send(qp); @@ -1823,13 +1951,13 @@ send_ack: return 0; } -void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err) +void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err) { unsigned long flags; int lastwqe; spin_lock_irqsave(&qp->s_lock, flags); - lastwqe = hfi1_error_qp(qp, err); + lastwqe = rvt_error_qp(qp, err); spin_unlock_irqrestore(&qp->s_lock, flags); if (lastwqe) { @@ -1842,7 +1970,7 @@ void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err) } } -static inline void update_ack_queue(struct hfi1_qp *qp, unsigned n) +static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) { unsigned next; @@ -1864,14 +1992,14 @@ static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, spin_lock_irqsave(&ppd->cc_log_lock, flags); - ppd->threshold_cong_event_map[sl/8] |= 1 << (sl % 8); + ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8); ppd->threshold_event_counter++; cc_event = &ppd->cc_events[ppd->cc_log_idx++]; if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) ppd->cc_log_idx = 0; - cc_event->lqpn = lqpn & HFI1_QPN_MASK; - cc_event->rqpn = rqpn & HFI1_QPN_MASK; + cc_event->lqpn = lqpn & RVT_QPN_MASK; + cc_event->rqpn = rqpn & RVT_QPN_MASK; cc_event->sl = sl; cc_event->svc_type = svc_type; cc_event->rlid = rlid; @@ -1897,7 +2025,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, cc_state = get_cc_state(ppd); - if (cc_state == NULL) + if (!cc_state) return; /* @@ -1957,7 +2085,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) u32 rcv_flags = packet->rcv_flags; void *data = packet->ebuf; u32 tlen = packet->tlen; - struct hfi1_qp *qp = packet->qp; + struct rvt_qp *qp = packet->qp; struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_other_headers *ohdr = packet->ohdr; @@ -1972,6 +2100,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) unsigned long flags; u32 bth1; int ret, is_fecn = 0; + int copy_last = 0; bth0 = be32_to_cpu(ohdr->bth[0]); if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0)) @@ -2054,13 +2183,13 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) break; } - if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST)) + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) qp_comm_est(qp); /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): - ret = hfi1_get_rwqe(qp, 0); + ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto nack_op_err; if (!ret) @@ -2076,12 +2205,12 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, pmtu, 1); + hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): /* consume RWQE */ - ret = hfi1_get_rwqe(qp, 1); + ret = hfi1_rvt_get_rwqe(qp, 1); if (ret < 0) goto nack_op_err; if (!ret) @@ -2090,7 +2219,7 @@ send_middle: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): - ret = hfi1_get_rwqe(qp, 0); + ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto nack_op_err; if (!ret) @@ -2104,8 +2233,10 @@ send_last_imm: wc.ex.imm_data = ohdr->u.imm_data; wc.wc_flags = IB_WC_WITH_IMM; goto send_last; - case OP(SEND_LAST): case OP(RDMA_WRITE_LAST): + copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user; + /* fall through */ + case OP(SEND_LAST): no_immediate_data: wc.wc_flags = 0; wc.ex.imm_data = 0; @@ -2121,10 +2252,10 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, tlen, 1); - hfi1_put_ss(&qp->r_sge); + hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last); + rvt_put_ss(&qp->r_sge); qp->r_msn++; - if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) break; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -2154,12 +2285,14 @@ send_last: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, - (bth0 & IB_BTH_SOLICITED) != 0); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + (bth0 & IB_BTH_SOLICITED) != 0); break; - case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_ONLY): + copy_last = 1; + /* fall through */ + case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) goto nack_inv; @@ -2174,8 +2307,8 @@ send_last: int ok; /* Check rkey & NAK */ - ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, - rkey, IB_ACCESS_REMOTE_WRITE); + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE); if (unlikely(!ok)) goto nack_acc; qp->r_sge.num_sge = 1; @@ -2190,7 +2323,7 @@ send_last: goto send_middle; else if (opcode == OP(RDMA_WRITE_ONLY)) goto no_immediate_data; - ret = hfi1_get_rwqe(qp, 1); + ret = hfi1_rvt_get_rwqe(qp, 1); if (ret < 0) goto nack_op_err; if (!ret) @@ -2200,7 +2333,7 @@ send_last: goto send_last; case OP(RDMA_READ_REQUEST): { - struct hfi1_ack_entry *e; + struct rvt_ack_entry *e; u32 len; u8 next; @@ -2218,7 +2351,7 @@ send_last: } e = &qp->s_ack_queue[qp->r_head_ack_queue]; if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - hfi1_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } reth = &ohdr->u.rc.reth; @@ -2229,8 +2362,8 @@ send_last: int ok; /* Check rkey & NAK */ - ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, - rkey, IB_ACCESS_REMOTE_READ); + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); if (unlikely(!ok)) goto nack_acc_unlck; /* @@ -2261,7 +2394,7 @@ send_last: qp->r_head_ack_queue = next; /* Schedule the send tasklet. */ - qp->s_flags |= HFI1_S_RESP_PENDING; + qp->s_flags |= RVT_S_RESP_PENDING; hfi1_schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); @@ -2273,7 +2406,7 @@ send_last: case OP(COMPARE_SWAP): case OP(FETCH_ADD): { struct ib_atomic_eth *ateth; - struct hfi1_ack_entry *e; + struct rvt_ack_entry *e; u64 vaddr; atomic64_t *maddr; u64 sdata; @@ -2293,29 +2426,29 @@ send_last: } e = &qp->s_ack_queue[qp->r_head_ack_queue]; if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { - hfi1_put_mr(e->rdma_sge.mr); + rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } ateth = &ohdr->u.atomic_eth; - vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + vaddr = ((u64)be32_to_cpu(ateth->vaddr[0]) << 32) | be32_to_cpu(ateth->vaddr[1]); if (unlikely(vaddr & (sizeof(u64) - 1))) goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); /* Check rkey & NAK */ - if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - vaddr, rkey, - IB_ACCESS_REMOTE_ATOMIC))) + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) goto nack_acc_unlck; /* Perform atomic OP and save result. */ - maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; sdata = be64_to_cpu(ateth->swap_data); e->atomic_data = (opcode == OP(FETCH_ADD)) ? - (u64) atomic64_add_return(sdata, maddr) - sdata : - (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, be64_to_cpu(ateth->compare_data), sdata); - hfi1_put_mr(qp->r_sge.sge.mr); + rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; e->opcode = opcode; e->sent = 0; @@ -2328,7 +2461,7 @@ send_last: qp->r_head_ack_queue = next; /* Schedule the send tasklet. */ - qp->s_flags |= HFI1_S_RESP_PENDING; + qp->s_flags |= RVT_S_RESP_PENDING; hfi1_schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); @@ -2347,11 +2480,13 @@ send_last: qp->r_nak_state = 0; /* Send an ACK if requested or required. */ if (psn & IB_BTH_REQ_ACK) { + struct hfi1_qp_priv *priv = qp->priv; + if (packet->numpkt == 0) { rc_cancel_ack(qp); goto send_ack; } - if (qp->r_adefered >= HFI1_PSN_CREDIT) { + if (priv->r_adefered >= HFI1_PSN_CREDIT) { rc_cancel_ack(qp); goto send_ack; } @@ -2359,13 +2494,13 @@ send_last: rc_cancel_ack(qp); goto send_ack; } - qp->r_adefered++; + priv->r_adefered++; rc_defered_ack(rcd, qp); } return; rnr_nak: - qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK; qp->r_ack_psn = qp->r_psn; /* Queue RNR NAK for later */ rc_defered_ack(rcd, qp); @@ -2403,7 +2538,7 @@ void hfi1_rc_hdrerr( struct hfi1_ctxtdata *rcd, struct hfi1_ib_header *hdr, u32 rcv_flags, - struct hfi1_qp *qp) + struct rvt_qp *qp) { int has_grh = rcv_flags & HFI1_HAS_GRH; struct hfi1_other_headers *ohdr; @@ -2428,7 +2563,7 @@ void hfi1_rc_hdrerr( if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { diff = delta_psn(psn, qp->r_psn); if (!qp->r_nak_state && diff >= 0) { - ibp->n_rc_seqnak++; + ibp->rvp.n_rc_seqnak++; qp->r_nak_state = IB_NAK_PSN_ERROR; /* Use the expected PSN. */ qp->r_ack_psn = qp->r_psn; diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c index 4a91975b68d7..08813cdbd475 100644 --- a/drivers/staging/rdma/hfi1/ruc.c +++ b/drivers/staging/rdma/hfi1/ruc.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -53,7 +50,8 @@ #include "hfi.h" #include "mad.h" #include "qp.h" -#include "sdma.h" +#include "verbs_txreq.h" +#include "trace.h" /* * Convert the AETH RNR timeout code into the number of microseconds. @@ -97,16 +95,16 @@ const u32 ib_hfi1_rnr_table[32] = { * Validate a RWQE and fill in the SGE state. * Return 1 if OK. */ -static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe) +static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) { int i, j, ret; struct ib_wc wc; - struct hfi1_lkey_table *rkt; - struct hfi1_pd *pd; - struct hfi1_sge_state *ss; + struct rvt_lkey_table *rkt; + struct rvt_pd *pd; + struct rvt_sge_state *ss; - rkt = &to_idev(qp->ibqp.device)->lk_table; - pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; + pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); ss = &qp->r_sge; ss->sg_list = qp->r_sg_list; qp->r_len = 0; @@ -114,8 +112,8 @@ static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe) if (wqe->sg_list[i].length == 0) continue; /* Check LKEY */ - if (!hfi1_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) goto bad_lkey; qp->r_len += wqe->sg_list[i].length; j++; @@ -127,9 +125,9 @@ static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe) bad_lkey: while (j) { - struct hfi1_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; - hfi1_put_mr(sge->mr); + rvt_put_mr(sge->mr); } ss->num_sge = 0; memset(&wc, 0, sizeof(wc)); @@ -138,14 +136,14 @@ bad_lkey: wc.opcode = IB_WC_RECV; wc.qp = &qp->ibqp; /* Signal solicited completion event. */ - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); ret = 0; bail: return ret; } /** - * hfi1_get_rwqe - copy the next RWQE into the QP's RWQE + * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE * @qp: the QP * @wr_id_only: update qp->r_wr_id only, not qp->r_sge * @@ -154,19 +152,19 @@ bail: * * Can be called from interrupt level. */ -int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only) +int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only) { unsigned long flags; - struct hfi1_rq *rq; - struct hfi1_rwq *wq; - struct hfi1_srq *srq; - struct hfi1_rwqe *wqe; + struct rvt_rq *rq; + struct rvt_rwq *wq; + struct rvt_srq *srq; + struct rvt_rwqe *wqe; void (*handler)(struct ib_event *, void *); u32 tail; int ret; if (qp->ibqp.srq) { - srq = to_isrq(qp->ibqp.srq); + srq = ibsrq_to_rvtsrq(qp->ibqp.srq); handler = srq->ibsrq.event_handler; rq = &srq->rq; } else { @@ -176,7 +174,7 @@ int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only) } spin_lock_irqsave(&rq->lock, flags); - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { ret = 0; goto unlock; } @@ -192,7 +190,7 @@ int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only) } /* Make sure entry is read after head index is read. */ smp_rmb(); - wqe = get_rwqe_ptr(rq, tail); + wqe = rvt_get_rwqe_ptr(rq, tail); /* * Even though we update the tail index in memory, the verbs * consumer is not supposed to post more entries until a @@ -208,7 +206,7 @@ int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only) qp->r_wr_id = wqe->wr_id; ret = 1; - set_bit(HFI1_R_WRID_VALID, &qp->r_aflags); + set_bit(RVT_R_WRID_VALID, &qp->r_aflags); if (handler) { u32 n; @@ -265,7 +263,7 @@ static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) * The s_lock will be acquired around the hfi1_migrate_qp() call. */ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, - int has_grh, struct hfi1_qp *qp, u32 bth0) + int has_grh, struct rvt_qp *qp, u32 bth0) { __be64 guid; unsigned long flags; @@ -279,11 +277,13 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH)) goto err; guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index); - if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix, + guid)) goto err; - if (!gid_ok(&hdr->u.l.grh.sgid, - qp->alt_ah_attr.grh.dgid.global.subnet_prefix, - qp->alt_ah_attr.grh.dgid.global.interface_id)) + if (!gid_ok( + &hdr->u.l.grh.sgid, + qp->alt_ah_attr.grh.dgid.global.subnet_prefix, + qp->alt_ah_attr.grh.dgid.global.interface_id)) goto err; } if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, @@ -312,11 +312,13 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, goto err; guid = get_sguid(ibp, qp->remote_ah_attr.grh.sgid_index); - if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix, + guid)) goto err; - if (!gid_ok(&hdr->u.l.grh.sgid, - qp->remote_ah_attr.grh.dgid.global.subnet_prefix, - qp->remote_ah_attr.grh.dgid.global.interface_id)) + if (!gid_ok( + &hdr->u.l.grh.sgid, + qp->remote_ah_attr.grh.dgid.global.subnet_prefix, + qp->remote_ah_attr.grh.dgid.global.interface_id)) goto err; } if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, @@ -355,12 +357,12 @@ err: * receive interrupts since this is a connected protocol and all packets * will pass through here. */ -static void ruc_loopback(struct hfi1_qp *sqp) +static void ruc_loopback(struct rvt_qp *sqp) { struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct hfi1_qp *qp; - struct hfi1_swqe *wqe; - struct hfi1_sge *sge; + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_sge *sge; unsigned long flags; struct ib_wc wc; u64 sdata; @@ -368,6 +370,8 @@ static void ruc_loopback(struct hfi1_qp *sqp) enum ib_wc_status send_status; int release; int ret; + int copy_last = 0; + u32 to; rcu_read_lock(); @@ -375,25 +379,27 @@ static void ruc_loopback(struct hfi1_qp *sqp) * Note that we check the responder QP state after * checking the requester's state. */ - qp = hfi1_lookup_qpn(ibp, sqp->remote_qpn); + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, + sqp->remote_qpn); spin_lock_irqsave(&sqp->s_lock, flags); /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT)) || - !(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_OR_FLUSH_SEND)) + if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) goto unlock; - sqp->s_flags |= HFI1_S_BUSY; + sqp->s_flags |= RVT_S_BUSY; again: - if (sqp->s_last == sqp->s_head) + smp_read_barrier_depends(); /* see post_one_send() */ + if (sqp->s_last == ACCESS_ONCE(sqp->s_head)) goto clr_busy; - wqe = get_swqe_ptr(sqp, sqp->s_last); + wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); /* Return if it is not OK to start a new work request. */ - if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_NEXT_SEND_OK)) { - if (!(ib_hfi1_state_ops[sqp->state] & HFI1_FLUSH_SEND)) + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) goto clr_busy; /* We are in the error state, flush the work request. */ send_status = IB_WC_WR_FLUSH_ERR; @@ -411,9 +417,9 @@ again: } spin_unlock_irqrestore(&sqp->s_lock, flags); - if (!qp || !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) || + if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; /* * For RC, the requester would timeout and retry so * shortcut the timeouts and just signal too many retries. @@ -439,7 +445,7 @@ again: wc.ex.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: - ret = hfi1_get_rwqe(qp, 0); + ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto op_err; if (!ret) @@ -451,21 +457,24 @@ again: goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = hfi1_get_rwqe(qp, 1); + ret = hfi1_rvt_get_rwqe(qp, 1); if (ret < 0) goto op_err; if (!ret) goto rnr_nak; - /* FALLTHROUGH */ + /* skip copy_last set and qp_access_flags recheck */ + goto do_write; case IB_WR_RDMA_WRITE: + copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user; if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) goto inv_err; +do_write: if (wqe->length == 0) break; - if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_WRITE))) goto acc_err; qp->r_sge.sg_list = NULL; qp->r_sge.num_sge = 1; @@ -475,10 +484,10 @@ again: case IB_WR_RDMA_READ: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) goto inv_err; - if (unlikely(!hfi1_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) + if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_READ))) goto acc_err; release = 0; sqp->s_sge.sg_list = NULL; @@ -493,20 +502,20 @@ again: case IB_WR_ATOMIC_FETCH_AND_ADD: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) goto inv_err; - if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->atomic_wr.remote_addr, + wqe->atomic_wr.rkey, + IB_ACCESS_REMOTE_ATOMIC))) goto acc_err; /* Perform atomic OP and save result. */ - maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; sdata = wqe->atomic_wr.compare_add; - *(u64 *) sqp->s_sge.sge.vaddr = + *(u64 *)sqp->s_sge.sge.vaddr = (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64) atomic64_add_return(sdata, maddr) - sdata : - (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, sdata, wqe->atomic_wr.swap); - hfi1_put_mr(qp->r_sge.sge.mr); + rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; goto send_comp; @@ -524,17 +533,17 @@ again: if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release); + hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; if (sge->sge_length == 0) { if (!release) - hfi1_put_mr(sge->mr); + rvt_put_mr(sge->mr); if (--sqp->s_sge.num_sge) *sge = *sqp->s_sge.sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= HFI1_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -547,9 +556,9 @@ again: sqp->s_len -= len; } if (release) - hfi1_put_ss(&qp->r_sge); + rvt_put_ss(&qp->r_sge); - if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) goto send_comp; if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) @@ -565,12 +574,12 @@ again: wc.sl = qp->remote_ah_attr.sl; wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: spin_lock_irqsave(&sqp->s_lock, flags); - ibp->n_loop_pkts++; + ibp->rvp.n_loop_pkts++; flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; hfi1_send_complete(sqp, wqe, send_status); @@ -580,7 +589,7 @@ rnr_nak: /* Handle RNR NAK */ if (qp->ibqp.qp_type == IB_QPT_UC) goto send_comp; - ibp->n_rnr_naks++; + ibp->rvp.n_rnr_naks++; /* * Note: we don't need the s_lock held since the BUSY flag * makes this single threaded. @@ -592,13 +601,10 @@ rnr_nak: if (sqp->s_rnr_retry_cnt < 7) sqp->s_rnr_retry--; spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_RECV_OK)) + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) goto clr_busy; - sqp->s_flags |= HFI1_S_WAIT_RNR; - sqp->s_timer.function = hfi1_rc_rnr_retry; - sqp->s_timer.expires = jiffies + - usecs_to_jiffies(ib_hfi1_rnr_table[qp->r_min_rnr_timer]); - add_timer(&sqp->s_timer); + to = ib_hfi1_rnr_table[qp->r_min_rnr_timer]; + hfi1_add_rnr_timer(sqp, to); goto clr_busy; op_err: @@ -622,9 +628,9 @@ serr: spin_lock_irqsave(&sqp->s_lock, flags); hfi1_send_complete(sqp, wqe, send_status); if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = hfi1_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - sqp->s_flags &= ~HFI1_S_BUSY; + sqp->s_flags &= ~RVT_S_BUSY; spin_unlock_irqrestore(&sqp->s_lock, flags); if (lastwqe) { struct ib_event ev; @@ -637,7 +643,7 @@ serr: goto done; } clr_busy: - sqp->s_flags &= ~HFI1_S_BUSY; + sqp->s_flags &= ~RVT_S_BUSY; unlock: spin_unlock_irqrestore(&sqp->s_lock, flags); done: @@ -666,7 +672,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, hdr->next_hdr = IB_GRH_NEXT_HDR; hdr->hop_limit = grh->hop_limit; /* The SGID is 32-bit aligned. */ - hdr->sgid.global.subnet_prefix = ibp->gid_prefix; + hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; hdr->sgid.global.interface_id = grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ? ibp->guids[grh->sgid_index - 1] : @@ -690,29 +696,31 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, * Subsequent middles use the copied entry, editing the * PSN with 1 or 2 edits. */ -static inline void build_ahg(struct hfi1_qp *qp, u32 npsn) +static inline void build_ahg(struct rvt_qp *qp, u32 npsn) { - if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR)) + struct hfi1_qp_priv *priv = qp->priv; + + if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR)) clear_ahg(qp); - if (!(qp->s_flags & HFI1_S_AHG_VALID)) { + if (!(qp->s_flags & RVT_S_AHG_VALID)) { /* first middle that needs copy */ if (qp->s_ahgidx < 0) - qp->s_ahgidx = sdma_ahg_alloc(qp->s_sde); + qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde); if (qp->s_ahgidx >= 0) { qp->s_ahgpsn = npsn; - qp->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY; + priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY; /* save to protect a change in another thread */ - qp->s_hdr->sde = qp->s_sde; - qp->s_hdr->ahgidx = qp->s_ahgidx; - qp->s_flags |= HFI1_S_AHG_VALID; + priv->s_hdr->sde = priv->s_sde; + priv->s_hdr->ahgidx = qp->s_ahgidx; + qp->s_flags |= RVT_S_AHG_VALID; } } else { /* subsequent middle after valid */ if (qp->s_ahgidx >= 0) { - qp->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG; - qp->s_hdr->ahgidx = qp->s_ahgidx; - qp->s_hdr->ahgcount++; - qp->s_hdr->ahgdesc[0] = + priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG; + priv->s_hdr->ahgidx = qp->s_ahgidx; + priv->s_hdr->ahgcount++; + priv->s_hdr->ahgdesc[0] = sdma_build_ahg_descriptor( (__force u16)cpu_to_be16((u16)npsn), BTH2_OFFSET, @@ -720,8 +728,8 @@ static inline void build_ahg(struct hfi1_qp *qp, u32 npsn) 16); if ((npsn & 0xffff0000) != (qp->s_ahgpsn & 0xffff0000)) { - qp->s_hdr->ahgcount++; - qp->s_hdr->ahgdesc[1] = + priv->s_hdr->ahgcount++; + priv->s_hdr->ahgdesc[1] = sdma_build_ahg_descriptor( (__force u16)cpu_to_be16( (u16)(npsn >> 16)), @@ -733,10 +741,12 @@ static inline void build_ahg(struct hfi1_qp *qp, u32 npsn) } } -void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, - u32 bth0, u32 bth2, int middle) +void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps) { - struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = ps->ibp; u16 lrh0; u32 nwords; u32 extra_bytes; @@ -747,13 +757,14 @@ void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, nwords = (qp->s_cur_size + extra_bytes) >> 2; lrh0 = HFI1_LRH_BTH; if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { - qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh, - &qp->remote_ah_attr.grh, - qp->s_hdrwords, nwords); + qp->s_hdrwords += hfi1_make_grh(ibp, + &ps->s_txreq->phdr.hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); lrh0 = HFI1_LRH_GRH; middle = 0; } - lrh0 |= (qp->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; + lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; /* * reset s_hdr/AHG fields * @@ -765,10 +776,10 @@ void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, * build_ahg() will modify as appropriate * to use the AHG feature. */ - qp->s_hdr->tx_flags = 0; - qp->s_hdr->ahgcount = 0; - qp->s_hdr->ahgidx = 0; - qp->s_hdr->sde = NULL; + priv->s_hdr->tx_flags = 0; + priv->s_hdr->ahgcount = 0; + priv->s_hdr->ahgidx = 0; + priv->s_hdr->sde = NULL; if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; else @@ -776,19 +787,19 @@ void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, if (middle) build_ahg(qp, bth2); else - qp->s_flags &= ~HFI1_S_AHG_VALID; - qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr->ibh.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); - qp->s_hdr->ibh.lrh[2] = + qp->s_flags &= ~RVT_S_AHG_VALID; + ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0); + ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + ps->s_txreq->phdr.hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); - qp->s_hdr->ibh.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | + ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | qp->remote_ah_attr.src_path_bits); bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); bth0 |= extra_bytes << 20; ohdr->bth[0] = cpu_to_be32(bth0); bth1 = qp->remote_qpn; - if (qp->s_flags & HFI1_S_ECN) { - qp->s_flags &= ~HFI1_S_ECN; + if (qp->s_flags & RVT_S_ECN) { + qp->s_flags &= ~RVT_S_ECN; /* we recently received a FECN, so return a BECN */ bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT); } @@ -799,6 +810,14 @@ void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, /* when sending, force a reschedule every one of these periods */ #define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ +void _hfi1_do_send(struct work_struct *work) +{ + struct iowait *wait = container_of(work, struct iowait, iowork); + struct rvt_qp *qp = iowait_to_qp(wait); + + hfi1_do_send(qp); +} + /** * hfi1_do_send - perform a send on a QP * @work: contains a pointer to the QP @@ -807,34 +826,45 @@ void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, * exhausted. Only allow one CPU to send a packet per QP (tasklet). * Otherwise, two threads could send packets out of order. */ -void hfi1_do_send(struct work_struct *work) +void hfi1_do_send(struct rvt_qp *qp) { - struct iowait *wait = container_of(work, struct iowait, iowork); - struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait); struct hfi1_pkt_state ps; - int (*make_req)(struct hfi1_qp *qp); + struct hfi1_qp_priv *priv = qp->priv; + int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps); unsigned long flags; unsigned long timeout; + unsigned long timeout_int; + int cpu; ps.dev = to_idev(qp->ibqp.device); ps.ibp = to_iport(qp->ibqp.device, qp->port_num); ps.ppd = ppd_from_ibp(ps.ibp); - if ((qp->ibqp.qp_type == IB_QPT_RC || - qp->ibqp.qp_type == IB_QPT_UC) && - !loopback && - (qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc) - 1)) == - ps.ppd->lid) { - ruc_loopback(qp); - return; - } - - if (qp->ibqp.qp_type == IB_QPT_RC) + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc + ) - 1)) == + ps.ppd->lid)) { + ruc_loopback(qp); + return; + } make_req = hfi1_make_rc_req; - else if (qp->ibqp.qp_type == IB_QPT_UC) + timeout_int = (qp->timeout_jiffies); + break; + case IB_QPT_UC: + if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc + ) - 1)) == + ps.ppd->lid)) { + ruc_loopback(qp); + return; + } make_req = hfi1_make_uc_req; - else + timeout_int = SEND_RESCHED_TIMEOUT; + break; + default: make_req = hfi1_make_ud_req; + timeout_int = SEND_RESCHED_TIMEOUT; + } spin_lock_irqsave(&qp->s_lock, flags); @@ -844,57 +874,83 @@ void hfi1_do_send(struct work_struct *work) return; } - qp->s_flags |= HFI1_S_BUSY; + qp->s_flags |= RVT_S_BUSY; - spin_unlock_irqrestore(&qp->s_lock, flags); - - timeout = jiffies + SEND_RESCHED_TIMEOUT; + timeout = jiffies + (timeout_int) / 8; + cpu = priv->s_sde ? priv->s_sde->cpu : + cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + /* insure a pre-built packet is handled */ + ps.s_txreq = get_waiting_verbs_txreq(qp); do { /* Check for a constructed packet to be sent. */ if (qp->s_hdrwords != 0) { + spin_unlock_irqrestore(&qp->s_lock, flags); /* * If the packet cannot be sent now, return and * the send tasklet will be woken up later. */ if (hfi1_verbs_send(qp, &ps)) - break; + return; /* Record that s_hdr is empty. */ qp->s_hdrwords = 0; + /* allow other tasks to run */ + if (unlikely(time_after(jiffies, timeout))) { + if (workqueue_congested(cpu, + ps.ppd->hfi1_wq)) { + spin_lock_irqsave(&qp->s_lock, flags); + qp->s_flags &= ~RVT_S_BUSY; + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, + flags); + this_cpu_inc( + *ps.ppd->dd->send_schedule); + return; + } + if (!irqs_disabled()) { + cond_resched(); + this_cpu_inc( + *ps.ppd->dd->send_schedule); + } + timeout = jiffies + (timeout_int) / 8; + } + spin_lock_irqsave(&qp->s_lock, flags); } + } while (make_req(qp, &ps)); - /* allow other tasks to run */ - if (unlikely(time_after(jiffies, timeout))) { - cond_resched(); - ps.ppd->dd->verbs_dev.n_send_schedule++; - timeout = jiffies + SEND_RESCHED_TIMEOUT; - } - } while (make_req(qp)); + spin_unlock_irqrestore(&qp->s_lock, flags); } /* * This should be called with s_lock held. */ -void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe, +void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status) { u32 old_last, last; unsigned i; - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) return; + last = qp->s_last; + old_last = last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + /* See post_send() */ + barrier(); for (i = 0; i < wqe->wr.num_sge; i++) { - struct hfi1_sge *sge = &wqe->sg_list[i]; + struct rvt_sge *sge = &wqe->sg_list[i]; - hfi1_put_mr(sge->mr); + rvt_put_mr(sge->mr); } if (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&to_iah(wqe->ud_wr.ah)->refcount); + atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); /* See ch. 11.2.4.1 and 10.7.3.1 */ - if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) || + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED) || status != IB_WC_SUCCESS) { struct ib_wc wc; @@ -906,15 +962,10 @@ void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe, wc.qp = &qp->ibqp; if (status == IB_WC_SUCCESS) wc.byte_len = wqe->length; - hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, - status != IB_WC_SUCCESS); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); } - last = qp->s_last; - old_last = last; - if (++last >= qp->s_size) - last = 0; - qp->s_last = last; if (qp->s_acked == old_last) qp->s_acked = last; if (qp->s_cur == old_last) diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c index 9a15f1f32b45..abb8ebc1fcac 100644 --- a/drivers/staging/rdma/hfi1/sdma.c +++ b/drivers/staging/rdma/hfi1/sdma.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -112,10 +109,10 @@ MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt"); | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK)) /* sdma_sendctrl operations */ -#define SDMA_SENDCTRL_OP_ENABLE (1U << 0) -#define SDMA_SENDCTRL_OP_INTENABLE (1U << 1) -#define SDMA_SENDCTRL_OP_HALT (1U << 2) -#define SDMA_SENDCTRL_OP_CLEANUP (1U << 3) +#define SDMA_SENDCTRL_OP_ENABLE BIT(0) +#define SDMA_SENDCTRL_OP_INTENABLE BIT(1) +#define SDMA_SENDCTRL_OP_HALT BIT(2) +#define SDMA_SENDCTRL_OP_CLEANUP BIT(3) /* handle long defines */ #define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \ @@ -325,9 +322,9 @@ static void sdma_wait_for_packet_egress(struct sdma_engine *sde, if (lcnt++ > 500) { /* timed out - bounce the link */ dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n", - __func__, sde->this_idx, (u32)reg); + __func__, sde->this_idx, (u32)reg); queue_work(dd->pport->hfi1_wq, - &dd->pport->link_bounce_work); + &dd->pport->link_bounce_work); break; } udelay(1); @@ -361,6 +358,28 @@ static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt) write_sde_csr(sde, SD(DESC_CNT), reg); } +static inline void complete_tx(struct sdma_engine *sde, + struct sdma_txreq *tx, + int res) +{ + /* protect against complete modifying */ + struct iowait *wait = tx->wait; + callback_t complete = tx->complete; + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + trace_hfi1_sdma_out_sn(sde, tx->sn); + if (WARN_ON_ONCE(sde->head_sn != tx->sn)) + dd_dev_err(sde->dd, "expected %llu got %llu\n", + sde->head_sn, tx->sn); + sde->head_sn++; +#endif + sdma_txclean(sde->dd, tx); + if (complete) + (*complete)(tx, res); + if (iowait_sdma_dec(wait) && wait) + iowait_drain_wakeup(wait); +} + /* * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status * @@ -395,27 +414,8 @@ static void sdma_flush(struct sdma_engine *sde) } spin_unlock_irqrestore(&sde->flushlist_lock, flags); /* flush from flush list */ - list_for_each_entry_safe(txp, txp_next, &flushlist, list) { - int drained = 0; - /* protect against complete modifying */ - struct iowait *wait = txp->wait; - - list_del_init(&txp->list); -#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER - trace_hfi1_sdma_out_sn(sde, txp->sn); - if (WARN_ON_ONCE(sde->head_sn != txp->sn)) - dd_dev_err(sde->dd, "expected %llu got %llu\n", - sde->head_sn, txp->sn); - sde->head_sn++; -#endif - sdma_txclean(sde->dd, txp); - if (wait) - drained = atomic_dec_and_test(&wait->sdma_busy); - if (txp->complete) - (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained); - if (wait && drained) - iowait_drain_wakeup(wait); - } + list_for_each_entry_safe(txp, txp_next, &flushlist, list) + complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED); } /* @@ -455,8 +455,8 @@ static void sdma_err_halt_wait(struct work_struct *work) break; if (time_after(jiffies, timeout)) { dd_dev_err(sde->dd, - "SDMA engine %d - timeout waiting for engine to halt\n", - sde->this_idx); + "SDMA engine %d - timeout waiting for engine to halt\n", + sde->this_idx); /* * Continue anyway. This could happen if there was * an uncorrectable error in the wrong spot. @@ -472,7 +472,6 @@ static void sdma_err_halt_wait(struct work_struct *work) static void sdma_err_progress_check_schedule(struct sdma_engine *sde) { if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) { - unsigned index; struct hfi1_devdata *dd = sde->dd; @@ -531,7 +530,7 @@ static void sdma_err_progress_check(unsigned long data) static void sdma_hw_clean_up_task(unsigned long opaque) { - struct sdma_engine *sde = (struct sdma_engine *) opaque; + struct sdma_engine *sde = (struct sdma_engine *)opaque; u64 statuscsr; while (1) { @@ -577,31 +576,10 @@ static void sdma_flush_descq(struct sdma_engine *sde) head = ++sde->descq_head & sde->sdma_mask; /* if now past this txp's descs, do the callback */ if (txp && txp->next_descq_idx == head) { - int drained = 0; - /* protect against complete modifying */ - struct iowait *wait = txp->wait; - /* remove from list */ sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; - if (wait) - drained = atomic_dec_and_test(&wait->sdma_busy); -#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER - trace_hfi1_sdma_out_sn(sde, txp->sn); - if (WARN_ON_ONCE(sde->head_sn != txp->sn)) - dd_dev_err(sde->dd, "expected %llu got %llu\n", - sde->head_sn, txp->sn); - sde->head_sn++; -#endif - sdma_txclean(sde->dd, txp); + complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED); trace_hfi1_sdma_progress(sde, head, tail, txp); - if (txp->complete) - (*txp->complete)( - txp, - SDMA_TXREQ_S_ABORTED, - drained); - if (wait && drained) - iowait_drain_wakeup(wait); - /* see if there is another txp */ txp = get_txhead(sde); } progress++; @@ -612,7 +590,7 @@ static void sdma_flush_descq(struct sdma_engine *sde) static void sdma_sw_clean_up_task(unsigned long opaque) { - struct sdma_engine *sde = (struct sdma_engine *) opaque; + struct sdma_engine *sde = (struct sdma_engine *)opaque; unsigned long flags; spin_lock_irqsave(&sde->tail_lock, flags); @@ -627,7 +605,6 @@ static void sdma_sw_clean_up_task(unsigned long opaque) * descq are ours to play with. */ - /* * In the error clean up sequence, software clean must be called * before the hardware clean so we can use the hardware head in @@ -676,7 +653,7 @@ static void sdma_start_hw_clean_up(struct sdma_engine *sde) } static void sdma_set_state(struct sdma_engine *sde, - enum sdma_states next_state) + enum sdma_states next_state) { struct sdma_state *ss = &sde->state; const struct sdma_set_state_action *action = sdma_action_table; @@ -692,8 +669,8 @@ static void sdma_set_state(struct sdma_engine *sde, ss->previous_op = ss->current_op; ss->current_state = next_state; - if (ss->previous_state != sdma_state_s99_running - && next_state == sdma_state_s99_running) + if (ss->previous_state != sdma_state_s99_running && + next_state == sdma_state_s99_running) sdma_flush(sde); if (action[next_state].op_enable) @@ -890,6 +867,9 @@ int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines) newmap->actual_vls = num_vls; newmap->vls = roundup_pow_of_two(num_vls); newmap->mask = (1 << ilog2(newmap->vls)) - 1; + /* initialize back-map */ + for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++) + newmap->engine_to_vl[i] = -1; for (i = 0; i < newmap->vls; i++) { /* save for wrap around */ int first_engine = engine; @@ -913,6 +893,9 @@ int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines) /* wrap back to first engine */ engine = first_engine; } + /* assign back-map */ + for (j = 0; j < vl_engines[i]; j++) + newmap->engine_to_vl[first_engine + j] = i; } else { /* just re-use entry without allocating */ newmap->map[i] = newmap->map[i % num_vls]; @@ -922,7 +905,7 @@ int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines) /* newmap in hand, save old map */ spin_lock_irq(&dd->sde_map_lock); oldmap = rcu_dereference_protected(dd->sdma_map, - lockdep_is_held(&dd->sde_map_lock)); + lockdep_is_held(&dd->sde_map_lock)); /* publish newmap */ rcu_assign_pointer(dd->sdma_map, newmap); @@ -983,7 +966,7 @@ static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) sde->tx_ring = NULL; } spin_lock_irq(&dd->sde_map_lock); - kfree(rcu_access_pointer(dd->sdma_map)); + sdma_map_free(rcu_access_pointer(dd->sdma_map)); RCU_INIT_POINTER(dd->sdma_map, NULL); spin_unlock_irq(&dd->sde_map_lock); synchronize_rcu(); @@ -1020,19 +1003,19 @@ int sdma_init(struct hfi1_devdata *dd, u8 port) return 0; } if (mod_num_sdma && - /* can't exceed chip support */ - mod_num_sdma <= dd->chip_sdma_engines && - /* count must be >= vls */ - mod_num_sdma >= num_vls) + /* can't exceed chip support */ + mod_num_sdma <= dd->chip_sdma_engines && + /* count must be >= vls */ + mod_num_sdma >= num_vls) num_engines = mod_num_sdma; dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma); dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines); dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n", - dd->chip_sdma_mem_size); + dd->chip_sdma_mem_size); per_sdma_credits = - dd->chip_sdma_mem_size/(num_engines * SDMA_BLOCK_SIZE); + dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE); /* set up freeze waitqueue */ init_waitqueue_head(&dd->sdma_unfreeze_wq); @@ -1040,7 +1023,7 @@ int sdma_init(struct hfi1_devdata *dd, u8 port) descq_cnt = sdma_get_descq_cnt(); dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n", - num_engines, descq_cnt); + num_engines, descq_cnt); /* alloc memory for array of send engines */ dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL); @@ -1061,18 +1044,18 @@ int sdma_init(struct hfi1_devdata *dd, u8 port) sde->desc_avail = sdma_descq_freecnt(sde); sde->sdma_shift = ilog2(descq_cnt); sde->sdma_mask = (1 << sde->sdma_shift) - 1; - sde->descq_full_count = 0; - - /* Create a mask for all 3 chip interrupt sources */ - sde->imask = (u64)1 << (0*TXE_NUM_SDMA_ENGINES + this_idx) - | (u64)1 << (1*TXE_NUM_SDMA_ENGINES + this_idx) - | (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx); - /* Create a mask specifically for sdma_idle */ - sde->idle_mask = - (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx); - /* Create a mask specifically for sdma_progress */ - sde->progress_mask = - (u64)1 << (TXE_NUM_SDMA_ENGINES + this_idx); + + /* Create a mask specifically for each interrupt source */ + sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES + + this_idx); + sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES + + this_idx); + sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES + + this_idx); + /* Create a combined mask to cover all 3 interrupt sources */ + sde->imask = sde->int_mask | sde->progress_mask | + sde->idle_mask; + spin_lock_init(&sde->tail_lock); seqlock_init(&sde->head_lock); spin_lock_init(&sde->senddmactrl_lock); @@ -1100,10 +1083,10 @@ int sdma_init(struct hfi1_devdata *dd, u8 port) SDMA_DESC1_INT_REQ_FLAG; tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task, - (unsigned long)sde); + (unsigned long)sde); tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task, - (unsigned long)sde); + (unsigned long)sde); INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait); INIT_WORK(&sde->flush_worker, sdma_field_flush); @@ -1251,11 +1234,10 @@ void sdma_exit(struct hfi1_devdata *dd) for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma; ++this_idx) { - sde = &dd->per_sdma[this_idx]; if (!list_empty(&sde->dmawait)) dd_dev_err(dd, "sde %u: dmawait list not empty!\n", - sde->this_idx); + sde->this_idx); sdma_process_event(sde, sdma_event_e00_go_hw_down); del_timer_sync(&sde->err_progress_check_timer); @@ -1358,8 +1340,8 @@ retry: use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) && (dd->flags & HFI1_HAS_SDMA_TIMEOUT); hwhead = use_dmahead ? - (u16) le64_to_cpu(*sde->head_dma) : - (u16) read_sde_csr(sde, SD(HEAD)); + (u16)le64_to_cpu(*sde->head_dma) : + (u16)read_sde_csr(sde, SD(HEAD)); if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) { u16 cnt; @@ -1385,9 +1367,9 @@ retry: if (unlikely(!sane)) { dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n", - sde->this_idx, - use_dmahead ? "dma" : "kreg", - hwhead, swhead, swtail, cnt); + sde->this_idx, + use_dmahead ? "dma" : "kreg", + hwhead, swhead, swtail, cnt); if (use_dmahead) { /* try one more time, using csr */ use_dmahead = 0; @@ -1464,7 +1446,7 @@ static void sdma_make_progress(struct sdma_engine *sde, u64 status) { struct sdma_txreq *txp = NULL; int progress = 0; - u16 hwhead, swhead, swtail; + u16 hwhead, swhead; int idle_check_done = 0; hwhead = sdma_gethead(sde); @@ -1485,29 +1467,9 @@ retry: /* if now past this txp's descs, do the callback */ if (txp && txp->next_descq_idx == swhead) { - int drained = 0; - /* protect against complete modifying */ - struct iowait *wait = txp->wait; - /* remove from list */ sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; - if (wait) - drained = atomic_dec_and_test(&wait->sdma_busy); -#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER - trace_hfi1_sdma_out_sn(sde, txp->sn); - if (WARN_ON_ONCE(sde->head_sn != txp->sn)) - dd_dev_err(sde->dd, "expected %llu got %llu\n", - sde->head_sn, txp->sn); - sde->head_sn++; -#endif - sdma_txclean(sde->dd, txp); - if (txp->complete) - (*txp->complete)( - txp, - SDMA_TXREQ_S_OK, - drained); - if (wait && drained) - iowait_drain_wakeup(wait); + complete_tx(sde, txp, SDMA_TXREQ_S_OK); /* see if there is another txp */ txp = get_txhead(sde); } @@ -1525,6 +1487,8 @@ retry: * of sdma_make_progress(..) which is ensured by idle_check_done flag */ if ((status & sde->idle_mask) && !idle_check_done) { + u16 swtail; + swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; if (swtail != hwhead) { hwhead = (u16)read_sde_csr(sde, SD(HEAD)); @@ -1552,6 +1516,12 @@ void sdma_engine_interrupt(struct sdma_engine *sde, u64 status) trace_hfi1_sdma_engine_interrupt(sde, status); write_seqlock(&sde->head_lock); sdma_set_desc_cnt(sde, sdma_desct_intr); + if (status & sde->idle_mask) + sde->idle_int_cnt++; + else if (status & sde->progress_mask) + sde->progress_int_cnt++; + else if (status & sde->int_mask) + sde->sdma_int_cnt++; sdma_make_progress(sde, status); write_sequnlock(&sde->head_lock); } @@ -1577,10 +1547,10 @@ void sdma_engine_error(struct sdma_engine *sde, u64 status) __sdma_process_event(sde, sdma_event_e60_hw_halted); if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) { dd_dev_err(sde->dd, - "SDMA (%u) engine error: 0x%llx state %s\n", - sde->this_idx, - (unsigned long long)status, - sdma_state_names[sde->state.current_state]); + "SDMA (%u) engine error: 0x%llx state %s\n", + sde->this_idx, + (unsigned long long)status, + sdma_state_names[sde->state.current_state]); dump_sdma_state(sde); } write_sequnlock(&sde->head_lock); @@ -1624,8 +1594,8 @@ static void sdma_sendctrl(struct sdma_engine *sde, unsigned op) if (op & SDMA_SENDCTRL_OP_CLEANUP) write_sde_csr(sde, SD(CTRL), - sde->p_senddmactrl | - SD(CTRL_SDMA_CLEANUP_SMASK)); + sde->p_senddmactrl | + SD(CTRL_SDMA_CLEANUP_SMASK)); else write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl); @@ -1649,12 +1619,10 @@ static void sdma_setlengen(struct sdma_engine *sde) * generation counter. */ write_sde_csr(sde, SD(LEN_GEN), - (sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT) - ); + (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)); write_sde_csr(sde, SD(LEN_GEN), - ((sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT)) - | (4ULL << SD(LEN_GEN_GENERATION_SHIFT)) - ); + ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) | + (4ULL << SD(LEN_GEN_GENERATION_SHIFT))); } static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail) @@ -1714,7 +1682,6 @@ static void set_sdma_integrity(struct sdma_engine *sde) write_sde_csr(sde, SD(CHECK_ENABLE), reg); } - static void init_sdma_regs( struct sdma_engine *sde, u32 credits, @@ -1735,17 +1702,16 @@ static void init_sdma_regs( write_sde_csr(sde, SD(DESC_CNT), 0); write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys); write_sde_csr(sde, SD(MEMORY), - ((u64)credits << - SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) | - ((u64)(credits * sde->this_idx) << - SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT))); + ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) | + ((u64)(credits * sde->this_idx) << + SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT))); write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull); set_sdma_integrity(sde); opmask = OPCODE_CHECK_MASK_DISABLED; opval = OPCODE_CHECK_VAL_DISABLED; write_sde_csr(sde, SD(CHECK_OPCODE), - (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) | - (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT)); + (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) | + (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT)); } #ifdef CONFIG_SDMA_VERBOSITY @@ -1824,12 +1790,9 @@ static void dump_sdma_state(struct sdma_engine *sde) descq = sde->descq; dd_dev_err(sde->dd, - "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n", - sde->this_idx, - head, - tail, - cnt, - !list_empty(&sde->flushlist)); + "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n", + sde->this_idx, head, tail, cnt, + !list_empty(&sde->flushlist)); /* print info for each entry in the descriptor queue */ while (head != tail) { @@ -1850,20 +1813,23 @@ static void dump_sdma_state(struct sdma_engine *sde) len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) & SDMA_DESC0_BYTE_COUNT_MASK; dd_dev_err(sde->dd, - "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", - head, flags, addr, gen, len); + "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); dd_dev_err(sde->dd, - "\tdesc0:0x%016llx desc1 0x%016llx\n", - desc[0], desc[1]); + "\tdesc0:0x%016llx desc1 0x%016llx\n", + desc[0], desc[1]); if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) dd_dev_err(sde->dd, - "\taidx: %u amode: %u alen: %u\n", - (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK) - >> SDMA_DESC1_HEADER_INDEX_SHIFT), - (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK) - >> SDMA_DESC1_HEADER_MODE_SHIFT), - (u8)((desc[1] & SDMA_DESC1_HEADER_DWS_SMASK) - >> SDMA_DESC1_HEADER_DWS_SHIFT)); + "\taidx: %u amode: %u alen: %u\n", + (u8)((desc[1] & + SDMA_DESC1_HEADER_INDEX_SMASK) >> + SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_MODE_SMASK) >> + SDMA_DESC1_HEADER_MODE_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_DWS_SMASK) >> + SDMA_DESC1_HEADER_DWS_SHIFT)); head++; head &= sde->sdma_mask; } @@ -1890,29 +1856,26 @@ void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde) head = sde->descq_head & sde->sdma_mask; tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; seq_printf(s, SDE_FMT, sde->this_idx, - sde->cpu, - sdma_state_name(sde->state.current_state), - (unsigned long long)read_sde_csr(sde, SD(CTRL)), - (unsigned long long)read_sde_csr(sde, SD(STATUS)), - (unsigned long long)read_sde_csr(sde, - SD(ENG_ERR_STATUS)), - (unsigned long long)read_sde_csr(sde, SD(TAIL)), - tail, - (unsigned long long)read_sde_csr(sde, SD(HEAD)), - head, - (unsigned long long)le64_to_cpu(*sde->head_dma), - (unsigned long long)read_sde_csr(sde, SD(MEMORY)), - (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)), - (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)), - (unsigned long long)sde->last_status, - (unsigned long long)sde->ahg_bits, - sde->tx_tail, - sde->tx_head, - sde->descq_tail, - sde->descq_head, + sde->cpu, + sdma_state_name(sde->state.current_state), + (unsigned long long)read_sde_csr(sde, SD(CTRL)), + (unsigned long long)read_sde_csr(sde, SD(STATUS)), + (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)), + (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail, + (unsigned long long)read_sde_csr(sde, SD(HEAD)), head, + (unsigned long long)le64_to_cpu(*sde->head_dma), + (unsigned long long)read_sde_csr(sde, SD(MEMORY)), + (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)), + (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)), + (unsigned long long)sde->last_status, + (unsigned long long)sde->ahg_bits, + sde->tx_tail, + sde->tx_head, + sde->descq_tail, + sde->descq_head, !list_empty(&sde->flushlist), - sde->descq_full_count, - (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID)); + sde->descq_full_count, + (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID)); /* print info for each entry in the descriptor queue */ while (head != tail) { @@ -1933,14 +1896,16 @@ void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde) len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) & SDMA_DESC0_BYTE_COUNT_MASK; seq_printf(s, - "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", - head, flags, addr, gen, len); + "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n", - (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK) - >> SDMA_DESC1_HEADER_INDEX_SHIFT), - (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK) - >> SDMA_DESC1_HEADER_MODE_SHIFT)); + (u8)((desc[1] & + SDMA_DESC1_HEADER_INDEX_SMASK) >> + SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_MODE_SMASK) >> + SDMA_DESC1_HEADER_MODE_SHIFT)); head = (head + 1) & sde->sdma_mask; } } @@ -2041,8 +2006,9 @@ static int sdma_check_progress( ret = wait->sleep(sde, wait, tx, seq); if (ret == -EAGAIN) sde->desc_avail = sdma_descq_freecnt(sde); - } else + } else { ret = -EBUSY; + } return ret; } @@ -2080,14 +2046,14 @@ retry: goto nodesc; tail = submit_tx(sde, tx); if (wait) - atomic_inc(&wait->sdma_busy); + iowait_sdma_inc(wait); sdma_update_tail(sde, tail); unlock: spin_unlock_irqrestore(&sde->tail_lock, flags); return ret; unlock_noconn: if (wait) - atomic_inc(&wait->sdma_busy); + iowait_sdma_inc(wait); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; @@ -2132,13 +2098,12 @@ nodesc: * side locking. * * Return: - * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring - * (wait == NULL) + * > 0 - Success (value is number of sdma_txreq's submitted), + * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL) * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ -int sdma_send_txlist(struct sdma_engine *sde, - struct iowait *wait, - struct list_head *tx_list) +int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, + struct list_head *tx_list) { struct sdma_txreq *tx, *tx_next; int ret = 0; @@ -2169,18 +2134,18 @@ retry: } update_tail: if (wait) - atomic_add(count, &wait->sdma_busy); + iowait_sdma_add(wait, count); if (tail != INVALID_TAIL) sdma_update_tail(sde, tail); spin_unlock_irqrestore(&sde->tail_lock, flags); - return ret; + return ret == 0 ? count : ret; unlock_noconn: spin_lock(&sde->flushlist_lock); list_for_each_entry_safe(tx, tx_next, tx_list, list) { tx->wait = wait; list_del_init(&tx->list); if (wait) - atomic_inc(&wait->sdma_busy); + iowait_sdma_inc(wait); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; @@ -2206,8 +2171,7 @@ nodesc: goto update_tail; } -static void sdma_process_event(struct sdma_engine *sde, - enum sdma_events event) +static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event) { unsigned long flags; @@ -2224,7 +2188,7 @@ static void sdma_process_event(struct sdma_engine *sde, } static void __sdma_process_event(struct sdma_engine *sde, - enum sdma_events event) + enum sdma_events event) { struct sdma_state *ss = &sde->state; int need_progress = 0; @@ -2247,14 +2211,15 @@ static void __sdma_process_event(struct sdma_engine *sde, * of link up, then we need to start up. * This can happen when hw down is requested while * bringing the link up with traffic active on - * 7220, e.g. */ + * 7220, e.g. + */ ss->go_s99_running = 1; /* fall through and start dma engine */ case sdma_event_e10_go_hw_start: /* This reference means the state machine is started */ sdma_get(&sde->state); sdma_set_state(sde, - sdma_state_s10_hw_start_up_halt_wait); + sdma_state_s10_hw_start_up_halt_wait); break; case sdma_event_e15_hw_halt_done: break; @@ -2292,7 +2257,7 @@ static void __sdma_process_event(struct sdma_engine *sde, break; case sdma_event_e15_hw_halt_done: sdma_set_state(sde, - sdma_state_s15_hw_start_up_clean_wait); + sdma_state_s15_hw_start_up_clean_wait); sdma_start_hw_clean_up(sde); break; case sdma_event_e25_hw_clean_up_done: @@ -2767,7 +2732,7 @@ enomem: * This function calls _extend_sdma_tx_descs to extend or allocate * coalesce buffer. If there is a allocated coalesce buffer, it will * copy the input packet data into the coalesce buffer. It also adds - * coalesce buffer descriptor once whe whole packet is received. + * coalesce buffer descriptor once when whole packet is received. * * Return: * <0 - error @@ -3030,7 +2995,8 @@ void sdma_freeze(struct hfi1_devdata *dd) * continuing. */ ret = wait_event_interruptible(dd->sdma_unfreeze_wq, - atomic_read(&dd->sdma_unfreeze_count) <= 0); + atomic_read(&dd->sdma_unfreeze_count) <= + 0); /* interrupted or count is negative, then unloading - just exit */ if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0) return; @@ -3047,7 +3013,7 @@ void sdma_freeze(struct hfi1_devdata *dd) * software clean will read engine CSRs, so must be completed before * the next step, which will clear the engine CSRs. */ - (void) wait_event_interruptible(dd->sdma_unfreeze_wq, + (void)wait_event_interruptible(dd->sdma_unfreeze_wq, atomic_read(&dd->sdma_unfreeze_count) <= 0); /* no need to check results - done no matter what */ } @@ -3067,7 +3033,7 @@ void sdma_unfreeze(struct hfi1_devdata *dd) /* tell all engines start freeze clean up */ for (i = 0; i < dd->num_sdma; i++) sdma_process_event(&dd->per_sdma[i], - sdma_event_e82_hw_unfreeze); + sdma_event_e82_hw_unfreeze); } /** @@ -3081,5 +3047,6 @@ void _sdma_engine_progress_schedule( trace_hfi1_sdma_engine_progress(sde, sde->progress_mask); /* assume we have selected a good cpu */ write_csr(sde->dd, - CCE_INT_FORCE + (8*(IS_SDMA_START/64)), sde->progress_mask); + CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)), + sde->progress_mask); } diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h index da89e6458162..8f50c99fe711 100644 --- a/drivers/staging/rdma/hfi1/sdma.h +++ b/drivers/staging/rdma/hfi1/sdma.h @@ -1,14 +1,13 @@ #ifndef _HFI1_SDMA_H #define _HFI1_SDMA_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -58,15 +55,13 @@ #include "hfi.h" #include "verbs.h" +#include "sdma_txreq.h" -/* increased for AHG */ -#define NUM_DESC 6 /* Hardware limit */ #define MAX_DESC 64 /* Hardware limit for SDMA packet size */ #define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1) - #define SDMA_TXREQ_S_OK 0 #define SDMA_TXREQ_S_SENDERROR 1 #define SDMA_TXREQ_S_ABORTED 2 @@ -109,8 +104,8 @@ /* * Bits defined in the send DMA descriptor. */ -#define SDMA_DESC0_FIRST_DESC_FLAG (1ULL << 63) -#define SDMA_DESC0_LAST_DESC_FLAG (1ULL << 62) +#define SDMA_DESC0_FIRST_DESC_FLAG BIT_ULL(63) +#define SDMA_DESC0_LAST_DESC_FLAG BIT_ULL(62) #define SDMA_DESC0_BYTE_COUNT_SHIFT 48 #define SDMA_DESC0_BYTE_COUNT_WIDTH 14 #define SDMA_DESC0_BYTE_COUNT_MASK \ @@ -154,8 +149,8 @@ ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1) #define SDMA_DESC1_GENERATION_SMASK \ (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT) -#define SDMA_DESC1_INT_REQ_FLAG (1ULL << 1) -#define SDMA_DESC1_HEAD_TO_HOST_FLAG (1ULL << 0) +#define SDMA_DESC1_INT_REQ_FLAG BIT_ULL(1) +#define SDMA_DESC1_HEAD_TO_HOST_FLAG BIT_ULL(0) enum sdma_states { sdma_state_s00_hw_down, @@ -311,83 +306,6 @@ struct hw_sdma_desc { __le64 qw[2]; }; -/* - * struct sdma_desc - canonical fragment descriptor - * - * This is the descriptor carried in the tx request - * corresponding to each fragment. - * - */ -struct sdma_desc { - /* private: don't use directly */ - u64 qw[2]; -}; - -struct sdma_txreq; -typedef void (*callback_t)(struct sdma_txreq *, int, int); - -/** - * struct sdma_txreq - the sdma_txreq structure (one per packet) - * @list: for use by user and by queuing for wait - * - * This is the representation of a packet which consists of some - * number of fragments. Storage is provided to within the structure. - * for all fragments. - * - * The storage for the descriptors are automatically extended as needed - * when the currently allocation is exceeded. - * - * The user (Verbs or PSM) may overload this structure with fields - * specific to their use by putting this struct first in their struct. - * The method of allocation of the overloaded structure is user dependent - * - * The list is the only public field in the structure. - * - */ - -struct sdma_txreq { - struct list_head list; - /* private: */ - struct sdma_desc *descp; - /* private: */ - void *coalesce_buf; - /* private: */ - u16 coalesce_idx; - /* private: */ - struct iowait *wait; - /* private: */ - callback_t complete; -#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER - u64 sn; -#endif - /* private: - used in coalesce/pad processing */ - u16 packet_len; - /* private: - down-counted to trigger last */ - u16 tlen; - /* private: flags */ - u16 flags; - /* private: */ - u16 num_desc; - /* private: */ - u16 desc_limit; - /* private: */ - u16 next_descq_idx; - /* private: */ - struct sdma_desc descs[NUM_DESC]; -}; - -struct verbs_txreq { - struct hfi1_pio_header phdr; - struct sdma_txreq txreq; - struct hfi1_qp *qp; - struct hfi1_swqe *wqe; - struct hfi1_mregion *mr; - struct hfi1_sge_state *ss; - struct sdma_engine *sde; - u16 hdr_dwords; - u16 hdr_inx; -}; - /** * struct sdma_engine - Data pertaining to each SDMA engine. * @dd: a back-pointer to the device data @@ -409,6 +327,7 @@ struct sdma_engine { u64 imask; /* clear interrupt mask */ u64 idle_mask; u64 progress_mask; + u64 int_mask; /* private: */ volatile __le64 *head_dma; /* DMA'ed by chip */ /* private: */ @@ -465,6 +384,12 @@ struct sdma_engine { u16 tx_head; /* private: */ u64 last_status; + /* private */ + u64 err_cnt; + /* private */ + u64 sdma_int_cnt; + u64 idle_int_cnt; + u64 progress_int_cnt; /* private: */ struct list_head dmawait; @@ -484,12 +409,12 @@ struct sdma_engine { u32 progress_check_head; /* private: */ struct work_struct flush_worker; + /* protect flush list */ spinlock_t flushlist_lock; /* private: */ struct list_head flushlist; }; - int sdma_init(struct hfi1_devdata *dd, u8 port); void sdma_start(struct hfi1_devdata *dd); void sdma_exit(struct hfi1_devdata *dd); @@ -535,7 +460,6 @@ static inline int __sdma_running(struct sdma_engine *engine) return engine->state.current_state == sdma_state_s99_running; } - /** * sdma_running() - state suitability test * @engine: sdma engine @@ -565,7 +489,6 @@ void _sdma_txreq_ahgadd( u32 *ahg, u8 ahg_hlen); - /** * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG * @tx: tx request to initialize @@ -626,7 +549,7 @@ static inline int sdma_txinit_ahg( u8 num_ahg, u32 *ahg, u8 ahg_hlen, - void (*cb)(struct sdma_txreq *, int, int)) + void (*cb)(struct sdma_txreq *, int)) { if (tlen == 0) return -ENODATA; @@ -640,7 +563,8 @@ static inline int sdma_txinit_ahg( tx->complete = cb; tx->coalesce_buf = NULL; tx->wait = NULL; - tx->tlen = tx->packet_len = tlen; + tx->packet_len = tlen; + tx->tlen = tx->packet_len; tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG; tx->descs[0].qw[1] = 0; if (flags & SDMA_TXREQ_F_AHG_COPY) @@ -689,7 +613,7 @@ static inline int sdma_txinit( struct sdma_txreq *tx, u16 flags, u16 tlen, - void (*cb)(struct sdma_txreq *, int, int)) + void (*cb)(struct sdma_txreq *, int)) { return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb); } @@ -753,7 +677,7 @@ static inline void _sdma_close_tx(struct hfi1_devdata *dd, dd->default_desc1; if (tx->flags & SDMA_TXREQ_F_URGENT) tx->descp[tx->num_desc].qw[1] |= - (SDMA_DESC1_HEAD_TO_HOST_FLAG| + (SDMA_DESC1_HEAD_TO_HOST_FLAG | SDMA_DESC1_INT_REQ_FLAG); } @@ -1080,6 +1004,7 @@ struct sdma_map_elem { /** * struct sdma_map_el - mapping for a vl + * @engine_to_vl - map of an engine to a vl * @list - rcu head for free callback * @mask - vl mask to "mod" the vl to produce an index to map array * @actual_vls - number of vls @@ -1091,6 +1016,7 @@ struct sdma_map_elem { * in turn point to an array of sde's for that vl. */ struct sdma_vl_map { + s8 engine_to_vl[TXE_NUM_SDMA_ENGINES]; struct rcu_head list; u32 mask; u8 actual_vls; diff --git a/drivers/staging/rdma/hfi1/sdma_txreq.h b/drivers/staging/rdma/hfi1/sdma_txreq.h new file mode 100644 index 000000000000..bf7d777d756e --- /dev/null +++ b/drivers/staging/rdma/hfi1/sdma_txreq.h @@ -0,0 +1,135 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef HFI1_SDMA_TXREQ_H +#define HFI1_SDMA_TXREQ_H + +/* increased for AHG */ +#define NUM_DESC 6 + +/* + * struct sdma_desc - canonical fragment descriptor + * + * This is the descriptor carried in the tx request + * corresponding to each fragment. + * + */ +struct sdma_desc { + /* private: don't use directly */ + u64 qw[2]; +}; + +/** + * struct sdma_txreq - the sdma_txreq structure (one per packet) + * @list: for use by user and by queuing for wait + * + * This is the representation of a packet which consists of some + * number of fragments. Storage is provided to within the structure. + * for all fragments. + * + * The storage for the descriptors are automatically extended as needed + * when the currently allocation is exceeded. + * + * The user (Verbs or PSM) may overload this structure with fields + * specific to their use by putting this struct first in their struct. + * The method of allocation of the overloaded structure is user dependent + * + * The list is the only public field in the structure. + * + */ + +#define SDMA_TXREQ_S_OK 0 +#define SDMA_TXREQ_S_SENDERROR 1 +#define SDMA_TXREQ_S_ABORTED 2 +#define SDMA_TXREQ_S_SHUTDOWN 3 + +/* flags bits */ +#define SDMA_TXREQ_F_URGENT 0x0001 +#define SDMA_TXREQ_F_AHG_COPY 0x0002 +#define SDMA_TXREQ_F_USE_AHG 0x0004 + +struct sdma_txreq; +typedef void (*callback_t)(struct sdma_txreq *, int); + +struct iowait; +struct sdma_txreq { + struct list_head list; + /* private: */ + struct sdma_desc *descp; + /* private: */ + void *coalesce_buf; + /* private: */ + struct iowait *wait; + /* private: */ + callback_t complete; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + u64 sn; +#endif + /* private: - used in coalesce/pad processing */ + u16 packet_len; + /* private: - down-counted to trigger last */ + u16 tlen; + /* private: */ + u16 num_desc; + /* private: */ + u16 desc_limit; + /* private: */ + u16 next_descq_idx; + /* private: */ + u16 coalesce_idx; + /* private: flags */ + u16 flags; + /* private: */ + struct sdma_desc descs[NUM_DESC]; +}; + +static inline int sdma_txreq_built(struct sdma_txreq *tx) +{ + return tx->num_desc; +} + +#endif /* HFI1_SDMA_TXREQ_H */ diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c index 1dd6727dd5ef..c7f1271190af 100644 --- a/drivers/staging/rdma/hfi1/sysfs.c +++ b/drivers/staging/rdma/hfi1/sysfs.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -53,7 +50,6 @@ #include "mad.h" #include "trace.h" - /* * Start of per-port congestion control structures and support code */ @@ -62,8 +58,8 @@ * Congestion control table size followed by table entries */ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { int ret; struct hfi1_pportdata *ppd = @@ -84,7 +80,7 @@ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, rcu_read_lock(); cc_state = get_cc_state(ppd); - if (cc_state == NULL) { + if (!cc_state) { rcu_read_unlock(); return -EINVAL; } @@ -99,10 +95,6 @@ static void port_release(struct kobject *kobj) /* nothing to do since memory is freed by hfi1_free_devdata() */ } -static struct kobj_type port_cc_ktype = { - .release = port_release, -}; - static struct bin_attribute cc_table_bin_attr = { .attr = {.name = "cc_table_bin", .mode = 0444}, .read = read_cc_table_bin, @@ -115,8 +107,8 @@ static struct bin_attribute cc_table_bin_attr = { * trigger threshold and the minimum injection rate delay. */ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { int ret; struct hfi1_pportdata *ppd = @@ -135,7 +127,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, rcu_read_lock(); cc_state = get_cc_state(ppd); - if (cc_state == NULL) { + if (!cc_state) { rcu_read_unlock(); return -EINVAL; } @@ -151,6 +143,68 @@ static struct bin_attribute cc_setting_bin_attr = { .size = PAGE_SIZE, }; +struct hfi1_port_attr { + struct attribute attr; + ssize_t (*show)(struct hfi1_pportdata *, char *); + ssize_t (*store)(struct hfi1_pportdata *, const char *, size_t); +}; + +static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf) +{ + return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off"); +} + +static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf, + size_t count) +{ + if (!memcmp(buf, "on", 2)) + ppd->cc_prescan = true; + else if (!memcmp(buf, "off", 3)) + ppd->cc_prescan = false; + + return count; +} + +static struct hfi1_port_attr cc_prescan_attr = + __ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store); + +static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_port_attr *port_attr = + container_of(attr, struct hfi1_port_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + + return port_attr->show(ppd, buf); +} + +static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct hfi1_port_attr *port_attr = + container_of(attr, struct hfi1_port_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + + return port_attr->store(ppd, buf, count); +} + +static const struct sysfs_ops port_cc_sysfs_ops = { + .show = cc_attr_show, + .store = cc_attr_store +}; + +static struct attribute *port_cc_default_attributes[] = { + &cc_prescan_attr.attr +}; + +static struct kobj_type port_cc_ktype = { + .release = port_release, + .sysfs_ops = &port_cc_sysfs_ops, + .default_attrs = port_cc_default_attributes +}; + /* Start sc2vl */ #define HFI1_SC2VL_ATTR(N) \ static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \ @@ -196,7 +250,6 @@ HFI1_SC2VL_ATTR(29); HFI1_SC2VL_ATTR(30); HFI1_SC2VL_ATTR(31); - static struct attribute *sc2vl_default_attributes[] = { &hfi1_sc2vl_attr_0.attr, &hfi1_sc2vl_attr_1.attr, @@ -302,7 +355,6 @@ HFI1_SL2SC_ATTR(29); HFI1_SL2SC_ATTR(30); HFI1_SL2SC_ATTR(31); - static struct attribute *sl2sc_default_attributes[] = { &hfi1_sl2sc_attr_0.attr, &hfi1_sl2sc_attr_1.attr, @@ -435,7 +487,6 @@ static struct kobj_type hfi1_vl2mtu_ktype = { .default_attrs = vl2mtu_default_attributes }; - /* end of per-port file structures and support code */ /* @@ -446,7 +497,7 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } @@ -455,7 +506,7 @@ static ssize_t show_hfi(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -470,19 +521,18 @@ static ssize_t show_boardversion(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); struct hfi1_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); } - static ssize_t show_nctxts(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); struct hfi1_devdata *dd = dd_from_dev(dev); /* @@ -497,10 +547,10 @@ static ssize_t show_nctxts(struct device *device, } static ssize_t show_nfreectxts(struct device *device, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); struct hfi1_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ @@ -511,11 +561,10 @@ static ssize_t show_serial(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); struct hfi1_devdata *dd = dd_from_dev(dev); return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); - } static ssize_t store_chip_reset(struct device *device, @@ -523,7 +572,7 @@ static ssize_t store_chip_reset(struct device *device, size_t count) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -552,7 +601,7 @@ static ssize_t show_tempsense(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, ibdev.dev); + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); struct hfi1_devdata *dd = dd_from_dev(dev); struct hfi1_temp temp; int ret; @@ -608,8 +657,8 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, if (!port_num || port_num > dd->num_pports) { dd_dev_err(dd, - "Skipping infiniband class with invalid port %u\n", - port_num); + "Skipping infiniband class with invalid port %u\n", + port_num); return -ENODEV; } ppd = &dd->pport[port_num - 1]; @@ -644,39 +693,36 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, } kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD); - ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype, kobj, "CCMgtA"); if (ret) { dd_dev_err(dd, - "Skipping Congestion Control sysfs info, (err %d) port %u\n", - ret, port_num); + "Skipping Congestion Control sysfs info, (err %d) port %u\n", + ret, port_num); goto bail_vl2mtu; } kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD); - ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, - &cc_setting_bin_attr); + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr); if (ret) { dd_dev_err(dd, - "Skipping Congestion Control setting sysfs info, (err %d) port %u\n", - ret, port_num); + "Skipping Congestion Control setting sysfs info, (err %d) port %u\n", + ret, port_num); goto bail_cc; } - ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, - &cc_table_bin_attr); + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr); if (ret) { dd_dev_err(dd, - "Skipping Congestion Control table sysfs info, (err %d) port %u\n", - ret, port_num); + "Skipping Congestion Control table sysfs info, (err %d) port %u\n", + ret, port_num); goto bail_cc_entry_bin; } dd_dev_info(dd, - "IB%u: Congestion Control Agent enabled for port %d\n", - dd->unit, port_num); + "IB%u: Congestion Control Agent enabled for port %d\n", + dd->unit, port_num); return 0; @@ -700,7 +746,7 @@ bail: */ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) { - struct ib_device *dev = &dd->verbs_dev.ibdev; + struct ib_device *dev = &dd->verbs_dev.rdi.ibdev; int i, ret; for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) { diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c index 10122e84cb2f..8b62fefcf903 100644 --- a/drivers/staging/rdma/hfi1/trace.c +++ b/drivers/staging/rdma/hfi1/trace.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -109,17 +106,17 @@ const char *parse_everbs_hdrs( case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE): case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE): trace_seq_printf(p, IMM_PRN, - be32_to_cpu(eh->imm_data)); + be32_to_cpu(eh->imm_data)); break; /* reth + imm */ case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): trace_seq_printf(p, RETH_PRN " " IMM_PRN, - (unsigned long long)ib_u64_get( - (__be32 *)&eh->rc.reth.vaddr), - be32_to_cpu(eh->rc.reth.rkey), - be32_to_cpu(eh->rc.reth.length), - be32_to_cpu(eh->rc.imm_data)); + (unsigned long long)ib_u64_get( + (__be32 *)&eh->rc.reth.vaddr), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length), + be32_to_cpu(eh->rc.imm_data)); break; /* reth */ case OP(RC, RDMA_READ_REQUEST): @@ -128,10 +125,10 @@ const char *parse_everbs_hdrs( case OP(RC, RDMA_WRITE_ONLY): case OP(UC, RDMA_WRITE_ONLY): trace_seq_printf(p, RETH_PRN, - (unsigned long long)ib_u64_get( - (__be32 *)&eh->rc.reth.vaddr), - be32_to_cpu(eh->rc.reth.rkey), - be32_to_cpu(eh->rc.reth.length)); + (unsigned long long)ib_u64_get( + (__be32 *)&eh->rc.reth.vaddr), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length)); break; case OP(RC, RDMA_READ_RESPONSE_FIRST): case OP(RC, RDMA_READ_RESPONSE_LAST): @@ -154,19 +151,20 @@ const char *parse_everbs_hdrs( case OP(RC, COMPARE_SWAP): case OP(RC, FETCH_ADD): trace_seq_printf(p, ATOMICETH_PRN, - (unsigned long long)ib_u64_get(eh->atomic_eth.vaddr), - eh->atomic_eth.rkey, - (unsigned long long)ib_u64_get( - (__be32 *)&eh->atomic_eth.swap_data), - (unsigned long long) ib_u64_get( + (unsigned long long)ib_u64_get( + eh->atomic_eth.vaddr), + eh->atomic_eth.rkey, + (unsigned long long)ib_u64_get( + (__be32 *)&eh->atomic_eth.swap_data), + (unsigned long long)ib_u64_get( (__be32 *)&eh->atomic_eth.compare_data)); break; /* deth */ case OP(UD, SEND_ONLY): case OP(UD, SEND_ONLY_WITH_IMMEDIATE): trace_seq_printf(p, DETH_PRN, - be32_to_cpu(eh->ud.deth[0]), - be32_to_cpu(eh->ud.deth[1]) & HFI1_QPN_MASK); + be32_to_cpu(eh->ud.deth[0]), + be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK); break; } trace_seq_putc(p, 0); @@ -187,12 +185,12 @@ const char *parse_sdma_flags( trace_seq_printf(p, "%s", flags); if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) trace_seq_printf(p, " amode:%u aidx:%u alen:%u", - (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) - & SDMA_DESC1_HEADER_MODE_MASK), - (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) - & SDMA_DESC1_HEADER_INDEX_MASK), - (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) - & SDMA_DESC1_HEADER_DWS_MASK)); + (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) & + SDMA_DESC1_HEADER_MODE_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) & + SDMA_DESC1_HEADER_INDEX_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) & + SDMA_DESC1_HEADER_DWS_MASK)); return ret; } @@ -234,3 +232,4 @@ __hfi1_trace_fn(DC8051); __hfi1_trace_fn(FIRMWARE); __hfi1_trace_fn(RCVCTRL); __hfi1_trace_fn(TID); +__hfi1_trace_fn(MMU); diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h index 86c12ebfd4f0..963dc948c38a 100644 --- a/drivers/staging/rdma/hfi1/trace.h +++ b/drivers/staging/rdma/hfi1/trace.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -76,304 +73,295 @@ __print_symbolic(etype, \ #define TRACE_SYSTEM hfi1_rx TRACE_EVENT(hfi1_rcvhdr, - TP_PROTO(struct hfi1_devdata *dd, - u64 eflags, - u32 ctxt, - u32 etype, - u32 hlen, - u32 tlen, - u32 updegr, - u32 etail), - TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u64, eflags) - __field(u32, ctxt) - __field(u32, etype) - __field(u32, hlen) - __field(u32, tlen) - __field(u32, updegr) - __field(u32, etail) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->eflags = eflags; - __entry->ctxt = ctxt; - __entry->etype = etype; - __entry->hlen = hlen; - __entry->tlen = tlen; - __entry->updegr = updegr; - __entry->etail = etail; - ), - TP_printk( -"[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", - __get_str(dev), - __entry->ctxt, - __entry->eflags, - __entry->etype, show_packettype(__entry->etype), - __entry->hlen, - __entry->tlen, - __entry->updegr, - __entry->etail - ) + TP_PROTO(struct hfi1_devdata *dd, + u64 eflags, + u32 ctxt, + u32 etype, + u32 hlen, + u32 tlen, + u32 updegr, + u32 etail + ), + TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u64, eflags) + __field(u32, ctxt) + __field(u32, etype) + __field(u32, hlen) + __field(u32, tlen) + __field(u32, updegr) + __field(u32, etail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->eflags = eflags; + __entry->ctxt = ctxt; + __entry->etype = etype; + __entry->hlen = hlen; + __entry->tlen = tlen; + __entry->updegr = updegr; + __entry->etail = etail; + ), + TP_printk( + "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", + __get_str(dev), + __entry->ctxt, + __entry->eflags, + __entry->etype, show_packettype(__entry->etype), + __entry->hlen, + __entry->tlen, + __entry->updegr, + __entry->etail + ) ); TRACE_EVENT(hfi1_receive_interrupt, - TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), - TP_ARGS(dd, ctxt), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u32, ctxt) - __field(u8, slow_path) - __field(u8, dma_rtail) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt) { - __entry->slow_path = 1; - __entry->dma_rtail = 0xFF; - } else if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt_dma_rtail){ - __entry->dma_rtail = 1; - __entry->slow_path = 0; - } else if (dd->rcd[ctxt]->do_interrupt == - &handle_receive_interrupt_nodma_rtail) { - __entry->dma_rtail = 0; - __entry->slow_path = 0; - } - ), - TP_printk( - "[%s] ctxt %d SlowPath: %d DmaRtail: %d", - __get_str(dev), - __entry->ctxt, - __entry->slow_path, - __entry->dma_rtail - ) + TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), + TP_ARGS(dd, ctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, ctxt) + __field(u8, slow_path) + __field(u8, dma_rtail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt) { + __entry->slow_path = 1; + __entry->dma_rtail = 0xFF; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_dma_rtail){ + __entry->dma_rtail = 1; + __entry->slow_path = 0; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_nodma_rtail) { + __entry->dma_rtail = 0; + __entry->slow_path = 0; + } + ), + TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d", + __get_str(dev), + __entry->ctxt, + __entry->slow_path, + __entry->dma_rtail + ) ); -const char *print_u64_array(struct trace_seq *, u64 *, int); +TRACE_EVENT(hfi1_exp_tid_reg, + TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, + u32 npages, unsigned long va, unsigned long pa, + dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) + ); -TRACE_EVENT(hfi1_exp_tid_map, - TP_PROTO(unsigned ctxt, u16 subctxt, int dir, - unsigned long *maps, u16 count), - TP_ARGS(ctxt, subctxt, dir, maps, count), +TRACE_EVENT(hfi1_exp_tid_unreg, + TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), TP_STRUCT__entry( __field(unsigned, ctxt) __field(u16, subctxt) - __field(int, dir) - __field(u16, count) - __dynamic_array(unsigned long, maps, sizeof(*maps) * count) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) ), TP_fast_assign( __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __entry->dir = dir; - __entry->count = count; - memcpy(__get_dynamic_array(maps), maps, - sizeof(*maps) * count); + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; ), - TP_printk("[%3u:%02u] %s tidmaps %s", + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", __entry->ctxt, __entry->subctxt, - (__entry->dir ? ">" : "<"), - print_u64_array(p, __get_dynamic_array(maps), - __entry->count) + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma ) ); -TRACE_EVENT(hfi1_exp_rcv_set, - TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid, - unsigned long vaddr, u64 phys_addr, void *page), - TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page), +TRACE_EVENT(hfi1_exp_tid_inval, + TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr, + u32 npages, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), TP_STRUCT__entry( __field(unsigned, ctxt) __field(u16, subctxt) - __field(u32, tid) - __field(unsigned long, vaddr) - __field(u64, phys_addr) - __field(void *, page) + __field(unsigned long, va) + __field(u32, rarr) + __field(u32, npages) + __field(dma_addr_t, dma) ), TP_fast_assign( __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __entry->tid = tid; - __entry->vaddr = vaddr; - __entry->phys_addr = phys_addr; - __entry->page = page; + __entry->va = va; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->dma = dma; ), - TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p", + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", __entry->ctxt, __entry->subctxt, - __entry->tid, - __entry->vaddr, - __entry->phys_addr, - __entry->page + __entry->rarr, + __entry->npages, + __entry->va, + __entry->dma ) ); -TRACE_EVENT(hfi1_exp_rcv_free, - TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid, - unsigned long phys, void *page), - TP_ARGS(ctxt, subctxt, tid, phys, page), +TRACE_EVENT(hfi1_mmu_invalidate, + TP_PROTO(unsigned ctxt, u16 subctxt, const char *type, + unsigned long start, unsigned long end), + TP_ARGS(ctxt, subctxt, type, start, end), TP_STRUCT__entry( __field(unsigned, ctxt) __field(u16, subctxt) - __field(u32, tid) - __field(unsigned long, phys) - __field(void *, page) + __string(type, type) + __field(unsigned long, start) + __field(unsigned long, end) ), TP_fast_assign( __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __entry->tid = tid; - __entry->phys = phys; - __entry->page = page; + __assign_str(type, type); + __entry->start = start; + __entry->end = end; ), - TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p", + TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx", __entry->ctxt, __entry->subctxt, - __entry->tid, - __entry->phys, - __entry->page + __get_str(type), + __entry->start, + __entry->end ) ); + #undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_tx TRACE_EVENT(hfi1_piofree, - TP_PROTO(struct send_context *sc, int extra), - TP_ARGS(sc, extra), - TP_STRUCT__entry( - DD_DEV_ENTRY(sc->dd) - __field(u32, sw_index) - __field(u32, hw_context) - __field(int, extra) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sc->dd); - __entry->sw_index = sc->sw_index; - __entry->hw_context = sc->hw_context; - __entry->extra = extra; - ), - TP_printk( - "[%s] ctxt %u(%u) extra %d", - __get_str(dev), - __entry->sw_index, - __entry->hw_context, - __entry->extra - ) + TP_PROTO(struct send_context *sc, int extra), + TP_ARGS(sc, extra), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(int, extra) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->extra = extra; + ), + TP_printk("[%s] ctxt %u(%u) extra %d", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->extra + ) ); TRACE_EVENT(hfi1_wantpiointr, - TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), - TP_ARGS(sc, needint, credit_ctrl), - TP_STRUCT__entry( - DD_DEV_ENTRY(sc->dd) - __field(u32, sw_index) - __field(u32, hw_context) - __field(u32, needint) - __field(u64, credit_ctrl) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sc->dd); - __entry->sw_index = sc->sw_index; - __entry->hw_context = sc->hw_context; - __entry->needint = needint; - __entry->credit_ctrl = credit_ctrl; - ), - TP_printk( - "[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", - __get_str(dev), - __entry->sw_index, - __entry->hw_context, - __entry->needint, - (unsigned long long)__entry->credit_ctrl - ) + TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), + TP_ARGS(sc, needint, credit_ctrl), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(u32, needint) + __field(u64, credit_ctrl) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->needint = needint; + __entry->credit_ctrl = credit_ctrl; + ), + TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->needint, + (unsigned long long)__entry->credit_ctrl + ) ); DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, - TP_PROTO(struct hfi1_qp *qp, u32 flags), - TP_ARGS(qp, flags), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) - __field(u32, qpn) - __field(u32, flags) - __field(u32, s_flags) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) - __entry->flags = flags; - __entry->qpn = qp->ibqp.qp_num; - __entry->s_flags = qp->s_flags; - ), - TP_printk( - "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", - __get_str(dev), - __entry->qpn, - __entry->flags, - __entry->s_flags - ) + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, flags) + __field(u32, s_flags) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->flags = flags; + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + ), + TP_printk( + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", + __get_str(dev), + __entry->qpn, + __entry->flags, + __entry->s_flags + ) ); DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, - TP_PROTO(struct hfi1_qp *qp, u32 flags), + TP_PROTO(struct rvt_qp *qp, u32 flags), TP_ARGS(qp, flags)); DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, - TP_PROTO(struct hfi1_qp *qp, u32 flags), + TP_PROTO(struct rvt_qp *qp, u32 flags), TP_ARGS(qp, flags)); #undef TRACE_SYSTEM -#define TRACE_SYSTEM hfi1_qphash -DECLARE_EVENT_CLASS(hfi1_qphash_template, - TP_PROTO(struct hfi1_qp *qp, u32 bucket), - TP_ARGS(qp, bucket), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) - __field(u32, qpn) - __field(u32, bucket) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) - __entry->qpn = qp->ibqp.qp_num; - __entry->bucket = bucket; - ), - TP_printk( - "[%s] qpn 0x%x bucket %u", - __get_str(dev), - __entry->qpn, - __entry->bucket - ) -); - -DEFINE_EVENT(hfi1_qphash_template, hfi1_qpinsert, - TP_PROTO(struct hfi1_qp *qp, u32 bucket), - TP_ARGS(qp, bucket)); - -DEFINE_EVENT(hfi1_qphash_template, hfi1_qpremove, - TP_PROTO(struct hfi1_qp *qp, u32 bucket), - TP_ARGS(qp, bucket)); - -#undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_ibhdrs u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr); -const char *parse_everbs_hdrs( - struct trace_seq *p, - u8 opcode, - void *ehdrs); +const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); #define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) -const char *parse_sdma_flags( - struct trace_seq *p, - u64 desc0, u64 desc1); +const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1); #define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) - #define lrh_name(lrh) { HFI1_##lrh, #lrh } #define show_lnh(lrh) \ __print_symbolic(lrh, \ @@ -420,7 +408,6 @@ __print_symbolic(opcode, \ ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ ib_opcode_name(CNP)) - #define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" #define BTH_PRN \ "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ @@ -428,124 +415,130 @@ __print_symbolic(opcode, \ #define EHDR_PRN "%s" DECLARE_EVENT_CLASS(hfi1_ibhdr_template, - TP_PROTO(struct hfi1_devdata *dd, - struct hfi1_ib_header *hdr), - TP_ARGS(dd, hdr), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - /* LRH */ - __field(u8, vl) - __field(u8, lver) - __field(u8, sl) - __field(u8, lnh) - __field(u16, dlid) - __field(u16, len) - __field(u16, slid) - /* BTH */ - __field(u8, opcode) - __field(u8, se) - __field(u8, m) - __field(u8, pad) - __field(u8, tver) - __field(u16, pkey) - __field(u8, f) - __field(u8, b) - __field(u32, qpn) - __field(u8, a) - __field(u32, psn) - /* extended headers */ - __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) - ), - TP_fast_assign( - struct hfi1_other_headers *ohdr; - - DD_DEV_ASSIGN(dd); - /* LRH */ - __entry->vl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); - __entry->lver = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; - __entry->sl = - (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; - __entry->lnh = - (u8)(be16_to_cpu(hdr->lrh[0]) & 3); - __entry->dlid = - be16_to_cpu(hdr->lrh[1]); - /* allow for larger len */ - __entry->len = - be16_to_cpu(hdr->lrh[2]); - __entry->slid = - be16_to_cpu(hdr->lrh[3]); - /* BTH */ - if (__entry->lnh == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - __entry->opcode = - (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; - __entry->se = - (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; - __entry->m = - (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; - __entry->pad = - (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; - __entry->tver = - (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; - __entry->pkey = - be32_to_cpu(ohdr->bth[0]) & 0xffff; - __entry->f = - (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) - & HFI1_FECN_MASK; - __entry->b = - (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) - & HFI1_BECN_MASK; - __entry->qpn = - be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; - __entry->a = - (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; - /* allow for larger PSN */ - __entry->psn = - be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; - /* extended headers */ - memcpy( - __get_dynamic_array(ehdrs), - &ohdr->u, - ibhdr_exhdr_len(hdr)); - ), - TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, - __get_str(dev), - /* LRH */ - __entry->vl, - __entry->lver, - __entry->sl, - __entry->lnh, show_lnh(__entry->lnh), - __entry->dlid, - __entry->len, - __entry->slid, - /* BTH */ - __entry->opcode, show_ib_opcode(__entry->opcode), - __entry->se, - __entry->m, - __entry->pad, - __entry->tver, - __entry->pkey, - __entry->f, - __entry->b, - __entry->qpn, - __entry->a, - __entry->psn, - /* extended headers */ - __parse_ib_ehdrs( - __entry->opcode, - (void *)__get_dynamic_array(ehdrs)) - ) + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + /* LRH */ + __field(u8, vl) + __field(u8, lver) + __field(u8, sl) + __field(u8, lnh) + __field(u16, dlid) + __field(u16, len) + __field(u16, slid) + /* BTH */ + __field(u8, opcode) + __field(u8, se) + __field(u8, m) + __field(u8, pad) + __field(u8, tver) + __field(u16, pkey) + __field(u8, f) + __field(u8, b) + __field(u32, qpn) + __field(u8, a) + __field(u32, psn) + /* extended headers */ + __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + DD_DEV_ASSIGN(dd); + /* LRH */ + __entry->vl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); + __entry->lver = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; + __entry->sl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->lnh = + (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + __entry->dlid = + be16_to_cpu(hdr->lrh[1]); + /* allow for larger len */ + __entry->len = + be16_to_cpu(hdr->lrh[2]); + __entry->slid = + be16_to_cpu(hdr->lrh[3]); + /* BTH */ + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + __entry->opcode = + (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->se = + (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; + __entry->m = + (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; + __entry->pad = + (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + __entry->tver = + (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; + __entry->pkey = + be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->f = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) & + HFI1_FECN_MASK; + __entry->b = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) & + HFI1_BECN_MASK; + __entry->qpn = + be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + __entry->a = + (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; + /* allow for larger PSN */ + __entry->psn = + be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; + /* extended headers */ + memcpy(__get_dynamic_array(ehdrs), &ohdr->u, + ibhdr_exhdr_len(hdr)); + ), + TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, + __get_str(dev), + /* LRH */ + __entry->vl, + __entry->lver, + __entry->sl, + __entry->lnh, show_lnh(__entry->lnh), + __entry->dlid, + __entry->len, + __entry->slid, + /* BTH */ + __entry->opcode, show_ib_opcode(__entry->opcode), + __entry->se, + __entry->m, + __entry->pad, + __entry->tver, + __entry->pkey, + __entry->f, + __entry->b, + __entry->qpn, + __entry->a, + __entry->psn, + /* extended headers */ + __parse_ib_ehdrs( + __entry->opcode, + (void *)__get_dynamic_array(ehdrs)) + ) ); DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), TP_ARGS(dd, hdr)); -DEFINE_EVENT(hfi1_ibhdr_template, output_ibhdr, +DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr, TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), TP_ARGS(dd, hdr)); @@ -556,15 +549,14 @@ DEFINE_EVENT(hfi1_ibhdr_template, output_ibhdr, #undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_snoop - TRACE_EVENT(snoop_capture, - TP_PROTO(struct hfi1_devdata *dd, - int hdr_len, - struct hfi1_ib_header *hdr, - int data_len, - void *data), - TP_ARGS(dd, hdr_len, hdr, data_len, data), - TP_STRUCT__entry( + TP_PROTO(struct hfi1_devdata *dd, + int hdr_len, + struct hfi1_ib_header *hdr, + int data_len, + void *data), + TP_ARGS(dd, hdr_len, hdr, data_len, data), + TP_STRUCT__entry( DD_DEV_ENTRY(dd) __field(u16, slid) __field(u16, dlid) @@ -577,8 +569,8 @@ TRACE_EVENT(snoop_capture, __field(u8, lnh) __dynamic_array(u8, raw_hdr, hdr_len) __dynamic_array(u8, raw_pkt, data_len) - ), - TP_fast_assign( + ), + TP_fast_assign( struct hfi1_other_headers *ohdr; __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); @@ -589,7 +581,7 @@ TRACE_EVENT(snoop_capture, DD_DEV_ASSIGN(dd); __entry->slid = be16_to_cpu(hdr->lrh[3]); __entry->dlid = be16_to_cpu(hdr->lrh[1]); - __entry->qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; @@ -597,8 +589,9 @@ TRACE_EVENT(snoop_capture, __entry->data_len = data_len; memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); memcpy(__get_dynamic_array(raw_pkt), data, data_len); - ), - TP_printk("[%s] " SNOOP_PRN, + ), + TP_printk( + "[%s] " SNOOP_PRN, __get_str(dev), __entry->slid, __entry->dlid, @@ -609,7 +602,7 @@ TRACE_EVENT(snoop_capture, __entry->pkey, __entry->hdr_len, __entry->data_len - ) + ) ); #undef TRACE_SYSTEM @@ -621,41 +614,39 @@ TRACE_EVENT(snoop_capture, TRACE_EVENT(hfi1_uctxtdata, TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), TP_ARGS(dd, uctxt), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(unsigned, ctxt) - __field(u32, credits) - __field(u64, hw_free) - __field(u64, piobase) - __field(u16, rcvhdrq_cnt) - __field(u64, rcvhdrq_phys) - __field(u32, eager_cnt) - __field(u64, rcvegr_phys) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = uctxt->ctxt; - __entry->credits = uctxt->sc->credits; - __entry->hw_free = (u64)uctxt->sc->hw_free; - __entry->piobase = (u64)uctxt->sc->base_addr; - __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; - __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; - __entry->eager_cnt = uctxt->egrbufs.alloced; - __entry->rcvegr_phys = uctxt->egrbufs.rcvtids[0].phys; - ), - TP_printk( - "[%s] ctxt %u " UCTXT_FMT, - __get_str(dev), - __entry->ctxt, - __entry->credits, - __entry->hw_free, - __entry->piobase, - __entry->rcvhdrq_cnt, - __entry->rcvhdrq_phys, - __entry->eager_cnt, - __entry->rcvegr_phys - ) - ); + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned, ctxt) + __field(u32, credits) + __field(u64, hw_free) + __field(u64, piobase) + __field(u16, rcvhdrq_cnt) + __field(u64, rcvhdrq_phys) + __field(u32, eager_cnt) + __field(u64, rcvegr_phys) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = uctxt->ctxt; + __entry->credits = uctxt->sc->credits; + __entry->hw_free = (u64)uctxt->sc->hw_free; + __entry->piobase = (u64)uctxt->sc->base_addr; + __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; + __entry->eager_cnt = uctxt->egrbufs.alloced; + __entry->rcvegr_phys = + uctxt->egrbufs.rcvtids[0].phys; + ), + TP_printk("[%s] ctxt %u " UCTXT_FMT, + __get_str(dev), + __entry->ctxt, + __entry->credits, + __entry->hw_free, + __entry->piobase, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_phys, + __entry->eager_cnt, + __entry->rcvegr_phys + ) +); #define CINFO_FMT \ "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" @@ -663,38 +654,35 @@ TRACE_EVENT(hfi1_ctxt_info, TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt, struct hfi1_ctxt_info cinfo), TP_ARGS(dd, ctxt, subctxt, cinfo), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(unsigned, ctxt) - __field(unsigned, subctxt) - __field(u16, egrtids) - __field(u16, rcvhdrq_cnt) - __field(u16, rcvhdrq_size) - __field(u16, sdma_ring_size) - __field(u32, rcvegr_size) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->ctxt = ctxt; - __entry->subctxt = subctxt; - __entry->egrtids = cinfo.egrtids; - __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; - __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; - __entry->sdma_ring_size = cinfo.sdma_ring_size; - __entry->rcvegr_size = cinfo.rcvegr_size; - ), - TP_printk( - "[%s] ctxt %u:%u " CINFO_FMT, - __get_str(dev), - __entry->ctxt, - __entry->subctxt, - __entry->egrtids, - __entry->rcvegr_size, - __entry->rcvhdrq_cnt, - __entry->rcvhdrq_size, - __entry->sdma_ring_size - ) - ); + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned, ctxt) + __field(unsigned, subctxt) + __field(u16, egrtids) + __field(u16, rcvhdrq_cnt) + __field(u16, rcvhdrq_size) + __field(u16, sdma_ring_size) + __field(u32, rcvegr_size) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->egrtids = cinfo.egrtids; + __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo.sdma_ring_size; + __entry->rcvegr_size = cinfo.rcvegr_size; + ), + TP_printk("[%s] ctxt %u:%u " CINFO_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->egrtids, + __entry->rcvegr_size, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_size, + __entry->sdma_ring_size + ) +); #undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_sma @@ -708,52 +696,48 @@ TRACE_EVENT(hfi1_ctxt_info, ) DECLARE_EVENT_CLASS(hfi1_bct_template, - TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), - TP_ARGS(dd, bc), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __dynamic_array(u8, bct, sizeof(*bc)) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - memcpy( - __get_dynamic_array(bct), - bc, - sizeof(*bc)); - ), - TP_printk(BCT_FORMAT, - BCT(overall_shared_limit), - - BCT(vl[0].dedicated), - BCT(vl[0].shared), - - BCT(vl[1].dedicated), - BCT(vl[1].shared), - - BCT(vl[2].dedicated), - BCT(vl[2].shared), - - BCT(vl[3].dedicated), - BCT(vl[3].shared), - - BCT(vl[4].dedicated), - BCT(vl[4].shared), - - BCT(vl[5].dedicated), - BCT(vl[5].shared), - - BCT(vl[6].dedicated), - BCT(vl[6].shared), - - BCT(vl[7].dedicated), - BCT(vl[7].shared), - - BCT(vl[15].dedicated), - BCT(vl[15].shared) - ) + TP_PROTO(struct hfi1_devdata *dd, + struct buffer_control *bc), + TP_ARGS(dd, bc), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __dynamic_array(u8, bct, sizeof(*bc)) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + memcpy(__get_dynamic_array(bct), bc, + sizeof(*bc)); + ), + TP_printk(BCT_FORMAT, + BCT(overall_shared_limit), + + BCT(vl[0].dedicated), + BCT(vl[0].shared), + + BCT(vl[1].dedicated), + BCT(vl[1].shared), + + BCT(vl[2].dedicated), + BCT(vl[2].shared), + + BCT(vl[3].dedicated), + BCT(vl[3].shared), + + BCT(vl[4].dedicated), + BCT(vl[4].shared), + + BCT(vl[5].dedicated), + BCT(vl[5].shared), + + BCT(vl[6].dedicated), + BCT(vl[6].shared), + + BCT(vl[7].dedicated), + BCT(vl[7].shared), + + BCT(vl[15].dedicated), + BCT(vl[15].shared) + ) ); - DEFINE_EVENT(hfi1_bct_template, bct_set, TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), TP_ARGS(dd, bc)); @@ -766,252 +750,209 @@ DEFINE_EVENT(hfi1_bct_template, bct_get, #define TRACE_SYSTEM hfi1_sdma TRACE_EVENT(hfi1_sdma_descriptor, - TP_PROTO( - struct sdma_engine *sde, - u64 desc0, - u64 desc1, - u16 e, - void *descp), + TP_PROTO(struct sdma_engine *sde, + u64 desc0, + u64 desc1, + u16 e, + void *descp), TP_ARGS(sde, desc0, desc1, e, descp), - TP_STRUCT__entry( - DD_DEV_ENTRY(sde->dd) - __field(void *, descp) - __field(u64, desc0) - __field(u64, desc1) - __field(u16, e) - __field(u8, idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sde->dd); - __entry->desc0 = desc0; - __entry->desc1 = desc1; - __entry->idx = sde->this_idx; - __entry->descp = descp; - __entry->e = e; - ), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(void *, descp) + __field(u64, desc0) + __field(u64, desc1) + __field(u16, e) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->desc0 = desc0; + __entry->desc1 = desc1; + __entry->idx = sde->this_idx; + __entry->descp = descp; + __entry->e = e; + ), TP_printk( - "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", - __get_str(dev), - __entry->idx, - __parse_sdma_flags(__entry->desc0, __entry->desc1), - (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) - & SDMA_DESC0_PHY_ADDR_MASK, - (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) - & SDMA_DESC1_GENERATION_MASK), - (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) - & SDMA_DESC0_BYTE_COUNT_MASK), - __entry->desc0, - __entry->desc1, - __entry->descp, - __entry->e - ) + "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", + __get_str(dev), + __entry->idx, + __parse_sdma_flags(__entry->desc0, __entry->desc1), + (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) & + SDMA_DESC0_PHY_ADDR_MASK, + (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) & + SDMA_DESC1_GENERATION_MASK), + (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) & + SDMA_DESC0_BYTE_COUNT_MASK), + __entry->desc0, + __entry->desc1, + __entry->descp, + __entry->e + ) ); TRACE_EVENT(hfi1_sdma_engine_select, - TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), - TP_ARGS(dd, sel, vl, idx), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __field(u32, sel) - __field(u8, vl) - __field(u8, idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd); - __entry->sel = sel; - __entry->vl = vl; - __entry->idx = idx; - ), - TP_printk( - "[%s] selecting SDE %u sel 0x%x vl %u", - __get_str(dev), - __entry->idx, - __entry->sel, - __entry->vl - ) + TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), + TP_ARGS(dd, sel, vl, idx), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, sel) + __field(u8, vl) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->sel = sel; + __entry->vl = vl; + __entry->idx = idx; + ), + TP_printk("[%s] selecting SDE %u sel 0x%x vl %u", + __get_str(dev), + __entry->idx, + __entry->sel, + __entry->vl + ) ); DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, - TP_PROTO( - struct sdma_engine *sde, - u64 status - ), - TP_ARGS(sde, status), - TP_STRUCT__entry( - DD_DEV_ENTRY(sde->dd) - __field(u64, status) - __field(u8, idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sde->dd); - __entry->status = status; - __entry->idx = sde->this_idx; - ), - TP_printk( - "[%s] SDE(%u) status %llx", - __get_str(dev), - __entry->idx, - (unsigned long long)__entry->status - ) + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, status) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->status = status; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) status %llx", + __get_str(dev), + __entry->idx, + (unsigned long long)__entry->status + ) ); DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, - TP_PROTO( - struct sdma_engine *sde, - u64 status - ), - TP_ARGS(sde, status) + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) ); DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, - TP_PROTO( - struct sdma_engine *sde, - u64 status - ), - TP_ARGS(sde, status) + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) ); DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, - TP_PROTO( - struct sdma_engine *sde, - int aidx - ), - TP_ARGS(sde, aidx), - TP_STRUCT__entry( - DD_DEV_ENTRY(sde->dd) - __field(int, aidx) - __field(u8, idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sde->dd); - __entry->idx = sde->this_idx; - __entry->aidx = aidx; - ), - TP_printk( - "[%s] SDE(%u) aidx %d", - __get_str(dev), - __entry->idx, - __entry->aidx - ) + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(int, aidx) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->idx = sde->this_idx; + __entry->aidx = aidx; + ), + TP_printk("[%s] SDE(%u) aidx %d", + __get_str(dev), + __entry->idx, + __entry->aidx + ) ); DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, - TP_PROTO( - struct sdma_engine *sde, - int aidx - ), + TP_PROTO(struct sdma_engine *sde, int aidx), TP_ARGS(sde, aidx)); DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, - TP_PROTO( - struct sdma_engine *sde, - int aidx - ), + TP_PROTO(struct sdma_engine *sde, int aidx), TP_ARGS(sde, aidx)); #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER TRACE_EVENT(hfi1_sdma_progress, - TP_PROTO( - struct sdma_engine *sde, - u16 hwhead, - u16 swhead, - struct sdma_txreq *txp - ), - TP_ARGS(sde, hwhead, swhead, txp), - TP_STRUCT__entry( - DD_DEV_ENTRY(sde->dd) - __field(u64, sn) - __field(u16, hwhead) - __field(u16, swhead) - __field(u16, txnext) - __field(u16, tx_tail) - __field(u16, tx_head) - __field(u8, idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sde->dd); - __entry->hwhead = hwhead; - __entry->swhead = swhead; - __entry->tx_tail = sde->tx_tail; - __entry->tx_head = sde->tx_head; - __entry->txnext = txp ? txp->next_descq_idx : ~0; - __entry->idx = sde->this_idx; - __entry->sn = txp ? txp->sn : ~0; - ), - TP_printk( - "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", - __get_str(dev), - __entry->idx, - __entry->sn, - __entry->hwhead, - __entry->swhead, - __entry->txnext, - __entry->tx_head, - __entry->tx_tail - ) + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, + u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + __entry->sn = txp ? txp->sn : ~0; + ), + TP_printk( + "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->sn, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) ); #else TRACE_EVENT(hfi1_sdma_progress, - TP_PROTO( - struct sdma_engine *sde, - u16 hwhead, - u16 swhead, - struct sdma_txreq *txp + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, u16 swhead, + struct sdma_txreq *txp ), TP_ARGS(sde, hwhead, swhead, txp), - TP_STRUCT__entry( - DD_DEV_ENTRY(sde->dd) - __field(u16, hwhead) - __field(u16, swhead) - __field(u16, txnext) - __field(u16, tx_tail) - __field(u16, tx_head) - __field(u8, idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sde->dd); - __entry->hwhead = hwhead; - __entry->swhead = swhead; - __entry->tx_tail = sde->tx_tail; - __entry->tx_head = sde->tx_head; - __entry->txnext = txp ? txp->next_descq_idx : ~0; - __entry->idx = sde->this_idx; - ), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + ), TP_printk( - "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", - __get_str(dev), - __entry->idx, - __entry->hwhead, - __entry->swhead, - __entry->txnext, - __entry->tx_head, - __entry->tx_tail - ) + "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) ); #endif DECLARE_EVENT_CLASS(hfi1_sdma_sn, - TP_PROTO( - struct sdma_engine *sde, - u64 sn - ), - TP_ARGS(sde, sn), - TP_STRUCT__entry( - DD_DEV_ENTRY(sde->dd) - __field(u64, sn) - __field(u8, idx) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sde->dd); - __entry->sn = sn; - __entry->idx = sde->this_idx; - ), - TP_printk( - "[%s] SDE(%u) sn %llu", - __get_str(dev), - __entry->idx, - __entry->sn - ) + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->sn = sn; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) sn %llu", + __get_str(dev), + __entry->idx, + __entry->sn + ) ); DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, @@ -1023,10 +964,7 @@ DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, ); DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, - TP_PROTO( - struct sdma_engine *sde, - u64 sn - ), + TP_PROTO(struct sdma_engine *sde, u64 sn), TP_ARGS(sde, sn) ); @@ -1227,88 +1165,85 @@ TRACE_EVENT(hfi1_sdma_user_header_ahg, ); TRACE_EVENT(hfi1_sdma_state, - TP_PROTO( - struct sdma_engine *sde, - const char *cstate, - const char *nstate - ), - TP_ARGS(sde, cstate, nstate), - TP_STRUCT__entry( - DD_DEV_ENTRY(sde->dd) - __string(curstate, cstate) - __string(newstate, nstate) - ), - TP_fast_assign( - DD_DEV_ASSIGN(sde->dd); - __assign_str(curstate, cstate); - __assign_str(newstate, nstate); - ), + TP_PROTO(struct sdma_engine *sde, + const char *cstate, + const char *nstate + ), + TP_ARGS(sde, cstate, nstate), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __string(curstate, cstate) + __string(newstate, nstate) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __assign_str(curstate, cstate); + __assign_str(newstate, nstate); + ), TP_printk("[%s] current state %s new state %s", - __get_str(dev), - __get_str(curstate), - __get_str(newstate) - ) + __get_str(dev), + __get_str(curstate), + __get_str(newstate) + ) ); #undef TRACE_SYSTEM #define TRACE_SYSTEM hfi1_rc DECLARE_EVENT_CLASS(hfi1_rc_template, - TP_PROTO(struct hfi1_qp *qp, u32 psn), - TP_ARGS(qp, psn), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) - __field(u32, qpn) - __field(u32, s_flags) - __field(u32, psn) - __field(u32, s_psn) - __field(u32, s_next_psn) - __field(u32, s_sending_psn) - __field(u32, s_sending_hpsn) - __field(u32, r_psn) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) - __entry->qpn = qp->ibqp.qp_num; - __entry->s_flags = qp->s_flags; - __entry->psn = psn; - __entry->s_psn = qp->s_psn; - __entry->s_next_psn = qp->s_next_psn; - __entry->s_sending_psn = qp->s_sending_psn; - __entry->s_sending_hpsn = qp->s_sending_hpsn; - __entry->r_psn = qp->r_psn; - ), - TP_printk( - "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", - __get_str(dev), - __entry->qpn, - __entry->s_flags, - __entry->psn, - __entry->s_psn, - __entry->s_next_psn, - __entry->s_sending_psn, - __entry->s_sending_hpsn, - __entry->r_psn - ) + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u32, psn) + __field(u32, s_psn) + __field(u32, s_next_psn) + __field(u32, s_sending_psn) + __field(u32, s_sending_hpsn) + __field(u32, r_psn) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->psn = psn; + __entry->s_psn = qp->s_psn; + __entry->s_next_psn = qp->s_next_psn; + __entry->s_sending_psn = qp->s_sending_psn; + __entry->s_sending_hpsn = qp->s_sending_hpsn; + __entry->r_psn = qp->r_psn; + ), + TP_printk( + "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->psn, + __entry->s_psn, + __entry->s_next_psn, + __entry->s_sending_psn, + __entry->s_sending_hpsn, + __entry->r_psn + ) ); DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete, - TP_PROTO(struct hfi1_qp *qp, u32 psn), + TP_PROTO(struct rvt_qp *qp, u32 psn), TP_ARGS(qp, psn) ); DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack, - TP_PROTO(struct hfi1_qp *qp, u32 psn), + TP_PROTO(struct rvt_qp *qp, u32 psn), TP_ARGS(qp, psn) ); DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout, - TP_PROTO(struct hfi1_qp *qp, u32 psn), + TP_PROTO(struct rvt_qp *qp, u32 psn), TP_ARGS(qp, psn) ); DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error, - TP_PROTO(struct hfi1_qp *qp, u32 psn), + TP_PROTO(struct rvt_qp *qp, u32 psn), TP_ARGS(qp, psn) ); @@ -1316,21 +1251,20 @@ DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error, #define TRACE_SYSTEM hfi1_misc TRACE_EVENT(hfi1_interrupt, - TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, - int src), - TP_ARGS(dd, is_entry, src), - TP_STRUCT__entry( - DD_DEV_ENTRY(dd) - __array(char, buf, 64) - __field(int, src) - ), - TP_fast_assign( - DD_DEV_ASSIGN(dd) - is_entry->is_name(__entry->buf, 64, src - is_entry->start); - __entry->src = src; - ), - TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, - __entry->src) + TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, + int src), + TP_ARGS(dd, is_entry, src), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __array(char, buf, 64) + __field(int, src) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd) + is_entry->is_name(__entry->buf, 64, + src - is_entry->start); + __entry->src = src; + ), + TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, + __entry->src) ); /* @@ -1345,21 +1279,21 @@ TRACE_EVENT(hfi1_interrupt, #define MAX_MSG_LEN 512 DECLARE_EVENT_CLASS(hfi1_trace_template, - TP_PROTO(const char *function, struct va_format *vaf), - TP_ARGS(function, vaf), - TP_STRUCT__entry( - __string(function, function) - __dynamic_array(char, msg, MAX_MSG_LEN) - ), - TP_fast_assign( - __assign_str(function, function); - WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), - MAX_MSG_LEN, vaf->fmt, - *vaf->va) >= MAX_MSG_LEN); - ), - TP_printk("(%s) %s", - __get_str(function), - __get_str(msg)) + TP_PROTO(const char *function, struct va_format *vaf), + TP_ARGS(function, vaf), + TP_STRUCT__entry(__string(function, function) + __dynamic_array(char, msg, MAX_MSG_LEN) + ), + TP_fast_assign(__assign_str(function, function); + WARN_ON_ONCE(vsnprintf + (__get_dynamic_array(msg), + MAX_MSG_LEN, vaf->fmt, + *vaf->va) >= + MAX_MSG_LEN); + ), + TP_printk("(%s) %s", + __get_str(function), + __get_str(msg)) ); /* @@ -1406,6 +1340,7 @@ __hfi1_trace_def(DC8051); __hfi1_trace_def(FIRMWARE); __hfi1_trace_def(RCVCTRL); __hfi1_trace_def(TID); +__hfi1_trace_def(MMU); #define hfi1_cdbg(which, fmt, ...) \ __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c index ea54fd2700ad..e82e52a63d35 100644 --- a/drivers/staging/rdma/hfi1/twsi.c +++ b/drivers/staging/rdma/hfi1/twsi.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -119,9 +116,9 @@ static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit) * Allow for slow slaves by simple * delay for falling edge, sampling on rise. */ - if (!bit) + if (!bit) { udelay(2); - else { + } else { int rise_usec; for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) { @@ -131,11 +128,24 @@ static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit) } if (rise_usec <= 0) dd_dev_err(dd, "SCL interface stuck low > %d uSec\n", - SCL_WAIT_USEC); + SCL_WAIT_USEC); } i2c_wait_for_writes(dd, target); } +static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait) +{ + u32 read_val, mask; + + mask = QSFP_HFI0_I2CCLK; + /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, 0, mask); + read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); + if (wait) + i2c_wait_for_writes(dd, target); + return (read_val & mask) >> GPIO_SCL_NUM; +} + static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit) { u32 mask; @@ -274,13 +284,12 @@ static void stop_cmd(struct hfi1_devdata *dd, u32 target) /** * hfi1_twsi_reset - reset I2C communication * @dd: the hfi1_ib device + * returns 0 if ok, -EIO on error */ - int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target) { int clock_cycles_left = 9; - int was_high = 0; - u32 pins, mask; + u32 mask; /* Both SCL and SDA should be high. If not, there * is something wrong. @@ -294,43 +303,23 @@ int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target) */ hfi1_gpio_mod(dd, target, 0, 0, mask); - /* - * Clock nine times to get all listeners into a sane state. - * If SDA does not go high at any point, we are wedged. - * One vendor recommends then issuing START followed by STOP. - * we cannot use our "normal" functions to do that, because - * if SCL drops between them, another vendor's part will - * wedge, dropping SDA and keeping it low forever, at the end of - * the next transaction (even if it was not the device addressed). - * So our START and STOP take place with SCL held high. + /* Check if SCL is low, if it is low then we have a slave device + * misbehaving and there is not much we can do. + */ + if (!scl_in(dd, target, 0)) + return -EIO; + + /* Check if SDA is low, if it is low then we have to clock SDA + * up to 9 times for the device to release the bus */ while (clock_cycles_left--) { + if (sda_in(dd, target, 0)) + return 0; scl_out(dd, target, 0); scl_out(dd, target, 1); - /* Note if SDA is high, but keep clocking to sync slave */ - was_high |= sda_in(dd, target, 0); - } - - if (was_high) { - /* - * We saw a high, which we hope means the slave is sync'd. - * Issue START, STOP, pause for T_BUF. - */ - - pins = hfi1_gpio_mod(dd, target, 0, 0, 0); - if ((pins & mask) != mask) - dd_dev_err(dd, "GPIO pins not at rest: %d\n", - pins & mask); - /* Drop SDA to issue START */ - udelay(1); /* Guarantee .6 uSec setup */ - sda_out(dd, target, 0); - udelay(1); /* Guarantee .6 uSec hold */ - /* At this point, SCL is high, SDA low. Raise SDA for STOP */ - sda_out(dd, target, 1); - udelay(TWSI_BUF_WAIT_USEC); } - return !was_high; + return -EIO; } #define HFI1_TWSI_START 0x100 @@ -365,17 +354,25 @@ static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags) * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, * which responded to all TWSI device codes, interpreting them as * address within device. On all other devices found on board handled by - * this driver, the device is followed by a one-byte "address" which selects + * this driver, the device is followed by a N-byte "address" which selects * the "register" or "offset" within the device from which data should * be read. */ int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, void *buffer, int len) { - int ret; u8 *bp = buffer; + int ret = 1; + int i; + int offset_size; + + /* obtain the offset size, strip it from the device address */ + offset_size = (dev >> 8) & 0xff; + dev &= 0xff; - ret = 1; + /* allow at most a 2 byte offset */ + if (offset_size > 2) + goto bail; if (dev == HFI1_TWSI_NO_DEV) { /* legacy not-really-I2C */ @@ -383,34 +380,29 @@ int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, ret = twsi_wr(dd, target, addr, HFI1_TWSI_START); } else { /* Actual I2C */ - ret = twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START); - if (ret) { - stop_cmd(dd, target); - ret = 1; - goto bail; - } - /* - * SFF spec claims we do _not_ stop after the addr - * but simply issue a start with the "read" dev-addr. - * Since we are implicitly waiting for ACK here, - * we need t_buf (nominally 20uSec) before that start, - * and cannot rely on the delay built in to the STOP - */ - ret = twsi_wr(dd, target, addr, 0); - udelay(TWSI_BUF_WAIT_USEC); + if (offset_size) { + ret = twsi_wr(dd, target, + dev | WRITE_CMD, HFI1_TWSI_START); + if (ret) { + stop_cmd(dd, target); + goto bail; + } - if (ret) { - dd_dev_err(dd, - "Failed to write interface read addr %02X\n", - addr); - ret = 1; - goto bail; + for (i = 0; i < offset_size; i++) { + ret = twsi_wr(dd, target, + (addr >> (i * 8)) & 0xff, 0); + udelay(TWSI_BUF_WAIT_USEC); + if (ret) { + dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", + i, addr); + goto bail; + } + } } ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START); } if (ret) { stop_cmd(dd, target); - ret = 1; goto bail; } @@ -442,76 +434,55 @@ bail: * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, * which responded to all TWSI device codes, interpreting them as * address within device. On all other devices found on board handled by - * this driver, the device is followed by a one-byte "address" which selects + * this driver, the device is followed by a N-byte "address" which selects * the "register" or "offset" within the device to which data should * be written. */ int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, const void *buffer, int len) { - int sub_len; const u8 *bp = buffer; - int max_wait_time, i; int ret = 1; + int i; + int offset_size; - while (len > 0) { - if (dev == HFI1_TWSI_NO_DEV) { - if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD, - HFI1_TWSI_START)) { - goto failed_write; - } - } else { - /* Real I2C */ - if (twsi_wr(dd, target, - dev | WRITE_CMD, HFI1_TWSI_START)) - goto failed_write; - ret = twsi_wr(dd, target, addr, 0); - if (ret) { - dd_dev_err(dd, - "Failed to write interface write addr %02X\n", - addr); - goto failed_write; - } - } - - sub_len = min(len, 4); - addr += sub_len; - len -= sub_len; + /* obtain the offset size, strip it from the device address */ + offset_size = (dev >> 8) & 0xff; + dev &= 0xff; - for (i = 0; i < sub_len; i++) - if (twsi_wr(dd, target, *bp++, 0)) - goto failed_write; + /* allow at most a 2 byte offset */ + if (offset_size > 2) + goto bail; - stop_cmd(dd, target); + if (dev == HFI1_TWSI_NO_DEV) { + if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD, + HFI1_TWSI_START)) { + goto failed_write; + } + } else { + /* Real I2C */ + if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START)) + goto failed_write; + } - /* - * Wait for write complete by waiting for a successful - * read (the chip replies with a zero after the write - * cmd completes, and before it writes to the eeprom. - * The startcmd for the read will fail the ack until - * the writes have completed. We do this inline to avoid - * the debug prints that are in the real read routine - * if the startcmd fails. - * We also use the proper device address, so it doesn't matter - * whether we have real eeprom_dev. Legacy likes any address. - */ - max_wait_time = 100; - while (twsi_wr(dd, target, - dev | READ_CMD, HFI1_TWSI_START)) { - stop_cmd(dd, target); - if (!--max_wait_time) - goto failed_write; + for (i = 0; i < offset_size; i++) { + ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0); + udelay(TWSI_BUF_WAIT_USEC); + if (ret) { + dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", + i, addr); + goto bail; } - /* now read (and ignore) the resulting byte */ - rd_byte(dd, target, 1); } + for (i = 0; i < len; i++) + if (twsi_wr(dd, target, *bp++, 0)) + goto failed_write; + ret = 0; - goto bail; failed_write: stop_cmd(dd, target); - ret = 1; bail: return ret; diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h index 5907e029613d..5b8a5b5e7eae 100644 --- a/drivers/staging/rdma/hfi1/twsi.h +++ b/drivers/staging/rdma/hfi1/twsi.h @@ -1,14 +1,13 @@ #ifndef _TWSI_H #define _TWSI_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -54,8 +51,9 @@ struct hfi1_devdata; -/* Bit position of SDA pin in ASIC_QSFP* registers */ +/* Bit position of SDA/SCL pins in ASIC_QSFP* registers */ #define GPIO_SDA_NUM 1 +#define GPIO_SCL_NUM 0 /* these functions must be called with qsfp_lock held */ int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target); @@ -64,5 +62,4 @@ int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, const void *buffer, int len); - #endif /* _TWSI_H */ diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c index 4f2a7889a852..df773d433297 100644 --- a/drivers/staging/rdma/hfi1/uc.c +++ b/drivers/staging/rdma/hfi1/uc.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -49,71 +46,82 @@ */ #include "hfi.h" -#include "sdma.h" +#include "verbs_txreq.h" #include "qp.h" /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_UC_##x +/* only opcode mask for adaptive pio */ +const u32 uc_only_opcode = + BIT(OP(SEND_ONLY) & 0x1f) | + BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)); + /** * hfi1_make_uc_req - construct a request packet (SEND, RDMA write) * @qp: a pointer to the QP * + * Assume s_lock is held. + * * Return 1 if constructed; otherwise, return 0. */ -int hfi1_make_uc_req(struct hfi1_qp *qp) +int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { + struct hfi1_qp_priv *priv = qp->priv; struct hfi1_other_headers *ohdr; - struct hfi1_swqe *wqe; - unsigned long flags; + struct rvt_swqe *wqe; u32 hwords = 5; u32 bth0 = 0; u32 len; u32 pmtu = qp->pmtu; - int ret = 0; int middle = 0; - spin_lock_irqsave(&qp->s_lock, flags); + ps->s_txreq = get_txreq(ps->dev, qp); + if (IS_ERR(ps->s_txreq)) + goto bail_no_tx; - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) { - if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - if (qp->s_last == qp->s_head) + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ - if (atomic_read(&qp->s_iowait.sdma_busy)) { - qp->s_flags |= HFI1_S_WAIT_DMA; + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } clear_ahg(qp); - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); - goto done; + goto done_free_tx; } - ohdr = &qp->s_hdr->ibh.u.oth; + ohdr = &ps->s_txreq->phdr.hdr.u.oth; if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr->ibh.u.l.oth; + ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; /* Get the next send request. */ - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); qp->s_wqe = NULL; switch (qp->s_state) { default: - if (!(ib_hfi1_state_ops[qp->state] & - HFI1_PROCESS_NEXT_SEND_OK)) + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_NEXT_SEND_OK)) goto bail; /* Check if send work queue is empty. */ - if (qp->s_cur == qp->s_head) { + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_cur == ACCESS_ONCE(qp->s_head)) { clear_ahg(qp); goto bail; } /* * Start a new request. */ - wqe->psn = qp->s_next_psn; - qp->s_psn = qp->s_next_psn; + qp->s_psn = wqe->psn; qp->s_sge.sge = wqe->sg_list[0]; qp->s_sge.sg_list = wqe->sg_list + 1; qp->s_sge.num_sge = wqe->wr.num_sge; @@ -128,9 +136,9 @@ int hfi1_make_uc_req(struct hfi1_qp *qp) len = pmtu; break; } - if (wqe->wr.opcode == IB_WR_SEND) + if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_ONLY); - else { + } else { qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ @@ -157,9 +165,9 @@ int hfi1_make_uc_req(struct hfi1_qp *qp) len = pmtu; break; } - if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { qp->s_state = OP(RDMA_WRITE_ONLY); - else { + } else { qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the RETH */ @@ -188,9 +196,9 @@ int hfi1_make_uc_req(struct hfi1_qp *qp) middle = HFI1_CAP_IS_KSET(SDMA_AHG); break; } - if (wqe->wr.opcode == IB_WR_SEND) + if (wqe->wr.opcode == IB_WR_SEND) { qp->s_state = OP(SEND_LAST); - else { + } else { qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ ohdr->u.imm_data = wqe->wr.ex.imm_data; @@ -213,9 +221,9 @@ int hfi1_make_uc_req(struct hfi1_qp *qp) middle = HFI1_CAP_IS_KSET(SDMA_AHG); break; } - if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { qp->s_state = OP(RDMA_WRITE_LAST); - else { + } else { qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ @@ -231,19 +239,28 @@ int hfi1_make_uc_req(struct hfi1_qp *qp) } qp->s_len -= len; qp->s_hdrwords = hwords; + ps->s_txreq->sde = priv->s_sde; qp->s_cur_sge = &qp->s_sge; qp->s_cur_size = len; hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), - mask_psn(qp->s_next_psn++), middle); -done: - ret = 1; - goto unlock; + mask_psn(qp->s_psn++), middle, ps); + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; bail: - qp->s_flags &= ~HFI1_S_BUSY; -unlock: - spin_unlock_irqrestore(&qp->s_lock, flags); - return ret; + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + qp->s_hdrwords = 0; + return 0; } /** @@ -266,7 +283,7 @@ void hfi1_uc_rcv(struct hfi1_packet *packet) u32 rcv_flags = packet->rcv_flags; void *data = packet->ebuf; u32 tlen = packet->tlen; - struct hfi1_qp *qp = packet->qp; + struct rvt_qp *qp = packet->qp; struct hfi1_other_headers *ohdr = packet->ohdr; u32 bth0, opcode; u32 hdrsize = packet->hlen; @@ -291,14 +308,14 @@ void hfi1_uc_rcv(struct hfi1_packet *packet) u16 rlid = be16_to_cpu(hdr->lrh[3]); u8 sl, sc5; - lqpn = bth1 & HFI1_QPN_MASK; + lqpn = bth1 & RVT_QPN_MASK; rqpn = qp->remote_qpn; sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; sl = ibp->sc_to_sl[sc5]; process_becn(ppd, sl, rlid, lqpn, rqpn, - IB_CC_SVCTYPE_UC); + IB_CC_SVCTYPE_UC); } if (bth1 & HFI1_FECN_SMASK) { @@ -331,10 +348,11 @@ void hfi1_uc_rcv(struct hfi1_packet *packet) inv: if (qp->r_state == OP(SEND_FIRST) || qp->r_state == OP(SEND_MIDDLE)) { - set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags); + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); qp->r_sge.num_sge = 0; - } else - hfi1_put_ss(&qp->r_sge); + } else { + rvt_put_ss(&qp->r_sge); + } qp->r_state = OP(SEND_LAST); switch (opcode) { case OP(SEND_FIRST): @@ -381,7 +399,7 @@ inv: goto inv; } - if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST)) + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) qp_comm_est(qp); /* OK, process the packet. */ @@ -390,10 +408,10 @@ inv: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): send_first: - if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags)) + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { qp->r_sge = qp->s_rdma_read_sge; - else { - ret = hfi1_get_rwqe(qp, 0); + } else { + ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto op_err; if (!ret) @@ -417,7 +435,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - hfi1_copy_sge(&qp->r_sge, data, pmtu, 0); + hfi1_copy_sge(&qp->r_sge, data, pmtu, 0, 0); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -442,8 +460,8 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - hfi1_copy_sge(&qp->r_sge, data, tlen, 0); - hfi1_put_ss(&qp->s_rdma_read_sge); + hfi1_copy_sge(&qp->r_sge, data, tlen, 0, 0); + rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -468,9 +486,9 @@ last_imm: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, - (ohdr->bth[0] & - cpu_to_be32(IB_BTH_SOLICITED)) != 0); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); break; case OP(RDMA_WRITE_FIRST): @@ -491,8 +509,8 @@ rdma_first: int ok; /* Check rkey */ - ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, - vaddr, rkey, IB_ACCESS_REMOTE_WRITE); + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, + vaddr, rkey, IB_ACCESS_REMOTE_WRITE); if (unlikely(!ok)) goto drop; qp->r_sge.num_sge = 1; @@ -503,9 +521,9 @@ rdma_first: qp->r_sge.sge.length = 0; qp->r_sge.sge.sge_length = 0; } - if (opcode == OP(RDMA_WRITE_ONLY)) + if (opcode == OP(RDMA_WRITE_ONLY)) { goto rdma_last; - else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { + } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { wc.ex.imm_data = ohdr->u.rc.imm_data; goto rdma_last_imm; } @@ -517,7 +535,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, pmtu, 1); + hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -535,10 +553,10 @@ rdma_last_imm: tlen -= (hdrsize + pad + 4); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags)) - hfi1_put_ss(&qp->s_rdma_read_sge); - else { - ret = hfi1_get_rwqe(qp, 1); + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { + rvt_put_ss(&qp->s_rdma_read_sge); + } else { + ret = hfi1_rvt_get_rwqe(qp, 1); if (ret < 0) goto op_err; if (!ret) @@ -546,8 +564,8 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - hfi1_copy_sge(&qp->r_sge, data, tlen, 1); - hfi1_put_ss(&qp->r_sge); + hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0); + rvt_put_ss(&qp->r_sge); goto last_imm; case OP(RDMA_WRITE_LAST): @@ -562,8 +580,8 @@ rdma_last: tlen -= (hdrsize + pad + 4); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, tlen, 1); - hfi1_put_ss(&qp->r_sge); + hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0); + rvt_put_ss(&qp->r_sge); break; default: @@ -575,14 +593,12 @@ rdma_last: return; rewind: - set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags); + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); qp->r_sge.num_sge = 0; drop: - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; return; op_err: hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); - return; - } diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c index bd1b402c1e14..ae8a70f703eb 100644 --- a/drivers/staging/rdma/hfi1/ud.c +++ b/drivers/staging/rdma/hfi1/ud.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -53,6 +50,7 @@ #include "hfi.h" #include "mad.h" +#include "verbs_txreq.h" #include "qp.h" /** @@ -65,24 +63,25 @@ * Note that the receive interrupt handler may be calling hfi1_ud_rcv() * while this is being called. */ -static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) +static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) { struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); struct hfi1_pportdata *ppd; - struct hfi1_qp *qp; + struct rvt_qp *qp; struct ib_ah_attr *ah_attr; unsigned long flags; - struct hfi1_sge_state ssge; - struct hfi1_sge *sge; + struct rvt_sge_state ssge; + struct rvt_sge *sge; struct ib_wc wc; u32 length; enum ib_qp_type sqptype, dqptype; rcu_read_lock(); - qp = hfi1_lookup_qpn(ibp, swqe->ud_wr.remote_qpn); + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, + swqe->ud_wr.remote_qpn); if (!qp) { - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; rcu_read_unlock(); return; } @@ -93,12 +92,12 @@ static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) IB_QPT_UD : qp->ibqp.qp_type; if (dqptype != sqptype || - !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) { - ibp->n_pkt_drops++; + !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; goto drop; } - ah_attr = &to_iah(swqe->ud_wr.ah)->attr; + ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr; ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -161,35 +160,36 @@ static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) /* * Get the next work request entry to find where to put the data. */ - if (qp->r_flags & HFI1_R_REUSE_SGE) - qp->r_flags &= ~HFI1_R_REUSE_SGE; - else { + if (qp->r_flags & RVT_R_REUSE_SGE) { + qp->r_flags &= ~RVT_R_REUSE_SGE; + } else { int ret; - ret = hfi1_get_rwqe(qp, 0); + ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) { hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); goto bail_unlock; } if (!ret) { if (qp->ibqp.qp_num == 0) - ibp->n_vl15_dropped++; + ibp->rvp.n_vl15_dropped++; goto bail_unlock; } } /* Silently drop packets which are too big. */ if (unlikely(wc.byte_len > qp->r_len)) { - qp->r_flags |= HFI1_R_REUSE_SGE; - ibp->n_pkt_drops++; + qp->r_flags |= RVT_R_REUSE_SGE; + ibp->rvp.n_pkt_drops++; goto bail_unlock; } if (ah_attr->ah_flags & IB_AH_GRH) { hfi1_copy_sge(&qp->r_sge, &ah_attr->grh, - sizeof(struct ib_grh), 1); + sizeof(struct ib_grh), 1, 0); wc.wc_flags |= IB_WC_GRH; - } else + } else { hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + } ssge.sg_list = swqe->sg_list + 1; ssge.sge = *swqe->sg_list; ssge.num_sge = swqe->wr.num_sge; @@ -202,7 +202,7 @@ static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1, 0); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -210,7 +210,7 @@ static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) if (--ssge.num_sge) *sge = *ssge.sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= HFI1_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -222,8 +222,8 @@ static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) } length -= len; } - hfi1_put_ss(&qp->r_sge); - if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) goto bail_unlock; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -242,14 +242,14 @@ static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1)); /* Check for loopback when the port lid is not set */ if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI) - wc.slid = HFI1_PERMISSIVE_LID; + wc.slid = be16_to_cpu(IB_LID_PERMISSIVE); wc.sl = ah_attr->sl; wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, - swqe->wr.send_flags & IB_SEND_SOLICITED); - ibp->n_loop_pkts++; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); + ibp->rvp.n_loop_pkts++; bail_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); drop: @@ -260,47 +260,53 @@ drop: * hfi1_make_ud_req - construct a UD request packet * @qp: the QP * + * Assume s_lock is held. + * * Return 1 if constructed; otherwise, return 0. */ -int hfi1_make_ud_req(struct hfi1_qp *qp) +int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { + struct hfi1_qp_priv *priv = qp->priv; struct hfi1_other_headers *ohdr; struct ib_ah_attr *ah_attr; struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; - struct hfi1_swqe *wqe; - unsigned long flags; + struct rvt_swqe *wqe; u32 nwords; u32 extra_bytes; u32 bth0; u16 lrh0; u16 lid; - int ret = 0; int next_cur; u8 sc5; - spin_lock_irqsave(&qp->s_lock, flags); + ps->s_txreq = get_txreq(ps->dev, qp); + if (IS_ERR(ps->s_txreq)) + goto bail_no_tx; - if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) { - if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND)) + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) goto bail; /* We are in the error state, flush the work request. */ - if (qp->s_last == qp->s_head) + smp_read_barrier_depends(); /* see post_one_send */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) goto bail; /* If DMAs are in progress, we can't flush immediately. */ - if (atomic_read(&qp->s_iowait.sdma_busy)) { - qp->s_flags |= HFI1_S_WAIT_DMA; + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); - goto done; + goto done_free_tx; } - if (qp->s_cur == qp->s_head) + /* see post_one_send() */ + smp_read_barrier_depends(); + if (qp->s_cur == ACCESS_ONCE(qp->s_head)) goto bail; - wqe = get_swqe_ptr(qp, qp->s_cur); + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); next_cur = qp->s_cur + 1; if (next_cur >= qp->s_size) next_cur = 0; @@ -308,13 +314,15 @@ int hfi1_make_ud_req(struct hfi1_qp *qp) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &to_iah(wqe->ud_wr.ah)->attr; - if (ah_attr->dlid < HFI1_MULTICAST_LID_BASE || - ah_attr->dlid == HFI1_PERMISSIVE_LID) { + ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + if (ah_attr->dlid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); - if (unlikely(!loopback && (lid == ppd->lid || - (lid == HFI1_PERMISSIVE_LID && - qp->ibqp.qp_type == IB_QPT_GSI)))) { + if (unlikely(!loopback && + (lid == ppd->lid || + (lid == be16_to_cpu(IB_LID_PERMISSIVE) && + qp->ibqp.qp_type == IB_QPT_GSI)))) { + unsigned long flags; /* * If DMAs are in progress, we can't generate * a completion for the loopback packet since @@ -322,16 +330,17 @@ int hfi1_make_ud_req(struct hfi1_qp *qp) * Instead of waiting, we could queue a * zero length descriptor so we get a callback. */ - if (atomic_read(&qp->s_iowait.sdma_busy)) { - qp->s_flags |= HFI1_S_WAIT_DMA; + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; goto bail; } qp->s_cur = next_cur; + local_irq_save(flags); spin_unlock_irqrestore(&qp->s_lock, flags); ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, flags); hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); - goto done; + goto done_free_tx; } } @@ -353,11 +362,12 @@ int hfi1_make_ud_req(struct hfi1_qp *qp) if (ah_attr->ah_flags & IB_AH_GRH) { /* Header size in 32-bit words. */ - qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh, - &ah_attr->grh, - qp->s_hdrwords, nwords); + qp->s_hdrwords += hfi1_make_grh(ibp, + &ps->s_txreq->phdr.hdr.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); lrh0 = HFI1_LRH_GRH; - ohdr = &qp->s_hdr->ibh.u.l.oth; + ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; /* * Don't worry about sending to locally attached multicast * QPs. It is unspecified by the spec. what happens. @@ -365,37 +375,42 @@ int hfi1_make_ud_req(struct hfi1_qp *qp) } else { /* Header size in 32-bit words. */ lrh0 = HFI1_LRH_BTH; - ohdr = &qp->s_hdr->ibh.u.oth; + ohdr = &ps->s_txreq->phdr.hdr.u.oth; } if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { qp->s_hdrwords++; ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; - } else + } else { bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + } sc5 = ibp->sl_to_sc[ah_attr->sl]; lrh0 |= (ah_attr->sl & 0xf) << 4; if (qp->ibqp.qp_type == IB_QPT_SMI) { lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ - qp->s_sc = 0xf; + priv->s_sc = 0xf; } else { lrh0 |= (sc5 & 0xf) << 12; - qp->s_sc = sc5; + priv->s_sc = sc5; } - qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc); - qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr->ibh.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ - qp->s_hdr->ibh.lrh[2] = + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + ps->s_txreq->sde = priv->s_sde; + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + ps->s_txreq->psc = priv->s_sendcontext; + ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0); + ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); + ps->s_txreq->phdr.hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); - if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) - qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE; - else { + if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE; + } else { lid = ppd->lid; if (lid) { lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1); - qp->s_hdr->ibh.lrh[3] = cpu_to_be16(lid); - } else - qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE; + ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid); + } else { + ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE; + } } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= IB_BTH_SOLICITED; @@ -406,7 +421,7 @@ int hfi1_make_ud_req(struct hfi1_qp *qp) bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); - ohdr->bth[2] = cpu_to_be32(mask_psn(qp->s_next_psn++)); + ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). @@ -415,20 +430,28 @@ int hfi1_make_ud_req(struct hfi1_qp *qp) qp->qkey : wqe->ud_wr.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); /* disarm any ahg */ - qp->s_hdr->ahgcount = 0; - qp->s_hdr->ahgidx = 0; - qp->s_hdr->tx_flags = 0; - qp->s_hdr->sde = NULL; + priv->s_hdr->ahgcount = 0; + priv->s_hdr->ahgidx = 0; + priv->s_hdr->tx_flags = 0; + priv->s_hdr->sde = NULL; + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; + + return 1; -done: - ret = 1; - goto unlock; +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; bail: - qp->s_flags &= ~HFI1_S_BUSY; -unlock: - spin_unlock_irqrestore(&qp->s_lock, flags); - return ret; + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + qp->s_hdrwords = 0; + return 0; } /* @@ -476,7 +499,7 @@ int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey) return -1; } -void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn, +void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh) { @@ -550,7 +573,7 @@ void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn, * opa_smp_check() returns 0 if all checks succeed, 1 otherwise. */ static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, - struct hfi1_qp *qp, u16 slid, struct opa_smp *smp) + struct rvt_qp *qp, u16 slid, struct opa_smp *smp) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); @@ -607,7 +630,7 @@ static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, case IB_MGMT_METHOD_TRAP: case IB_MGMT_METHOD_GET_RESP: case IB_MGMT_METHOD_REPORT_RESP: - if (ibp->port_cap_flags & IB_PORT_SM) + if (ibp->rvp.port_cap_flags & IB_PORT_SM) return 0; if (pkey == FULL_MGMT_P_KEY) { smp->status |= IB_SMP_UNSUP_METHOD; @@ -624,7 +647,6 @@ static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, return 0; } - /** * hfi1_ud_rcv - receive an incoming UD packet * @ibp: the port the packet came in on @@ -654,7 +676,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) u32 rcv_flags = packet->rcv_flags; void *data = packet->ebuf; u32 tlen = packet->tlen; - struct hfi1_qp *qp = packet->qp; + struct rvt_qp *qp = packet->qp; bool has_grh = rcv_flags & HFI1_HAS_GRH; bool sc4_bit = has_sc4_bit(packet); u8 sc; @@ -663,18 +685,18 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) struct ib_grh *grh = NULL; qkey = be32_to_cpu(ohdr->u.ud.deth[0]); - src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK; + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; dlid = be16_to_cpu(hdr->lrh[1]); - is_mcast = (dlid > HFI1_MULTICAST_LID_BASE) && - (dlid != HFI1_PERMISSIVE_LID); + is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); bth1 = be32_to_cpu(ohdr->bth[1]); if (unlikely(bth1 & HFI1_BECN_SMASK)) { /* * In pre-B0 h/w the CNP_OPCODE is handled via an - * error path (errata 291394). + * error path. */ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 lqpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; u8 sl, sc5; sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; @@ -750,7 +772,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); if (mgmt_pkey_idx < 0) goto drop; - } if (unlikely(qkey != qp->qkey)) { hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, @@ -788,7 +809,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); if (mgmt_pkey_idx < 0) goto drop; - } if (qp->ibqp.qp_num > 1 && @@ -799,8 +819,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { wc.ex.imm_data = 0; wc.wc_flags = 0; - } else + } else { goto drop; + } /* * A GRH is expected to precede the data even if not @@ -811,36 +832,38 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) /* * Get the next work request entry to find where to put the data. */ - if (qp->r_flags & HFI1_R_REUSE_SGE) - qp->r_flags &= ~HFI1_R_REUSE_SGE; - else { + if (qp->r_flags & RVT_R_REUSE_SGE) { + qp->r_flags &= ~RVT_R_REUSE_SGE; + } else { int ret; - ret = hfi1_get_rwqe(qp, 0); + ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) { hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); return; } if (!ret) { if (qp->ibqp.qp_num == 0) - ibp->n_vl15_dropped++; + ibp->rvp.n_vl15_dropped++; return; } } /* Silently drop packets which are too big. */ if (unlikely(wc.byte_len > qp->r_len)) { - qp->r_flags |= HFI1_R_REUSE_SGE; + qp->r_flags |= RVT_R_REUSE_SGE; goto drop; } if (has_grh) { hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh, - sizeof(struct ib_grh), 1); + sizeof(struct ib_grh), 1, 0); wc.wc_flags |= IB_WC_GRH; - } else + } else { hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); - hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); - hfi1_put_ss(&qp->r_sge); - if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + } + hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + 1, 0); + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; @@ -862,8 +885,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) } } wc.pkey_index = (unsigned)mgmt_pkey_idx; - } else + } else { wc.pkey_index = 0; + } wc.slid = be16_to_cpu(hdr->lrh[3]); sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; @@ -873,15 +897,15 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) /* * Save the LMC lower bits if the destination LID is a unicast LID. */ - wc.dlid_path_bits = dlid >= HFI1_MULTICAST_LID_BASE ? 0 : + wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 : dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, - (ohdr->bth[0] & - cpu_to_be32(IB_BTH_SOLICITED)) != 0); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); return; drop: - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; } diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c new file mode 100644 index 000000000000..0861e095df8d --- /dev/null +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -0,0 +1,1044 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <asm/page.h> + +#include "user_exp_rcv.h" +#include "trace.h" +#include "mmu_rb.h" + +struct tid_group { + struct list_head list; + unsigned base; + u8 size; + u8 used; + u8 map; +}; + +struct tid_rb_node { + struct mmu_rb_node mmu; + unsigned long phys; + struct tid_group *grp; + u32 rcventry; + dma_addr_t dma_addr; + bool freed; + unsigned npages; + struct page *pages[0]; +}; + +struct tid_pageset { + u16 idx; + u16 count; +}; + +#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) + +#define num_user_pages(vaddr, len) \ + (1 + (((((unsigned long)(vaddr) + \ + (unsigned long)(len) - 1) & PAGE_MASK) - \ + ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) + +static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, + struct rb_root *); +static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); +static int set_rcvarray_entry(struct file *, unsigned long, u32, + struct tid_group *, struct page **, unsigned); +static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); +static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *, bool); +static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *); +static int program_rcvarray(struct file *, unsigned long, struct tid_group *, + struct tid_pageset *, unsigned, u16, struct page **, + u32 *, unsigned *, unsigned *); +static int unprogram_rcvarray(struct file *, u32, struct tid_group **); +static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *); + +static struct mmu_rb_ops tid_rb_ops = { + .insert = mmu_rb_insert, + .remove = mmu_rb_remove, + .invalidate = mmu_rb_invalidate +}; + +static inline u32 rcventry2tidinfo(u32 rcventry) +{ + u32 pair = rcventry & ~0x1; + + return EXP_TID_SET(IDX, pair >> 1) | + EXP_TID_SET(CTRL, 1 << (rcventry - pair)); +} + +static inline void exp_tid_group_init(struct exp_tid_set *set) +{ + INIT_LIST_HEAD(&set->list); + set->count = 0; +} + +static inline void tid_group_remove(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_del_init(&grp->list); + set->count--; +} + +static inline void tid_group_add_tail(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_add_tail(&grp->list, &set->list); + set->count++; +} + +static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) +{ + struct tid_group *grp = + list_first_entry(&set->list, struct tid_group, list); + list_del_init(&grp->list); + set->count--; + return grp; +} + +static inline void tid_group_move(struct tid_group *group, + struct exp_tid_set *s1, + struct exp_tid_set *s2) +{ + tid_group_remove(group, s1); + tid_group_add_tail(group, s2); +} + +/* + * Initialize context and file private data needed for Expected + * receive caching. This needs to be done after the context has + * been configured with the eager/expected RcvEntry counts. + */ +int hfi1_user_exp_rcv_init(struct file *fp) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned tidbase; + int i, ret = 0; + + spin_lock_init(&fd->tid_lock); + spin_lock_init(&fd->invalid_lock); + fd->tid_rb_root = RB_ROOT; + + if (!uctxt->subctxt_cnt || !fd->subctxt) { + exp_tid_group_init(&uctxt->tid_group_list); + exp_tid_group_init(&uctxt->tid_used_list); + exp_tid_group_init(&uctxt->tid_full_list); + + tidbase = uctxt->expected_base; + for (i = 0; i < uctxt->expected_count / + dd->rcv_entries.group_size; i++) { + struct tid_group *grp; + + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) { + /* + * If we fail here, the groups already + * allocated will be freed by the close + * call. + */ + ret = -ENOMEM; + goto done; + } + grp->size = dd->rcv_entries.group_size; + grp->base = tidbase; + tid_group_add_tail(grp, &uctxt->tid_group_list); + tidbase += dd->rcv_entries.group_size; + } + } + + fd->entry_to_rb = kcalloc(uctxt->expected_count, + sizeof(struct rb_node *), + GFP_KERNEL); + if (!fd->entry_to_rb) + return -ENOMEM; + + if (!HFI1_CAP_IS_USET(TID_UNMAP)) { + fd->invalid_tid_idx = 0; + fd->invalid_tids = kzalloc(uctxt->expected_count * + sizeof(u32), GFP_KERNEL); + if (!fd->invalid_tids) { + ret = -ENOMEM; + goto done; + } + + /* + * Register MMU notifier callbacks. If the registration + * fails, continue but turn off the TID caching for + * all user contexts. + */ + ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops); + if (ret) { + dd_dev_info(dd, + "Failed MMU notifier registration %d\n", + ret); + HFI1_CAP_USET(TID_UNMAP); + ret = 0; + } + } + + /* + * PSM does not have a good way to separate, count, and + * effectively enforce a limit on RcvArray entries used by + * subctxts (when context sharing is used) when TID caching + * is enabled. To help with that, we calculate a per-process + * RcvArray entry share and enforce that. + * If TID caching is not in use, PSM deals with usage on its + * own. In that case, we allow any subctxt to take all of the + * entries. + * + * Make sure that we set the tid counts only after successful + * init. + */ + spin_lock(&fd->tid_lock); + if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { + u16 remainder; + + fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; + remainder = uctxt->expected_count % uctxt->subctxt_cnt; + if (remainder && fd->subctxt < remainder) + fd->tid_limit++; + } else { + fd->tid_limit = uctxt->expected_count; + } + spin_unlock(&fd->tid_lock); +done: + return ret; +} + +int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct tid_group *grp, *gptr; + + /* + * The notifier would have been removed when the process'es mm + * was freed. + */ + if (!HFI1_CAP_IS_USET(TID_UNMAP)) + hfi1_mmu_rb_unregister(&fd->tid_rb_root); + + kfree(fd->invalid_tids); + + if (!uctxt->cnt) { + if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) + unlock_exp_tids(uctxt, &uctxt->tid_full_list, + &fd->tid_rb_root); + if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) + unlock_exp_tids(uctxt, &uctxt->tid_used_list, + &fd->tid_rb_root); + list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, + list) { + list_del_init(&grp->list); + kfree(grp); + } + hfi1_clear_tids(uctxt); + } + + kfree(fd->entry_to_rb); + return 0; +} + +/* + * Write an "empty" RcvArray entry. + * This function exists so the TID registaration code can use it + * to write to unused/unneeded entries and still take advantage + * of the WC performance improvements. The HFI will ignore this + * write to the RcvArray entry. + */ +static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +{ + /* + * Doing the WC fill writes only makes sense if the device is + * present and the RcvArray has been mapped as WC memory. + */ + if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) + writeq(0, dd->rcvarray_wc + (index * 8)); +} + +/* + * RcvArray entry allocation for Expected Receives is done by the + * following algorithm: + * + * The context keeps 3 lists of groups of RcvArray entries: + * 1. List of empty groups - tid_group_list + * This list is created during user context creation and + * contains elements which describe sets (of 8) of empty + * RcvArray entries. + * 2. List of partially used groups - tid_used_list + * This list contains sets of RcvArray entries which are + * not completely used up. Another mapping request could + * use some of all of the remaining entries. + * 3. List of full groups - tid_full_list + * This is the list where sets that are completely used + * up go. + * + * An attempt to optimize the usage of RcvArray entries is + * made by finding all sets of physically contiguous pages in a + * user's buffer. + * These physically contiguous sets are further split into + * sizes supported by the receive engine of the HFI. The + * resulting sets of pages are stored in struct tid_pageset, + * which describes the sets as: + * * .count - number of pages in this set + * * .idx - starting index into struct page ** array + * of this set + * + * From this point on, the algorithm deals with the page sets + * described above. The number of pagesets is divided by the + * RcvArray group size to produce the number of full groups + * needed. + * + * Groups from the 3 lists are manipulated using the following + * rules: + * 1. For each set of 8 pagesets, a complete group from + * tid_group_list is taken, programmed, and moved to + * the tid_full_list list. + * 2. For all remaining pagesets: + * 2.1 If the tid_used_list is empty and the tid_group_list + * is empty, stop processing pageset and return only + * what has been programmed up to this point. + * 2.2 If the tid_used_list is empty and the tid_group_list + * is not empty, move a group from tid_group_list to + * tid_used_list. + * 2.3 For each group is tid_used_group, program as much as + * can fit into the group. If the group becomes fully + * used, move it to tid_full_list. + */ +int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) +{ + int ret = 0, need_group = 0, pinned; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, + tididx = 0, mapped, mapped_pages = 0; + unsigned long vaddr = tinfo->vaddr; + struct page **pages = NULL; + u32 *tidlist = NULL; + struct tid_pageset *pagesets = NULL; + + /* Get the number of pages the user buffer spans */ + npages = num_user_pages(vaddr, tinfo->length); + if (!npages) + return -EINVAL; + + if (npages > uctxt->expected_count) { + dd_dev_err(dd, "Expected buffer too big\n"); + return -EINVAL; + } + + /* Verify that access is OK for the user buffer */ + if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, + npages * PAGE_SIZE)) { + dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, npages); + return -EFAULT; + } + + pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), + GFP_KERNEL); + if (!pagesets) + return -ENOMEM; + + /* Allocate the array of struct page pointers needed for pinning */ + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto bail; + } + + /* + * Pin all the pages of the user buffer. If we can't pin all the + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. + */ + if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) + return -ENOMEM; + pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); + if (pinned <= 0) { + ret = pinned; + goto bail; + } + fd->tid_n_pinned += npages; + + /* Find sets of physically contiguous pages */ + npagesets = find_phys_blocks(pages, pinned, pagesets); + + /* + * We don't need to access this under a lock since tid_used is per + * process and the same process cannot be in hfi1_user_exp_rcv_clear() + * and hfi1_user_exp_rcv_setup() at the same time. + */ + spin_lock(&fd->tid_lock); + if (fd->tid_used + npagesets > fd->tid_limit) + pageset_count = fd->tid_limit - fd->tid_used; + else + pageset_count = npagesets; + spin_unlock(&fd->tid_lock); + + if (!pageset_count) + goto bail; + + ngroups = pageset_count / dd->rcv_entries.group_size; + tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); + if (!tidlist) { + ret = -ENOMEM; + goto nomem; + } + + tididx = 0; + + /* + * From this point on, we are going to be using shared (between master + * and subcontexts) context resources. We need to take the lock. + */ + mutex_lock(&uctxt->exp_lock); + /* + * The first step is to program the RcvArray entries which are complete + * groups. + */ + while (ngroups && uctxt->tid_group_list.count) { + struct tid_group *grp = + tid_group_pop(&uctxt->tid_group_list); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, dd->rcv_entries.group_size, + pages, tidlist, &tididx, &mapped); + /* + * If there was a failure to program the RcvArray + * entries for the entire group, reset the grp fields + * and add the grp back to the free group list. + */ + if (ret <= 0) { + tid_group_add_tail(grp, &uctxt->tid_group_list); + hfi1_cdbg(TID, + "Failed to program RcvArray group %d", ret); + goto unlock; + } + + tid_group_add_tail(grp, &uctxt->tid_full_list); + ngroups--; + pageidx += ret; + mapped_pages += mapped; + } + + while (pageidx < pageset_count) { + struct tid_group *grp, *ptr; + /* + * If we don't have any partially used tid groups, check + * if we have empty groups. If so, take one from there and + * put in the partially used list. + */ + if (!uctxt->tid_used_list.count || need_group) { + if (!uctxt->tid_group_list.count) + goto unlock; + + grp = tid_group_pop(&uctxt->tid_group_list); + tid_group_add_tail(grp, &uctxt->tid_used_list); + need_group = 0; + } + /* + * There is an optimization opportunity here - instead of + * fitting as many page sets as we can, check for a group + * later on in the list that could fit all of them. + */ + list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, + list) { + unsigned use = min_t(unsigned, pageset_count - pageidx, + grp->size - grp->used); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, use, pages, tidlist, + &tididx, &mapped); + if (ret < 0) { + hfi1_cdbg(TID, + "Failed to program RcvArray entries %d", + ret); + ret = -EFAULT; + goto unlock; + } else if (ret > 0) { + if (grp->used == grp->size) + tid_group_move(grp, + &uctxt->tid_used_list, + &uctxt->tid_full_list); + pageidx += ret; + mapped_pages += mapped; + need_group = 0; + /* Check if we are done so we break out early */ + if (pageidx >= pageset_count) + break; + } else if (WARN_ON(ret == 0)) { + /* + * If ret is 0, we did not program any entries + * into this group, which can only happen if + * we've screwed up the accounting somewhere. + * Warn and try to continue. + */ + need_group = 1; + } + } + } +unlock: + mutex_unlock(&uctxt->exp_lock); +nomem: + hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, + mapped_pages, ret); + if (tididx) { + spin_lock(&fd->tid_lock); + fd->tid_used += tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, + tidlist, sizeof(tidlist[0]) * tididx)) { + /* + * On failure to copy to the user level, we need to undo + * everything done so far so we don't leak resources. + */ + tinfo->tidlist = (unsigned long)&tidlist; + hfi1_user_exp_rcv_clear(fp, tinfo); + tinfo->tidlist = 0; + ret = -EFAULT; + goto bail; + } + } + + /* + * If not everything was mapped (due to insufficient RcvArray entries, + * for example), unpin all unmapped pages so we can pin them nex time. + */ + if (mapped_pages != pinned) { + hfi1_release_user_pages(current->mm, &pages[mapped_pages], + pinned - mapped_pages, + false); + fd->tid_n_pinned -= pinned - mapped_pages; + } +bail: + kfree(pagesets); + kfree(pages); + kfree(tidlist); + return ret > 0 ? 0 : ret; +} + +int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo) +{ + int ret = 0; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + u32 *tidinfo; + unsigned tididx; + + tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL); + if (!tidinfo) + return -ENOMEM; + + if (copy_from_user(tidinfo, (void __user *)(unsigned long) + tinfo->tidlist, sizeof(tidinfo[0]) * + tinfo->tidcnt)) { + ret = -EFAULT; + goto done; + } + + mutex_lock(&uctxt->exp_lock); + for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { + ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL); + if (ret) { + hfi1_cdbg(TID, "Failed to unprogram rcv array %d", + ret); + break; + } + } + spin_lock(&fd->tid_lock); + fd->tid_used -= tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + mutex_unlock(&uctxt->exp_lock); +done: + kfree(tidinfo); + return ret; +} + +int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + unsigned long *ev = uctxt->dd->events + + (((uctxt->ctxt - uctxt->dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fd->subctxt); + u32 *array; + int ret = 0; + + if (!fd->invalid_tids) + return -EINVAL; + + /* + * copy_to_user() can sleep, which will leave the invalid_lock + * locked and cause the MMU notifier to be blocked on the lock + * for a long time. + * Copy the data to a local buffer so we can release the lock. + */ + array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); + if (!array) + return -EFAULT; + + spin_lock(&fd->invalid_lock); + if (fd->invalid_tid_idx) { + memcpy(array, fd->invalid_tids, sizeof(*array) * + fd->invalid_tid_idx); + memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * + fd->invalid_tid_idx); + tinfo->tidcnt = fd->invalid_tid_idx; + fd->invalid_tid_idx = 0; + /* + * Reset the user flag while still holding the lock. + * Otherwise, PSM can miss events. + */ + clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } else { + tinfo->tidcnt = 0; + } + spin_unlock(&fd->invalid_lock); + + if (tinfo->tidcnt) { + if (copy_to_user((void __user *)tinfo->tidlist, + array, sizeof(*array) * tinfo->tidcnt)) + ret = -EFAULT; + } + kfree(array); + + return ret; +} + +static u32 find_phys_blocks(struct page **pages, unsigned npages, + struct tid_pageset *list) +{ + unsigned pagecount, pageidx, setcount = 0, i; + unsigned long pfn, this_pfn; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + pfn = page_to_pfn(pages[0]); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; + + /* + * If the pfn's are not sequential, pages are not physically + * contiguous. + */ + if (this_pfn != ++pfn) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + pfn = this_pfn; + } else { + pagecount++; + } + } + return setcount; +} + +/** + * program_rcvarray() - program an RcvArray group with receive buffers + * @fp: file pointer + * @vaddr: starting user virtual address + * @grp: RcvArray group + * @sets: array of struct tid_pageset holding information on physically + * contiguous chunks from the user buffer + * @start: starting index into sets array + * @count: number of struct tid_pageset's to program + * @pages: an array of struct page * for the user buffer + * @tidlist: the array of u32 elements when the information about the + * programmed RcvArray entries is to be encoded. + * @tididx: starting offset into tidlist + * @pmapped: (output parameter) number of pages programmed into the RcvArray + * entries. + * + * This function will program up to 'count' number of RcvArray entries from the + * group 'grp'. To make best use of write-combining writes, the function will + * perform writes to the unused RcvArray entries which will be ignored by the + * HW. Each RcvArray entry will be programmed with a physically contiguous + * buffer chunk from the user's virtual buffer. + * + * Return: + * -EINVAL if the requested count is larger than the size of the group, + * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or + * number of RcvArray entries programmed. + */ +static int program_rcvarray(struct file *fp, unsigned long vaddr, + struct tid_group *grp, + struct tid_pageset *sets, + unsigned start, u16 count, struct page **pages, + u32 *tidlist, unsigned *tididx, unsigned *pmapped) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + u16 idx; + u32 tidinfo = 0, rcventry, useidx = 0; + int mapped = 0; + + /* Count should never be larger than the group size */ + if (count > grp->size) + return -EINVAL; + + /* Find the first unused entry in the group */ + for (idx = 0; idx < grp->size; idx++) { + if (!(grp->map & (1 << idx))) { + useidx = idx; + break; + } + rcv_array_wc_fill(dd, grp->base + idx); + } + + idx = 0; + while (idx < count) { + u16 npages, pageidx, setidx = start + idx; + int ret = 0; + + /* + * If this entry in the group is used, move to the next one. + * If we go past the end of the group, exit the loop. + */ + if (useidx >= grp->size) { + break; + } else if (grp->map & (1 << useidx)) { + rcv_array_wc_fill(dd, grp->base + useidx); + useidx++; + continue; + } + + rcventry = grp->base + useidx; + npages = sets[setidx].count; + pageidx = sets[setidx].idx; + + ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE), + rcventry, grp, pages + pageidx, + npages); + if (ret) + return ret; + mapped += npages; + + tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | + EXP_TID_SET(LEN, npages); + tidlist[(*tididx)++] = tidinfo; + grp->used++; + grp->map |= 1 << useidx++; + idx++; + } + + /* Fill the rest of the group with "blank" writes */ + for (; useidx < grp->size; useidx++) + rcv_array_wc_fill(dd, grp->base + useidx); + *pmapped = mapped; + return idx; +} + +static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, + u32 rcventry, struct tid_group *grp, + struct page **pages, unsigned npages) +{ + int ret; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct tid_rb_node *node; + struct hfi1_devdata *dd = uctxt->dd; + struct rb_root *root = &fd->tid_rb_root; + dma_addr_t phys; + + /* + * Allocate the node first so we can handle a potential + * failure before we've programmed anything. + */ + node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), + GFP_KERNEL); + if (!node) + return -ENOMEM; + + phys = pci_map_single(dd->pcidev, + __va(page_to_phys(pages[0])), + npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); + if (dma_mapping_error(&dd->pcidev->dev, phys)) { + dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", + phys); + kfree(node); + return -EFAULT; + } + + node->mmu.addr = vaddr; + node->mmu.len = npages * PAGE_SIZE; + node->phys = page_to_phys(pages[0]); + node->npages = npages; + node->rcventry = rcventry; + node->dma_addr = phys; + node->grp = grp; + node->freed = false; + memcpy(node->pages, pages, sizeof(struct page *) * npages); + + if (HFI1_CAP_IS_USET(TID_UNMAP)) + ret = mmu_rb_insert(root, &node->mmu); + else + ret = hfi1_mmu_rb_insert(root, &node->mmu); + + if (ret) { + hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", + node->rcventry, node->mmu.addr, node->phys, ret); + pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, + PCI_DMA_FROMDEVICE); + kfree(node); + return -EFAULT; + } + hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); + trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, + node->mmu.addr, node->phys, phys); + return 0; +} + +static int unprogram_rcvarray(struct file *fp, u32 tidinfo, + struct tid_group **grp) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + struct tid_rb_node *node; + u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; + + if (tididx >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + tididx, uctxt->ctxt); + return -EINVAL; + } + + if (tidctrl == 0x3) + return -EINVAL; + + rcventry = tididx + (tidctrl - 1); + + node = fd->entry_to_rb[rcventry]; + if (!node || node->rcventry != (uctxt->expected_base + rcventry)) + return -EBADF; + if (HFI1_CAP_IS_USET(TID_UNMAP)) + mmu_rb_remove(&fd->tid_rb_root, &node->mmu, false); + else + hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); + + if (grp) + *grp = node->grp; + clear_tid_node(fd, fd->subctxt, node); + return 0; +} + +static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, + struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + + trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, + node->npages, node->mmu.addr, node->phys, + node->dma_addr); + + hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); + /* + * Make sure device has seen the write before we unpin the + * pages. + */ + flush_wc(); + + pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len, + PCI_DMA_FROMDEVICE); + hfi1_release_user_pages(current->mm, node->pages, node->npages, true); + fd->tid_n_pinned -= node->npages; + + node->grp->used--; + node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); + + if (node->grp->used == node->grp->size - 1) + tid_group_move(node->grp, &uctxt->tid_full_list, + &uctxt->tid_used_list); + else if (!node->grp->used) + tid_group_move(node->grp, &uctxt->tid_used_list, + &uctxt->tid_group_list); + kfree(node); +} + +static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, + struct exp_tid_set *set, struct rb_root *root) +{ + struct tid_group *grp, *ptr; + struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata, + tid_rb_root); + int i; + + list_for_each_entry_safe(grp, ptr, &set->list, list) { + list_del_init(&grp->list); + + for (i = 0; i < grp->size; i++) { + if (grp->map & (1 << i)) { + u16 rcventry = grp->base + i; + struct tid_rb_node *node; + + node = fd->entry_to_rb[rcventry - + uctxt->expected_base]; + if (!node || node->rcventry != rcventry) + continue; + if (HFI1_CAP_IS_USET(TID_UNMAP)) + mmu_rb_remove(&fd->tid_rb_root, + &node->mmu, false); + else + hfi1_mmu_rb_remove(&fd->tid_rb_root, + &node->mmu); + clear_tid_node(fd, -1, node); + } + } + } +} + +static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct hfi1_filedata *fdata = + container_of(root, struct hfi1_filedata, tid_rb_root); + struct hfi1_ctxtdata *uctxt = fdata->uctxt; + struct tid_rb_node *node = + container_of(mnode, struct tid_rb_node, mmu); + + if (node->freed) + return 0; + + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr, + node->rcventry, node->npages, node->dma_addr); + node->freed = true; + + spin_lock(&fdata->invalid_lock); + if (fdata->invalid_tid_idx < uctxt->expected_count) { + fdata->invalid_tids[fdata->invalid_tid_idx] = + rcventry2tidinfo(node->rcventry - uctxt->expected_base); + fdata->invalid_tids[fdata->invalid_tid_idx] |= + EXP_TID_SET(LEN, node->npages); + if (!fdata->invalid_tid_idx) { + unsigned long *ev; + + /* + * hfi1_set_uevent_bits() sets a user event flag + * for all processes. Because calling into the + * driver to process TID cache invalidations is + * expensive and TID cache invalidations are + * handled on a per-process basis, we can + * optimize this to set the flag only for the + * process in question. + */ + ev = uctxt->dd->events + + (((uctxt->ctxt - uctxt->dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fdata->subctxt); + set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } + fdata->invalid_tid_idx++; + } + spin_unlock(&fdata->invalid_lock); + return 0; +} + +static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node) +{ + struct hfi1_filedata *fdata = + container_of(root, struct hfi1_filedata, tid_rb_root); + struct tid_rb_node *tnode = + container_of(node, struct tid_rb_node, mmu); + u32 base = fdata->uctxt->expected_base; + + fdata->entry_to_rb[tnode->rcventry - base] = tnode; + return 0; +} + +static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node, + bool notifier) +{ + struct hfi1_filedata *fdata = + container_of(root, struct hfi1_filedata, tid_rb_root); + struct tid_rb_node *tnode = + container_of(node, struct tid_rb_node, mmu); + u32 base = fdata->uctxt->expected_base; + + fdata->entry_to_rb[tnode->rcventry - base] = NULL; +} diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/staging/rdma/hfi1/user_exp_rcv.h index 4f4876e1d353..9bc8d9fba87e 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.h +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.h @@ -1,14 +1,13 @@ #ifndef _HFI1_USER_EXP_RCV_H #define _HFI1_USER_EXP_RCV_H /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -20,8 +19,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -50,6 +47,8 @@ * */ +#include "hfi.h" + #define EXP_TID_TIDLEN_MASK 0x7FFULL #define EXP_TID_TIDLEN_SHIFT 0 #define EXP_TID_TIDCTRL_MASK 0x3ULL @@ -71,4 +70,10 @@ (tid) |= EXP_TID_SET(field, (value)); \ } while (0) +int hfi1_user_exp_rcv_init(struct file *); +int hfi1_user_exp_rcv_free(struct hfi1_filedata *); +int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *); + #endif /* _HFI1_USER_EXP_RCV_H */ diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c index 692de658f0dc..88e10b5f55f1 100644 --- a/drivers/staging/rdma/hfi1/user_pages.c +++ b/drivers/staging/rdma/hfi1/user_pages.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -51,36 +48,62 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/device.h> +#include <linux/module.h> #include "hfi.h" -/** - * hfi1_map_page - a safety wrapper around pci_map_page() +static unsigned long cache_size = 256; +module_param(cache_size, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); + +/* + * Determine whether the caller can pin pages. + * + * This function should be used in the implementation of buffer caches. + * The cache implementation should call this function prior to attempting + * to pin buffer pages in order to determine whether they should do so. + * The function computes cache limits based on the configured ulimit and + * cache size. Use of this function is especially important for caches + * which are not limited in any other way (e.g. by HW resources) and, thus, + * could keeping caching buffers. * */ -dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page, - unsigned long offset, size_t size, int direction) +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) { - dma_addr_t phys; + unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit, + size = (cache_size * (1UL << 20)); /* convert to bytes */ + unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt; + bool can_lock = capable(CAP_IPC_LOCK); - phys = pci_map_page(hwdev, page, offset, size, direction); + /* + * Calculate per-cache size. The calculation below uses only a quarter + * of the available per-context limit. This leaves space for other + * pinning. Should we worry about shared ctxts? + */ + cache_limit = (ulimit / usr_ctxts) / 4; - return phys; -} + /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */ + if (ulimit != (-1UL) && size > cache_limit) + size = cache_limit; -int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, - struct page **pages) -{ - unsigned long pinned, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - bool can_lock = capable(CAP_IPC_LOCK); - int ret; + /* Convert to number of pages */ + size = DIV_ROUND_UP(size, PAGE_SIZE); down_read(¤t->mm->mmap_sem); pinned = current->mm->pinned_vm; up_read(¤t->mm->mmap_sem); - if (pinned + npages > lock_limit && !can_lock) - return -ENOMEM; + /* First, check the absolute limit against all pinned pages. */ + if (pinned + npages >= ulimit && !can_lock) + return false; + + return ((nlocked + npages) <= size) || can_lock; +} + +int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, + struct page **pages) +{ + int ret; ret = get_user_pages_fast(vaddr, npages, writable, pages); if (ret < 0) @@ -93,7 +116,8 @@ int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, return ret; } -void hfi1_release_user_pages(struct page **p, size_t npages, bool dirty) +void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, + size_t npages, bool dirty) { size_t i; @@ -103,9 +127,9 @@ void hfi1_release_user_pages(struct page **p, size_t npages, bool dirty) put_page(p[i]); } - if (current->mm) { /* during close after signal, mm can be NULL */ - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm -= npages; - up_write(¤t->mm->mmap_sem); + if (mm) { /* during close after signal, mm can be NULL */ + down_write(&mm->mmap_sem); + mm->pinned_vm -= npages; + up_write(&mm->mmap_sem); } } diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c index d3de771a0770..46e254d52dad 100644 --- a/drivers/staging/rdma/hfi1/user_sdma.c +++ b/drivers/staging/rdma/hfi1/user_sdma.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -71,6 +68,7 @@ #include "verbs.h" /* for the headers */ #include "common.h" /* for struct hfi1_tid_info */ #include "trace.h" +#include "mmu_rb.h" static uint hfi1_sdma_comp_ring_size = 128; module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); @@ -147,7 +145,6 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12 /* Last packet in the request */ #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0) -#define TXREQ_FLAGS_IOVEC_LAST_PKT BIT(0) #define SDMA_REQ_IN_USE 0 #define SDMA_REQ_FOR_THREAD 1 @@ -171,16 +168,28 @@ static unsigned initial_pkt_count = 8; #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ struct user_sdma_iovec { + struct list_head list; struct iovec iov; /* number of pages in this vector */ unsigned npages; /* array of pinned pages for this vector */ struct page **pages; - /* offset into the virtual address space of the vector at - * which we last left off. */ + /* + * offset into the virtual address space of the vector at + * which we last left off. + */ u64 offset; }; +struct sdma_mmu_node { + struct mmu_rb_node rb; + struct list_head list; + struct hfi1_user_sdma_pkt_q *pq; + atomic_t refcount; + struct page **pages; + unsigned npages; +}; + struct user_sdma_request { struct sdma_req_info info; struct hfi1_user_sdma_pkt_q *pq; @@ -214,15 +223,6 @@ struct user_sdma_request { */ u8 omfactor; /* - * pointer to the user's mm_struct. We are going to - * get a reference to it so it doesn't get freed - * since we might not be in process context when we - * are processing the iov's. - * Using this mm_struct, we can get vma based on the - * iov's address (find_vma()). - */ - struct mm_struct *user_mm; - /* * We copy the iovs for this request (based on * info.iovcnt). These are only the data vectors */ @@ -239,13 +239,12 @@ struct user_sdma_request { u16 tididx; u32 sent; u64 seqnum; + u64 seqcomp; + u64 seqsubmitted; struct list_head txps; - spinlock_t txcmp_lock; /* protect txcmp list */ - struct list_head txcmp; unsigned long flags; /* status of the last txreq completed */ int status; - struct work_struct worker; }; /* @@ -260,11 +259,6 @@ struct user_sdma_txreq { struct sdma_txreq txreq; struct list_head list; struct user_sdma_request *req; - struct { - struct user_sdma_iovec *vec; - u8 flags; - } iovecs[3]; - int idx; u16 flags; unsigned busycount; u64 seqnum; @@ -280,21 +274,21 @@ struct user_sdma_txreq { static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); static int num_user_pages(const struct iovec *); -static void user_sdma_txreq_cb(struct sdma_txreq *, int, int); -static void user_sdma_delayed_completion(struct work_struct *); -static void user_sdma_free_request(struct user_sdma_request *); +static void user_sdma_txreq_cb(struct sdma_txreq *, int); +static inline void pq_update(struct hfi1_user_sdma_pkt_q *); +static void user_sdma_free_request(struct user_sdma_request *, bool); static int pin_vector_pages(struct user_sdma_request *, struct user_sdma_iovec *); -static void unpin_vector_pages(struct user_sdma_request *, - struct user_sdma_iovec *); +static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned); static int check_header_template(struct user_sdma_request *, struct hfi1_pkt_header *, u32, u32); static int set_txreq_header(struct user_sdma_request *, struct user_sdma_txreq *, u32); static int set_txreq_header_ahg(struct user_sdma_request *, struct user_sdma_txreq *, u32); -static inline void set_comp_state(struct user_sdma_request *, - enum hfi1_sdma_comp_state, int); +static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *, + struct hfi1_user_sdma_comp_q *, + u16, enum hfi1_sdma_comp_state, int); static inline u32 set_pkt_bth_psn(__be32, u8, u32); static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); @@ -304,6 +298,17 @@ static int defer_packet_queue( struct sdma_txreq *, unsigned seq); static void activate_packet_queue(struct iowait *, int); +static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); +static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *); +static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *, bool); +static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *); + +static struct mmu_rb_ops sdma_rb_ops = { + .filter = sdma_rb_filter, + .insert = sdma_rb_insert, + .remove = sdma_rb_remove, + .invalidate = sdma_rb_invalidate +}; static int defer_packet_queue( struct sdma_engine *sde, @@ -381,7 +386,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) goto pq_nomem; memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size; - pq->reqs = kmalloc(memsize, GFP_KERNEL); + pq->reqs = kzalloc(memsize, GFP_KERNEL); if (!pq->reqs) goto pq_reqs_nomem; @@ -393,9 +398,12 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) pq->state = SDMA_PKT_Q_INACTIVE; atomic_set(&pq->n_reqs, 0); init_waitqueue_head(&pq->wait); + pq->sdma_rb_root = RB_ROOT; + INIT_LIST_HEAD(&pq->evict); + spin_lock_init(&pq->evict_lock); iowait_init(&pq->busy, 0, NULL, defer_packet_queue, - activate_packet_queue); + activate_packet_queue, NULL); pq->reqidx = 0; snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, fd->subctxt); @@ -423,6 +431,12 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) cq->nentries = hfi1_sdma_comp_ring_size; fd->cq = cq; + ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops); + if (ret) { + dd_dev_err(dd, "Failed to register with MMU %d", ret); + goto done; + } + spin_lock_irqsave(&uctxt->sdma_qlock, flags); list_add(&pq->list, &uctxt->sdma_queues); spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); @@ -452,6 +466,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, uctxt->ctxt, fd->subctxt); pq = fd->pq; + hfi1_mmu_rb_unregister(&pq->sdma_rb_root); if (pq) { spin_lock_irqsave(&uctxt->sdma_qlock, flags); if (!list_empty(&pq->list)) @@ -479,7 +494,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, unsigned long dim, unsigned long *count) { - int ret = 0, i = 0, sent; + int ret = 0, i = 0; struct hfi1_filedata *fd = fp->private_data; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_user_sdma_pkt_q *pq = fd->pq; @@ -505,9 +520,11 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, dd->unit, uctxt->ctxt, fd->subctxt, ret); return -EFAULT; } + trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, (u16 *)&info); - if (cq->comps[info.comp_idx].status == QUEUED) { + if (cq->comps[info.comp_idx].status == QUEUED || + test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) { hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); @@ -534,10 +551,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, req->cq = cq; req->status = -1; INIT_LIST_HEAD(&req->txps); - INIT_LIST_HEAD(&req->txcmp); - INIT_WORK(&req->worker, user_sdma_delayed_completion); - spin_lock_init(&req->txcmp_lock); memcpy(&req->info, &info, sizeof(info)); if (req_opcode(info.ctrl) == EXPECTED) @@ -596,8 +610,10 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, } req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); - /* Calculate the initial TID offset based on the values of - KDETH.OFFSET and KDETH.OM that are passed in. */ + /* + * Calculate the initial TID offset based on the values of + * KDETH.OFFSET and KDETH.OM that are passed in. + */ req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? KDETH_OM_LARGE : KDETH_OM_SMALL); @@ -606,8 +622,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, /* Save all the IO vector structures */ while (i < req->data_iovs) { + INIT_LIST_HEAD(&req->iovs[i].list); memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); - req->iovs[i].offset = 0; + ret = pin_vector_pages(req, &req->iovs[i]); + if (ret) { + req->status = ret; + goto free_req; + } req->data_len += req->iovs[i++].iov.iov_len; } SDMA_DBG(req, "total data length %u", req->data_len); @@ -671,52 +692,59 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, } } - set_comp_state(req, QUEUED, 0); + set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); + atomic_inc(&pq->n_reqs); /* Send the first N packets in the request to buy us some time */ - sent = user_sdma_send_pkts(req, pcount); - if (unlikely(sent < 0)) { - if (sent != -EBUSY) { - req->status = sent; - set_comp_state(req, ERROR, req->status); - return sent; - } else - sent = 0; + ret = user_sdma_send_pkts(req, pcount); + if (unlikely(ret < 0 && ret != -EBUSY)) { + req->status = ret; + goto free_req; } - atomic_inc(&pq->n_reqs); - xchg(&pq->state, SDMA_PKT_Q_ACTIVE); - if (sent < req->info.npkts) { - /* - * This is a somewhat blocking send implementation. - * The driver will block the caller until all packets of the - * request have been submitted to the SDMA engine. However, it - * will not wait for send completions. - */ - while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { - ret = user_sdma_send_pkts(req, pcount); - if (ret < 0) { - if (ret != -EBUSY) { - req->status = ret; - return ret; - } - wait_event_interruptible_timeout( - pq->busy.wait_dma, - (pq->state == SDMA_PKT_Q_ACTIVE), - msecs_to_jiffies( - SDMA_IOWAIT_TIMEOUT)); + /* + * It is possible that the SDMA engine would have processed all the + * submitted packets by the time we get here. Therefore, only set + * packet queue state to ACTIVE if there are still uncompleted + * requests. + */ + if (atomic_read(&pq->n_reqs)) + xchg(&pq->state, SDMA_PKT_Q_ACTIVE); + + /* + * This is a somewhat blocking send implementation. + * The driver will block the caller until all packets of the + * request have been submitted to the SDMA engine. However, it + * will not wait for send completions. + */ + while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { + ret = user_sdma_send_pkts(req, pcount); + if (ret < 0) { + if (ret != -EBUSY) { + req->status = ret; + set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + if (ACCESS_ONCE(req->seqcomp) == + req->seqsubmitted - 1) + goto free_req; + return ret; } + wait_event_interruptible_timeout( + pq->busy.wait_dma, + (pq->state == SDMA_PKT_Q_ACTIVE), + msecs_to_jiffies( + SDMA_IOWAIT_TIMEOUT)); } - } *count += idx; return 0; free_req: - user_sdma_free_request(req); + user_sdma_free_request(req, true); + pq_update(pq); + set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); return ret; } static inline u32 compute_data_length(struct user_sdma_request *req, - struct user_sdma_txreq *tx) + struct user_sdma_txreq *tx) { /* * Determine the proper size of the packet data. @@ -734,8 +762,10 @@ static inline u32 compute_data_length(struct user_sdma_request *req, } else if (req_opcode(req->info.ctrl) == EXPECTED) { u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * PAGE_SIZE; - /* Get the data length based on the remaining space in the - * TID pair. */ + /* + * Get the data length based on the remaining space in the + * TID pair. + */ len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); /* If we've filled up the TID pair, move to the next one. */ if (unlikely(!len) && ++req->tididx < req->n_tids && @@ -745,12 +775,15 @@ static inline u32 compute_data_length(struct user_sdma_request *req, req->tidoffset = 0; len = min_t(u32, tidlen, req->info.fragsize); } - /* Since the TID pairs map entire pages, make sure that we + /* + * Since the TID pairs map entire pages, make sure that we * are not going to try to send more data that we have - * remaining. */ + * remaining. + */ len = min(len, req->data_len - req->sent); - } else + } else { len = min(req->data_len - req->sent, (u32)req->info.fragsize); + } SDMA_DBG(req, "Data Length = %u", len); return len; } @@ -813,9 +846,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) tx->flags = 0; tx->req = req; tx->busycount = 0; - tx->idx = -1; INIT_LIST_HEAD(&tx->list); - memset(tx->iovecs, 0, sizeof(tx->iovecs)); if (req->seqnum == req->info.npkts - 1) tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT; @@ -836,18 +867,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) WARN_ON(iovec->offset); } - /* - * This request might include only a header and no user - * data, so pin pages only if there is data and it the - * pages have not been pinned already. - */ - if (unlikely(!iovec->pages && iovec->iov.iov_len)) { - ret = pin_vector_pages(req, iovec); - if (ret) - goto free_tx; - } - - tx->iovecs[++tx->idx].vec = iovec; datalen = compute_data_length(req, tx); if (!datalen) { SDMA_DBG(req, @@ -937,16 +956,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) iovec->pages[pageidx], offset, len); if (ret) { - int i; - SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); - /* Mark all assigned vectors as complete so they - * are unpinned in the callback. */ - for (i = tx->idx; i >= 0; i--) { - tx->iovecs[i].flags |= - TXREQ_FLAGS_IOVEC_LAST_PKT; - } goto free_txreq; } iov_offset += len; @@ -954,19 +965,10 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) data_sent += len; if (unlikely(queued < datalen && pageidx == iovec->npages && - req->iov_idx < req->data_iovs - 1 && - tx->idx < ARRAY_SIZE(tx->iovecs))) { + req->iov_idx < req->data_iovs - 1)) { iovec->offset += iov_offset; - tx->iovecs[tx->idx].flags |= - TXREQ_FLAGS_IOVEC_LAST_PKT; iovec = &req->iovs[++req->iov_idx]; - if (!iovec->pages) { - ret = pin_vector_pages(req, iovec); - if (ret) - goto free_txreq; - } iov_offset = 0; - tx->iovecs[++tx->idx].vec = iovec; } } /* @@ -977,28 +979,21 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) if (req_opcode(req->info.ctrl) == EXPECTED) req->tidoffset += datalen; req->sent += data_sent; - if (req->data_len) { - tx->iovecs[tx->idx].vec->offset += iov_offset; - /* If we've reached the end of the io vector, mark it - * so the callback can unpin the pages and free it. */ - if (tx->iovecs[tx->idx].vec->offset == - tx->iovecs[tx->idx].vec->iov.iov_len) - tx->iovecs[tx->idx].flags |= - TXREQ_FLAGS_IOVEC_LAST_PKT; - } - + if (req->data_len) + iovec->offset += iov_offset; + list_add_tail(&tx->txreq.list, &req->txps); /* * It is important to increment this here as it is used to * generate the BTH.PSN and, therefore, can't be bulk-updated * outside of the loop. */ tx->seqnum = req->seqnum++; - list_add_tail(&tx->txreq.list, &req->txps); npkts++; } dosend: ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps); - if (list_empty(&req->txps)) + if (list_empty(&req->txps)) { + req->seqsubmitted = req->seqnum; if (req->seqnum == req->info.npkts) { set_bit(SDMA_REQ_SEND_DONE, &req->flags); /* @@ -1010,6 +1005,10 @@ dosend: if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) sdma_ahg_free(req->sde, req->ahg_idx); } + } else if (ret > 0) { + req->seqsubmitted += ret; + ret = 0; + } return ret; free_txreq: @@ -1024,7 +1023,7 @@ free_tx: */ static inline int num_user_pages(const struct iovec *iov) { - const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long addr = (unsigned long)iov->iov_base; const unsigned long len = iov->iov_len; const unsigned long spage = addr & PAGE_MASK; const unsigned long epage = (addr + len - 1) & PAGE_MASK; @@ -1032,64 +1031,129 @@ static inline int num_user_pages(const struct iovec *iov) return 1 + ((epage - spage) >> PAGE_SHIFT); } -static int pin_vector_pages(struct user_sdma_request *req, - struct user_sdma_iovec *iovec) { - int pinned, npages; +/* Caller must hold pq->evict_lock */ +static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) +{ + u32 cleared = 0; + struct sdma_mmu_node *node, *ptr; - npages = num_user_pages(&iovec->iov); - iovec->pages = kcalloc(npages, sizeof(*iovec->pages), GFP_KERNEL); - if (!iovec->pages) { - SDMA_DBG(req, "Failed page array alloc"); - return -ENOMEM; + list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) { + /* Make sure that no one is still using the node. */ + if (!atomic_read(&node->refcount)) { + /* + * Need to use the page count now as the remove callback + * will free the node. + */ + cleared += node->npages; + spin_unlock(&pq->evict_lock); + hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); + spin_lock(&pq->evict_lock); + if (cleared >= npages) + break; + } } + return cleared; +} - /* - * Get a reference to the process's mm so we can use it when - * unpinning the io vectors. - */ - req->pq->user_mm = get_task_mm(current); +static int pin_vector_pages(struct user_sdma_request *req, + struct user_sdma_iovec *iovec) { + int ret = 0, pinned, npages, cleared; + struct page **pages; + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct sdma_mmu_node *node = NULL; + struct mmu_rb_node *rb_node; + + rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root, + (unsigned long)iovec->iov.iov_base, + iovec->iov.iov_len); + if (rb_node) + node = container_of(rb_node, struct sdma_mmu_node, rb); + + if (!node) { + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; - pinned = hfi1_acquire_user_pages((unsigned long)iovec->iov.iov_base, - npages, 0, iovec->pages); + node->rb.addr = (unsigned long)iovec->iov.iov_base; + node->rb.len = iovec->iov.iov_len; + node->pq = pq; + atomic_set(&node->refcount, 0); + INIT_LIST_HEAD(&node->list); + } - if (pinned < 0) - return pinned; + npages = num_user_pages(&iovec->iov); + if (node->npages < npages) { + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + SDMA_DBG(req, "Failed page array alloc"); + ret = -ENOMEM; + goto bail; + } + memcpy(pages, node->pages, node->npages * sizeof(*pages)); + + npages -= node->npages; +retry: + if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) { + spin_lock(&pq->evict_lock); + cleared = sdma_cache_evict(pq, npages); + spin_unlock(&pq->evict_lock); + if (cleared >= npages) + goto retry; + } + pinned = hfi1_acquire_user_pages( + ((unsigned long)iovec->iov.iov_base + + (node->npages * PAGE_SIZE)), npages, 0, + pages + node->npages); + if (pinned < 0) { + kfree(pages); + ret = pinned; + goto bail; + } + if (pinned != npages) { + unpin_vector_pages(current->mm, pages, pinned); + ret = -EFAULT; + goto bail; + } + kfree(node->pages); + node->pages = pages; + node->npages += pinned; + npages = node->npages; + spin_lock(&pq->evict_lock); + if (!rb_node) + list_add(&node->list, &pq->evict); + else + list_move(&node->list, &pq->evict); + pq->n_locked += pinned; + spin_unlock(&pq->evict_lock); + } + iovec->pages = node->pages; + iovec->npages = npages; - iovec->npages = pinned; - if (pinned != npages) { - SDMA_DBG(req, "Failed to pin pages (%d/%u)", pinned, npages); - unpin_vector_pages(req, iovec); - return -EFAULT; + if (!rb_node) { + ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); + if (ret) { + spin_lock(&pq->evict_lock); + list_del(&node->list); + pq->n_locked -= node->npages; + spin_unlock(&pq->evict_lock); + ret = 0; + goto bail; + } + } else { + atomic_inc(&node->refcount); } return 0; +bail: + if (!rb_node) + kfree(node); + return ret; } -static void unpin_vector_pages(struct user_sdma_request *req, - struct user_sdma_iovec *iovec) +static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, + unsigned npages) { - /* - * Unpinning is done through the workqueue so use the - * process's mm if we have a reference to it. - */ - if ((current->flags & PF_KTHREAD) && req->pq->user_mm) - use_mm(req->pq->user_mm); - - hfi1_release_user_pages(iovec->pages, iovec->npages, 0); - - /* - * Unuse the user's mm (see above) and release the - * reference to it. - */ - if (req->pq->user_mm) { - if (current->flags & PF_KTHREAD) - unuse_mm(req->pq->user_mm); - mmput(req->pq->user_mm); - } - - kfree(iovec->pages); - iovec->pages = NULL; - iovec->npages = 0; - iovec->offset = 0; + hfi1_release_user_pages(mm, pages, npages, 0); + kfree(pages); } static int check_header_template(struct user_sdma_request *req, @@ -1212,7 +1276,6 @@ static int set_txreq_header(struct user_sdma_request *req, if (ret) return ret; goto done; - } hdr->bth[2] = cpu_to_be32( @@ -1222,7 +1285,7 @@ static int set_txreq_header(struct user_sdma_request *req, /* Set ACK request on last packet */ if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) - hdr->bth[2] |= cpu_to_be32(1UL<<31); + hdr->bth[2] |= cpu_to_be32(1UL << 31); /* Set the new offset */ hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); @@ -1236,8 +1299,10 @@ static int set_txreq_header(struct user_sdma_request *req, if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * PAGE_SIZE)) { req->tidoffset = 0; - /* Since we don't copy all the TIDs, all at once, - * we have to check again. */ + /* + * Since we don't copy all the TIDs, all at once, + * we have to check again. + */ if (++req->tididx > req->n_tids - 1 || !req->tids[req->tididx]) { return -EINVAL; @@ -1318,8 +1383,10 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * PAGE_SIZE)) { req->tidoffset = 0; - /* Since we don't copy all the TIDs, all at once, - * we have to check again. */ + /* + * Since we don't copy all the TIDs, all at once, + * we have to check again. + */ if (++req->tididx > req->n_tids - 1 || !req->tids[req->tididx]) { return -EINVAL; @@ -1343,8 +1410,9 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, INTR) >> 16); val &= cpu_to_le16(~(1U << 13)); AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); - } else + } else { AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val); + } } trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, @@ -1359,113 +1427,62 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, * tx request have been processed by the DMA engine. Called in * interrupt context. */ -static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status, - int drain) +static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) { struct user_sdma_txreq *tx = container_of(txreq, struct user_sdma_txreq, txreq); struct user_sdma_request *req; - bool defer; - int i; + struct hfi1_user_sdma_pkt_q *pq; + struct hfi1_user_sdma_comp_q *cq; + u16 idx; if (!tx->req) return; req = tx->req; - /* - * If this is the callback for the last packet of the request, - * queue up the request for clean up. - */ - defer = (tx->seqnum == req->info.npkts - 1); - - /* - * If we have any io vectors associated with this txreq, - * check whether they need to be 'freed'. We can't free them - * here because the unpin function needs to be able to sleep. - */ - for (i = tx->idx; i >= 0; i--) { - if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT) { - defer = true; - break; - } - } + pq = req->pq; + cq = req->cq; - req->status = status; if (status != SDMA_TXREQ_S_OK) { SDMA_DBG(req, "SDMA completion with error %d", status); set_bit(SDMA_REQ_HAS_ERROR, &req->flags); - defer = true; } - /* - * Defer the clean up of the iovectors and the request until later - * so it can be done outside of interrupt context. - */ - if (defer) { - spin_lock(&req->txcmp_lock); - list_add_tail(&tx->list, &req->txcmp); - spin_unlock(&req->txcmp_lock); - schedule_work(&req->worker); + req->seqcomp = tx->seqnum; + kmem_cache_free(pq->txreq_cache, tx); + tx = NULL; + + idx = req->info.comp_idx; + if (req->status == -1 && status == SDMA_TXREQ_S_OK) { + if (req->seqcomp == req->info.npkts - 1) { + req->status = 0; + user_sdma_free_request(req, false); + pq_update(pq); + set_comp_state(pq, cq, idx, COMPLETE, 0); + } } else { - kmem_cache_free(req->pq->txreq_cache, tx); + if (status != SDMA_TXREQ_S_OK) + req->status = status; + if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && + (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || + test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { + user_sdma_free_request(req, false); + pq_update(pq); + set_comp_state(pq, cq, idx, ERROR, req->status); + } } } -static void user_sdma_delayed_completion(struct work_struct *work) +static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) { - struct user_sdma_request *req = - container_of(work, struct user_sdma_request, worker); - struct hfi1_user_sdma_pkt_q *pq = req->pq; - struct user_sdma_txreq *tx = NULL; - unsigned long flags; - u64 seqnum; - int i; - - while (1) { - spin_lock_irqsave(&req->txcmp_lock, flags); - if (!list_empty(&req->txcmp)) { - tx = list_first_entry(&req->txcmp, - struct user_sdma_txreq, list); - list_del(&tx->list); - } - spin_unlock_irqrestore(&req->txcmp_lock, flags); - if (!tx) - break; - - for (i = tx->idx; i >= 0; i--) - if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT) - unpin_vector_pages(req, tx->iovecs[i].vec); - - seqnum = tx->seqnum; - kmem_cache_free(pq->txreq_cache, tx); - tx = NULL; - - if (req->status != SDMA_TXREQ_S_OK) { - if (seqnum == ACCESS_ONCE(req->seqnum) && - test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) { - atomic_dec(&pq->n_reqs); - set_comp_state(req, ERROR, req->status); - user_sdma_free_request(req); - break; - } - } else { - if (seqnum == req->info.npkts - 1) { - atomic_dec(&pq->n_reqs); - set_comp_state(req, COMPLETE, 0); - user_sdma_free_request(req); - break; - } - } - } - - if (!atomic_read(&pq->n_reqs)) { + if (atomic_dec_and_test(&pq->n_reqs)) { xchg(&pq->state, SDMA_PKT_Q_INACTIVE); wake_up(&pq->wait); } } -static void user_sdma_free_request(struct user_sdma_request *req) +static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) { if (!list_empty(&req->txps)) { struct sdma_txreq *t, *p; @@ -1479,25 +1496,87 @@ static void user_sdma_free_request(struct user_sdma_request *req) } } if (req->data_iovs) { + struct sdma_mmu_node *node; + struct mmu_rb_node *mnode; int i; - for (i = 0; i < req->data_iovs; i++) - if (req->iovs[i].npages && req->iovs[i].pages) - unpin_vector_pages(req, &req->iovs[i]); + for (i = 0; i < req->data_iovs; i++) { + mnode = hfi1_mmu_rb_search( + &req->pq->sdma_rb_root, + (unsigned long)req->iovs[i].iov.iov_base, + req->iovs[i].iov.iov_len); + if (!mnode) + continue; + + node = container_of(mnode, struct sdma_mmu_node, rb); + if (unpin) + hfi1_mmu_rb_remove(&req->pq->sdma_rb_root, + &node->rb); + else + atomic_dec(&node->refcount); + } } kfree(req->tids); clear_bit(SDMA_REQ_IN_USE, &req->flags); } -static inline void set_comp_state(struct user_sdma_request *req, - enum hfi1_sdma_comp_state state, - int ret) +static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, + struct hfi1_user_sdma_comp_q *cq, + u16 idx, enum hfi1_sdma_comp_state state, + int ret) { - SDMA_DBG(req, "Setting completion status %u %d", state, ret); - req->cq->comps[req->info.comp_idx].status = state; + hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", + pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); + cq->comps[idx].status = state; if (state == ERROR) - req->cq->comps[req->info.comp_idx].errcode = -ret; - trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt, - req->pq->subctxt, req->info.comp_idx, - state, ret); + cq->comps[idx].errcode = -ret; + trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, + idx, state, ret); +} + +static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, + unsigned long len) +{ + return (bool)(node->addr == addr); +} + +static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + + atomic_inc(&node->refcount); + return 0; +} + +static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, + bool notifier) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + + spin_lock(&node->pq->evict_lock); + list_del(&node->list); + node->pq->n_locked -= node->npages; + spin_unlock(&node->pq->evict_lock); + + unpin_vector_pages(notifier ? NULL : current->mm, node->pages, + node->npages); + /* + * If called by the MMU notifier, we have to adjust the pinned + * page count ourselves. + */ + if (notifier) + current->mm->pinned_vm -= node->npages; + kfree(node); +} + +static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + + if (!atomic_read(&node->refcount)) + return 1; + return 0; } diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h index 0afa28508a8a..b9240e351161 100644 --- a/drivers/staging/rdma/hfi1/user_sdma.h +++ b/drivers/staging/rdma/hfi1/user_sdma.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -69,7 +66,11 @@ struct hfi1_user_sdma_pkt_q { struct iowait busy; unsigned state; wait_queue_head_t wait; - struct mm_struct *user_mm; + unsigned long unpinned; + struct rb_root sdma_rb_root; + u32 n_locked; + struct list_head evict; + spinlock_t evict_lock; /* protect evict and n_locked */ }; struct hfi1_user_sdma_comp_q { diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c index 09b8d412ee90..89f2aad45c1b 100644 --- a/drivers/staging/rdma/hfi1/verbs.c +++ b/drivers/staging/rdma/hfi1/verbs.c @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -63,9 +60,9 @@ #include "device.h" #include "trace.h" #include "qp.h" -#include "sdma.h" +#include "verbs_txreq.h" -unsigned int hfi1_lkey_table_size = 16; +static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, S_IRUGO); MODULE_PARM_DESC(lkey_table_size, @@ -124,45 +121,181 @@ unsigned int hfi1_max_srq_wrs = 0x1FFFF; module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO); MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); +unsigned short piothreshold = 256; +module_param(piothreshold, ushort, S_IRUGO); +MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); + +#define COPY_CACHELESS 1 +#define COPY_ADAPTIVE 2 +static unsigned int sge_copy_mode; +module_param(sge_copy_mode, uint, S_IRUGO); +MODULE_PARM_DESC(sge_copy_mode, + "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS"); + static void verbs_sdma_complete( struct sdma_txreq *cookie, - int status, - int drained); + int status); + +static int pio_wait(struct rvt_qp *qp, + struct send_context *sc, + struct hfi1_pkt_state *ps, + u32 flag); /* Length of buffer to create verbs txreq cache name */ #define TXREQ_NAME_LEN 24 +static uint wss_threshold; +module_param(wss_threshold, uint, S_IRUGO); +MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); +static uint wss_clean_period = 256; +module_param(wss_clean_period, uint, S_IRUGO); +MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); + +/* memory working set size */ +struct hfi1_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; +}; + +static struct hfi1_wss wss; + +int hfi1_wss_init(void) +{ + long llc_size; + long llc_bits; + long table_size; + long table_bits; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss.pages_mask = table_bits - 1; + wss.num_entries = table_bits / BITS_PER_LONG; + + wss.threshold = (llc_bits * wss_threshold) / 100; + if (wss.threshold == 0) + wss.threshold = 1; + + atomic_set(&wss.clean_counter, wss_clean_period); + + wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), + GFP_KERNEL); + if (!wss.entries) { + hfi1_wss_exit(); + return -ENOMEM; + } + + return 0; +} + +void hfi1_wss_exit(void) +{ + /* coded to handle partially initialized and repeat callers */ + kfree(wss.entries); + wss.entries = NULL; +} + /* - * Note that it is OK to post send work requests in the SQE and ERR - * states; hfi1_do_send() will process them and generate error - * completions as per IB 1.2 C10-96. + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. */ -const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = 0, - [IB_QPS_INIT] = HFI1_POST_RECV_OK, - [IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK, - [IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | - HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK | - HFI1_PROCESS_NEXT_SEND_OK, - [IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | - HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK, - [IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | - HFI1_POST_SEND_OK | HFI1_FLUSH_SEND, - [IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV | - HFI1_POST_SEND_OK | HFI1_FLUSH_SEND, -}; +static void wss_advance_clean_counter(void) +{ + int entry; + int weight; + unsigned long bits; -struct hfi1_ucontext { - struct ib_ucontext ibucontext; -}; + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss.clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss.clean_counter, wss_clean_period); -static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext - *ibucontext) + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss.clean_entry) - 1) + & (wss.num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss.entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss.total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(void *address) { - return container_of(ibucontext, struct hfi1_ucontext, ibucontext); + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss.entries[entry])) + atomic_inc(&wss.total_count); + + wss_advance_clean_counter(); } -static inline void _hfi1_schedule_send(struct hfi1_qp *qp); +/* + * Is the working set larger than the threshold? + */ +static inline int wss_exceeds_threshold(void) +{ + return atomic_read(&wss.total_count) >= wss.threshold; +} /* * Translate ib_wr_opcode into ib_wc_opcode. @@ -274,14 +407,47 @@ __be64 ib_hfi1_sys_image_guid; * @ss: the SGE state * @data: the data to copy * @length: the length of the data + * @copy_last: do a separate copy of the last 8 bytes */ void hfi1_copy_sge( - struct hfi1_sge_state *ss, + struct rvt_sge_state *ss, void *data, u32 length, - int release) + int release, + int copy_last) { - struct hfi1_sge *sge = &ss->sge; + struct rvt_sge *sge = &ss->sge; + int in_last = 0; + int i; + int cacheless_copy = 0; + + if (sge_copy_mode == COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(sge->vaddr + PAGE_SIZE); + + cacheless_copy = wss_exceeds_threshold(); + } else { + wss_advance_clean_counter(); + } + } + if (copy_last) { + if (length > 8) { + length -= 8; + } else { + copy_last = 0; + in_last = 1; + } + } +again: while (length) { u32 len = sge->length; @@ -290,17 +456,25 @@ void hfi1_copy_sge( if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - memcpy(sge->vaddr, data, len); + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ + for (i = 0; i < len; i++) + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); + } else { + memcpy(sge->vaddr, data, len); + } sge->vaddr += len; sge->length -= len; sge->sge_length -= len; if (sge->sge_length == 0) { if (release) - hfi1_put_mr(sge->mr); + rvt_put_mr(sge->mr); if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= HFI1_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -313,6 +487,13 @@ void hfi1_copy_sge( data += len; length -= len; } + + if (copy_last) { + copy_last = 0; + in_last = 1; + length = 8; + goto again; + } } /** @@ -320,9 +501,9 @@ void hfi1_copy_sge( * @ss: the SGE state * @length: the number of bytes to skip */ -void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release) +void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release) { - struct hfi1_sge *sge = &ss->sge; + struct rvt_sge *sge = &ss->sge; while (length) { u32 len = sge->length; @@ -337,11 +518,11 @@ void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release) sge->sge_length -= len; if (sge->sge_length == 0) { if (release) - hfi1_put_mr(sge->mr); + rvt_put_mr(sge->mr); if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= HFI1_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) break; sge->n = 0; @@ -355,231 +536,6 @@ void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release) } } -/** - * post_one_send - post one RC, UC, or UD send work request - * @qp: the QP to post on - * @wr: the work request to send - */ -static int post_one_send(struct hfi1_qp *qp, struct ib_send_wr *wr) -{ - struct hfi1_swqe *wqe; - u32 next; - int i; - int j; - int acc; - struct hfi1_lkey_table *rkt; - struct hfi1_pd *pd; - struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - struct hfi1_pportdata *ppd; - struct hfi1_ibport *ibp; - - /* IB spec says that num_sge == 0 is OK. */ - if (unlikely(wr->num_sge > qp->s_max_sge)) - return -EINVAL; - - ppd = &dd->pport[qp->port_num - 1]; - ibp = &ppd->ibport_data; - - /* - * Don't allow RDMA reads or atomic operations on UC or - * undefined operations. - * Make sure buffer is large enough to hold the result for atomics. - */ - if (qp->ibqp.qp_type == IB_QPT_UC) { - if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) - return -EINVAL; - } else if (qp->ibqp.qp_type != IB_QPT_RC) { - /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ - if (wr->opcode != IB_WR_SEND && - wr->opcode != IB_WR_SEND_WITH_IMM) - return -EINVAL; - /* Check UD destination address PD */ - if (qp->ibqp.pd != ud_wr(wr)->ah->pd) - return -EINVAL; - } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) - return -EINVAL; - else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && - (wr->num_sge == 0 || - wr->sg_list[0].length < sizeof(u64) || - wr->sg_list[0].addr & (sizeof(u64) - 1))) - return -EINVAL; - else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) - return -EINVAL; - - next = qp->s_head + 1; - if (next >= qp->s_size) - next = 0; - if (next == qp->s_last) - return -ENOMEM; - - rkt = &to_idev(qp->ibqp.device)->lk_table; - pd = to_ipd(qp->ibqp.pd); - wqe = get_swqe_ptr(qp, qp->s_head); - - - if (qp->ibqp.qp_type != IB_QPT_UC && - qp->ibqp.qp_type != IB_QPT_RC) - memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr)); - else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || - wr->opcode == IB_WR_RDMA_WRITE || - wr->opcode == IB_WR_RDMA_READ) - memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr)); - else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr)); - else - memcpy(&wqe->wr, wr, sizeof(wqe->wr)); - - wqe->length = 0; - j = 0; - if (wr->num_sge) { - acc = wr->opcode >= IB_WR_RDMA_READ ? - IB_ACCESS_LOCAL_WRITE : 0; - for (i = 0; i < wr->num_sge; i++) { - u32 length = wr->sg_list[i].length; - int ok; - - if (length == 0) - continue; - ok = hfi1_lkey_ok(rkt, pd, &wqe->sg_list[j], - &wr->sg_list[i], acc); - if (!ok) - goto bail_inval_free; - wqe->length += length; - j++; - } - wqe->wr.num_sge = j; - } - if (qp->ibqp.qp_type == IB_QPT_UC || - qp->ibqp.qp_type == IB_QPT_RC) { - if (wqe->length > 0x80000000U) - goto bail_inval_free; - } else { - struct hfi1_ah *ah = to_iah(ud_wr(wr)->ah); - - atomic_inc(&ah->refcount); - } - wqe->ssn = qp->s_ssn++; - qp->s_head = next; - - return 0; - -bail_inval_free: - /* release mr holds */ - while (j) { - struct hfi1_sge *sge = &wqe->sg_list[--j]; - - hfi1_put_mr(sge->mr); - } - return -EINVAL; -} - -/** - * post_send - post a send on a QP - * @ibqp: the QP to post the send on - * @wr: the list of work requests to post - * @bad_wr: the first bad WR is put here - * - * This may be called from interrupt context. - */ -static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) -{ - struct hfi1_qp *qp = to_iqp(ibqp); - int err = 0; - int call_send; - unsigned long flags; - unsigned nreq = 0; - - spin_lock_irqsave(&qp->s_lock, flags); - - /* Check that state is OK to post send. */ - if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) { - spin_unlock_irqrestore(&qp->s_lock, flags); - return -EINVAL; - } - - /* sq empty and not list -> call send */ - call_send = qp->s_head == qp->s_last && !wr->next; - - for (; wr; wr = wr->next) { - err = post_one_send(qp, wr); - if (unlikely(err)) { - *bad_wr = wr; - goto bail; - } - nreq++; - } -bail: - spin_unlock_irqrestore(&qp->s_lock, flags); - if (nreq && !call_send) - _hfi1_schedule_send(qp); - if (nreq && call_send) - hfi1_do_send(&qp->s_iowait.iowork); - return err; -} - -/** - * post_receive - post a receive on a QP - * @ibqp: the QP to post the receive on - * @wr: the WR to post - * @bad_wr: the first bad WR is put here - * - * This may be called from interrupt context. - */ -static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) -{ - struct hfi1_qp *qp = to_iqp(ibqp); - struct hfi1_rwq *wq = qp->r_rq.wq; - unsigned long flags; - int ret; - - /* Check that state is OK to post receive. */ - if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) { - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - - for (; wr; wr = wr->next) { - struct hfi1_rwqe *wqe; - u32 next; - int i; - - if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - - spin_lock_irqsave(&qp->r_rq.lock, flags); - next = wq->head + 1; - if (next >= qp->r_rq.size) - next = 0; - if (next == wq->tail) { - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - *bad_wr = wr; - ret = -ENOMEM; - goto bail; - } - - wqe = get_rwqe_ptr(&qp->r_rq, wq->head); - wqe->wr_id = wr->wr_id; - wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; - /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - } - ret = 0; - -bail: - return ret; -} - /* * Make sure the QP is ready and able to accept the given opcode. */ @@ -587,18 +543,17 @@ static inline int qp_ok(int opcode, struct hfi1_packet *packet) { struct hfi1_ibport *ibp; - if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK)) + if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) goto dropit; if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) || (opcode == IB_OPCODE_CNP)) return 1; dropit: ibp = &packet->rcd->ppd->ibport_data; - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; return 0; } - /** * hfi1_ib_rcv - process an incoming packet * @packet: data packet information @@ -614,6 +569,7 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) u32 tlen = packet->tlen; struct hfi1_pportdata *ppd = rcd->ppd; struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; unsigned long flags; u32 qp_num; int lnh; @@ -622,9 +578,9 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) /* Check for GRH */ lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == HFI1_LRH_BTH) + if (lnh == HFI1_LRH_BTH) { packet->ohdr = &hdr->u.oth; - else if (lnh == HFI1_LRH_GRH) { + } else if (lnh == HFI1_LRH_GRH) { u32 vtf; packet->ohdr = &hdr->u.l.oth; @@ -634,8 +590,9 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) goto drop; packet->rcv_flags |= HFI1_HAS_GRH; - } else + } else { goto drop; + } trace_input_ibhdr(rcd->dd, hdr); @@ -643,17 +600,17 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) inc_opstats(tlen, &rcd->opstats->stats[opcode]); /* Get the destination QP number. */ - qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK; + qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK; lid = be16_to_cpu(hdr->lrh[1]); - if (unlikely((lid >= HFI1_MULTICAST_LID_BASE) && - (lid != HFI1_PERMISSIVE_LID))) { - struct hfi1_mcast *mcast; - struct hfi1_mcast_qp *p; + if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) { + struct rvt_mcast *mcast; + struct rvt_mcast_qp *p; if (lnh != HFI1_LRH_GRH) goto drop; - mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid); - if (mcast == NULL) + mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid); + if (!mcast) goto drop; list_for_each_entry_rcu(p, &mcast->qp_list, list) { packet->qp = p->qp; @@ -663,14 +620,14 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) spin_unlock_irqrestore(&packet->qp->r_lock, flags); } /* - * Notify hfi1_multicast_detach() if it is waiting for us + * Notify rvt_multicast_detach() if it is waiting for us * to finish. */ if (atomic_dec_return(&mcast->refcount) <= 1) wake_up(&mcast->wait); } else { rcu_read_lock(); - packet->qp = hfi1_lookup_qpn(ibp, qp_num); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); if (!packet->qp) { rcu_read_unlock(); goto drop; @@ -684,7 +641,7 @@ void hfi1_ib_rcv(struct hfi1_packet *packet) return; drop: - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; } /* @@ -695,15 +652,17 @@ static void mem_timer(unsigned long data) { struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data; struct list_head *list = &dev->memwait; - struct hfi1_qp *qp = NULL; + struct rvt_qp *qp = NULL; struct iowait *wait; unsigned long flags; + struct hfi1_qp_priv *priv; write_seqlock_irqsave(&dev->iowait_lock, flags); if (!list_empty(list)) { wait = list_first_entry(list, struct iowait, list); - qp = container_of(wait, struct hfi1_qp, s_iowait); - list_del_init(&qp->s_iowait.list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); /* refcount held until actual wake up */ if (!list_empty(list)) mod_timer(&dev->mem_timer, jiffies + 1); @@ -711,12 +670,12 @@ static void mem_timer(unsigned long data) write_sequnlock_irqrestore(&dev->iowait_lock, flags); if (qp) - hfi1_qp_wakeup(qp, HFI1_S_WAIT_KMEM); + hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM); } -void update_sge(struct hfi1_sge_state *ss, u32 length) +void update_sge(struct rvt_sge_state *ss, u32 length) { - struct hfi1_sge *sge = &ss->sge; + struct rvt_sge *sge = &ss->sge; sge->vaddr += length; sge->length -= length; @@ -725,7 +684,7 @@ void update_sge(struct hfi1_sge_state *ss, u32 length) if (--ss->num_sge) *sge = *ss->sg_list++; } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= HFI1_SEGSZ) { + if (++sge->n >= RVT_SEGSZ) { if (++sge->m >= sge->mr->mapsz) return; sge->n = 0; @@ -735,143 +694,55 @@ void update_sge(struct hfi1_sge_state *ss, u32 length) } } -static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, - struct hfi1_qp *qp) -{ - struct verbs_txreq *tx; - unsigned long flags; - - tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); - if (!tx) { - spin_lock_irqsave(&qp->s_lock, flags); - write_seqlock(&dev->iowait_lock); - if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK && - list_empty(&qp->s_iowait.list)) { - dev->n_txwait++; - qp->s_flags |= HFI1_S_WAIT_TX; - list_add_tail(&qp->s_iowait.list, &dev->txwait); - trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TX); - atomic_inc(&qp->refcount); - } - qp->s_flags &= ~HFI1_S_BUSY; - write_sequnlock(&dev->iowait_lock); - spin_unlock_irqrestore(&qp->s_lock, flags); - tx = ERR_PTR(-EBUSY); - } - return tx; -} - -static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, - struct hfi1_qp *qp) -{ - struct verbs_txreq *tx; - - tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); - if (!tx) { - /* call slow path to get the lock */ - tx = __get_txreq(dev, qp); - if (IS_ERR(tx)) - return tx; - } - tx->qp = qp; - return tx; -} - -void hfi1_put_txreq(struct verbs_txreq *tx) -{ - struct hfi1_ibdev *dev; - struct hfi1_qp *qp; - unsigned long flags; - unsigned int seq; - - qp = tx->qp; - dev = to_idev(qp->ibqp.device); - - if (tx->mr) { - hfi1_put_mr(tx->mr); - tx->mr = NULL; - } - sdma_txclean(dd_from_dev(dev), &tx->txreq); - - /* Free verbs_txreq and return to slab cache */ - kmem_cache_free(dev->verbs_txreq_cache, tx); - - do { - seq = read_seqbegin(&dev->iowait_lock); - if (!list_empty(&dev->txwait)) { - struct iowait *wait; - - write_seqlock_irqsave(&dev->iowait_lock, flags); - /* Wake up first QP wanting a free struct */ - wait = list_first_entry(&dev->txwait, struct iowait, - list); - qp = container_of(wait, struct hfi1_qp, s_iowait); - list_del_init(&qp->s_iowait.list); - /* refcount held until actual wake up */ - write_sequnlock_irqrestore(&dev->iowait_lock, flags); - hfi1_qp_wakeup(qp, HFI1_S_WAIT_TX); - break; - } - } while (read_seqretry(&dev->iowait_lock, seq)); -} - /* * This is called with progress side lock held. */ /* New API */ static void verbs_sdma_complete( struct sdma_txreq *cookie, - int status, - int drained) + int status) { struct verbs_txreq *tx = container_of(cookie, struct verbs_txreq, txreq); - struct hfi1_qp *qp = tx->qp; + struct rvt_qp *qp = tx->qp; spin_lock(&qp->s_lock); - if (tx->wqe) + if (tx->wqe) { hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); - else if (qp->ibqp.qp_type == IB_QPT_RC) { + } else if (qp->ibqp.qp_type == IB_QPT_RC) { struct hfi1_ib_header *hdr; hdr = &tx->phdr.hdr; hfi1_rc_send_complete(qp, hdr); } - if (drained) { - /* - * This happens when the send engine notes - * a QP in the error state and cannot - * do the flush work until that QP's - * sdma work has finished. - */ - if (qp->s_flags & HFI1_S_WAIT_DMA) { - qp->s_flags &= ~HFI1_S_WAIT_DMA; - hfi1_schedule_send(qp); - } - } spin_unlock(&qp->s_lock); hfi1_put_txreq(tx); } -static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp) +static int wait_kmem(struct hfi1_ibdev *dev, + struct rvt_qp *qp, + struct hfi1_pkt_state *ps) { + struct hfi1_qp_priv *priv = qp->priv; unsigned long flags; int ret = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); - if (list_empty(&qp->s_iowait.list)) { + list_add_tail(&ps->s_txreq->txreq.list, + &priv->s_iowait.tx_head); + if (list_empty(&priv->s_iowait.list)) { if (list_empty(&dev->memwait)) mod_timer(&dev->mem_timer, jiffies + 1); - qp->s_flags |= HFI1_S_WAIT_KMEM; - list_add_tail(&qp->s_iowait.list, &dev->memwait); - trace_hfi1_qpsleep(qp, HFI1_S_WAIT_KMEM); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->s_iowait.list, &dev->memwait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); atomic_inc(&qp->refcount); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~HFI1_S_BUSY; + qp->s_flags &= ~RVT_S_BUSY; ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -884,14 +755,14 @@ static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp) * * Add failures will revert the sge cursor */ -static int build_verbs_ulp_payload( +static noinline int build_verbs_ulp_payload( struct sdma_engine *sde, - struct hfi1_sge_state *ss, + struct rvt_sge_state *ss, u32 length, struct verbs_txreq *tx) { - struct hfi1_sge *sg_list = ss->sg_list; - struct hfi1_sge sge = ss->sge; + struct rvt_sge *sg_list = ss->sg_list; + struct rvt_sge sge = ss->sge; u8 num_sge = ss->num_sge; u32 len; int ret = 0; @@ -928,23 +799,21 @@ bail_txadd: * NOTE: DMA mapping is held in the tx until completed in the ring or * the tx desc is freed without having been submitted to the ring * - * This routine insures the following all the helper routine - * calls succeed. + * This routine ensures all the helper routine calls succeed. */ /* New API */ static int build_verbs_tx_desc( struct sdma_engine *sde, - struct hfi1_sge_state *ss, + struct rvt_sge_state *ss, u32 length, struct verbs_txreq *tx, struct ahg_ib_header *ahdr, u64 pbc) { int ret = 0; - struct hfi1_pio_header *phdr; + struct hfi1_pio_header *phdr = &tx->phdr; u16 hdrbytes = tx->hdr_dwords << 2; - phdr = &tx->phdr; if (!ahdr->ahgcount) { ret = sdma_txinit_ahg( &tx->txreq, @@ -958,29 +827,14 @@ static int build_verbs_tx_desc( if (ret) goto bail_txadd; phdr->pbc = cpu_to_le64(pbc); - memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc)); - /* add the header */ ret = sdma_txadd_kvaddr( sde->dd, &tx->txreq, - &tx->phdr, - tx->hdr_dwords << 2); + phdr, + hdrbytes); if (ret) goto bail_txadd; } else { - struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth; - struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth; - - /* needed in rc_send_complete() */ - phdr->hdr.lrh[0] = ahdr->ibh.lrh[0]; - if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) { - sohdr = &ahdr->ibh.u.l.oth; - dohdr = &phdr->hdr.u.l.oth; - } - /* opcode */ - dohdr->bth[0] = sohdr->bth[0]; - /* PSN/ACK */ - dohdr->bth[2] = sohdr->bth[2]; ret = sdma_txinit_ahg( &tx->txreq, ahdr->tx_flags, @@ -1001,80 +855,75 @@ bail_txadd: return ret; } -int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc) { - struct ahg_ib_header *ahdr = qp->s_hdr; + struct hfi1_qp_priv *priv = qp->priv; + struct ahg_ib_header *ahdr = priv->s_hdr; u32 hdrwords = qp->s_hdrwords; - struct hfi1_sge_state *ss = qp->s_cur_sge; + struct rvt_sge_state *ss = qp->s_cur_sge; u32 len = qp->s_cur_size; u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */ struct hfi1_ibdev *dev = ps->dev; struct hfi1_pportdata *ppd = ps->ppd; struct verbs_txreq *tx; - struct sdma_txreq *stx; u64 pbc_flags = 0; - u8 sc5 = qp->s_sc; + u8 sc5 = priv->s_sc; + int ret; - if (!list_empty(&qp->s_iowait.tx_head)) { - stx = list_first_entry( - &qp->s_iowait.tx_head, - struct sdma_txreq, - list); - list_del_init(&stx->list); - tx = container_of(stx, struct verbs_txreq, txreq); - ret = sdma_send_txreq(tx->sde, &qp->s_iowait, stx); - if (unlikely(ret == -ECOMM)) + tx = ps->s_txreq; + if (!sdma_txreq_built(&tx->txreq)) { + if (likely(pbc == 0)) { + u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + /* No vl15 here */ + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + + pbc = create_pbc(ppd, + pbc_flags, + qp->srate_mbps, + vl, + plen); + } + tx->wqe = qp->s_wqe; + ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc); + if (unlikely(ret)) + goto bail_build; + } + ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq); + if (unlikely(ret < 0)) { + if (ret == -ECOMM) goto bail_ecomm; return ret; } - - tx = get_txreq(dev, qp); - if (IS_ERR(tx)) - goto bail_tx; - - tx->sde = qp->s_sde; - - if (likely(pbc == 0)) { - u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); - /* No vl15 here */ - /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ - pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; - - pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); - } - tx->wqe = qp->s_wqe; - tx->mr = qp->s_rdma_mr; - if (qp->s_rdma_mr) - qp->s_rdma_mr = NULL; - tx->hdr_dwords = hdrwords + 2; - ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc); - if (unlikely(ret)) - goto bail_build; - trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh); - ret = sdma_send_txreq(tx->sde, &qp->s_iowait, &tx->txreq); - if (unlikely(ret == -ECOMM)) - goto bail_ecomm; + trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &ps->s_txreq->phdr.hdr); return ret; bail_ecomm: /* The current one got "sent" */ return 0; bail_build: - /* kmalloc or mapping fail */ - hfi1_put_txreq(tx); - return wait_kmem(dev, qp); -bail_tx: - return PTR_ERR(tx); + ret = wait_kmem(dev, qp, ps); + if (!ret) { + /* free txreq - bad state */ + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + } + return ret; } /* * If we are now in the error state, return zero to flush the * send work request. */ -static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc) +static int pio_wait(struct rvt_qp *qp, + struct send_context *sc, + struct hfi1_pkt_state *ps, + u32 flag) { + struct hfi1_qp_priv *priv = qp->priv; struct hfi1_devdata *dd = sc->dd; struct hfi1_ibdev *dev = &dd->verbs_dev; unsigned long flags; @@ -1087,74 +936,89 @@ static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc) * enabling the PIO avail interrupt. */ spin_lock_irqsave(&qp->s_lock, flags); - if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); - if (list_empty(&qp->s_iowait.list)) { + list_add_tail(&ps->s_txreq->txreq.list, + &priv->s_iowait.tx_head); + if (list_empty(&priv->s_iowait.list)) { struct hfi1_ibdev *dev = &dd->verbs_dev; int was_empty; + dev->n_piowait += !!(flag & RVT_S_WAIT_PIO); + dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN); dev->n_piowait++; - qp->s_flags |= HFI1_S_WAIT_PIO; + qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); - list_add_tail(&qp->s_iowait.list, &sc->piowait); - trace_hfi1_qpsleep(qp, HFI1_S_WAIT_PIO); + list_add_tail(&priv->s_iowait.list, &sc->piowait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO); atomic_inc(&qp->refcount); /* counting: only call wantpiobuf_intr if first user */ if (was_empty) hfi1_sc_wantpiobuf_intr(sc, 1); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~HFI1_S_BUSY; + qp->s_flags &= ~RVT_S_BUSY; ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); return ret; } -struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5) +static void verbs_pio_complete(void *arg, int code) { - struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1); - u8 vl; + struct rvt_qp *qp = (struct rvt_qp *)arg; + struct hfi1_qp_priv *priv = qp->priv; - vl = sc_to_vlt(dd, sc5); - if (vl >= ppd->vls_supported && vl != 15) - return NULL; - return dd->vld[vl].sc; + if (iowait_pio_dec(&priv->s_iowait)) + iowait_drain_wakeup(&priv->s_iowait); } -int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc) { - struct ahg_ib_header *ahdr = qp->s_hdr; + struct hfi1_qp_priv *priv = qp->priv; u32 hdrwords = qp->s_hdrwords; - struct hfi1_sge_state *ss = qp->s_cur_sge; + struct rvt_sge_state *ss = qp->s_cur_sge; u32 len = qp->s_cur_size; u32 dwords = (len + 3) >> 2; u32 plen = hdrwords + dwords + 2; /* includes pbc */ struct hfi1_pportdata *ppd = ps->ppd; - u32 *hdr = (u32 *)&ahdr->ibh; + u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr; u64 pbc_flags = 0; - u32 sc5; + u8 sc5; unsigned long flags = 0; struct send_context *sc; struct pio_buf *pbuf; int wc_status = IB_WC_SUCCESS; + int ret = 0; + pio_release_cb cb = NULL; + + /* only RC/UC use complete */ + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + cb = verbs_pio_complete; + break; + default: + break; + } /* vl15 special case taken care of in ud.c */ - sc5 = qp->s_sc; - sc = qp_to_send_context(qp, sc5); + sc5 = priv->s_sc; + sc = ps->s_txreq->psc; - if (!sc) - return -EINVAL; if (likely(pbc == 0)) { - u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); } - pbuf = sc_buffer_alloc(sc, plen, NULL, NULL); - if (unlikely(pbuf == NULL)) { + if (cb) + iowait_pio_inc(&priv->s_iowait); + pbuf = sc_buffer_alloc(sc, plen, cb, qp); + if (unlikely(!pbuf)) { + if (cb) + verbs_pio_complete(qp, 0); if (ppd->host_link_state != HLS_UP_ACTIVE) { /* * If we have filled the PIO buffers to capacity and are @@ -1174,7 +1038,12 @@ int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, * so lets continue to queue the request. */ hfi1_cdbg(PIO, "alloc failed. state active, queuing"); - return no_bufs_available(qp, sc); + ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO); + if (!ret) + /* txreq not queued - free */ + goto bail; + /* tx consumed in wait */ + return ret; } } @@ -1182,7 +1051,7 @@ int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords); } else { if (ss) { - seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4); + seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4); while (len) { void *addr = ss->sge.vaddr; u32 slen = ss->sge.length; @@ -1197,12 +1066,8 @@ int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, } } - trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh); - - if (qp->s_rdma_mr) { - hfi1_put_mr(qp->s_rdma_mr); - qp->s_rdma_mr = NULL; - } + trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &ps->s_txreq->phdr.hdr); pio_bail: if (qp->s_wqe) { @@ -1211,10 +1076,15 @@ pio_bail: spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); - hfi1_rc_send_complete(qp, &ahdr->ibh); + hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr); spin_unlock_irqrestore(&qp->s_lock, flags); } - return 0; + + ret = 0; + +bail: + hfi1_put_txreq(ps->s_txreq); + return ret; } /* @@ -1247,13 +1117,14 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) */ static inline int egress_pkey_check(struct hfi1_pportdata *ppd, struct hfi1_ib_header *hdr, - struct hfi1_qp *qp) + struct rvt_qp *qp) { + struct hfi1_qp_priv *priv = qp->priv; struct hfi1_other_headers *ohdr; struct hfi1_devdata *dd; int i = 0; u16 pkey; - u8 lnh, sc5 = qp->s_sc; + u8 lnh, sc5 = priv->s_sc; if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) return 0; @@ -1271,14 +1142,14 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd, if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) goto bad; - /* Is the pkey = 0x0, or 0x8000? */ if ((pkey & PKEY_LOW_15_MASK) == 0) goto bad; /* The most likely matching pkey has index qp->s_pkey_index */ if (unlikely(!egress_pkey_matches_entry(pkey, - ppd->pkeys[qp->s_pkey_index]))) { + ppd->pkeys + [qp->s_pkey_index]))) { /* no match - try the entire table */ for (; i < MAX_PKEY_VALUES; i++) { if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) @@ -1302,31 +1173,65 @@ bad: } /** + * get_send_routine - choose an egress routine + * + * Choose an egress routine based on QP type + * and size + */ +static inline send_routine get_send_routine(struct rvt_qp *qp, + struct verbs_txreq *tx) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ib_header *h = &tx->phdr.hdr; + + if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA))) + return dd->process_pio_send; + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + return dd->process_pio_send; + case IB_QPT_GSI: + case IB_QPT_UD: + break; + case IB_QPT_RC: + if (piothreshold && + qp->s_cur_size <= min(piothreshold, qp->pmtu) && + (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) && + iowait_sdma_pending(&priv->s_iowait) == 0 && + !sdma_txreq_built(&tx->txreq)) + return dd->process_pio_send; + break; + case IB_QPT_UC: + if (piothreshold && + qp->s_cur_size <= min(piothreshold, qp->pmtu) && + (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) && + iowait_sdma_pending(&priv->s_iowait) == 0 && + !sdma_txreq_built(&tx->txreq)) + return dd->process_pio_send; + break; + default: + break; + } + return dd->process_dma_send; +} + +/** * hfi1_verbs_send - send a packet * @qp: the QP to send on * @ps: the state of the packet to send * * Return zero if packet is sent or queued OK. - * Return non-zero and clear qp->s_flags HFI1_S_BUSY otherwise. + * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise. */ -int hfi1_verbs_send(struct hfi1_qp *qp, struct hfi1_pkt_state *ps) +int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - struct ahg_ib_header *ahdr = qp->s_hdr; + struct hfi1_qp_priv *priv = qp->priv; + send_routine sr; int ret; - int pio = 0; - unsigned long flags = 0; - - /* - * VL15 packets (IB_QPT_SMI) will always use PIO, so we - * can defer SDMA restart until link goes ACTIVE without - * worrying about just how we got there. - */ - if ((qp->ibqp.qp_type == IB_QPT_SMI) || - !(dd->flags & HFI1_HAS_SEND_DMA)) - pio = 1; - ret = egress_pkey_check(dd->pport, &ahdr->ibh, qp); + sr = get_send_routine(qp, ps->s_txreq); + ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp); if (unlikely(ret)) { /* * The value we are returning here does not get propagated to @@ -1336,7 +1241,9 @@ int hfi1_verbs_send(struct hfi1_qp *qp, struct hfi1_pkt_state *ps) * mechanism for handling the errors. So for SDMA we can just * return. */ - if (pio) { + if (sr == dd->process_pio_send) { + unsigned long flags; + hfi1_cdbg(PIO, "%s() Failed. Completing with err", __func__); spin_lock_irqsave(&qp->s_lock, flags); @@ -1345,71 +1252,57 @@ int hfi1_verbs_send(struct hfi1_qp *qp, struct hfi1_pkt_state *ps) } return -EINVAL; } - - if (pio) { - ret = dd->process_pio_send(qp, ps, 0); - } else { -#ifdef CONFIG_SDMA_VERBOSITY - dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n", - slashstrip(__FILE__), __LINE__, __func__); - dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", qp->s_hdrwords, - qp->s_cur_size); -#endif - ret = dd->process_dma_send(qp, ps, 0); - } - - return ret; + if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait)) + return pio_wait(qp, + ps->s_txreq->psc, + ps, + RVT_S_WAIT_PIO_DRAIN); + return sr(qp, ps, 0); } -static int query_device(struct ib_device *ibdev, - struct ib_device_attr *props, - struct ib_udata *uhw) +/** + * hfi1_fill_device_attr - Fill in rvt dev info device attributes. + * @dd: the device data structure + */ +static void hfi1_fill_device_attr(struct hfi1_devdata *dd) { - struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - struct hfi1_ibdev *dev = to_idev(ibdev); - - if (uhw->inlen || uhw->outlen) - return -EINVAL; - memset(props, 0, sizeof(*props)); - - props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | - IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | - IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; - - props->page_size_cap = PAGE_SIZE; - props->vendor_id = - dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; - props->vendor_part_id = dd->pcidev->device; - props->hw_ver = dd->minrev; - props->sys_image_guid = ib_hfi1_sys_image_guid; - props->max_mr_size = ~0ULL; - props->max_qp = hfi1_max_qps; - props->max_qp_wr = hfi1_max_qp_wrs; - props->max_sge = hfi1_max_sges; - props->max_sge_rd = hfi1_max_sges; - props->max_cq = hfi1_max_cqs; - props->max_ah = hfi1_max_ahs; - props->max_cqe = hfi1_max_cqes; - props->max_mr = dev->lk_table.max; - props->max_fmr = dev->lk_table.max; - props->max_map_per_fmr = 32767; - props->max_pd = hfi1_max_pds; - props->max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; - props->max_qp_init_rd_atom = 255; - /* props->max_res_rd_atom */ - props->max_srq = hfi1_max_srqs; - props->max_srq_wr = hfi1_max_srq_wrs; - props->max_srq_sge = hfi1_max_srq_sges; - /* props->local_ca_ack_delay */ - props->atomic_cap = IB_ATOMIC_GLOB; - props->max_pkeys = hfi1_get_npkeys(dd); - props->max_mcast_grp = hfi1_max_mcast_grps; - props->max_mcast_qp_attach = hfi1_max_mcast_qp_attached; - props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * - props->max_mcast_grp; - - return 0; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + + memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props)); + + rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + rdi->dparms.props.page_size_cap = PAGE_SIZE; + rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; + rdi->dparms.props.vendor_part_id = dd->pcidev->device; + rdi->dparms.props.hw_ver = dd->minrev; + rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid; + rdi->dparms.props.max_mr_size = ~0ULL; + rdi->dparms.props.max_qp = hfi1_max_qps; + rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_sge = hfi1_max_sges; + rdi->dparms.props.max_sge_rd = hfi1_max_sges; + rdi->dparms.props.max_cq = hfi1_max_cqs; + rdi->dparms.props.max_ah = hfi1_max_ahs; + rdi->dparms.props.max_cqe = hfi1_max_cqes; + rdi->dparms.props.max_mr = rdi->lkey_table.max; + rdi->dparms.props.max_fmr = rdi->lkey_table.max; + rdi->dparms.props.max_map_per_fmr = 32767; + rdi->dparms.props.max_pd = hfi1_max_pds; + rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; + rdi->dparms.props.max_qp_init_rd_atom = 255; + rdi->dparms.props.max_srq = hfi1_max_srqs; + rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs; + rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges; + rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB; + rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd); + rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps; + rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached; + rdi->dparms.props.max_total_mcast_qp_attach = + rdi->dparms.props.max_mcast_qp_attach * + rdi->dparms.props.max_mcast_grp; } static inline u16 opa_speed_to_ib(u16 in) @@ -1443,33 +1336,24 @@ static inline u16 opa_width_to_ib(u16 in) } } -static int query_port(struct ib_device *ibdev, u8 port, +static int query_port(struct rvt_dev_info *rdi, u8 port_num, struct ib_port_attr *props) { - struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - struct hfi1_ibport *ibp = to_iport(ibdev, port); - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; u16 lid = ppd->lid; - memset(props, 0, sizeof(*props)); props->lid = lid ? lid : 0; props->lmc = ppd->lmc; - props->sm_lid = ibp->sm_lid; - props->sm_sl = ibp->sm_sl; /* OPA logical states match IB logical states */ props->state = driver_lstate(ppd); props->phys_state = hfi1_ibphys_portstate(ppd); - props->port_cap_flags = ibp->port_cap_flags; props->gid_tbl_len = HFI1_GUIDS_PER_PORT; - props->max_msg_sz = 0x80000000; - props->pkey_tbl_len = hfi1_get_npkeys(dd); - props->bad_pkey_cntr = ibp->pkey_violations; - props->qkey_viol_cntr = ibp->qkey_violations; props->active_width = (u8)opa_width_to_ib(ppd->link_width_active); /* see rate_show() in ib core/sysfs.c */ props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active); props->max_vl_num = ppd->vls_supported; - props->init_type_reply = 0; /* Once we are a "first class" citizen and have added the OPA MTUs to * the core we can advertise the larger MTU enum to the ULPs, for now @@ -1483,27 +1367,6 @@ static int query_port(struct ib_device *ibdev, u8 port, 4096 : hfi1_max_mtu), IB_MTU_4096); props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : mtu_to_enum(ppd->ibmtu, IB_MTU_2048); - props->subnet_timeout = ibp->subnet_timeout; - - return 0; -} - -static int port_immutable(struct ib_device *ibdev, u8 port_num, - struct ib_port_immutable *immutable) -{ - struct ib_port_attr attr; - int err; - - err = query_port(ibdev, port_num, &attr); - if (err) - return err; - - memset(immutable, 0, sizeof(*immutable)); - - immutable->pkey_tbl_len = attr.pkey_tbl_len; - immutable->gid_tbl_len = attr.gid_tbl_len; - immutable->core_cap_flags = RDMA_CORE_PORT_INTEL_OPA; - immutable->max_mad_size = OPA_MGMT_MAD_SIZE; return 0; } @@ -1547,102 +1410,31 @@ bail: return ret; } -static int modify_port(struct ib_device *ibdev, u8 port, - int port_modify_mask, struct ib_port_modify *props) -{ - struct hfi1_ibport *ibp = to_iport(ibdev, port); - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - int ret = 0; - - ibp->port_cap_flags |= props->set_port_cap_mask; - ibp->port_cap_flags &= ~props->clr_port_cap_mask; - if (props->set_port_cap_mask || props->clr_port_cap_mask) - hfi1_cap_mask_chg(ibp); - if (port_modify_mask & IB_PORT_SHUTDOWN) { - set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0, - OPA_LINKDOWN_REASON_UNKNOWN); - ret = set_link_state(ppd, HLS_DN_DOWNDEF); - } - if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) - ibp->qkey_violations = 0; - return ret; -} - -static int query_gid(struct ib_device *ibdev, u8 port, - int index, union ib_gid *gid) +static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num) { - struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - int ret = 0; - - if (!port || port > dd->num_pports) - ret = -EINVAL; - else { - struct hfi1_ibport *ibp = to_iport(ibdev, port); - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - - gid->global.subnet_prefix = ibp->gid_prefix; - if (index == 0) - gid->global.interface_id = cpu_to_be64(ppd->guid); - else if (index < HFI1_GUIDS_PER_PORT) - gid->global.interface_id = ibp->guids[index - 1]; - else - ret = -EINVAL; - } - - return ret; -} - -static struct ib_pd *alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) -{ - struct hfi1_ibdev *dev = to_idev(ibdev); - struct hfi1_pd *pd; - struct ib_pd *ret; - - /* - * This is actually totally arbitrary. Some correctness tests - * assume there's a maximum number of PDs that can be allocated. - * We don't actually have this limit, but we fail the test if - * we allow allocations of more than we report for this value. - */ - - pd = kmalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - spin_lock(&dev->n_pds_lock); - if (dev->n_pds_allocated == hfi1_max_pds) { - spin_unlock(&dev->n_pds_lock); - kfree(pd); - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - dev->n_pds_allocated++; - spin_unlock(&dev->n_pds_lock); - - /* ib_alloc_pd() will initialize pd->ibpd. */ - pd->user = udata != NULL; - - ret = &pd->ibpd; + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; + int ret; -bail: + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0, + OPA_LINKDOWN_REASON_UNKNOWN); + ret = set_link_state(ppd, HLS_DN_DOWNDEF); return ret; } -static int dealloc_pd(struct ib_pd *ibpd) +static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, + int guid_index, __be64 *guid) { - struct hfi1_pd *pd = to_ipd(ibpd); - struct hfi1_ibdev *dev = to_idev(ibpd->device); - - spin_lock(&dev->n_pds_lock); - dev->n_pds_allocated--; - spin_unlock(&dev->n_pds_lock); + struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - kfree(pd); + if (guid_index == 0) + *guid = cpu_to_be64(ppd->guid); + else if (guid_index < HFI1_GUIDS_PER_PORT) + *guid = ibp->guids[guid_index - 1]; + else + return -EINVAL; return 0; } @@ -1657,101 +1449,57 @@ u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah) return ibp->sl_to_sc[ah->sl]; } -int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr) +static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr) { struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct hfi1_devdata *dd; u8 sc5; - /* A multicast address requires a GRH (see ch. 8.4.1). */ - if (ah_attr->dlid >= HFI1_MULTICAST_LID_BASE && - ah_attr->dlid != HFI1_PERMISSIVE_LID && - !(ah_attr->ah_flags & IB_AH_GRH)) - goto bail; - if ((ah_attr->ah_flags & IB_AH_GRH) && - ah_attr->grh.sgid_index >= HFI1_GUIDS_PER_PORT) - goto bail; - if (ah_attr->dlid == 0) - goto bail; - if (ah_attr->port_num < 1 || - ah_attr->port_num > ibdev->phys_port_cnt) - goto bail; - if (ah_attr->static_rate != IB_RATE_PORT_CURRENT && - ib_rate_to_mbps(ah_attr->static_rate) < 0) - goto bail; - if (ah_attr->sl >= OPA_MAX_SLS) - goto bail; /* test the mapping for validity */ ibp = to_iport(ibdev, ah_attr->port_num); ppd = ppd_from_ibp(ibp); sc5 = ibp->sl_to_sc[ah_attr->sl]; dd = dd_from_ppd(ppd); if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf) - goto bail; + return -EINVAL; return 0; -bail: - return -EINVAL; } -/** - * create_ah - create an address handle - * @pd: the protection domain - * @ah_attr: the attributes of the AH - * - * This may be called from interrupt context. - */ -static struct ib_ah *create_ah(struct ib_pd *pd, - struct ib_ah_attr *ah_attr) +static void hfi1_notify_new_ah(struct ib_device *ibdev, + struct ib_ah_attr *ah_attr, + struct rvt_ah *ah) { - struct hfi1_ah *ah; - struct ib_ah *ret; - struct hfi1_ibdev *dev = to_idev(pd->device); - unsigned long flags; - - if (hfi1_check_ah(pd->device, ah_attr)) { - ret = ERR_PTR(-EINVAL); - goto bail; - } - - ah = kmalloc(sizeof(*ah), GFP_ATOMIC); - if (!ah) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - spin_lock_irqsave(&dev->n_ahs_lock, flags); - if (dev->n_ahs_allocated == hfi1_max_ahs) { - spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - kfree(ah); - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - dev->n_ahs_allocated++; - spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - - /* ib_create_ah() will initialize ah->ibah. */ - ah->attr = *ah_attr; - atomic_set(&ah->refcount, 0); + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u8 sc5; - ret = &ah->ibah; + /* + * Do not trust reading anything from rvt_ah at this point as it is not + * done being setup. We can however modify things which we need to set. + */ -bail: - return ret; + ibp = to_iport(ibdev, ah_attr->port_num); + ppd = ppd_from_ibp(ibp); + sc5 = ibp->sl_to_sc[ah->attr.sl]; + dd = dd_from_ppd(ppd); + ah->vl = sc_to_vlt(dd, sc5); + if (ah->vl < num_vls || ah->vl == 15) + ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu); } struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid) { struct ib_ah_attr attr; struct ib_ah *ah = ERR_PTR(-EINVAL); - struct hfi1_qp *qp0; + struct rvt_qp *qp0; memset(&attr, 0, sizeof(attr)); attr.dlid = dlid; attr.port_num = ppd_from_ibp(ibp)->port; rcu_read_lock(); - qp0 = rcu_dereference(ibp->qp[0]); + qp0 = rcu_dereference(ibp->rvp.qp[0]); if (qp0) ah = ib_create_ah(qp0->ibqp.pd, &attr); rcu_read_unlock(); @@ -1759,51 +1507,6 @@ struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid) } /** - * destroy_ah - destroy an address handle - * @ibah: the AH to destroy - * - * This may be called from interrupt context. - */ -static int destroy_ah(struct ib_ah *ibah) -{ - struct hfi1_ibdev *dev = to_idev(ibah->device); - struct hfi1_ah *ah = to_iah(ibah); - unsigned long flags; - - if (atomic_read(&ah->refcount) != 0) - return -EBUSY; - - spin_lock_irqsave(&dev->n_ahs_lock, flags); - dev->n_ahs_allocated--; - spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - - kfree(ah); - - return 0; -} - -static int modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) -{ - struct hfi1_ah *ah = to_iah(ibah); - - if (hfi1_check_ah(ibah->device, ah_attr)) - return -EINVAL; - - ah->attr = *ah_attr; - - return 0; -} - -static int query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) -{ - struct hfi1_ah *ah = to_iah(ibah); - - *ah_attr = ah->attr; - - return 0; -} - -/** * hfi1_get_npkeys - return the size of the PKEY table for context 0 * @dd: the hfi1_ib device */ @@ -1812,54 +1515,6 @@ unsigned hfi1_get_npkeys(struct hfi1_devdata *dd) return ARRAY_SIZE(dd->pport[0].pkeys); } -static int query_pkey(struct ib_device *ibdev, u8 port, u16 index, - u16 *pkey) -{ - struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - int ret; - - if (index >= hfi1_get_npkeys(dd)) { - ret = -EINVAL; - goto bail; - } - - *pkey = hfi1_get_pkey(to_iport(ibdev, port), index); - ret = 0; - -bail: - return ret; -} - -/** - * alloc_ucontext - allocate a ucontest - * @ibdev: the infiniband device - * @udata: not used by the driver - */ - -static struct ib_ucontext *alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) -{ - struct hfi1_ucontext *context; - struct ib_ucontext *ret; - - context = kmalloc(sizeof(*context), GFP_KERNEL); - if (!context) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - ret = &context->ibucontext; - -bail: - return ret; -} - -static int dealloc_ucontext(struct ib_ucontext *context) -{ - kfree(to_iucontext(context)); - return 0; -} - static void init_ibport(struct hfi1_pportdata *ppd) { struct hfi1_ibport *ibp = &ppd->ibport_data; @@ -1871,28 +1526,21 @@ static void init_ibport(struct hfi1_pportdata *ppd) ibp->sc_to_sl[i] = i; } - spin_lock_init(&ibp->lock); + spin_lock_init(&ibp->rvp.lock); /* Set the prefix to the default value (see ch. 4.1.1) */ - ibp->gid_prefix = IB_DEFAULT_GID_PREFIX; - ibp->sm_lid = 0; + ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; + ibp->rvp.sm_lid = 0; /* Below should only set bits defined in OPA PortInfo.CapabilityMask */ - ibp->port_cap_flags = IB_PORT_AUTO_MIGR_SUP | + ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP | IB_PORT_CAP_MASK_NOTICE_SUP; - ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; - ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; - ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; - ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; - ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; - - RCU_INIT_POINTER(ibp->qp[0], NULL); - RCU_INIT_POINTER(ibp->qp[1], NULL); -} - -static void verbs_txreq_kmem_cache_ctor(void *obj) -{ - struct verbs_txreq *tx = obj; - - memset(tx, 0, sizeof(*tx)); + ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + RCU_INIT_POINTER(ibp->rvp.qp[0], NULL); + RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); } /** @@ -1903,76 +1551,26 @@ static void verbs_txreq_kmem_cache_ctor(void *obj) int hfi1_register_ib_device(struct hfi1_devdata *dd) { struct hfi1_ibdev *dev = &dd->verbs_dev; - struct ib_device *ibdev = &dev->ibdev; + struct ib_device *ibdev = &dev->rdi.ibdev; struct hfi1_pportdata *ppd = dd->pport; - unsigned i, lk_tab_size; + unsigned i; int ret; size_t lcpysz = IB_DEVICE_NAME_MAX; - u16 descq_cnt; - char buf[TXREQ_NAME_LEN]; - - ret = hfi1_qp_init(dev); - if (ret) - goto err_qp_init; - for (i = 0; i < dd->num_pports; i++) init_ibport(ppd + i); /* Only need to initialize non-zero fields. */ - spin_lock_init(&dev->n_pds_lock); - spin_lock_init(&dev->n_ahs_lock); - spin_lock_init(&dev->n_cqs_lock); - spin_lock_init(&dev->n_qps_lock); - spin_lock_init(&dev->n_srqs_lock); - spin_lock_init(&dev->n_mcast_grps_lock); - init_timer(&dev->mem_timer); - dev->mem_timer.function = mem_timer; - dev->mem_timer.data = (unsigned long) dev; - /* - * The top hfi1_lkey_table_size bits are used to index the - * table. The lower 8 bits can be owned by the user (copied from - * the LKEY). The remaining bits act as a generation number or tag. - */ - spin_lock_init(&dev->lk_table.lock); - dev->lk_table.max = 1 << hfi1_lkey_table_size; - /* ensure generation is at least 4 bits (keys.c) */ - if (hfi1_lkey_table_size > MAX_LKEY_TABLE_BITS) { - dd_dev_warn(dd, "lkey bits %u too large, reduced to %u\n", - hfi1_lkey_table_size, MAX_LKEY_TABLE_BITS); - hfi1_lkey_table_size = MAX_LKEY_TABLE_BITS; - } - lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); - dev->lk_table.table = (struct hfi1_mregion __rcu **) - vmalloc(lk_tab_size); - if (dev->lk_table.table == NULL) { - ret = -ENOMEM; - goto err_lk; - } - RCU_INIT_POINTER(dev->dma_mr, NULL); - for (i = 0; i < dev->lk_table.max; i++) - RCU_INIT_POINTER(dev->lk_table.table[i], NULL); - INIT_LIST_HEAD(&dev->pending_mmaps); - spin_lock_init(&dev->pending_lock); + setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev); + seqlock_init(&dev->iowait_lock); - dev->mmap_offset = PAGE_SIZE; - spin_lock_init(&dev->mmap_offset_lock); INIT_LIST_HEAD(&dev->txwait); INIT_LIST_HEAD(&dev->memwait); - descq_cnt = sdma_get_descq_cnt(); - - snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit); - /* SLAB_HWCACHE_ALIGN for AHG */ - dev->verbs_txreq_cache = kmem_cache_create(buf, - sizeof(struct verbs_txreq), - 0, SLAB_HWCACHE_ALIGN, - verbs_txreq_kmem_cache_ctor); - if (!dev->verbs_txreq_cache) { - ret = -ENOMEM; + ret = verbs_txreq_init(dev); + if (ret) goto err_verbs_txreq; - } /* * The system image GUID is supposed to be the same for all @@ -1985,142 +1583,119 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz); ibdev->owner = THIS_MODULE; ibdev->node_guid = cpu_to_be64(ppd->guid); - ibdev->uverbs_abi_ver = HFI1_UVERBS_ABI_VERSION; - ibdev->uverbs_cmd_mask = - (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | - (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | - (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | - (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | - (1ull << IB_USER_VERBS_CMD_CREATE_AH) | - (1ull << IB_USER_VERBS_CMD_MODIFY_AH) | - (1ull << IB_USER_VERBS_CMD_QUERY_AH) | - (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | - (1ull << IB_USER_VERBS_CMD_REG_MR) | - (1ull << IB_USER_VERBS_CMD_DEREG_MR) | - (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | - (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | - (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | - (1ull << IB_USER_VERBS_CMD_POLL_CQ) | - (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | - (1ull << IB_USER_VERBS_CMD_CREATE_QP) | - (1ull << IB_USER_VERBS_CMD_QUERY_QP) | - (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | - (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | - (1ull << IB_USER_VERBS_CMD_POST_SEND) | - (1ull << IB_USER_VERBS_CMD_POST_RECV) | - (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | - (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | - (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | - (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | - (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | - (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); - ibdev->node_type = RDMA_NODE_IB_CA; ibdev->phys_port_cnt = dd->num_pports; - ibdev->num_comp_vectors = 1; ibdev->dma_device = &dd->pcidev->dev; - ibdev->query_device = query_device; ibdev->modify_device = modify_device; - ibdev->query_port = query_port; - ibdev->modify_port = modify_port; - ibdev->query_pkey = query_pkey; - ibdev->query_gid = query_gid; - ibdev->alloc_ucontext = alloc_ucontext; - ibdev->dealloc_ucontext = dealloc_ucontext; - ibdev->alloc_pd = alloc_pd; - ibdev->dealloc_pd = dealloc_pd; - ibdev->create_ah = create_ah; - ibdev->destroy_ah = destroy_ah; - ibdev->modify_ah = modify_ah; - ibdev->query_ah = query_ah; - ibdev->create_srq = hfi1_create_srq; - ibdev->modify_srq = hfi1_modify_srq; - ibdev->query_srq = hfi1_query_srq; - ibdev->destroy_srq = hfi1_destroy_srq; - ibdev->create_qp = hfi1_create_qp; - ibdev->modify_qp = hfi1_modify_qp; - ibdev->query_qp = hfi1_query_qp; - ibdev->destroy_qp = hfi1_destroy_qp; - ibdev->post_send = post_send; - ibdev->post_recv = post_receive; - ibdev->post_srq_recv = hfi1_post_srq_receive; - ibdev->create_cq = hfi1_create_cq; - ibdev->destroy_cq = hfi1_destroy_cq; - ibdev->resize_cq = hfi1_resize_cq; - ibdev->poll_cq = hfi1_poll_cq; - ibdev->req_notify_cq = hfi1_req_notify_cq; - ibdev->get_dma_mr = hfi1_get_dma_mr; - ibdev->reg_user_mr = hfi1_reg_user_mr; - ibdev->dereg_mr = hfi1_dereg_mr; - ibdev->alloc_mr = hfi1_alloc_mr; - ibdev->alloc_fmr = hfi1_alloc_fmr; - ibdev->map_phys_fmr = hfi1_map_phys_fmr; - ibdev->unmap_fmr = hfi1_unmap_fmr; - ibdev->dealloc_fmr = hfi1_dealloc_fmr; - ibdev->attach_mcast = hfi1_multicast_attach; - ibdev->detach_mcast = hfi1_multicast_detach; + + /* keep process mad in the driver */ ibdev->process_mad = hfi1_process_mad; - ibdev->mmap = hfi1_mmap; - ibdev->dma_ops = &hfi1_dma_mapping_ops; - ibdev->get_port_immutable = port_immutable; strncpy(ibdev->node_desc, init_utsname()->nodename, sizeof(ibdev->node_desc)); - ret = ib_register_device(ibdev, hfi1_create_port_files); - if (ret) - goto err_reg; - - ret = hfi1_create_agents(dev); + /* + * Fill in rvt info object. + */ + dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; + dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name; + dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; + dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; + dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; + dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be; + dd->verbs_dev.rdi.driver_f.query_port_state = query_port; + dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port; + dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg; + /* + * Fill in rvt info device attributes. + */ + hfi1_fill_device_attr(dd); + + /* queue pair */ + dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size; + dd->verbs_dev.rdi.dparms.qpn_start = 0; + dd->verbs_dev.rdi.dparms.qpn_inc = 1; + dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift; + dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16; + dd->verbs_dev.rdi.dparms.qpn_res_end = + dd->verbs_dev.rdi.dparms.qpn_res_start + 65535; + dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC; + dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK; + dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT; + dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK; + dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA; + dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE; + + dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc; + dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free; + dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps; + dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset; + dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send; + dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr; + dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; + dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters; + dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue; + dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp; + dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; + dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp; + dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu; + dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; + dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; + dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + + /* completeion queue */ + snprintf(dd->verbs_dev.rdi.dparms.cq_name, + sizeof(dd->verbs_dev.rdi.dparms.cq_name), + "hfi1_cq%d", dd->unit); + dd->verbs_dev.rdi.dparms.node = dd->node; + + /* misc settings */ + dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */ + dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; + dd->verbs_dev.rdi.dparms.nports = dd->num_pports; + dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + + ppd = dd->pport; + for (i = 0; i < dd->num_pports; i++, ppd++) + rvt_init_port(&dd->verbs_dev.rdi, + &ppd->ibport_data.rvp, + i, + ppd->pkeys); + + ret = rvt_register_device(&dd->verbs_dev.rdi); if (ret) - goto err_agents; + goto err_verbs_txreq; ret = hfi1_verbs_register_sysfs(dd); if (ret) goto err_class; - goto bail; + return ret; err_class: - hfi1_free_agents(dev); -err_agents: - ib_unregister_device(ibdev); -err_reg: + rvt_unregister_device(&dd->verbs_dev.rdi); err_verbs_txreq: - kmem_cache_destroy(dev->verbs_txreq_cache); - vfree(dev->lk_table.table); -err_lk: - hfi1_qp_exit(dev); -err_qp_init: + verbs_txreq_exit(dev); dd_dev_err(dd, "cannot register verbs: %d!\n", -ret); -bail: return ret; } void hfi1_unregister_ib_device(struct hfi1_devdata *dd) { struct hfi1_ibdev *dev = &dd->verbs_dev; - struct ib_device *ibdev = &dev->ibdev; hfi1_verbs_unregister_sysfs(dd); - hfi1_free_agents(dev); - - ib_unregister_device(ibdev); + rvt_unregister_device(&dd->verbs_dev.rdi); if (!list_empty(&dev->txwait)) dd_dev_err(dd, "txwait list not empty!\n"); if (!list_empty(&dev->memwait)) dd_dev_err(dd, "memwait list not empty!\n"); - if (dev->dma_mr) - dd_dev_err(dd, "DMA MR not NULL!\n"); - hfi1_qp_exit(dev); del_timer_sync(&dev->mem_timer); - kmem_cache_destroy(dev->verbs_txreq_cache); - vfree(dev->lk_table.table); + verbs_txreq_exit(dev); } void hfi1_cnp_rcv(struct hfi1_packet *packet) @@ -2128,7 +1703,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet) struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_ib_header *hdr = packet->hdr; - struct hfi1_qp *qp = packet->qp; + struct rvt_qp *qp = packet->qp; u32 lqpn, rqpn = 0; u16 rlid = 0; u8 sl, sc5, sc4_bit, svc_type; @@ -2151,7 +1726,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet) svc_type = IB_CC_SVCTYPE_UD; break; default: - ibp->n_pkt_drops++; + ibp->rvp.n_pkt_drops++; return; } diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h index 286e468b0479..6c4670fffdbb 100644 --- a/drivers/staging/rdma/hfi1/verbs.h +++ b/drivers/staging/rdma/hfi1/verbs.h @@ -1,12 +1,11 @@ /* + * Copyright(c) 2015, 2016 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 Intel Corporation. - * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. @@ -18,8 +17,6 @@ * * BSD LICENSE * - * Copyright(c) 2015 Intel Corporation. - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -59,9 +56,13 @@ #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/completion.h> +#include <linux/slab.h> #include <rdma/ib_pack.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_mad.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> +#include <rdma/rdmavt_cq.h> struct hfi1_ctxtdata; struct hfi1_pportdata; @@ -79,12 +80,6 @@ struct hfi1_packet; */ #define HFI1_UVERBS_ABI_VERSION 2 -/* - * Define an ib_cq_notify value that is not valid so we know when CQ - * notifications are armed. - */ -#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) - #define IB_SEQ_NAK (3 << 29) /* AETH NAK opcode values */ @@ -95,17 +90,6 @@ struct hfi1_packet; #define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 #define IB_NAK_INVALID_RD_REQUEST 0x64 -/* Flags for checking QP state (see ib_hfi1_state_ops[]) */ -#define HFI1_POST_SEND_OK 0x01 -#define HFI1_POST_RECV_OK 0x02 -#define HFI1_PROCESS_RECV_OK 0x04 -#define HFI1_PROCESS_SEND_OK 0x08 -#define HFI1_PROCESS_NEXT_SEND_OK 0x10 -#define HFI1_FLUSH_SEND 0x20 -#define HFI1_FLUSH_RECV 0x40 -#define HFI1_PROCESS_OR_FLUSH_SEND \ - (HFI1_PROCESS_SEND_OK | HFI1_FLUSH_SEND) - /* IB Performance Manager status values */ #define IB_PMA_SAMPLE_STATUS_DONE 0x00 #define IB_PMA_SAMPLE_STATUS_STARTED 0x01 @@ -208,341 +192,18 @@ struct hfi1_pio_header { } __packed; /* - * used for force cacheline alignment for AHG - */ -struct tx_pio_header { - struct hfi1_pio_header phdr; -} ____cacheline_aligned; - -/* - * There is one struct hfi1_mcast for each multicast GID. - * All attached QPs are then stored as a list of - * struct hfi1_mcast_qp. - */ -struct hfi1_mcast_qp { - struct list_head list; - struct hfi1_qp *qp; -}; - -struct hfi1_mcast { - struct rb_node rb_node; - union ib_gid mgid; - struct list_head qp_list; - wait_queue_head_t wait; - atomic_t refcount; - int n_attached; -}; - -/* Protection domain */ -struct hfi1_pd { - struct ib_pd ibpd; - int user; /* non-zero if created from user space */ -}; - -/* Address Handle */ -struct hfi1_ah { - struct ib_ah ibah; - struct ib_ah_attr attr; - atomic_t refcount; -}; - -/* - * This structure is used by hfi1_mmap() to validate an offset - * when an mmap() request is made. The vm_area_struct then uses - * this as its vm_private_data. - */ -struct hfi1_mmap_info { - struct list_head pending_mmaps; - struct ib_ucontext *context; - void *obj; - __u64 offset; - struct kref ref; - unsigned size; -}; - -/* - * This structure is used to contain the head pointer, tail pointer, - * and completion queue entries as a single memory allocation so - * it can be mmap'ed into user space. - */ -struct hfi1_cq_wc { - u32 head; /* index of next entry to fill */ - u32 tail; /* index of next ib_poll_cq() entry */ - union { - /* these are actually size ibcq.cqe + 1 */ - struct ib_uverbs_wc uqueue[0]; - struct ib_wc kqueue[0]; - }; -}; - -/* - * The completion queue structure. - */ -struct hfi1_cq { - struct ib_cq ibcq; - struct kthread_work comptask; - struct hfi1_devdata *dd; - spinlock_t lock; /* protect changes in this struct */ - u8 notify; - u8 triggered; - struct hfi1_cq_wc *queue; - struct hfi1_mmap_info *ip; -}; - -/* - * A segment is a linear region of low physical memory. - * Used by the verbs layer. - */ -struct hfi1_seg { - void *vaddr; - size_t length; -}; - -/* The number of hfi1_segs that fit in a page. */ -#define HFI1_SEGSZ (PAGE_SIZE / sizeof(struct hfi1_seg)) - -struct hfi1_segarray { - struct hfi1_seg segs[HFI1_SEGSZ]; -}; - -struct hfi1_mregion { - struct ib_pd *pd; /* shares refcnt of ibmr.pd */ - u64 user_base; /* User's address for this region */ - u64 iova; /* IB start address of this region */ - size_t length; - u32 lkey; - u32 offset; /* offset (bytes) to start of region */ - int access_flags; - u32 max_segs; /* number of hfi1_segs in all the arrays */ - u32 mapsz; /* size of the map array */ - u8 page_shift; /* 0 - non unform/non powerof2 sizes */ - u8 lkey_published; /* in global table */ - struct completion comp; /* complete when refcount goes to zero */ - atomic_t refcount; - struct hfi1_segarray *map[0]; /* the segments */ -}; - -/* - * These keep track of the copy progress within a memory region. - * Used by the verbs layer. - */ -struct hfi1_sge { - struct hfi1_mregion *mr; - void *vaddr; /* kernel virtual address of segment */ - u32 sge_length; /* length of the SGE */ - u32 length; /* remaining length of the segment */ - u16 m; /* current index: mr->map[m] */ - u16 n; /* current index: mr->map[m]->segs[n] */ -}; - -/* Memory region */ -struct hfi1_mr { - struct ib_mr ibmr; - struct ib_umem *umem; - struct hfi1_mregion mr; /* must be last */ -}; - -/* - * Send work request queue entry. - * The size of the sg_list is determined when the QP is created and stored - * in qp->s_max_sge. - */ -struct hfi1_swqe { - union { - struct ib_send_wr wr; /* don't use wr.sg_list */ - struct ib_rdma_wr rdma_wr; - struct ib_atomic_wr atomic_wr; - struct ib_ud_wr ud_wr; - }; - u32 psn; /* first packet sequence number */ - u32 lpsn; /* last packet sequence number */ - u32 ssn; /* send sequence number */ - u32 length; /* total length of data in sg_list */ - struct hfi1_sge sg_list[0]; -}; - -/* - * Receive work request queue entry. - * The size of the sg_list is determined when the QP (or SRQ) is created - * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). - */ -struct hfi1_rwqe { - u64 wr_id; - u8 num_sge; - struct ib_sge sg_list[0]; -}; - -/* - * This structure is used to contain the head pointer, tail pointer, - * and receive work queue entries as a single memory allocation so - * it can be mmap'ed into user space. - * Note that the wq array elements are variable size so you can't - * just index into the array to get the N'th element; - * use get_rwqe_ptr() instead. + * hfi1 specific data structures that will be hidden from rvt after the queue + * pair is made common */ -struct hfi1_rwq { - u32 head; /* new work requests posted to the head */ - u32 tail; /* receives pull requests from here. */ - struct hfi1_rwqe wq[0]; -}; - -struct hfi1_rq { - struct hfi1_rwq *wq; - u32 size; /* size of RWQE array */ - u8 max_sge; - /* protect changes in this struct */ - spinlock_t lock ____cacheline_aligned_in_smp; -}; - -struct hfi1_srq { - struct ib_srq ibsrq; - struct hfi1_rq rq; - struct hfi1_mmap_info *ip; - /* send signal when number of RWQEs < limit */ - u32 limit; -}; - -struct hfi1_sge_state { - struct hfi1_sge *sg_list; /* next SGE to be used if any */ - struct hfi1_sge sge; /* progress state for the current SGE */ - u32 total_len; - u8 num_sge; -}; - -/* - * This structure holds the information that the send tasklet needs - * to send a RDMA read response or atomic operation. - */ -struct hfi1_ack_entry { - u8 opcode; - u8 sent; - u32 psn; - u32 lpsn; - union { - struct hfi1_sge rdma_sge; - u64 atomic_data; - }; -}; - -/* - * Variables prefixed with s_ are for the requester (sender). - * Variables prefixed with r_ are for the responder (receiver). - * Variables prefixed with ack_ are for responder replies. - * - * Common variables are protected by both r_rq.lock and s_lock in that order - * which only happens in modify_qp() or changing the QP 'state'. - */ -struct hfi1_qp { - struct ib_qp ibqp; - /* read mostly fields above and below */ - struct ib_ah_attr remote_ah_attr; - struct ib_ah_attr alt_ah_attr; - struct hfi1_qp __rcu *next; /* link list for QPN hash table */ - struct hfi1_swqe *s_wq; /* send work queue */ - struct hfi1_mmap_info *ip; - struct ahg_ib_header *s_hdr; /* next packet header to send */ - /* sc for UC/RC QPs - based on ah for UD */ - u8 s_sc; - unsigned long timeout_jiffies; /* computed from timeout */ - - enum ib_mtu path_mtu; - int srate_mbps; /* s_srate (below) converted to Mbit/s */ - u32 remote_qpn; - u32 pmtu; /* decoded from path_mtu */ - u32 qkey; /* QKEY for this QP (for UD or RD) */ - u32 s_size; /* send work queue size */ - u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ - u32 s_ahgpsn; /* set to the psn in the copy of the header */ - - u8 state; /* QP state */ - u8 allowed_ops; /* high order bits of allowed opcodes */ - u8 qp_access_flags; - u8 alt_timeout; /* Alternate path timeout for this QP */ - u8 timeout; /* Timeout for this QP */ - u8 s_srate; - u8 s_mig_state; - u8 port_num; - u8 s_pkey_index; /* PKEY index to use */ - u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ - u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ - u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ - u8 s_retry_cnt; /* number of times to retry */ - u8 s_rnr_retry_cnt; - u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ - u8 s_max_sge; /* size of s_wq->sg_list */ - u8 s_draining; - - /* start of read/write fields */ - atomic_t refcount ____cacheline_aligned_in_smp; - wait_queue_head_t wait; - - - struct hfi1_ack_entry s_ack_queue[HFI1_MAX_RDMA_ATOMIC + 1] - ____cacheline_aligned_in_smp; - struct hfi1_sge_state s_rdma_read_sge; - - spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ - unsigned long r_aflags; - u64 r_wr_id; /* ID for current receive WQE */ - u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ - u32 r_len; /* total length of r_sge */ - u32 r_rcv_len; /* receive data len processed */ - u32 r_psn; /* expected rcv packet sequence number */ - u32 r_msn; /* message sequence number */ - - u8 r_adefered; /* number of acks defered */ - u8 r_state; /* opcode of last packet received */ - u8 r_flags; - u8 r_head_ack_queue; /* index into s_ack_queue[] */ - - struct list_head rspwait; /* link for waiting to respond */ - - struct hfi1_sge_state r_sge; /* current receive data */ - struct hfi1_rq r_rq; /* receive work queue */ - - spinlock_t s_lock ____cacheline_aligned_in_smp; - struct hfi1_sge_state *s_cur_sge; - u32 s_flags; - struct hfi1_swqe *s_wqe; - struct hfi1_sge_state s_sge; /* current send request data */ - struct hfi1_mregion *s_rdma_mr; - struct sdma_engine *s_sde; /* current sde */ - u32 s_cur_size; /* size of send packet in bytes */ - u32 s_len; /* total length of s_sge */ - u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ - u32 s_next_psn; /* PSN for next request */ - u32 s_last_psn; /* last response PSN processed */ - u32 s_sending_psn; /* lowest PSN that is being sent */ - u32 s_sending_hpsn; /* highest PSN that is being sent */ - u32 s_psn; /* current packet sequence number */ - u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ - u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ - u32 s_head; /* new entries added here */ - u32 s_tail; /* next entry to process */ - u32 s_cur; /* current work queue entry */ - u32 s_acked; /* last un-ACK'ed entry */ - u32 s_last; /* last completed entry */ - u32 s_ssn; /* SSN of tail entry */ - u32 s_lsn; /* limit sequence number (credit) */ - u16 s_hdrwords; /* size of s_hdr in 32 bit words */ - u16 s_rdma_ack_cnt; - s8 s_ahgidx; - u8 s_state; /* opcode of last packet sent */ - u8 s_ack_state; /* opcode of packet to ACK */ - u8 s_nak_state; /* non-zero if NAK is pending */ - u8 r_nak_state; /* non-zero if NAK is pending */ - u8 s_retry; /* requester retry counter */ - u8 s_rnr_retry; /* requester RNR retry counter */ - u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ - u8 s_tail_ack_queue; /* index into s_ack_queue[] */ - - struct hfi1_sge_state s_ack_rdma_sge; - struct timer_list s_timer; - +struct hfi1_qp_priv { + struct ahg_ib_header *s_hdr; /* next header to send */ + struct sdma_engine *s_sde; /* current sde */ + struct send_context *s_sendcontext; /* current sendcontext */ + u8 s_sc; /* SC[0..4] for next packet */ + u8 r_adefered; /* number of acks defered */ struct iowait s_iowait; - - struct hfi1_sge r_sg_list[0] /* verified SGEs */ - ____cacheline_aligned_in_smp; + struct timer_list s_rnr_timer; + struct rvt_qp *owner; }; /* @@ -553,123 +214,11 @@ struct hfi1_pkt_state { struct hfi1_ibdev *dev; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; + struct verbs_txreq *s_txreq; }; -/* - * Atomic bit definitions for r_aflags. - */ -#define HFI1_R_WRID_VALID 0 -#define HFI1_R_REWIND_SGE 1 - -/* - * Bit definitions for r_flags. - */ -#define HFI1_R_REUSE_SGE 0x01 -#define HFI1_R_RDMAR_SEQ 0x02 -/* defer ack until end of interrupt session */ -#define HFI1_R_RSP_DEFERED_ACK 0x04 -/* relay ack to send engine */ -#define HFI1_R_RSP_SEND 0x08 -#define HFI1_R_COMM_EST 0x10 - -/* - * Bit definitions for s_flags. - * - * HFI1_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled - * HFI1_S_BUSY - send tasklet is processing the QP - * HFI1_S_TIMER - the RC retry timer is active - * HFI1_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics - * HFI1_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs - * before processing the next SWQE - * HFI1_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete - * before processing the next SWQE - * HFI1_S_WAIT_RNR - waiting for RNR timeout - * HFI1_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE - * HFI1_S_WAIT_DMA - waiting for send DMA queue to drain before generating - * next send completion entry not via send DMA - * HFI1_S_WAIT_PIO - waiting for a send buffer to be available - * HFI1_S_WAIT_TX - waiting for a struct verbs_txreq to be available - * HFI1_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available - * HFI1_S_WAIT_KMEM - waiting for kernel memory to be available - * HFI1_S_WAIT_PSN - waiting for a packet to exit the send DMA queue - * HFI1_S_WAIT_ACK - waiting for an ACK packet before sending more requests - * HFI1_S_SEND_ONE - send one packet, request ACK, then wait for ACK - * HFI1_S_ECN - a BECN was queued to the send engine - */ -#define HFI1_S_SIGNAL_REQ_WR 0x0001 -#define HFI1_S_BUSY 0x0002 -#define HFI1_S_TIMER 0x0004 -#define HFI1_S_RESP_PENDING 0x0008 -#define HFI1_S_ACK_PENDING 0x0010 -#define HFI1_S_WAIT_FENCE 0x0020 -#define HFI1_S_WAIT_RDMAR 0x0040 -#define HFI1_S_WAIT_RNR 0x0080 -#define HFI1_S_WAIT_SSN_CREDIT 0x0100 -#define HFI1_S_WAIT_DMA 0x0200 -#define HFI1_S_WAIT_PIO 0x0400 -#define HFI1_S_WAIT_TX 0x0800 -#define HFI1_S_WAIT_DMA_DESC 0x1000 -#define HFI1_S_WAIT_KMEM 0x2000 -#define HFI1_S_WAIT_PSN 0x4000 -#define HFI1_S_WAIT_ACK 0x8000 -#define HFI1_S_SEND_ONE 0x10000 -#define HFI1_S_UNLIMITED_CREDIT 0x20000 -#define HFI1_S_AHG_VALID 0x40000 -#define HFI1_S_AHG_CLEAR 0x80000 -#define HFI1_S_ECN 0x100000 - -/* - * Wait flags that would prevent any packet type from being sent. - */ -#define HFI1_S_ANY_WAIT_IO (HFI1_S_WAIT_PIO | HFI1_S_WAIT_TX | \ - HFI1_S_WAIT_DMA_DESC | HFI1_S_WAIT_KMEM) - -/* - * Wait flags that would prevent send work requests from making progress. - */ -#define HFI1_S_ANY_WAIT_SEND (HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | \ - HFI1_S_WAIT_RNR | HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_DMA | \ - HFI1_S_WAIT_PSN | HFI1_S_WAIT_ACK) - -#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | HFI1_S_ANY_WAIT_SEND) - #define HFI1_PSN_CREDIT 16 -/* - * Since struct hfi1_swqe is not a fixed size, we can't simply index into - * struct hfi1_qp.s_wq. This function does the array index computation. - */ -static inline struct hfi1_swqe *get_swqe_ptr(struct hfi1_qp *qp, - unsigned n) -{ - return (struct hfi1_swqe *)((char *)qp->s_wq + - (sizeof(struct hfi1_swqe) + - qp->s_max_sge * - sizeof(struct hfi1_sge)) * n); -} - -/* - * Since struct hfi1_rwqe is not a fixed size, we can't simply index into - * struct hfi1_rwq.wq. This function does the array index computation. - */ -static inline struct hfi1_rwqe *get_rwqe_ptr(struct hfi1_rq *rq, unsigned n) -{ - return (struct hfi1_rwqe *) - ((char *) rq->wq->wq + - (sizeof(struct hfi1_rwqe) + - rq->max_sge * sizeof(struct ib_sge)) * n); -} - -#define MAX_LKEY_TABLE_BITS 23 - -struct hfi1_lkey_table { - spinlock_t lock; /* protect changes in this struct */ - u32 next; /* next unused index (speeds search) */ - u32 gen; /* generation count */ - u32 max; /* size of the table */ - struct hfi1_mregion __rcu **table; -}; - struct hfi1_opcode_stats { u64 n_packets; /* number of packets */ u64 n_bytes; /* total number of bytes */ @@ -690,75 +239,20 @@ static inline void inc_opstats( } struct hfi1_ibport { - struct hfi1_qp __rcu *qp[2]; - struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ - struct hfi1_ah *sm_ah; - struct hfi1_ah *smi_ah; - struct rb_root mcast_tree; - spinlock_t lock; /* protect changes in this struct */ - - /* non-zero when timer is set */ - unsigned long mkey_lease_timeout; - unsigned long trap_timeout; - __be64 gid_prefix; /* in network order */ - __be64 mkey; + struct rvt_qp __rcu *qp[2]; + struct rvt_ibport rvp; + __be64 guids[HFI1_GUIDS_PER_PORT - 1]; /* writable GUIDs */ - u64 tid; /* TID for traps */ - u64 n_rc_resends; - u64 n_seq_naks; - u64 n_rdma_seq; - u64 n_rnr_naks; - u64 n_other_naks; - u64 n_loop_pkts; - u64 n_pkt_drops; - u64 n_vl15_dropped; - u64 n_rc_timeouts; - u64 n_dmawait; - u64 n_unaligned; - u64 n_rc_dupreq; - u64 n_rc_seqnak; - - /* Hot-path per CPU counters to avoid cacheline trading to update */ - u64 z_rc_acks; - u64 z_rc_qacks; - u64 z_rc_delayed_comp; - u64 __percpu *rc_acks; - u64 __percpu *rc_qacks; - u64 __percpu *rc_delayed_comp; - - u32 port_cap_flags; - u32 pma_sample_start; - u32 pma_sample_interval; - __be16 pma_counter_select[5]; - u16 pma_tag; - u16 pkey_violations; - u16 qkey_violations; - u16 mkey_violations; - u16 mkey_lease_period; - u16 sm_lid; - u16 repress_traps; - u8 sm_sl; - u8 mkeyprot; - u8 subnet_timeout; - u8 vl_high_limit; + /* the first 16 entries are sl_to_vl for !OPA */ u8 sl_to_sc[32]; u8 sc_to_sl[32]; }; - -struct hfi1_qp_ibdev; struct hfi1_ibdev { - struct ib_device ibdev; - struct list_head pending_mmaps; - spinlock_t mmap_offset_lock; /* protect mmap_offset */ - u32 mmap_offset; - struct hfi1_mregion __rcu *dma_mr; - - struct hfi1_qp_ibdev *qp_dev; + struct rvt_dev_info rdi; /* Must be first */ /* QP numbers are shared by all IB ports */ - struct hfi1_lkey_table lk_table; /* protect wait lists */ seqlock_t iowait_lock; struct list_head txwait; /* list for wait verbs_txreq */ @@ -767,26 +261,11 @@ struct hfi1_ibdev { struct kmem_cache *verbs_txreq_cache; struct timer_list mem_timer; - /* other waiters */ - spinlock_t pending_lock; - u64 n_piowait; + u64 n_piodrain; u64 n_txwait; u64 n_kmem_wait; - u64 n_send_schedule; - - u32 n_pds_allocated; /* number of PDs allocated for device */ - spinlock_t n_pds_lock; - u32 n_ahs_allocated; /* number of AHs allocated for device */ - spinlock_t n_ahs_lock; - u32 n_cqs_allocated; /* number of CQs allocated for device */ - spinlock_t n_cqs_lock; - u32 n_qps_allocated; /* number of QPs allocated for device */ - spinlock_t n_qps_lock; - u32 n_srqs_allocated; /* number of SRQs allocated for device */ - spinlock_t n_srqs_lock; - u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ - spinlock_t n_mcast_grps_lock; + #ifdef CONFIG_DEBUG_FS /* per HFI debugfs */ struct dentry *hfi1_ibdev_dbg; @@ -795,66 +274,31 @@ struct hfi1_ibdev { #endif }; -struct hfi1_verbs_counters { - u64 symbol_error_counter; - u64 link_error_recovery_counter; - u64 link_downed_counter; - u64 port_rcv_errors; - u64 port_rcv_remphys_errors; - u64 port_xmit_discards; - u64 port_xmit_data; - u64 port_rcv_data; - u64 port_xmit_packets; - u64 port_rcv_packets; - u32 local_link_integrity_errors; - u32 excessive_buffer_overrun_errors; - u32 vl15_dropped; -}; - -static inline struct hfi1_mr *to_imr(struct ib_mr *ibmr) -{ - return container_of(ibmr, struct hfi1_mr, ibmr); -} - -static inline struct hfi1_pd *to_ipd(struct ib_pd *ibpd) -{ - return container_of(ibpd, struct hfi1_pd, ibpd); -} - -static inline struct hfi1_ah *to_iah(struct ib_ah *ibah) -{ - return container_of(ibah, struct hfi1_ah, ibah); -} - -static inline struct hfi1_cq *to_icq(struct ib_cq *ibcq) +static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) { - return container_of(ibcq, struct hfi1_cq, ibcq); -} + struct rvt_dev_info *rdi; -static inline struct hfi1_srq *to_isrq(struct ib_srq *ibsrq) -{ - return container_of(ibsrq, struct hfi1_srq, ibsrq); + rdi = container_of(ibdev, struct rvt_dev_info, ibdev); + return container_of(rdi, struct hfi1_ibdev, rdi); } -static inline struct hfi1_qp *to_iqp(struct ib_qp *ibqp) +static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) { - return container_of(ibqp, struct hfi1_qp, ibqp); -} + struct hfi1_qp_priv *priv; -static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) -{ - return container_of(ibdev, struct hfi1_ibdev, ibdev); + priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait); + return priv->owner; } /* * Send if not busy or waiting for I/O and either * a RC response is pending or we can process send work requests. */ -static inline int hfi1_send_ok(struct hfi1_qp *qp) +static inline int hfi1_send_ok(struct rvt_qp *qp) { - return !(qp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT_IO)) && - (qp->s_hdrwords || (qp->s_flags & HFI1_S_RESP_PENDING) || - !(qp->s_flags & HFI1_S_ANY_WAIT_SEND)); + return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) && + (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); } /* @@ -862,7 +306,7 @@ static inline int hfi1_send_ok(struct hfi1_qp *qp) */ void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, u32 qp1, u32 qp2, u16 lid1, u16 lid2); -void hfi1_cap_mask_chg(struct hfi1_ibport *ibp); +void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num); void hfi1_sys_guid_chg(struct hfi1_ibport *ibp); void hfi1_node_desc_chg(struct hfi1_ibport *ibp); int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, @@ -870,8 +314,6 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, const struct ib_mad_hdr *in_mad, size_t in_mad_size, struct ib_mad_hdr *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); -int hfi1_create_agents(struct hfi1_ibdev *dev); -void hfi1_free_agents(struct hfi1_ibdev *dev); /* * The PSN_MASK and PSN_SHIFT allow for @@ -901,7 +343,7 @@ void hfi1_free_agents(struct hfi1_ibdev *dev); */ static inline int cmp_msn(u32 a, u32 b) { - return (((int) a) - ((int) b)) << 8; + return (((int)a) - ((int)b)) << 8; } /* @@ -910,7 +352,7 @@ static inline int cmp_msn(u32 a, u32 b) */ static inline int cmp_psn(u32 a, u32 b) { - return (((int) a) - ((int) b)) << PSN_SHIFT; + return (((int)a) - ((int)b)) << PSN_SHIFT; } /* @@ -929,23 +371,15 @@ static inline u32 delta_psn(u32 a, u32 b) return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; } -struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid); - -int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); - -int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); - -int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp); - struct verbs_txreq; void hfi1_put_txreq(struct verbs_txreq *tx); -int hfi1_verbs_send(struct hfi1_qp *qp, struct hfi1_pkt_state *ps); +int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -void hfi1_copy_sge(struct hfi1_sge_state *ss, void *data, u32 length, - int release); +void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, + int release, int copy_last); -void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release); +void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release); void hfi1_cnp_rcv(struct hfi1_packet *packet); @@ -957,147 +391,75 @@ void hfi1_rc_hdrerr( struct hfi1_ctxtdata *rcd, struct hfi1_ib_header *hdr, u32 rcv_flags, - struct hfi1_qp *qp); + struct rvt_qp *qp); u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); -int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); - struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid); void hfi1_rc_rnr_retry(unsigned long arg); +void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to); +void hfi1_rc_timeout(unsigned long arg); +void hfi1_del_timers_sync(struct rvt_qp *qp); +void hfi1_stop_rc_timers(struct rvt_qp *qp); -void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr); +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr); -void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err); +void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err); void hfi1_ud_rcv(struct hfi1_packet *packet); int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey); -int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region); - -void hfi1_free_lkey(struct hfi1_mregion *mr); - -int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd, - struct hfi1_sge *isge, struct ib_sge *sge, int acc); - -int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge, - u32 len, u64 vaddr, u32 rkey, int acc); - -int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); - -struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, - struct ib_srq_init_attr *srq_init_attr, - struct ib_udata *udata); - -int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, - enum ib_srq_attr_mask attr_mask, - struct ib_udata *udata); - -int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); - -int hfi1_destroy_srq(struct ib_srq *ibsrq); - -int hfi1_cq_init(struct hfi1_devdata *dd); - -void hfi1_cq_exit(struct hfi1_devdata *dd); - -void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int sig); - -int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); - -struct ib_cq *hfi1_create_cq( - struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_ucontext *context, - struct ib_udata *udata); - -int hfi1_destroy_cq(struct ib_cq *ibcq); - -int hfi1_req_notify_cq( - struct ib_cq *ibcq, - enum ib_cq_notify_flags notify_flags); - -int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); - -struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc); - -struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt_addr, int mr_access_flags, - struct ib_udata *udata); +int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only); -int hfi1_dereg_mr(struct ib_mr *ibmr); +void hfi1_migrate_qp(struct rvt_qp *qp); -struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd, - enum ib_mr_type mr_type, - u32 max_entries); +int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); -struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr); +void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); -int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova); +int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); -int hfi1_unmap_fmr(struct list_head *fmr_list); +extern const u32 rc_only_opcode; +extern const u32 uc_only_opcode; -int hfi1_dealloc_fmr(struct ib_fmr *ibfmr); - -static inline void hfi1_get_mr(struct hfi1_mregion *mr) -{ - atomic_inc(&mr->refcount); -} - -static inline void hfi1_put_mr(struct hfi1_mregion *mr) +static inline u8 get_opcode(struct hfi1_ib_header *h) { - if (unlikely(atomic_dec_and_test(&mr->refcount))) - complete(&mr->comp); -} + u16 lnh = be16_to_cpu(h->lrh[0]) & 3; -static inline void hfi1_put_ss(struct hfi1_sge_state *ss) -{ - while (ss->num_sge) { - hfi1_put_mr(ss->sge.mr); - if (--ss->num_sge) - ss->sge = *ss->sg_list++; - } + if (lnh == IB_LNH_IBA_LOCAL) + return be32_to_cpu(h->u.oth.bth[0]) >> 24; + else + return be32_to_cpu(h->u.l.oth.bth[0]) >> 24; } -void hfi1_release_mmap_info(struct kref *ref); - -struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, u32 size, - struct ib_ucontext *context, - void *obj); - -void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip, - u32 size, void *obj); - -int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); - -int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only); - int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, - int has_grh, struct hfi1_qp *qp, u32 bth0); + int has_grh, struct rvt_qp *qp, u32 bth0); u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, struct ib_global_route *grh, u32 hwords, u32 nwords); -void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, - u32 bth0, u32 bth2, int middle); +void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps); -void hfi1_do_send(struct work_struct *work); +void _hfi1_do_send(struct work_struct *work); -void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe, +void hfi1_do_send(struct rvt_qp *qp); + +void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status); -void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct hfi1_qp *qp, int is_fecn); +void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn); -int hfi1_make_rc_req(struct hfi1_qp *qp); +int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -int hfi1_make_uc_req(struct hfi1_qp *qp); +int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -int hfi1_make_ud_req(struct hfi1_qp *qp); +int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); int hfi1_register_ib_device(struct hfi1_devdata *); @@ -1107,24 +469,42 @@ void hfi1_ib_rcv(struct hfi1_packet *packet); unsigned hfi1_get_npkeys(struct hfi1_devdata *); -int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct hfi1_pkt_state *ps, +int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5); +int hfi1_wss_init(void); +void hfi1_wss_exit(void); + +/* platform specific: return the lowest level cache (llc) size, in KiB */ +static inline int wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static inline void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; extern const u8 hdr_len_by_opcode[]; -extern const int ib_hfi1_state_ops[]; +extern const int ib_rvt_state_ops[]; extern __be64 ib_hfi1_sys_image_guid; /* in network order */ -extern unsigned int hfi1_lkey_table_size; - extern unsigned int hfi1_max_cqes; extern unsigned int hfi1_max_cqs; @@ -1145,8 +525,8 @@ extern unsigned int hfi1_max_srq_sges; extern unsigned int hfi1_max_srq_wrs; -extern const u32 ib_hfi1_rnr_table[]; +extern unsigned short piothreshold; -extern struct ib_dma_mapping_ops hfi1_dma_mapping_ops; +extern const u32 ib_hfi1_rnr_table[]; #endif /* HFI1_VERBS_H */ diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.c b/drivers/staging/rdma/hfi1/verbs_txreq.c new file mode 100644 index 000000000000..bc95c4112c61 --- /dev/null +++ b/drivers/staging/rdma/hfi1/verbs_txreq.c @@ -0,0 +1,149 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "verbs_txreq.h" +#include "qp.h" +#include "trace.h" + +#define TXREQ_LEN 24 + +void hfi1_put_txreq(struct verbs_txreq *tx) +{ + struct hfi1_ibdev *dev; + struct rvt_qp *qp; + unsigned long flags; + unsigned int seq; + struct hfi1_qp_priv *priv; + + qp = tx->qp; + dev = to_idev(qp->ibqp.device); + + if (tx->mr) + rvt_put_mr(tx->mr); + + sdma_txclean(dd_from_dev(dev), &tx->txreq); + + /* Free verbs_txreq and return to slab cache */ + kmem_cache_free(dev->verbs_txreq_cache, tx); + + do { + seq = read_seqbegin(&dev->iowait_lock); + if (!list_empty(&dev->txwait)) { + struct iowait *wait; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + wait = list_first_entry(&dev->txwait, struct iowait, + list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + /* refcount held until actual wake up */ + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + hfi1_qp_wakeup(qp, RVT_S_WAIT_TX); + break; + } + } while (read_seqretry(&dev->iowait_lock, seq)); +} + +struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp) +{ + struct verbs_txreq *tx = ERR_PTR(-EBUSY); + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + write_seqlock(&dev->iowait_lock); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + struct hfi1_qp_priv *priv; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (tx) + goto out; + priv = qp->priv; + if (list_empty(&priv->s_iowait.list)) { + dev->n_txwait++; + qp->s_flags |= RVT_S_WAIT_TX; + list_add_tail(&priv->s_iowait.list, &dev->txwait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX); + atomic_inc(&qp->refcount); + } + qp->s_flags &= ~RVT_S_BUSY; + } +out: + write_sequnlock(&dev->iowait_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + return tx; +} + +static void verbs_txreq_kmem_cache_ctor(void *obj) +{ + struct verbs_txreq *tx = (struct verbs_txreq *)obj; + + memset(tx, 0, sizeof(*tx)); +} + +int verbs_txreq_init(struct hfi1_ibdev *dev) +{ + char buf[TXREQ_LEN]; + struct hfi1_devdata *dd = dd_from_dev(dev); + + snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit); + dev->verbs_txreq_cache = kmem_cache_create(buf, + sizeof(struct verbs_txreq), + 0, SLAB_HWCACHE_ALIGN, + verbs_txreq_kmem_cache_ctor); + if (!dev->verbs_txreq_cache) + return -ENOMEM; + return 0; +} + +void verbs_txreq_exit(struct hfi1_ibdev *dev) +{ + kmem_cache_destroy(dev->verbs_txreq_cache); + dev->verbs_txreq_cache = NULL; +} diff --git a/drivers/staging/rdma/hfi1/verbs_txreq.h b/drivers/staging/rdma/hfi1/verbs_txreq.h new file mode 100644 index 000000000000..1cf69b2fe4a5 --- /dev/null +++ b/drivers/staging/rdma/hfi1/verbs_txreq.h @@ -0,0 +1,116 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef HFI1_VERBS_TXREQ_H +#define HFI1_VERBS_TXREQ_H + +#include <linux/types.h> +#include <linux/slab.h> + +#include "verbs.h" +#include "sdma_txreq.h" +#include "iowait.h" + +struct verbs_txreq { + struct hfi1_pio_header phdr; + struct sdma_txreq txreq; + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_mregion *mr; + struct rvt_sge_state *ss; + struct sdma_engine *sde; + struct send_context *psc; + u16 hdr_dwords; +}; + +struct hfi1_ibdev; +struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp); + +static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp) +{ + struct verbs_txreq *tx; + struct hfi1_qp_priv *priv = qp->priv; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (unlikely(!tx)) { + /* call slow path to get the lock */ + tx = __get_txreq(dev, qp); + if (IS_ERR(tx)) + return tx; + } + tx->qp = qp; + tx->mr = NULL; + tx->sde = priv->s_sde; + tx->psc = priv->s_sendcontext; + /* so that we can test if the sdma decriptors are there */ + tx->txreq.num_desc = 0; + return tx; +} + +static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx) +{ + return &tx->txreq; +} + +static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp) +{ + struct sdma_txreq *stx; + struct hfi1_qp_priv *priv = qp->priv; + + stx = iowait_get_txhead(&priv->s_iowait); + if (stx) + return container_of(stx, struct verbs_txreq, txreq); + return NULL; +} + +void hfi1_put_txreq(struct verbs_txreq *tx); +int verbs_txreq_init(struct hfi1_ibdev *dev); +void verbs_txreq_exit(struct hfi1_ibdev *dev); + +#endif /* HFI1_VERBS_TXREQ_H */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a0e8cc8dcc67..8541a913f6a3 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -219,6 +219,7 @@ enum { MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31, MLX4_DEV_CAP_FLAG2_LB_SRC_CHK = 1ULL << 32, MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, + MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER = 1ULL << 34, }; enum { @@ -1160,6 +1161,8 @@ enum mlx4_net_trans_promisc_mode { MLX4_FS_REGULAR = 1, MLX4_FS_ALL_DEFAULT, MLX4_FS_MC_DEFAULT, + MLX4_FS_MIRROR_RX_PORT, + MLX4_FS_MIRROR_SX_PORT, MLX4_FS_UC_SNIFFER, MLX4_FS_MC_SNIFFER, MLX4_FS_MODE_NUM, /* should be last */ diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 987764afa65c..4b531c44b3c7 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -105,6 +105,29 @@ __mlx5_mask(typ, fld)) ___t; \ }) +/* Big endian getters */ +#define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\ + __mlx5_64_off(typ, fld))) + +#define MLX5_GET_BE(type_t, typ, p, fld) ({ \ + type_t tmp; \ + switch (sizeof(tmp)) { \ + case sizeof(u8): \ + tmp = (__force type_t)MLX5_GET(typ, p, fld); \ + break; \ + case sizeof(u16): \ + tmp = (__force type_t)cpu_to_be16(MLX5_GET(typ, p, fld)); \ + break; \ + case sizeof(u32): \ + tmp = (__force type_t)cpu_to_be32(MLX5_GET(typ, p, fld)); \ + break; \ + case sizeof(u64): \ + tmp = (__force type_t)MLX5_GET64_BE(typ, p, fld); \ + break; \ + } \ + tmp; \ + }) + enum { MLX5_MAX_COMMANDS = 32, MLX5_CMD_DATA_BLOCK_SIZE = 512, @@ -1196,6 +1219,8 @@ enum mlx5_cap_type { MLX5_CAP_FLOW_TABLE, MLX5_CAP_ESWITCH_FLOW_TABLE, MLX5_CAP_ESWITCH, + MLX5_CAP_RESERVED, + MLX5_CAP_VECTOR_CALC, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1258,6 +1283,10 @@ enum mlx5_cap_type { #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) +#define MLX5_CAP_VECTOR_CALC(mdev, cap) \ + MLX5_GET(vector_calc_cap, \ + mdev->hca_caps_cur[MLX5_CAP_VECTOR_CALC], cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, @@ -1284,7 +1313,8 @@ enum { MLX5_RFC_3635_COUNTERS_GROUP = 0x3, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5, MLX5_PER_PRIORITY_COUNTERS_GROUP = 0x10, - MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11 + MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11, + MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, }; static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) @@ -1294,6 +1324,11 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_BY_PASS_NUM_PRIOS 9 +#define MLX5_BY_PASS_NUM_REGULAR_PRIOS 8 +#define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 8 +#define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 +#define MLX5_BY_PASS_NUM_PRIOS (MLX5_BY_PASS_NUM_REGULAR_PRIOS +\ + MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS +\ + MLX5_BY_PASS_NUM_MULTICAST_PRIOS) #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1e3006dcf35d..e1d987fb49b2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -338,7 +338,7 @@ struct mlx5_core_sig_ctx { u32 sigerr_count; }; -struct mlx5_core_mr { +struct mlx5_core_mkey { u64 iova; u64 size; u32 key; @@ -426,7 +426,7 @@ struct mlx5_srq_table { struct radix_tree_root tree; }; -struct mlx5_mr_table { +struct mlx5_mkey_table { /* protect radix tree */ rwlock_t lock; @@ -484,9 +484,9 @@ struct mlx5_priv { struct mlx5_cq_table cq_table; /* end: cq staff */ - /* start: mr staff */ - struct mlx5_mr_table mr_table; - /* end: mr staff */ + /* start: mkey staff */ + struct mlx5_mkey_table mkey_table; + /* end: mkey staff */ /* start: alloc staff */ /* protect buffer alocation according to numa node */ @@ -613,7 +613,10 @@ struct mlx5_pas { }; enum port_state_policy { - MLX5_AAA_000 + MLX5_POLICY_DOWN = 0, + MLX5_POLICY_UP = 1, + MLX5_POLICY_FOLLOW = 2, + MLX5_POLICY_INVALID = 0xffffffff }; enum phy_port_state { @@ -706,8 +709,7 @@ void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); int mlx5_cmd_status_to_err_v2(void *ptr); -int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type, - enum mlx5_cap_mode cap_mode); +int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, @@ -739,16 +741,18 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, struct mlx5_query_srq_mbox_out *out); int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); -void mlx5_init_mr_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev); -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +void mlx5_init_mkey_table(struct mlx5_core_dev *dev); +void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev); +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, struct mlx5_create_mkey_mbox_in *in, int inlen, mlx5_cmd_cbk_t callback, void *context, struct mlx5_create_mkey_mbox_out *out); -int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr); -int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey); +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_query_mkey_mbox_out *out, int outlen); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, +int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); @@ -847,6 +851,8 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); +int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, + u8 port_num, void *out, size_t sz); static inline int fw_initializing(struct mlx5_core_dev *dev) { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8230caa3fb6e..8dec5508d93d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -38,6 +38,10 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 +enum { + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, +}; + #define LEFTOVERS_RULE_NUM 2 static inline void build_leftovers_ft_param(int *priority, int *n_ent, @@ -52,6 +56,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, MLX5_FLOW_NAMESPACE_KERNEL, MLX5_FLOW_NAMESPACE_LEFTOVERS, + MLX5_FLOW_NAMESPACE_ANCHOR, MLX5_FLOW_NAMESPACE_FDB, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 51f1e540fc2b..bb9e07ca6534 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -458,7 +458,8 @@ struct mlx5_ifc_ads_bits { }; struct mlx5_ifc_flow_table_nic_cap_bits { - u8 reserved_at_0[0x200]; + u8 nic_rx_multi_path_tirs[0x1]; + u8 reserved_at_1[0x1ff]; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; @@ -615,6 +616,33 @@ struct mlx5_ifc_odp_cap_bits { u8 reserved_at_e0[0x720]; }; +struct mlx5_ifc_calc_op { + u8 reserved_at_0[0x10]; + u8 reserved_at_10[0x9]; + u8 op_swap_endianness[0x1]; + u8 op_min[0x1]; + u8 op_xor[0x1]; + u8 op_or[0x1]; + u8 op_and[0x1]; + u8 op_max[0x1]; + u8 op_add[0x1]; +}; + +struct mlx5_ifc_vector_calc_cap_bits { + u8 calc_matrix[0x1]; + u8 reserved_at_1[0x1f]; + u8 reserved_at_20[0x8]; + u8 max_vec_count[0x8]; + u8 reserved_at_30[0xd]; + u8 max_chunk_size[0x3]; + struct mlx5_ifc_calc_op calc0; + struct mlx5_ifc_calc_op calc1; + struct mlx5_ifc_calc_op calc2; + struct mlx5_ifc_calc_op calc3; + + u8 reserved_at_e0[0x720]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -736,7 +764,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; - u8 reserved_at_200[0xe]; + u8 reserved_at_200[0x3]; + u8 ipoib_basic_offloads[0x1]; + u8 reserved_at_204[0xa]; u8 drain_sigerr[0x1]; u8 cmdif_checksum[0x2]; u8 sigerr_cqe[0x1]; @@ -767,10 +797,14 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cd[0x1]; u8 reserved_at_22c[0x1]; u8 apm[0x1]; - u8 reserved_at_22e[0x7]; + u8 vector_calc[0x1]; + u8 reserved_at_22f[0x1]; + u8 imaicl[0x1]; + u8 reserved_at_231[0x4]; u8 qkv[0x1]; u8 pkv[0x1]; - u8 reserved_at_237[0x4]; + u8 set_deth_sqpn[0x1]; + u8 reserved_at_239[0x3]; u8 xrc[0x1]; u8 ud[0x1]; u8 uc[0x1]; @@ -1208,6 +1242,36 @@ struct mlx5_ifc_phys_layer_cntrs_bits { u8 reserved_at_640[0x180]; }; +struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits { + u8 symbol_error_counter[0x10]; + + u8 link_error_recovery_counter[0x8]; + + u8 link_downed_counter[0x8]; + + u8 port_rcv_errors[0x10]; + + u8 port_rcv_remote_physical_errors[0x10]; + + u8 port_rcv_switch_relay_errors[0x10]; + + u8 port_xmit_discards[0x10]; + + u8 port_xmit_constraint_errors[0x8]; + + u8 port_rcv_constraint_errors[0x8]; + + u8 reserved_at_70[0x8]; + + u8 link_overrun_errors[0x8]; + + u8 reserved_at_80[0x10]; + + u8 vl_15_dropped[0x10]; + + u8 reserved_at_a0[0xa0]; +}; + struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits { u8 transmit_queue_high[0x20]; @@ -1780,7 +1844,7 @@ struct mlx5_ifc_qpc_bits { u8 log_sq_size[0x4]; u8 reserved_at_55[0x6]; u8 rlky[0x1]; - u8 reserved_at_5c[0x4]; + u8 ulp_stateless_offload_mode[0x4]; u8 counter_set_id[0x8]; u8 uar_page[0x18]; @@ -1904,6 +1968,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; struct mlx5_ifc_e_switch_cap_bits e_switch_cap; + struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap; u8 reserved_at_0[0x8000]; }; @@ -2618,6 +2683,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout; struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout; struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout; + struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; u8 reserved_at_0[0x7c0]; }; @@ -3126,7 +3192,8 @@ struct mlx5_ifc_query_vport_counter_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 reserved_at_41[0xb]; + u8 port_num[0x4]; u8 vport_number[0x10]; u8 reserved_at_60[0x60]; @@ -3629,6 +3696,12 @@ struct mlx5_ifc_query_hca_vport_pkey_in_bits { u8 pkey_index[0x10]; }; +enum { + MLX5_HCA_VPORT_SEL_PORT_GUID = 1 << 0, + MLX5_HCA_VPORT_SEL_NODE_GUID = 1 << 1, + MLX5_HCA_VPORT_SEL_STATE_POLICY = 1 << 2, +}; + struct mlx5_ifc_query_hca_vport_gid_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -6954,6 +7027,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_peir_reg_bits peir_reg; struct mlx5_ifc_pelc_reg_bits pelc_reg; struct mlx5_ifc_pfcc_reg_bits pfcc_reg; + struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_pifr_reg_bits pifr_reg; struct mlx5_ifc_pipg_reg_bits pipg_reg; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 5b8c89ffaa58..cf031a3f16c5 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -499,7 +499,8 @@ struct mlx5_qp_context { u8 reserved2[4]; __be32 next_send_psn; __be32 cqn_send; - u8 reserved3[8]; + __be32 deth_sqpn; + u8 reserved3[4]; __be32 last_acked_psn; __be32 ssn; __be32 params2; @@ -621,9 +622,9 @@ static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); } -static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key) +static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key) { - return radix_tree_lookup(&dev->priv.mr_table.tree, key); + return radix_tree_lookup(&dev->priv.mkey_table.tree, key); } struct mlx5_page_fault_resume_mbox_in { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 123771003e68..bd93e6323603 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -92,5 +92,12 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev); int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev); +int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, + int vf, u8 port_num, void *out, + size_t out_sz); +int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, + u8 other_vport, u8 port_num, + int vf, + struct mlx5_hca_vport_context *req); #endif /* __MLX5_VPORT_H__ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5440b7b705eb..7b4ae218b90b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1147,6 +1147,9 @@ struct net_device_ops { struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); + int (*ndo_set_vf_guid)(struct net_device *dev, + int vf, u64 guid, + int guid_type); int (*ndo_set_vf_rss_query_en)( struct net_device *dev, int vf, bool setting); diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 0ff049bd9ad4..37dd534cbeab 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -424,11 +424,11 @@ typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent, /** * ib_mad_snoop_handler - Callback handler for snooping sent MADs. * @mad_agent: MAD agent that snooped the MAD. - * @send_wr: Work request information on the sent MAD. + * @send_buf: send MAD data buffer. * @mad_send_wc: Work completion information on the sent MAD. Valid * only for snooping that occurs on a send completion. * - * Clients snooping MADs should not modify data referenced by the @send_wr + * Clients snooping MADs should not modify data referenced by the @send_buf * or @mad_send_wc. */ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 284b00c8fea4..fb2cef4e9747 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -56,6 +56,7 @@ #include <linux/string.h> #include <linux/slab.h> +#include <linux/if_link.h> #include <linux/atomic.h> #include <linux/mmu_notifier.h> #include <asm/uaccess.h> @@ -97,6 +98,11 @@ enum rdma_node_type { RDMA_NODE_USNIC_UDP, }; +enum { + /* set the local administered indication */ + IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, +}; + enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, @@ -212,6 +218,8 @@ enum ib_device_cap_flags { IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1 << 31), + IB_DEVICE_SG_GAPS_REG = (1ULL << 32), + IB_DEVICE_VIRTUAL_FUNCTION = ((u64)1 << 33), }; enum ib_signature_prot_cap { @@ -273,7 +281,7 @@ struct ib_device_attr { u32 hw_ver; int max_qp; int max_qp_wr; - int device_cap_flags; + u64 device_cap_flags; int max_sge; int max_sge_rd; int max_cq; @@ -489,6 +497,7 @@ union rdma_protocol_stats { | RDMA_CORE_CAP_OPA_MAD) struct ib_port_attr { + u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; @@ -508,6 +517,7 @@ struct ib_port_attr { u8 active_width; u8 active_speed; u8 phys_state; + bool grh_required; }; enum ib_device_modify_flags { @@ -613,6 +623,7 @@ enum { }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) +#define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 @@ -662,10 +673,15 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); * @IB_MR_TYPE_SIGNATURE: memory region that is used for * signature operations (data-integrity * capable regions) + * @IB_MR_TYPE_SG_GAPS: memory region that is capable to + * register any arbitrary sg lists (without + * the normal mr constraints - see + * ib_map_mr_sg) */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SIGNATURE, + IB_MR_TYPE_SG_GAPS, }; /** @@ -1487,6 +1503,11 @@ enum ib_flow_domain { IB_FLOW_DOMAIN_NUM /* Must be last */ }; +enum ib_flow_flags { + IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ + IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */ +}; + struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; @@ -1808,7 +1829,8 @@ struct ib_device { struct scatterlist *sg, int sg_nents); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, - enum ib_mw_type type); + enum ib_mw_type type, + struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, @@ -1846,6 +1868,16 @@ struct ib_device { int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); + void (*drain_rq)(struct ib_qp *qp); + void (*drain_sq)(struct ib_qp *qp); + int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port, + int state); + int (*get_vf_config)(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *ivf); + int (*get_vf_stats)(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats); + int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid, + int type); struct ib_dma_mapping_ops *dma_ops; @@ -2289,6 +2321,15 @@ int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid, struct ib_gid_attr *attr); +int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, + int state); +int ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info); +int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, + struct ifla_vf_stats *stats); +int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, + int type); + int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); @@ -3094,4 +3135,7 @@ int ib_sg_to_pages(struct ib_mr *mr, int sg_nents, int (*set_page)(struct ib_mr *, u64)); +void ib_drain_rq(struct ib_qp *qp); +void ib_drain_sq(struct ib_qp *qp); +void ib_drain_qp(struct ib_qp *qp); #endif /* IB_VERBS_H */ diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 036bd2772662..6d0065c322b7 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -83,8 +83,10 @@ struct iw_cm_id { iw_cm_handler cm_handler; /* client callback function */ void *context; /* client cb context */ struct ib_device *device; - struct sockaddr_storage local_addr; + struct sockaddr_storage local_addr; /* local addr */ struct sockaddr_storage remote_addr; + struct sockaddr_storage m_local_addr; /* nmapped local addr */ + struct sockaddr_storage m_remote_addr; /* nmapped rem addr */ void *provider_data; /* provider private data */ iw_event_handler event_handler; /* cb for provider events */ @@ -92,6 +94,7 @@ struct iw_cm_id { void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); u8 tos; + bool mapped; }; struct iw_cm_conn_param { @@ -123,6 +126,7 @@ struct iw_cm_verbs { int backlog); int (*destroy_listen)(struct iw_cm_id *cm_id); + char ifname[IFNAMSIZ]; }; /** diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index a0fa975cd1c1..2b95c2c336eb 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -97,7 +97,7 @@ #define OPA_LINKDOWN_REASON_WIDTH_POLICY 41 /* 42-48 reserved */ #define OPA_LINKDOWN_REASON_DISCONNECTED 49 -#define OPA_LINKDOWN_REASONLOCAL_MEDIA_NOT_INSTALLED 50 +#define OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED 50 #define OPA_LINKDOWN_REASON_NOT_INSTALLED 51 #define OPA_LINKDOWN_REASON_CHASSIS_CONFIG 52 /* 53 reserved */ diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h new file mode 100644 index 000000000000..a8696551abb1 --- /dev/null +++ b/include/rdma/rdma_vt.h @@ -0,0 +1,481 @@ +#ifndef DEF_RDMA_VT_H +#define DEF_RDMA_VT_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * Structure that low level drivers will populate in order to register with the + * rdmavt layer. + */ + +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdmavt_mr.h> +#include <rdma/rdmavt_qp.h> + +#define RVT_MAX_PKEY_VALUES 16 + +struct rvt_ibport { + struct rvt_qp __rcu *qp[2]; + struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ + struct rb_root mcast_tree; + spinlock_t lock; /* protect changes in this struct */ + + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + unsigned long trap_timeout; + __be64 gid_prefix; /* in network order */ + __be64 mkey; + u64 tid; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 mkey_lease_period; + u16 sm_lid; + u8 sm_sl; + u8 mkeyprot; + u8 subnet_timeout; + u8 vl_high_limit; + + /* + * Driver is expected to keep these up to date. These + * counters are informational only and not required to be + * completely accurate. + */ + u64 n_rc_resends; + u64 n_seq_naks; + u64 n_rdma_seq; + u64 n_rnr_naks; + u64 n_other_naks; + u64 n_loop_pkts; + u64 n_pkt_drops; + u64 n_vl15_dropped; + u64 n_rc_timeouts; + u64 n_dmawait; + u64 n_unaligned; + u64 n_rc_dupreq; + u64 n_rc_seqnak; + u16 pkey_violations; + u16 qkey_violations; + u16 mkey_violations; + + /* Hot-path per CPU counters to avoid cacheline trading to update */ + u64 z_rc_acks; + u64 z_rc_qacks; + u64 z_rc_delayed_comp; + u64 __percpu *rc_acks; + u64 __percpu *rc_qacks; + u64 __percpu *rc_delayed_comp; + + void *priv; /* driver private data */ + + /* + * The pkey table is allocated and maintained by the driver. Drivers + * need to have access to this before registering with rdmav. However + * rdmavt will need access to it so drivers need to proviee this during + * the attach port API call. + */ + u16 *pkey_table; + + struct rvt_ah *sm_ah; +}; + +#define RVT_CQN_MAX 16 /* maximum length of cq name */ + +/* + * Things that are driver specific, module parameters in hfi1 and qib + */ +struct rvt_driver_params { + struct ib_device_attr props; + + /* + * Anything driver specific that is not covered by props + * For instance special module parameters. Goes here. + */ + unsigned int lkey_table_size; + unsigned int qp_table_size; + int qpn_start; + int qpn_inc; + int qpn_res_start; + int qpn_res_end; + int nports; + int npkeys; + u8 qos_shift; + char cq_name[RVT_CQN_MAX]; + int node; + int max_rdma_atomic; + int psn_mask; + int psn_shift; + int psn_modify_mask; + u32 core_cap_flags; + u32 max_mad_size; +}; + +/* Protection domain */ +struct rvt_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address handle */ +struct rvt_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; + atomic_t refcount; + u8 vl; + u8 log_pmtu; +}; + +struct rvt_dev_info; +struct rvt_swqe; +struct rvt_driver_provided { + /* + * Which functions are required depends on which verbs rdmavt is + * providing and which verbs the driver is overriding. See + * check_support() for details. + */ + + /* Passed to ib core registration. Callback to create syfs files */ + int (*port_callback)(struct ib_device *, u8, struct kobject *); + + /* + * Returns a string to represent the device for which is being + * registered. This is primarily used for error and debug messages on + * the console. + */ + const char * (*get_card_name)(struct rvt_dev_info *rdi); + + /* + * Returns a pointer to the undelying hardware's PCI device. This is + * used to display information as to what hardware is being referenced + * in an output message + */ + struct pci_dev * (*get_pci_dev)(struct rvt_dev_info *rdi); + + /* + * Allocate a private queue pair data structure for driver specific + * information which is opaque to rdmavt. + */ + void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + gfp_t gfp); + + /* + * Free the driver's private qp structure. + */ + void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp); + + /* + * Inform the driver the particular qp in quesiton has been reset so + * that it can clean up anything it needs to. + */ + void (*notify_qp_reset)(struct rvt_qp *qp); + + /* + * Give the driver a notice that there is send work to do. It is up to + * the driver to generally push the packets out, this just queues the + * work with the driver. There are two variants here. The no_lock + * version requires the s_lock not to be held. The other assumes the + * s_lock is held. + */ + void (*schedule_send)(struct rvt_qp *qp); + void (*schedule_send_no_lock)(struct rvt_qp *qp); + + /* + * Sometimes rdmavt needs to kick the driver's send progress. That is + * done by this call back. + */ + void (*do_send)(struct rvt_qp *qp); + + /* + * Get a path mtu from the driver based on qp attributes. + */ + int (*get_pmtu_from_attr)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr); + + /* + * Notify driver that it needs to flush any outstanding IO requests that + * are waiting on a qp. + */ + void (*flush_qp_waiters)(struct rvt_qp *qp); + + /* + * Notify driver to stop its queue of sending packets. Nothing else + * should be posted to the queue pair after this has been called. + */ + void (*stop_send_queue)(struct rvt_qp *qp); + + /* + * Have the drivr drain any in progress operations + */ + void (*quiesce_qp)(struct rvt_qp *qp); + + /* + * Inform the driver a qp has went to error state. + */ + void (*notify_error_qp)(struct rvt_qp *qp); + + /* + * Get an MTU for a qp. + */ + u32 (*mtu_from_qp)(struct rvt_dev_info *rdi, struct rvt_qp *qp, + u32 pmtu); + /* + * Convert an mtu to a path mtu + */ + int (*mtu_to_path_mtu)(u32 mtu); + + /* + * Get the guid of a port in big endian byte order + */ + int (*get_guid_be)(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, + int guid_index, __be64 *guid); + + /* + * Query driver for the state of the port. + */ + int (*query_port_state)(struct rvt_dev_info *rdi, u8 port_num, + struct ib_port_attr *props); + + /* + * Tell driver to shutdown a port + */ + int (*shut_down_port)(struct rvt_dev_info *rdi, u8 port_num); + + /* Tell driver to send a trap for changed port capabilities */ + void (*cap_mask_chg)(struct rvt_dev_info *rdi, u8 port_num); + + /* + * The following functions can be safely ignored completely. Any use of + * these is checked for NULL before blindly calling. Rdmavt should also + * be functional if drivers omit these. + */ + + /* Called to inform the driver that all qps should now be freed. */ + unsigned (*free_all_qps)(struct rvt_dev_info *rdi); + + /* Driver specific AH validation */ + int (*check_ah)(struct ib_device *, struct ib_ah_attr *); + + /* Inform the driver a new AH has been created */ + void (*notify_new_ah)(struct ib_device *, struct ib_ah_attr *, + struct rvt_ah *); + + /* Let the driver pick the next queue pair number*/ + int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, + enum ib_qp_type type, u8 port_num, gfp_t gfp); + + /* Determine if its safe or allowed to modify the qp */ + int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + + /* Driver specific QP modification/notification-of */ + void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + + /* Driver specific work request checking */ + int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + + /* Notify driver a mad agent has been created */ + void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx); + + /* Notify driver a mad agent has been removed */ + void (*notify_free_mad_agent)(struct rvt_dev_info *rdi, int port_idx); + +}; + +struct rvt_dev_info { + struct ib_device ibdev; /* Keep this first. Nothing above here */ + + /* + * Prior to calling for registration the driver will be responsible for + * allocating space for this structure. + * + * The driver will also be responsible for filling in certain members of + * dparms.props. The driver needs to fill in dparms exactly as it would + * want values reported to a ULP. This will be returned to the caller + * in rdmavt's device. The driver should also therefore refrain from + * modifying this directly after registration with rdmavt. + */ + + /* Driver specific properties */ + struct rvt_driver_params dparms; + + struct rvt_mregion __rcu *dma_mr; + struct rvt_lkey_table lkey_table; + + /* Driver specific helper functions */ + struct rvt_driver_provided driver_f; + + /* Internal use */ + int n_pds_allocated; + spinlock_t n_pds_lock; /* Protect pd allocated count */ + + int n_ahs_allocated; + spinlock_t n_ahs_lock; /* Protect ah allocated count */ + + u32 n_srqs_allocated; + spinlock_t n_srqs_lock; /* Protect srqs allocated count */ + + int flags; + struct rvt_ibport **ports; + + /* QP */ + struct rvt_qp_ibdev *qp_dev; + u32 n_qps_allocated; /* number of QPs allocated for device */ + u32 n_rc_qps; /* number of RC QPs allocated for device */ + u32 busy_jiffies; /* timeout scaling based on RC QP count */ + spinlock_t n_qps_lock; /* protect qps, rc qps and busy jiffy counts */ + + /* memory maps */ + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; /* protect mmap_offset */ + u32 mmap_offset; + spinlock_t pending_lock; /* protect pending mmap list */ + + /* CQ */ + struct kthread_worker *worker; /* per device cq worker */ + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; /* protect count of in use cqs */ + + /* Multicast */ + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; + +}; + +static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct rvt_pd, ibpd); +} + +static inline struct rvt_ah *ibah_to_rvtah(struct ib_ah *ibah) +{ + return container_of(ibah, struct rvt_ah, ibah); +} + +static inline struct rvt_dev_info *ib_to_rvt(struct ib_device *ibdev) +{ + return container_of(ibdev, struct rvt_dev_info, ibdev); +} + +static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct rvt_srq, ibsrq); +} + +static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct rvt_qp, ibqp); +} + +static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) +{ + /* + * All ports have same number of pkeys. + */ + return rdi->dparms.npkeys; +} + +/* + * Return the indexed PKEY from the port PKEY table. + */ +static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi, + int port_index, + unsigned index) +{ + if (index >= rvt_get_npkeys(rdi)) + return 0; + else + return rdi->ports[port_index]->pkey_table[index]; +} + +/** + * rvt_lookup_qpn - return the QP with the given QPN + * @ibp: the ibport + * @qpn: the QP number to look up + * + * The caller must hold the rcu_read_lock(), and keep the lock until + * the returned qp is no longer in use. + */ +/* TODO: Remove this and put in rdmavt/qp.h when no longer needed by drivers */ +static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, + struct rvt_ibport *rvp, + u32 qpn) __must_hold(RCU) +{ + struct rvt_qp *qp = NULL; + + if (unlikely(qpn <= 1)) { + qp = rcu_dereference(rvp->qp[qpn]); + } else { + u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits); + + for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) + if (qp->ibqp.qp_num == qpn) + break; + } + return qp; +} + +struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); +int rvt_register_device(struct rvt_dev_info *rvd); +void rvt_unregister_device(struct rvt_dev_info *rvd); +int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); +int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, + int port_index, u16 *pkey_table); +int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc); +int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, + struct rvt_sge *isge, struct ib_sge *sge, int acc); +struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid); + +#endif /* DEF_RDMA_VT_H */ diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h new file mode 100644 index 000000000000..51fd00b243d0 --- /dev/null +++ b/include/rdma/rdmavt_cq.h @@ -0,0 +1,99 @@ +#ifndef DEF_RDMAVT_INCCQ_H +#define DEF_RDMAVT_INCCQ_H + +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2016 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/kthread.h> +#include <rdma/ib_user_verbs.h> + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct rvt_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct rvt_cq { + struct ib_cq ibcq; + struct kthread_work comptask; + spinlock_t lock; /* protect changes in this struct */ + u8 notify; + u8 triggered; + struct rvt_dev_info *rdi; + struct rvt_cq_wc *queue; + struct rvt_mmap_info *ip; +}; + +static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct rvt_cq, ibcq); +} + +void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); + +#endif /* DEF_RDMAVT_INCCQH */ diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h new file mode 100644 index 000000000000..5edffdca8c53 --- /dev/null +++ b/include/rdma/rdmavt_mr.h @@ -0,0 +1,139 @@ +#ifndef DEF_RDMAVT_INCMR_H +#define DEF_RDMAVT_INCMR_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * For Memory Regions. This stuff should probably be moved into rdmavt/mr.h once + * drivers no longer need access to the MR directly. + */ + +/* + * A segment is a linear region of low physical memory. + * Used by the verbs layer. + */ +struct rvt_seg { + void *vaddr; + size_t length; +}; + +/* The number of rvt_segs that fit in a page. */ +#define RVT_SEGSZ (PAGE_SIZE / sizeof(struct rvt_seg)) + +struct rvt_segarray { + struct rvt_seg segs[RVT_SEGSZ]; +}; + +struct rvt_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of rvt_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + u8 page_shift; /* 0 - non unform/non powerof2 sizes */ + u8 lkey_published; /* in global table */ + struct completion comp; /* complete when refcount goes to zero */ + atomic_t refcount; + struct rvt_segarray *map[0]; /* the segments */ +}; + +#define RVT_MAX_LKEY_TABLE_BITS 23 + +struct rvt_lkey_table { + spinlock_t lock; /* protect changes in this struct */ + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct rvt_mregion __rcu **table; +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct rvt_sge { + struct rvt_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +struct rvt_sge_state { + struct rvt_sge *sg_list; /* next SGE to be used if any */ + struct rvt_sge sge; /* progress state for the current SGE */ + u32 total_len; + u8 num_sge; +}; + +static inline void rvt_put_mr(struct rvt_mregion *mr) +{ + if (unlikely(atomic_dec_and_test(&mr->refcount))) + complete(&mr->comp); +} + +static inline void rvt_get_mr(struct rvt_mregion *mr) +{ + atomic_inc(&mr->refcount); +} + +static inline void rvt_put_ss(struct rvt_sge_state *ss) +{ + while (ss->num_sge) { + rvt_put_mr(ss->sge.mr); + if (--ss->num_sge) + ss->sge = *ss->sg_list++; + } +} + +#endif /* DEF_RDMAVT_INCMRH */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h new file mode 100644 index 000000000000..497e59065c2c --- /dev/null +++ b/include/rdma/rdmavt_qp.h @@ -0,0 +1,446 @@ +#ifndef DEF_RDMAVT_INCQP_H +#define DEF_RDMAVT_INCQP_H + +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/rdma_vt.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_verbs.h> +/* + * Atomic bit definitions for r_aflags. + */ +#define RVT_R_WRID_VALID 0 +#define RVT_R_REWIND_SGE 1 + +/* + * Bit definitions for r_flags. + */ +#define RVT_R_REUSE_SGE 0x01 +#define RVT_R_RDMAR_SEQ 0x02 +#define RVT_R_RSP_NAK 0x04 +#define RVT_R_RSP_SEND 0x08 +#define RVT_R_COMM_EST 0x10 + +/* + * Bit definitions for s_flags. + * + * RVT_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled + * RVT_S_BUSY - send tasklet is processing the QP + * RVT_S_TIMER - the RC retry timer is active + * RVT_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics + * RVT_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * RVT_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete + * before processing the next SWQE + * RVT_S_WAIT_RNR - waiting for RNR timeout + * RVT_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA + * RVT_S_WAIT_PIO - waiting for a send buffer to be available + * RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets + * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available + * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available + * RVT_S_WAIT_KMEM - waiting for kernel memory to be available + * RVT_S_WAIT_PSN - waiting for a packet to exit the send DMA queue + * RVT_S_WAIT_ACK - waiting for an ACK packet before sending more requests + * RVT_S_SEND_ONE - send one packet, request ACK, then wait for ACK + * RVT_S_ECN - a BECN was queued to the send engine + */ +#define RVT_S_SIGNAL_REQ_WR 0x0001 +#define RVT_S_BUSY 0x0002 +#define RVT_S_TIMER 0x0004 +#define RVT_S_RESP_PENDING 0x0008 +#define RVT_S_ACK_PENDING 0x0010 +#define RVT_S_WAIT_FENCE 0x0020 +#define RVT_S_WAIT_RDMAR 0x0040 +#define RVT_S_WAIT_RNR 0x0080 +#define RVT_S_WAIT_SSN_CREDIT 0x0100 +#define RVT_S_WAIT_DMA 0x0200 +#define RVT_S_WAIT_PIO 0x0400 +#define RVT_S_WAIT_PIO_DRAIN 0x0800 +#define RVT_S_WAIT_TX 0x1000 +#define RVT_S_WAIT_DMA_DESC 0x2000 +#define RVT_S_WAIT_KMEM 0x4000 +#define RVT_S_WAIT_PSN 0x8000 +#define RVT_S_WAIT_ACK 0x10000 +#define RVT_S_SEND_ONE 0x20000 +#define RVT_S_UNLIMITED_CREDIT 0x40000 +#define RVT_S_AHG_VALID 0x80000 +#define RVT_S_AHG_CLEAR 0x100000 +#define RVT_S_ECN 0x200000 + +/* + * Wait flags that would prevent any packet type from being sent. + */ +#define RVT_S_ANY_WAIT_IO (RVT_S_WAIT_PIO | RVT_S_WAIT_TX | \ + RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM) + +/* + * Wait flags that would prevent send work requests from making progress. + */ +#define RVT_S_ANY_WAIT_SEND (RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | \ + RVT_S_WAIT_RNR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA | \ + RVT_S_WAIT_PSN | RVT_S_WAIT_ACK) + +#define RVT_S_ANY_WAIT (RVT_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) + +/* Number of bits to pay attention to in the opcode for checking qp type */ +#define RVT_OPCODE_QP_MASK 0xE0 + +/* Flags for checking QP state (see ib_rvt_state_ops[]) */ +#define RVT_POST_SEND_OK 0x01 +#define RVT_POST_RECV_OK 0x02 +#define RVT_PROCESS_RECV_OK 0x04 +#define RVT_PROCESS_SEND_OK 0x08 +#define RVT_PROCESS_NEXT_SEND_OK 0x10 +#define RVT_FLUSH_SEND 0x20 +#define RVT_FLUSH_RECV 0x40 +#define RVT_PROCESS_OR_FLUSH_SEND \ + (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND) + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct rvt_swqe { + union { + struct ib_send_wr wr; /* don't use wr.sg_list */ + struct ib_ud_wr ud_wr; + struct ib_reg_wr reg_wr; + struct ib_rdma_wr rdma_wr; + struct ib_atomic_wr atomic_wr; + }; + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct rvt_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct rvt_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct rvt_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct rvt_rwqe wq[0]; +}; + +struct rvt_rq { + struct rvt_rwq *wq; + u32 size; /* size of RWQE array */ + u8 max_sge; + /* protect changes in this struct */ + spinlock_t lock ____cacheline_aligned_in_smp; +}; + +/* + * This structure is used by rvt_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct rvt_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +#define RVT_MAX_RDMA_ATOMIC 16 + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct rvt_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + u32 lpsn; + union { + struct rvt_sge rdma_sge; + u64 atomic_data; + }; +}; + +#define RC_QP_SCALING_INTERVAL 5 + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct rvt_qp { + struct ib_qp ibqp; + void *priv; /* Driver private data */ + /* read mostly fields above and below */ + struct ib_ah_attr remote_ah_attr; + struct ib_ah_attr alt_ah_attr; + struct rvt_qp __rcu *next; /* link list for QPN hash table */ + struct rvt_swqe *s_wq; /* send work queue */ + struct rvt_mmap_info *ip; + + unsigned long timeout_jiffies; /* computed from timeout */ + + enum ib_mtu path_mtu; + int srate_mbps; /* s_srate (below) converted to Mbit/s */ + pid_t pid; /* pid for user mode QPs */ + u32 remote_qpn; + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_ahgpsn; /* set to the psn in the copy of the header */ + + u16 pmtu; /* decoded from path_mtu */ + u8 log_pmtu; /* shift for pmtu */ + u8 state; /* QP state */ + u8 allowed_ops; /* high order bits of allowed opcodes */ + u8 qp_access_flags; + u8 alt_timeout; /* Alternate path timeout for this QP */ + u8 timeout; /* Timeout for this QP */ + u8 s_srate; + u8 s_mig_state; + u8 port_num; + u8 s_pkey_index; /* PKEY index to use */ + u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_draining; + + /* start of read/write fields */ + atomic_t refcount ____cacheline_aligned_in_smp; + wait_queue_head_t wait; + + struct rvt_ack_entry s_ack_queue[RVT_MAX_RDMA_ATOMIC + 1] + ____cacheline_aligned_in_smp; + struct rvt_sge_state s_rdma_read_sge; + + spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ + u32 r_psn; /* expected rcv packet sequence number */ + unsigned long r_aflags; + u64 r_wr_id; /* ID for current receive WQE */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_msn; /* message sequence number */ + + u8 r_state; /* opcode of last packet received */ + u8 r_flags; + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + + struct list_head rspwait; /* link for waiting to respond */ + + struct rvt_sge_state r_sge; /* current receive data */ + struct rvt_rq r_rq; /* receive work queue */ + + /* post send line */ + spinlock_t s_hlock ____cacheline_aligned_in_smp; + u32 s_head; /* new entries added here */ + u32 s_next_psn; /* PSN for next request */ + u32 s_avail; /* number of entries avail */ + u32 s_ssn; /* SSN of tail entry */ + + spinlock_t s_lock ____cacheline_aligned_in_smp; + u32 s_flags; + struct rvt_sge_state *s_cur_sge; + struct rvt_swqe *s_wqe; + struct rvt_sge_state s_sge; /* current send request data */ + struct rvt_mregion *s_rdma_mr; + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_sending_psn; /* lowest PSN that is being sent */ + u32 s_sending_hpsn; /* highest PSN that is being sent */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_acked; /* last un-ACK'ed entry */ + u32 s_last; /* last completed entry */ + u32 s_lsn; /* limit sequence number (credit) */ + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u16 s_rdma_ack_cnt; + s8 s_ahgidx; + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + + struct rvt_sge_state s_ack_rdma_sge; + struct timer_list s_timer; + + /* + * This sge list MUST be last. Do not add anything below here. + */ + struct rvt_sge r_sg_list[0] /* verified SGEs */ + ____cacheline_aligned_in_smp; +}; + +struct rvt_srq { + struct ib_srq ibsrq; + struct rvt_rq rq; + struct rvt_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +#define RVT_QPN_MAX BIT(24) +#define RVT_QPNMAP_ENTRIES (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) +#define RVT_BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) +#define RVT_BITS_PER_PAGE_MASK (RVT_BITS_PER_PAGE - 1) +#define RVT_QPN_MASK 0xFFFFFF + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct rvt_qpn_map { + void *page; +}; + +struct rvt_qpn_table { + spinlock_t lock; /* protect changes to the qp table */ + unsigned flags; /* flags for QP0/1 allocated for each port */ + u32 last; /* last QP number allocated */ + u32 nmaps; /* size of the map table */ + u16 limit; + u8 incr; + /* bit map of free QP numbers other than 0/1 */ + struct rvt_qpn_map map[RVT_QPNMAP_ENTRIES]; +}; + +struct rvt_qp_ibdev { + u32 qp_table_size; + u32 qp_table_bits; + struct rvt_qp __rcu **qp_table; + spinlock_t qpt_lock; /* qptable lock */ + struct rvt_qpn_table qpn_table; +}; + +/* + * There is one struct rvt_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct rvt_mcast_qp. + */ +struct rvt_mcast_qp { + struct list_head list; + struct rvt_qp *qp; +}; + +struct rvt_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* + * Since struct rvt_swqe is not a fixed size, we can't simply index into + * struct rvt_qp.s_wq. This function does the array index computation. + */ +static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp, + unsigned n) +{ + return (struct rvt_swqe *)((char *)qp->s_wq + + (sizeof(struct rvt_swqe) + + qp->s_max_sge * + sizeof(struct rvt_sge)) * n); +} + +/* + * Since struct rvt_rwqe is not a fixed size, we can't simply index into + * struct rvt_rwq.wq. This function does the array index computation. + */ +static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) +{ + return (struct rvt_rwqe *) + ((char *)rq->wq->wq + + (sizeof(struct rvt_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +extern const int ib_rvt_state_ops[]; + +struct rvt_dev_info; +int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); + +#endif /* DEF_RDMAVT_INCQP_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a30b78090594..1d01e8a4e5dd 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -556,6 +556,8 @@ enum { */ IFLA_VF_STATS, /* network device statistics */ IFLA_VF_TRUST, /* Trust VF */ + IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ + IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ __IFLA_VF_MAX, }; @@ -588,6 +590,11 @@ struct ifla_vf_spoofchk { __u32 setting; }; +struct ifla_vf_guid { + __u32 vf; + __u64 guid; +}; + enum { IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ IFLA_VF_LINK_STATE_ENABLE, /* link always up */ diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 288694e422fb..a533cecab14f 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -66,7 +66,7 @@ * The major version changes when data structures change in an incompatible * way. The driver must be the same for initialization to succeed. */ -#define HFI1_USER_SWMAJOR 4 +#define HFI1_USER_SWMAJOR 5 /* * Minor version differences are always compatible @@ -93,7 +93,7 @@ #define HFI1_CAP_MULTI_PKT_EGR (1UL << 7) /* Enable multi-packet Egr buffs*/ #define HFI1_CAP_NODROP_RHQ_FULL (1UL << 8) /* Don't drop on Hdr Q full */ #define HFI1_CAP_NODROP_EGR_FULL (1UL << 9) /* Don't drop on EGR buffs full */ -#define HFI1_CAP_TID_UNMAP (1UL << 10) /* Enable Expected TID caching */ +#define HFI1_CAP_TID_UNMAP (1UL << 10) /* Disable Expected TID caching */ #define HFI1_CAP_PRINT_UNIMPL (1UL << 11) /* Show for unimplemented feats */ #define HFI1_CAP_ALLOW_PERM_JKEY (1UL << 12) /* Allow use of permissive JKEY */ #define HFI1_CAP_NO_INTEGRITY (1UL << 13) /* Enable ctxt integrity checks */ @@ -134,6 +134,7 @@ #define HFI1_CMD_ACK_EVENT 10 /* ack & clear user status bits */ #define HFI1_CMD_SET_PKEY 11 /* set context's pkey */ #define HFI1_CMD_CTXT_RESET 12 /* reset context's HW send context */ +#define HFI1_CMD_TID_INVAL_READ 13 /* read TID cache invalidations */ /* separate EPROM commands from normal PSM commands */ #define HFI1_CMD_EP_INFO 64 /* read EPROM device ID */ #define HFI1_CMD_EP_ERASE_CHIP 65 /* erase whole EPROM */ @@ -147,13 +148,15 @@ #define _HFI1_EVENT_LID_CHANGE_BIT 2 #define _HFI1_EVENT_LMC_CHANGE_BIT 3 #define _HFI1_EVENT_SL2VL_CHANGE_BIT 4 -#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_SL2VL_CHANGE_BIT +#define _HFI1_EVENT_TID_MMU_NOTIFY_BIT 5 +#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_TID_MMU_NOTIFY_BIT #define HFI1_EVENT_FROZEN (1UL << _HFI1_EVENT_FROZEN_BIT) #define HFI1_EVENT_LINKDOWN (1UL << _HFI1_EVENT_LINKDOWN_BIT) #define HFI1_EVENT_LID_CHANGE (1UL << _HFI1_EVENT_LID_CHANGE_BIT) #define HFI1_EVENT_LMC_CHANGE (1UL << _HFI1_EVENT_LMC_CHANGE_BIT) #define HFI1_EVENT_SL2VL_CHANGE (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT) +#define HFI1_EVENT_TID_MMU_NOTIFY (1UL << _HFI1_EVENT_TID_MMU_NOTIFY_BIT) /* * These are the status bits readable (in ASCII form, 64bit value) @@ -238,11 +241,6 @@ struct hfi1_tid_info { __u32 tidcnt; /* length of transfer buffer programmed by this request */ __u32 length; - /* - * pointer to bitmap of TIDs used for this call; - * checked for being large enough at open - */ - __u64 tidmap; }; struct hfi1_cmd { diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 4fa418de4075..6e373d151cad 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -5,8 +5,8 @@ enum { RDMA_NL_RDMA_CM = 1, - RDMA_NL_NES, - RDMA_NL_C4IW, + RDMA_NL_IWCM, + RDMA_NL_RSVD, RDMA_NL_LS, /* RDMA Local Services */ RDMA_NL_I40IW, RDMA_NL_NUM_CLIENTS diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 52b4a2f993f2..1852e383afd6 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -109,14 +109,13 @@ struct p9_trans_rdma { /** * p9_rdma_context - Keeps track of in-process WR * - * @wc_op: The original WR op for when the CQE completes in error. * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ struct p9_rdma_req; struct p9_rdma_context { - enum ib_wc_opcode wc_op; + struct ib_cqe cqe; dma_addr_t busa; union { struct p9_req_t *req; @@ -284,9 +283,12 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) } static void -handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); struct p9_req_t *req; int err = 0; int16_t tag; @@ -295,7 +297,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, DMA_FROM_DEVICE); - if (status != IB_WC_SUCCESS) + if (wc->status != IB_WC_SUCCESS) goto err_out; err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); @@ -316,21 +318,32 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, req->rc = c->rc; p9_client_cb(client, req, REQ_STATUS_RCVD); + out: + up(&rdma->rq_sem); + kfree(c); return; err_out: - p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status); + p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", + req, err, wc->status); rdma->state = P9_RDMA_FLUSHING; client->status = Disconnected; + goto out; } static void -handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); + ib_dma_unmap_single(rdma->cm_id->device, c->busa, c->req->tc->size, DMA_TO_DEVICE); + up(&rdma->sq_sem); + kfree(c); } static void qp_event_handler(struct ib_event *event, void *context) @@ -339,42 +352,6 @@ static void qp_event_handler(struct ib_event *event, void *context) event->event, context); } -static void cq_comp_handler(struct ib_cq *cq, void *cq_context) -{ - struct p9_client *client = cq_context; - struct p9_trans_rdma *rdma = client->trans; - int ret; - struct ib_wc wc; - - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); - while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; - - switch (c->wc_op) { - case IB_WC_RECV: - handle_recv(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->rq_sem); - break; - - case IB_WC_SEND: - handle_send(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->sq_sem); - break; - - default: - pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n", - c->wc_op, wc.opcode, wc.status); - break; - } - kfree(c); - } -} - -static void cq_event_handler(struct ib_event *e, void *v) -{ - p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); -} - static void rdma_destroy_trans(struct p9_trans_rdma *rdma) { if (!rdma) @@ -387,7 +364,7 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma) ib_dealloc_pd(rdma->pd); if (rdma->cq && !IS_ERR(rdma->cq)) - ib_destroy_cq(rdma->cq); + ib_free_cq(rdma->cq); if (rdma->cm_id && !IS_ERR(rdma->cm_id)) rdma_destroy_id(rdma->cm_id); @@ -408,13 +385,14 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; + c->cqe.done = recv_done; + sge.addr = c->busa; sge.length = client->msize; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_RECV; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; return ib_post_recv(rdma->qp, &wr, &bad_wr); @@ -499,13 +477,14 @@ dont_need_post_recv: goto send_error; } + c->cqe.done = send_done; + sge.addr = c->busa; sge.length = c->req->tc->size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_SEND; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.sg_list = &sge; @@ -642,7 +621,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; - struct ib_cq_init_attr cq_attr = {}; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); @@ -695,13 +673,11 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) goto error; /* Create the Completion Queue */ - cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; - rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, - cq_event_handler, client, - &cq_attr); + rdma->cq = ib_alloc_cq(rdma->cm_id->device, client, + opts.sq_depth + opts.rq_depth + 1, + 0, IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); /* Create the Protection Domain */ rdma->pd = ib_alloc_pd(rdma->cm_id->device); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d735e854f916..4b6f3db9f8af 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1387,6 +1387,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, + [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, + [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { @@ -1534,6 +1536,22 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return 0; } +static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, + int guid_type) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); +} + +static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) +{ + if (dev->type != ARPHRD_INFINIBAND) + return -EOPNOTSUPP; + + return handle_infiniband_guid(dev, ivt, guid_type); +} + static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1636,6 +1654,24 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } + if (tb[IFLA_VF_IB_NODE_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); + } + + if (tb[IFLA_VF_IB_PORT_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); + } + return err; } |