diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 17:38:19 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 17:38:19 +0300 |
commit | da19a102ce87bf3e0a7fe277a659d1fc35330d6d (patch) | |
tree | a6c1d40ef544e812b31f4b5f497c20d449d45ec3 /drivers/infiniband/sw | |
parent | e5f6d9afa3415104e402cd69288bb03f7165eeba (diff) | |
parent | a60109dc9a954ef9eddba6577e2d2e9e7952e487 (diff) | |
download | linux-da19a102ce87bf3e0a7fe277a659d1fc35330d6d.tar.xz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"This has been a smaller cycle with many of the commits being smallish
code fixes and improvements across the drivers.
- Driver updates for bnxt_re, cxgb4, hfi1, hns, mlx5, nes, qedr, and
rxe
- Memory window support in hns
- mlx5 user API 'flow mutate/steering' allows accessing the full
packet mangling and matching machinery from user space
- Support inter-working with verbs API calls in the 'devx' mlx5 user
API, and provide options to use devx with less privilege
- Modernize the use of syfs and the device interface to use attribute
groups and cdev properly for uverbs, and clean up some of the core
code's device list management
- More progress on net namespaces for RDMA devices
- Consolidate driver BAR mmapping support into core code helpers and
rework how RDMA holds poitners to mm_struct for get_user_pages
cases
- First pass to use 'dev_name' instead of ib_device->name
- Device renaming for RDMA devices"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (242 commits)
IB/mlx5: Add support for extended atomic operations
RDMA/core: Fix comment for hw stats init for port == 0
RDMA/core: Refactor ib_register_device() function
RDMA/core: Fix unwinding flow in case of error to register device
ib_srp: Remove WARN_ON in srp_terminate_io()
IB/mlx5: Allow scatter to CQE without global signaled WRs
IB/mlx5: Verify that driver supports user flags
IB/mlx5: Support scatter to CQE for DC transport type
RDMA/drivers: Use core provided API for registering device attributes
RDMA/core: Allow existing drivers to set one sysfs group per device
IB/rxe: Remove unnecessary enum values
RDMA/umad: Use kernel API to allocate umad indexes
RDMA/uverbs: Use kernel API to allocate uverbs indexes
RDMA/core: Increase total number of RDMA ports across all devices
IB/mlx4: Add port and TID to MAD debug print
IB/mlx4: Enable debug print of SMPs
RDMA/core: Rename ports_parent to ports_kobj
RDMA/core: Do not expose unsupported counters
IB/mlx4: Refer to the device kobject instead of ports_parent
RDMA/nldev: Allow IB device rename through RDMA netlink
...
Diffstat (limited to 'drivers/infiniband/sw')
22 files changed, 874 insertions, 168 deletions
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig index 98e798007f75..7df896a18d38 100644 --- a/drivers/infiniband/sw/rdmavt/Kconfig +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_RDMAVT tristate "RDMA verbs transport library" - depends on 64BIT && ARCH_DMA_ADDR_T_64BIT + depends on X86_64 && ARCH_DMA_ADDR_T_64BIT depends on PCI select DMA_VIRT_OPS ---help--- diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 5ce403c6cddb..1735deb1a9d4 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -118,6 +118,187 @@ const int ib_rvt_state_ops[IB_QPS_ERR + 1] = { }; EXPORT_SYMBOL(ib_rvt_state_ops); +/* platform specific: return the last level cache (llc) size, in KiB */ +static int rvt_wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} + +void rvt_wss_exit(struct rvt_dev_info *rdi) +{ + struct rvt_wss *wss = rdi->wss; + + if (!wss) + return; + + /* coded to handle partially initialized and repeat callers */ + kfree(wss->entries); + wss->entries = NULL; + kfree(rdi->wss); + rdi->wss = NULL; +} + +/** + * rvt_wss_init - Init wss data structures + * + * Return: 0 on success + */ +int rvt_wss_init(struct rvt_dev_info *rdi) +{ + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + unsigned int wss_threshold = rdi->dparms.wss_threshold; + unsigned int wss_clean_period = rdi->dparms.wss_clean_period; + long llc_size; + long llc_bits; + long table_size; + long table_bits; + struct rvt_wss *wss; + int node = rdi->dparms.node; + + if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) { + rdi->wss = NULL; + return 0; + } + + rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node); + if (!rdi->wss) + return -ENOMEM; + wss = rdi->wss; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = rvt_wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss->pages_mask = table_bits - 1; + wss->num_entries = table_bits / BITS_PER_LONG; + + wss->threshold = (llc_bits * wss_threshold) / 100; + if (wss->threshold == 0) + wss->threshold = 1; + + wss->clean_period = wss_clean_period; + atomic_set(&wss->clean_counter, wss_clean_period); + + wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries), + GFP_KERNEL, node); + if (!wss->entries) { + rvt_wss_exit(rdi); + return -ENOMEM; + } + + return 0; +} + +/* + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. + */ +static void wss_advance_clean_counter(struct rvt_wss *wss) +{ + int entry; + int weight; + unsigned long bits; + + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss->clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss->clean_counter, wss->clean_period); + + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss->clean_entry) - 1) + & (wss->num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss->entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss->total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(struct rvt_wss *wss, void *address) +{ + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss->entries[entry])) + atomic_inc(&wss->total_count); + + wss_advance_clean_counter(wss); +} + +/* + * Is the working set larger than the threshold? + */ +static inline bool wss_exceeds_threshold(struct rvt_wss *wss) +{ + return atomic_read(&wss->total_count) >= wss->threshold; +} + static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map) { @@ -1164,11 +1345,8 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int lastwqe = 0; int mig = 0; int pmtu = 0; /* for gcc warning only */ - enum rdma_link_layer link; int opa_ah; - link = rdma_port_get_link_layer(ibqp->device, qp->port_num); - spin_lock_irq(&qp->r_lock); spin_lock(&qp->s_hlock); spin_lock(&qp->s_lock); @@ -1179,7 +1357,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num); if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, link)) + attr_mask)) goto inval; if (rdi->driver_f.check_modify_qp && @@ -1718,7 +1896,7 @@ static inline int rvt_qp_is_avail( */ static int rvt_post_one_wr(struct rvt_qp *qp, const struct ib_send_wr *wr, - int *call_send) + bool *call_send) { struct rvt_swqe *wqe; u32 next; @@ -1823,15 +2001,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->wr.num_sge = j; } - /* general part of wqe valid - allow for driver checks */ - if (rdi->driver_f.check_send_wqe) { - ret = rdi->driver_f.check_send_wqe(qp, wqe); - if (ret < 0) - goto bail_inval_free; - if (ret) - *call_send = ret; - } - + /* + * Calculate and set SWQE PSN values prior to handing it off + * to the driver's check routine. This give the driver the + * opportunity to adjust PSN values based on internal checks. + */ log_pmtu = qp->log_pmtu; if (qp->ibqp.qp_type != IB_QPT_UC && qp->ibqp.qp_type != IB_QPT_RC) { @@ -1856,8 +2030,18 @@ static int rvt_post_one_wr(struct rvt_qp *qp, (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); - qp->s_next_psn = wqe->lpsn + 1; } + + /* general part of wqe valid - allow for driver checks */ + if (rdi->driver_f.setup_wqe) { + ret = rdi->driver_f.setup_wqe(qp, wqe, call_send); + if (ret < 0) + goto bail_inval_free_ref; + } + + if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) + qp->s_next_psn = wqe->lpsn + 1; + if (unlikely(reserved_op)) { wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; rvt_qp_wqe_reserve(qp, wqe); @@ -1871,6 +2055,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, return 0; +bail_inval_free_ref: + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) + atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); bail_inval_free: /* release mr holds */ while (j) { @@ -1897,7 +2085,7 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); unsigned long flags = 0; - int call_send; + bool call_send; unsigned nreq = 0; int err = 0; @@ -1930,7 +2118,11 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, bail: spin_unlock_irqrestore(&qp->s_hlock, flags); if (nreq) { - if (call_send) + /* + * Only call do_send if there is exactly one packet, and the + * driver said it was ok. + */ + if (nreq == 1 && call_send) rdi->driver_f.do_send(qp); else rdi->driver_f.schedule_send_no_lock(qp); @@ -2465,3 +2657,454 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, rcu_read_unlock(); } EXPORT_SYMBOL(rvt_qp_iter); + +/* + * This should be called with s_lock held. + */ +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + return; + + last = qp->s_last; + old_last = last; + trace_rvt_qp_send_completion(qp, wqe, last); + if (++last >= qp->s_size) + last = 0; + trace_rvt_qp_send_completion(qp, wqe, last); + qp->s_last = last; + /* See post_send() */ + barrier(); + rvt_put_swqe(wqe); + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); + + rvt_qp_swqe_complete(qp, + wqe, + rdi->wc_opcode[wqe->wr.opcode], + status); + + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} +EXPORT_SYMBOL(rvt_send_complete); + +/** + * rvt_copy_sge - copy data to SGE memory + * @qp: associated QP + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + * @release: boolean to release MR + * @copy_last: do a separate copy of the last 8 bytes + */ +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last) +{ + struct rvt_sge *sge = &ss->sge; + int i; + bool in_last = false; + bool cacheless_copy = false; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_wss *wss = rdi->wss; + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + + if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(wss, sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(wss, (sge->vaddr + PAGE_SIZE)); + + cacheless_copy = wss_exceeds_threshold(wss); + } else { + wss_advance_clean_counter(wss); + } + } + + if (copy_last) { + if (length > 8) { + length -= 8; + } else { + copy_last = false; + in_last = true; + } + } + +again: + while (length) { + u32 len = rvt_get_sge_length(sge, length); + + WARN_ON_ONCE(len == 0); + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ + for (i = 0; i < len; i++) + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); + } else { + memcpy(sge->vaddr, data, len); + } + rvt_update_sge(ss, len, release); + data += len; + length -= len; + } + + if (copy_last) { + copy_last = false; + in_last = true; + length = 8; + goto again; + } +} +EXPORT_SYMBOL(rvt_copy_sge); + +/** + * ruc_loopback - handle UC and RC loopback requests + * @sqp: the sending QP + * + * This is called from rvt_do_send() to forward a WQE addressed to the same HFI + * Note that although we are single threaded due to the send engine, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +void rvt_ruc_loopback(struct rvt_qp *sqp) +{ + struct rvt_ibport *rvp = NULL; + struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device); + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + bool release; + int ret; + bool copy_last = false; + int local_ops = 0; + + rcu_read_lock(); + rvp = rdi->ports[sqp->port_num - 1]; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp, + sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= RVT_S_BUSY; + +again: + if (sqp->s_last == READ_ONCE(sqp->s_head)) + goto clr_busy; + wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work request. */ + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + rvp->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof(wc)); + send_status = IB_WC_SUCCESS; + + release = true; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_REG_MR: + goto send_comp; + + case IB_WR_LOCAL_INV: + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + if (rvt_invalidate_rkey(sqp, + wqe->wr.ex.invalidate_rkey)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + } + goto send_comp; + + case IB_WR_SEND_WITH_INV: + if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { + wc.wc_flags = IB_WC_WITH_INVALIDATE; + wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; + } + goto send; + + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: +send: + ret = rvt_get_rwqe(qp, false); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = rvt_get_rwqe(qp, true); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* skip copy_last set and qp_access_flags recheck */ + goto do_write; + case IB_WR_RDMA_WRITE: + copy_last = rvt_is_user_qp(qp); + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; +do_write: + if (wqe->length == 0) + break; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = false; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->atomic_wr.remote_addr, + wqe->atomic_wr.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; + sdata = wqe->atomic_wr.compare_add; + *(u64 *)sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, + sdata, wqe->atomic_wr.swap); + rvt_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, + len, release, copy_last); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + rvt_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + rvt_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; + wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + rvp->n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + rvt_send_complete(sqp, wqe, send_status); + if (local_ops) { + atomic_dec(&sqp->local_ops_pending); + local_ops = 0; + } + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + rvp->n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) + goto clr_busy; + rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << + IB_AETH_CREDIT_SHIFT); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + rvt_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + rvt_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~RVT_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~RVT_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + rcu_read_unlock(); +} +EXPORT_SYMBOL(rvt_ruc_loopback); diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 264811fdc530..6d883972e0b8 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -66,4 +66,6 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); +int rvt_wss_init(struct rvt_dev_info *rdi); +void rvt_wss_exit(struct rvt_dev_info *rdi); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h index 0ef25fc49f25..d5df352eadb1 100644 --- a/drivers/infiniband/sw/rdmavt/trace_tx.h +++ b/drivers/infiniband/sw/rdmavt/trace_tx.h @@ -153,6 +153,48 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + rvt_qp_send_completion, + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx), + TP_ARGS(qp, wqe, idx), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(struct rvt_swqe *, wqe) + __field(u64, wr_id) + __field(u32, qpn) + __field(u32, qpt) + __field(u32, length) + __field(u32, idx) + __field(u32, ssn) + __field(enum ib_wr_opcode, opcode) + __field(int, send_flags) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->wqe = wqe; + __entry->wr_id = wqe->wr.wr_id; + __entry->qpn = qp->ibqp.qp_num; + __entry->qpt = qp->ibqp.qp_type; + __entry->length = wqe->length; + __entry->idx = idx; + __entry->ssn = wqe->ssn; + __entry->opcode = wqe->wr.opcode; + __entry->send_flags = wqe->wr.send_flags; + ), + TP_printk( + "[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x", + __get_str(dev), + __entry->qpn, + __entry->qpt, + __entry->wqe, + __entry->idx, + __entry->wr_id, + __entry->length, + __entry->ssn, + __entry->opcode, + __entry->send_flags + ) +); #endif /* __RVT_TRACE_TX_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 17e4abc067af..723d3daf2eba 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -774,6 +774,13 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) goto bail_no_mr; } + /* Memory Working Set Size */ + ret = rvt_wss_init(rdi); + if (ret) { + rvt_pr_err(rdi, "Error in WSS init.\n"); + goto bail_mr; + } + /* Completion queues */ spin_lock_init(&rdi->n_cqs_lock); @@ -828,10 +835,11 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ - ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); + ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev), + rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); - goto bail_mr; + goto bail_wss; } rvt_create_mad_agents(rdi); @@ -839,6 +847,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rvt_pr_info(rdi, "Registration with rdmavt done.\n"); return ret; +bail_wss: + rvt_wss_exit(rdi); bail_mr: rvt_mr_exit(rdi); @@ -862,6 +872,7 @@ void rvt_unregister_device(struct rvt_dev_info *rdi) rvt_free_mad_agents(rdi); ib_unregister_device(&rdi->ibdev); + rvt_wss_exit(rdi); rvt_mr_exit(rdi); rvt_qp_exit(rdi); } diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 10999fa69281..383e65c7bbc0 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -103,7 +103,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; rxe->attr.max_ee_init_rd_atom = RXE_MAX_EE_INIT_RD_ATOM; - rxe->attr.atomic_cap = RXE_ATOMIC_CAP; + rxe->attr.atomic_cap = IB_ATOMIC_HCA; rxe->attr.max_ee = RXE_MAX_EE; rxe->attr.max_rdd = RXE_MAX_RDD; rxe->attr.max_mw = RXE_MAX_MW; @@ -128,9 +128,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe) /* initialize port attributes */ static int rxe_init_port_param(struct rxe_port *port) { - port->attr.state = RXE_PORT_STATE; - port->attr.max_mtu = RXE_PORT_MAX_MTU; - port->attr.active_mtu = RXE_PORT_ACTIVE_MTU; + port->attr.state = IB_PORT_DOWN; + port->attr.max_mtu = IB_MTU_4096; + port->attr.active_mtu = IB_MTU_256; port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; @@ -147,8 +147,7 @@ static int rxe_init_port_param(struct rxe_port *port) port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; port->attr.phys_state = RXE_PORT_PHYS_STATE; - port->mtu_cap = - ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU); + port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); return 0; @@ -300,7 +299,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) mtu = eth_mtu_int_to_enum(ndev_mtu); /* Make sure that new MTU in range */ - mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256; + mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 83311dd07019..ea089cb091ad 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -191,6 +191,7 @@ static inline void reset_retry_counters(struct rxe_qp *qp) { qp->comp.retry_cnt = qp->attr.retry_cnt; qp->comp.rnr_retry = qp->attr.rnr_retry; + qp->comp.started_retry = 0; } static inline enum comp_state check_psn(struct rxe_qp *qp, @@ -253,6 +254,17 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + /* read retries of partial data may restart from + * read response first or response only. + */ + if ((pkt->psn == wqe->first_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) || + (wqe->first_psn == wqe->last_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY)) + break; + return COMPST_ERROR; } break; @@ -499,11 +511,11 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct rxe_send_wqe *wqe) { - qp->comp.opcode = -1; - - if (pkt) { - if (psn_compare(pkt->psn, qp->comp.psn) >= 0) - qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + if (pkt && wqe->state == wqe_state_pending) { + if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) { + qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK; + qp->comp.opcode = -1; + } if (qp->req.wait_psn) { qp->req.wait_psn = 0; @@ -676,6 +688,20 @@ int rxe_completer(void *arg) goto exit; } + /* if we've started a retry, don't start another + * retry sequence, unless this is a timeout. + */ + if (qp->comp.started_retry && + !qp->comp.timeout_retry) { + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + } + + goto done; + } + if (qp->comp.retry_cnt > 0) { if (qp->comp.retry_cnt != 7) qp->comp.retry_cnt--; @@ -692,6 +718,7 @@ int rxe_completer(void *arg) rxe_counter_inc(rxe, RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; + qp->comp.started_retry = 1; rxe_run_task(&qp->req.task, 1); } @@ -701,7 +728,7 @@ int rxe_completer(void *arg) skb = NULL; } - goto exit; + goto done; } else { rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED); diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 2ee4b08b00ea..a57276f2cb84 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -30,7 +30,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - +#include <linux/vmalloc.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" @@ -97,7 +97,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); if (err) { - kvfree(cq->queue->buf); + vfree(cq->queue->buf); kfree(cq->queue); return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 87d14f7ef21b..afd53f57a62b 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -144,8 +144,7 @@ void rxe_loopback(struct sk_buff *skb); int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb); struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt); -int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, u32 *crc); +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc); enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num); const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); struct device *rxe_dma_device(struct rxe_dev *rxe); @@ -196,7 +195,7 @@ static inline int qp_mtu(struct rxe_qp *qp) if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) return qp->attr.path_mtu; else - return RXE_PORT_MAX_MTU; + return IB_MTU_4096; } static inline int rcv_wqe_size(int max_sge) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index dff605fdf60f..9d3916b93f23 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -573,33 +573,20 @@ struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, struct rxe_dev *rxe = to_rdev(pd->ibpd.device); int index = key >> 8; - if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) { - mem = rxe_pool_get_index(&rxe->mr_pool, index); - if (!mem) - goto err1; - } else { - goto err1; + mem = rxe_pool_get_index(&rxe->mr_pool, index); + if (!mem) + return NULL; + + if (unlikely((type == lookup_local && mem->lkey != key) || + (type == lookup_remote && mem->rkey != key) || + mem->pd != pd || + (access && !(access & mem->access)) || + mem->state != RXE_MEM_STATE_VALID)) { + rxe_drop_ref(mem); + mem = NULL; } - if ((type == lookup_local && mem->lkey != key) || - (type == lookup_remote && mem->rkey != key)) - goto err2; - - if (mem->pd != pd) - goto err2; - - if (access && !(access & mem->access)) - goto err2; - - if (mem->state != RXE_MEM_STATE_VALID) - goto err2; - return mem; - -err2: - rxe_drop_ref(mem); -err1: - return NULL; } int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem, diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 8094cbaa54a9..40e82e0f6c2d 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -72,7 +72,7 @@ struct rxe_dev *get_rxe_by_name(const char *name) spin_lock_bh(&dev_list_lock); list_for_each_entry(rxe, &rxe_dev_list, list) { - if (!strcmp(name, rxe->ib_dev.name)) { + if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) { found = rxe; break; } @@ -182,19 +182,11 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, #endif -static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, +static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_qp *qp, struct rxe_av *av) { - const struct ib_gid_attr *attr; struct dst_entry *dst = NULL; - struct net_device *ndev; - - attr = rdma_get_gid_attr(&rxe->ib_dev, qp->attr.port_num, - av->grh.sgid_index); - if (IS_ERR(attr)) - return NULL; - ndev = attr->ndev; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); @@ -229,7 +221,6 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, sk_dst_set(qp->sk->sk, dst); } } - rdma_put_gid_attr(attr); return dst; } @@ -377,8 +368,8 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } -static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, struct rxe_av *av) +static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb, + struct rxe_av *av) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; @@ -387,7 +378,7 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; - dst = rxe_find_route(rxe, qp, av); + dst = rxe_find_route(skb->dev, qp, av); if (!dst) { pr_err("Host not reachable\n"); return -EHOSTUNREACH; @@ -396,8 +387,8 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, if (!memcmp(saddr, daddr, sizeof(*daddr))) pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), - htons(ROCE_V2_UDP_DPORT)); + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); @@ -406,15 +397,15 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return 0; } -static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, struct rxe_av *av) +static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb, + struct rxe_av *av) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; - dst = rxe_find_route(rxe, qp, av); + dst = rxe_find_route(skb->dev, qp, av); if (!dst) { pr_err("Host not reachable\n"); return -EHOSTUNREACH; @@ -423,8 +414,8 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, if (!memcmp(saddr, daddr, sizeof(*daddr))) pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), - htons(ROCE_V2_UDP_DPORT)); + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, av->grh.traffic_class, @@ -434,16 +425,15 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return 0; } -int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, u32 *crc) +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) { int err = 0; struct rxe_av *av = rxe_get_av(pkt); if (av->network_type == RDMA_NETWORK_IPV4) - err = prepare4(rxe, pkt, skb, av); + err = prepare4(pkt, skb, av); else if (av->network_type == RDMA_NETWORK_IPV6) - err = prepare6(rxe, pkt, skb, av); + err = prepare6(pkt, skb, av); *crc = rxe_icrc_hdr(pkt, skb); @@ -501,11 +491,6 @@ void rxe_loopback(struct sk_buff *skb) rxe_rcv(skb); } -static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av) -{ - return rxe->port.port_guid == av->grh.dgid.global.interface_id; -} - struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt) { @@ -625,7 +610,7 @@ void rxe_port_up(struct rxe_dev *rxe) port->attr.phys_state = IB_PHYS_STATE_LINK_UP; rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); - pr_info("set %s active\n", rxe->ib_dev.name); + dev_info(&rxe->ib_dev.dev, "set active\n"); } /* Caller must hold net_info_lock */ @@ -638,7 +623,7 @@ void rxe_port_down(struct rxe_dev *rxe) port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN; rxe_port_event(rxe, IB_EVENT_PORT_ERR); - pr_info("set %s down\n", rxe->ib_dev.name); + dev_info(&rxe->ib_dev.dev, "set down\n"); } static int rxe_notify(struct notifier_block *not_blk, diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 4555510d86c4..bdea899a58ac 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -90,7 +90,6 @@ enum rxe_device_param { RXE_MAX_RES_RD_ATOM = 0x3f000, RXE_MAX_QP_INIT_RD_ATOM = 128, RXE_MAX_EE_INIT_RD_ATOM = 0, - RXE_ATOMIC_CAP = 1, RXE_MAX_EE = 0, RXE_MAX_RDD = 0, RXE_MAX_MW = 0, @@ -139,9 +138,6 @@ enum rxe_device_param { /* default/initial rxe port parameters */ enum rxe_port_param { - RXE_PORT_STATE = IB_PORT_DOWN, - RXE_PORT_MAX_MTU = IB_MTU_4096, - RXE_PORT_ACTIVE_MTU = IB_MTU_256, RXE_PORT_GID_TBL_LEN = 1024, RXE_PORT_PORT_CAP_FLAGS = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP, RXE_PORT_MAX_MSG_SZ = 0x800000, diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index b4a8acc7bb7d..36b53fb94a49 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -207,7 +207,7 @@ int rxe_pool_init( kref_init(&pool->ref_cnt); - spin_lock_init(&pool->pool_lock); + rwlock_init(&pool->pool_lock); if (rxe_type_info[type].flags & RXE_POOL_INDEX) { err = rxe_pool_init_index(pool, @@ -222,7 +222,7 @@ int rxe_pool_init( pool->key_size = rxe_type_info[type].key_size; } - pool->state = rxe_pool_valid; + pool->state = RXE_POOL_STATE_VALID; out: return err; @@ -232,7 +232,7 @@ static void rxe_pool_release(struct kref *kref) { struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt); - pool->state = rxe_pool_invalid; + pool->state = RXE_POOL_STATE_INVALID; kfree(pool->table); } @@ -245,12 +245,12 @@ int rxe_pool_cleanup(struct rxe_pool *pool) { unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); - pool->state = rxe_pool_invalid; + write_lock_irqsave(&pool->pool_lock, flags); + pool->state = RXE_POOL_STATE_INVALID; if (atomic_read(&pool->num_elem) > 0) pr_warn("%s pool destroyed with unfree'd elem\n", pool_name(pool)); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); rxe_pool_put(pool); @@ -336,10 +336,10 @@ void rxe_add_key(void *arg, void *key) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); memcpy((u8 *)elem + pool->key_offset, key, pool->key_size); insert_key(pool, elem); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_drop_key(void *arg) @@ -348,9 +348,9 @@ void rxe_drop_key(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); rb_erase(&elem->node, &pool->tree); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_add_index(void *arg) @@ -359,10 +359,10 @@ void rxe_add_index(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); elem->index = alloc_index(pool); insert_index(pool, elem); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_drop_index(void *arg) @@ -371,10 +371,10 @@ void rxe_drop_index(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); clear_bit(elem->index - pool->min_index, pool->table); rb_erase(&elem->node, &pool->tree); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void *rxe_alloc(struct rxe_pool *pool) @@ -384,13 +384,13 @@ void *rxe_alloc(struct rxe_pool *pool) might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); - spin_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) { - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != RXE_POOL_STATE_VALID) { + read_unlock_irqrestore(&pool->pool_lock, flags); return NULL; } kref_get(&pool->ref_cnt); - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); kref_get(&pool->rxe->ref_cnt); @@ -436,9 +436,9 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) struct rxe_pool_entry *elem = NULL; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) + if (pool->state != RXE_POOL_STATE_VALID) goto out; node = pool->tree.rb_node; @@ -450,15 +450,14 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) node = node->rb_left; else if (elem->index < index) node = node->rb_right; - else + else { + kref_get(&elem->ref_cnt); break; + } } - if (node) - kref_get(&elem->ref_cnt); - out: - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); return node ? elem : NULL; } @@ -469,9 +468,9 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key) int cmp; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) + if (pool->state != RXE_POOL_STATE_VALID) goto out; node = pool->tree.rb_node; @@ -494,6 +493,6 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key) kref_get(&elem->ref_cnt); out: - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); return node ? elem : NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 47df28e43acf..aa4ba307097b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -74,8 +74,8 @@ struct rxe_type_info { extern struct rxe_type_info rxe_type_info[]; enum rxe_pool_state { - rxe_pool_invalid, - rxe_pool_valid, + RXE_POOL_STATE_INVALID, + RXE_POOL_STATE_VALID, }; struct rxe_pool_entry { @@ -90,7 +90,7 @@ struct rxe_pool_entry { struct rxe_pool { struct rxe_dev *rxe; - spinlock_t pool_lock; /* pool spinlock */ + rwlock_t pool_lock; /* protects pool add/del/search */ size_t elem_size; struct kref ref_cnt; void (*cleanup)(struct rxe_pool_entry *obj); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c58452daffc7..b9710907dac2 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -34,6 +34,7 @@ #include <linux/skbuff.h> #include <linux/delay.h> #include <linux/sched.h> +#include <linux/vmalloc.h> #include "rxe.h" #include "rxe_loc.h" @@ -227,6 +228,16 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, return err; qp->sk->sk->sk_user_data = qp; + /* pick a source UDP port number for this QP based on + * the source QPN. this spreads traffic for different QPs + * across different NIC RX queues (while using a single + * flow for a given QP to maintain packet order). + * the port number must be in the Dynamic Ports range + * (0xc000 - 0xffff). + */ + qp->src_port = RXE_ROCE_V2_SPORT + + (hash_32_generic(qp_num(qp), 14) & 0x3fff); + qp->sq.max_wr = init->cap.max_send_wr; qp->sq.max_sge = init->cap.max_send_sge; qp->sq.max_inline = init->cap.max_inline_data; @@ -247,7 +258,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, &qp->sq.queue->ip); if (err) { - kvfree(qp->sq.queue->buf); + vfree(qp->sq.queue->buf); kfree(qp->sq.queue); return err; } @@ -300,7 +311,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, qp->rq.queue->buf, qp->rq.queue->buf_size, &qp->rq.queue->ip); if (err) { - kvfree(qp->rq.queue->buf); + vfree(qp->rq.queue->buf); kfree(qp->rq.queue); return err; } @@ -408,8 +419,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, enum ib_qp_state new_state = (mask & IB_QP_STATE) ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { pr_warn("invalid mask or state for qp\n"); goto err1; } diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index d30dbac24583..5c29a1bb575a 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -122,7 +122,7 @@ static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, set_bad_pkey_cntr(port); goto err1; } - } else if (qpn != 0) { + } else { if (unlikely(!pkey_match(pkey, port->pkey_tbl[qp->attr.pkey_index] ))) { @@ -134,7 +134,7 @@ static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, } if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) && - qpn != 0 && pkt->mask) { + pkt->mask) { u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey; if (unlikely(deth_qkey(pkt) != qkey)) { diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 8be27238a86e..6c361d70d7cd 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -73,9 +73,6 @@ static void req_retry(struct rxe_qp *qp) int npsn; int first = 1; - wqe = queue_head(qp->sq.queue); - npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK; - qp->req.wqe_index = consumer_index(qp->sq.queue); qp->req.psn = qp->comp.psn; qp->req.opcode = -1; @@ -107,11 +104,17 @@ static void req_retry(struct rxe_qp *qp) if (first) { first = 0; - if (mask & WR_WRITE_OR_SEND_MASK) + if (mask & WR_WRITE_OR_SEND_MASK) { + npsn = (qp->comp.psn - wqe->first_psn) & + BTH_PSN_MASK; retry_first_write_send(qp, wqe, mask, npsn); + } - if (mask & WR_READ_MASK) + if (mask & WR_READ_MASK) { + npsn = (wqe->dma.length - wqe->dma.resid) / + qp->mtu; wqe->iova += npsn * qp->mtu; + } } wqe->state = wqe_state_posted; @@ -435,7 +438,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, if (pkt->mask & RXE_RETH_MASK) { reth_set_rkey(pkt, ibwr->wr.rdma.rkey); reth_set_va(pkt, wqe->iova); - reth_set_len(pkt, wqe->dma.length); + reth_set_len(pkt, wqe->dma.resid); } if (pkt->mask & RXE_IMMDT_MASK) @@ -476,7 +479,7 @@ static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, u32 *p; int err; - err = rxe_prepare(rxe, pkt, skb, &crc); + err = rxe_prepare(pkt, skb, &crc); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index aa5833318372..c962160292f4 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -637,7 +637,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, if (ack->mask & RXE_ATMACK_MASK) atmack_set_orig(ack, qp->resp.atomic_orig); - err = rxe_prepare(rxe, ack, skb, &crc); + err = rxe_prepare(ack, skb, &crc); if (err) { kfree_skb(skb); return NULL; @@ -682,6 +682,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, rxe_advance_resp_resource(qp); res->type = RXE_READ_MASK; + res->replay = 0; res->read.va = qp->resp.va; res->read.va_org = qp->resp.va; @@ -752,7 +753,8 @@ static enum resp_states read_reply(struct rxe_qp *qp, state = RESPST_DONE; } else { qp->resp.res = NULL; - qp->resp.opcode = -1; + if (!res->replay) + qp->resp.opcode = -1; if (psn_compare(res->cur_psn, qp->resp.psn) >= 0) qp->resp.psn = res->cur_psn; state = RESPST_CLEANUP; @@ -814,6 +816,7 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; @@ -1065,7 +1068,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { enum resp_states rc; - u32 prev_psn = (qp->resp.psn - 1) & BTH_PSN_MASK; + u32 prev_psn = (qp->resp.ack_psn - 1) & BTH_PSN_MASK; if (pkt->mask & RXE_SEND_MASK || pkt->mask & RXE_WRITE_MASK) { @@ -1108,6 +1111,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, res->state = (pkt->psn == res->first_psn) ? rdatm_res_state_new : rdatm_res_state_replay; + res->replay = 1; /* Reset the resource, except length. */ res->read.va_org = iova; diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 0d6c04ba7fc3..c41a5fee81f7 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include <linux/vmalloc.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" @@ -129,13 +130,18 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, q->buf, q->buf_size, &q->ip); - if (err) + if (err) { + vfree(q->buf); + kfree(q); return err; + } if (uresp) { if (copy_to_user(&uresp->srq_num, &srq->srq_num, - sizeof(uresp->srq_num))) + sizeof(uresp->srq_num))) { + rxe_queue_cleanup(q); return -EFAULT; + } } return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c index d5ed7571128f..73a19f808e1b 100644 --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c @@ -105,7 +105,7 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp) } rxe_set_port_state(ndev); - pr_info("added %s to %s\n", rxe->ib_dev.name, intf); + dev_info(&rxe->ib_dev.dev, "added %s\n", intf); err: if (ndev) dev_put(ndev); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f5b1e0ad6142..9c19f2027511 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1148,18 +1148,21 @@ static ssize_t parent_show(struct device *device, static DEVICE_ATTR_RO(parent); -static struct device_attribute *rxe_dev_attributes[] = { - &dev_attr_parent, +static struct attribute *rxe_dev_attributes[] = { + &dev_attr_parent.attr, + NULL +}; + +static const struct attribute_group rxe_attr_group = { + .attrs = rxe_dev_attributes, }; int rxe_register_device(struct rxe_dev *rxe) { int err; - int i; struct ib_device *dev = &rxe->ib_dev; struct crypto_shash *tfm; - strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX); strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->owner = THIS_MODULE; @@ -1260,26 +1263,16 @@ int rxe_register_device(struct rxe_dev *rxe) } rxe->tfm = tfm; + rdma_set_device_sysfs_group(dev, &rxe_attr_group); dev->driver_id = RDMA_DRIVER_RXE; - err = ib_register_device(dev, NULL); + err = ib_register_device(dev, "rxe%d", NULL); if (err) { pr_warn("%s failed with error %d\n", __func__, err); goto err1; } - for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) { - err = device_create_file(&dev->dev, rxe_dev_attributes[i]); - if (err) { - pr_warn("%s failed with error %d for attr number %d\n", - __func__, err, i); - goto err2; - } - } - return 0; -err2: - ib_unregister_device(dev); err1: crypto_free_shash(rxe->tfm); @@ -1288,12 +1281,8 @@ err1: int rxe_unregister_device(struct rxe_dev *rxe) { - int i; struct ib_device *dev = &rxe->ib_dev; - for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) - device_remove_file(&dev->dev, rxe_dev_attributes[i]); - ib_unregister_device(dev); return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index af1470d29391..82e670d6eeea 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -158,6 +158,7 @@ struct rxe_comp_info { int opcode; int timeout; int timeout_retry; + int started_retry; u32 retry_cnt; u32 rnr_retry; struct rxe_task task; @@ -171,6 +172,7 @@ enum rdatm_res_state { struct resp_res { int type; + int replay; u32 first_psn; u32 last_psn; u32 cur_psn; @@ -195,6 +197,7 @@ struct rxe_resp_info { enum rxe_qp_state state; u32 msn; u32 psn; + u32 ack_psn; int opcode; int drop_msg; int goto_error; @@ -248,6 +251,7 @@ struct rxe_qp { struct socket *sk; u32 dst_cookie; + u16 src_port; struct rxe_av pri_av; struct rxe_av alt_av; |