diff options
Diffstat (limited to 'drivers/infiniband/hw/hfi1')
34 files changed, 1362 insertions, 1474 deletions
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index f451ba912f47..ff790390c91a 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -8,12 +8,42 @@ # obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o -hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ - eprom.o exp_rcv.o file_ops.o firmware.o \ - init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ - qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ - uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ - verbs_txreq.o vnic_main.o vnic_sdma.o +hfi1-y := \ + affinity.o \ + chip.o \ + device.o \ + driver.o \ + efivar.o \ + eprom.o \ + exp_rcv.o \ + file_ops.o \ + firmware.o \ + init.o \ + intr.o \ + iowait.o \ + mad.o \ + mmu_rb.o \ + msix.o \ + pcie.o \ + pio.o \ + pio_copy.o \ + platform.o \ + qp.o \ + qsfp.o \ + rc.o \ + ruc.o \ + sdma.o \ + sysfs.o \ + trace.o \ + uc.o \ + ud.o \ + user_exp_rcv.o \ + user_pages.o \ + user_sdma.o \ + verbs.o \ + verbs_txreq.o \ + vnic_main.o \ + vnic_sdma.o ifdef CONFIG_DEBUG_FS hfi1-y += debugfs.o diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index bedd5fba33b0..2baf38cc1e23 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -817,10 +817,10 @@ static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) set = &entry->def_intr; cpumask_set_cpu(cpu, &set->mask); cpumask_set_cpu(cpu, &set->used); - for (i = 0; i < dd->num_msix_entries; i++) { + for (i = 0; i < dd->msix_info.max_requested; i++) { struct hfi1_msix_entry *other_msix; - other_msix = &dd->msix_entries[i]; + other_msix = &dd->msix_info.msix_entries[i]; if (other_msix->type != IRQ_SDMA || other_msix == msix) continue; diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index e1668bcc2d13..9b20479dc710 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -67,8 +67,6 @@ #include "debugfs.h" #include "fault.h" -#define NUM_IB_PORTS 1 - uint kdeth_qp; module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO); MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix"); @@ -1100,9 +1098,9 @@ struct err_reg_info { const char *desc; }; -#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START) -#define NUM_DC_ERRS (IS_DC_END - IS_DC_START) -#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START) +#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START) +#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START) +#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START) /* * Helpers for building HFI and DC error interrupt table entries. Different @@ -8181,7 +8179,7 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) /** * is_rcv_urgent_int() - User receive context urgent IRQ handler * @dd: valid dd - * @source: logical IRQ source (ofse from IS_RCVURGENT_START) + * @source: logical IRQ source (offset from IS_RCVURGENT_START) * * RX block receive urgent interrupt. Source is < 160. * @@ -8231,7 +8229,7 @@ static const struct is_table is_table[] = { is_sdma_eng_err_name, is_sdma_eng_err_int }, { IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, is_sendctxt_err_name, is_sendctxt_err_int }, -{ IS_SDMA_START, IS_SDMA_END, +{ IS_SDMA_START, IS_SDMA_IDLE_END, is_sdma_eng_name, is_sdma_eng_int }, { IS_VARIOUS_START, IS_VARIOUS_END, is_various_name, is_various_int }, @@ -8257,7 +8255,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) /* avoids a double compare by walking the table in-order */ for (entry = &is_table[0]; entry->is_name; entry++) { - if (source < entry->end) { + if (source <= entry->end) { trace_hfi1_interrupt(dd, entry, source); entry->is_int(dd, source - entry->start); return; @@ -8276,7 +8274,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) * context DATA IRQs are threaded and are not supported by this handler. * */ -static irqreturn_t general_interrupt(int irq, void *data) +irqreturn_t general_interrupt(int irq, void *data) { struct hfi1_devdata *dd = data; u64 regs[CCE_NUM_INT_CSRS]; @@ -8309,7 +8307,7 @@ static irqreturn_t general_interrupt(int irq, void *data) return handled; } -static irqreturn_t sdma_interrupt(int irq, void *data) +irqreturn_t sdma_interrupt(int irq, void *data) { struct sdma_engine *sde = data; struct hfi1_devdata *dd = sde->dd; @@ -8401,7 +8399,7 @@ static inline int check_packet_present(struct hfi1_ctxtdata *rcd) * invoked) is finished. The intent is to avoid extra interrupts while we * are processing packets anyway. */ -static irqreturn_t receive_context_interrupt(int irq, void *data) +irqreturn_t receive_context_interrupt(int irq, void *data) { struct hfi1_ctxtdata *rcd = data; struct hfi1_devdata *dd = rcd->dd; @@ -8441,7 +8439,7 @@ static irqreturn_t receive_context_interrupt(int irq, void *data) * Receive packet thread handler. This expects to be invoked with the * receive interrupt still blocked. */ -static irqreturn_t receive_context_thread(int irq, void *data) +irqreturn_t receive_context_thread(int irq, void *data) { struct hfi1_ctxtdata *rcd = data; int present; @@ -9651,30 +9649,10 @@ void qsfp_event(struct work_struct *work) } } -static void init_qsfp_int(struct hfi1_devdata *dd) +void init_qsfp_int(struct hfi1_devdata *dd) { struct hfi1_pportdata *ppd = dd->pport; - u64 qsfp_mask, cce_int_mask; - const int qsfp1_int_smask = QSFP1_INT % 64; - const int qsfp2_int_smask = QSFP2_INT % 64; - - /* - * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0 - * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR, - * therefore just one of QSFP1_INT/QSFP2_INT can be used to find - * the index of the appropriate CSR in the CCEIntMask CSR array - */ - cce_int_mask = read_csr(dd, CCE_INT_MASK + - (8 * (QSFP1_INT / 64))); - if (dd->hfi1_id) { - cce_int_mask &= ~((u64)1 << qsfp1_int_smask); - write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)), - cce_int_mask); - } else { - cce_int_mask &= ~((u64)1 << qsfp2_int_smask); - write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)), - cce_int_mask); - } + u64 qsfp_mask; qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); /* Clear current status to avoid spurious interrupts */ @@ -9691,6 +9669,12 @@ static void init_qsfp_int(struct hfi1_devdata *dd) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT, qsfp_mask); + + /* Enable the appropriate QSFP IRQ source */ + if (!dd->hfi1_id) + set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true); + else + set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true); } /* @@ -10577,12 +10561,29 @@ void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, } } -/* - * Verify if BCT for data VLs is non-zero. +/** + * data_vls_operational() - Verify if data VL BCT credits and MTU + * are both set. + * @ppd: pointer to hfi1_pportdata structure + * + * Return: true - Ok, false -otherwise. */ static inline bool data_vls_operational(struct hfi1_pportdata *ppd) { - return !!ppd->actual_vls_operational; + int i; + u64 reg; + + if (!ppd->actual_vls_operational) + return false; + + for (i = 0; i < ppd->vls_supported; i++) { + reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i)); + if ((reg && !ppd->dd->vld[i].mtu) || + (!reg && ppd->dd->vld[i].mtu)) + return false; + } + + return true; } /* @@ -10695,7 +10696,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) if (!data_vls_operational(ppd)) { dd_dev_err(dd, - "%s: data VLs not operational\n", __func__); + "%s: Invalid data VL credits or mtu\n", + __func__); ret = -EINVAL; break; } @@ -11932,10 +11934,16 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK; } - if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) + if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, true); rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; - if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) + } + if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, false); rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + } if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr) rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; if (op & HFI1_RCVCTRL_TAILUPD_DIS) { @@ -11965,6 +11973,13 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS) rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + if (op & HFI1_RCVCTRL_URGENT_ENB) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, true); + if (op & HFI1_RCVCTRL_URGENT_DIS) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, false); + hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl); write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl); @@ -12963,63 +12978,71 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp) return ret; } +/* ========================================================================= */ + /** - * get_int_mask - get 64 bit int mask - * @dd - the devdata - * @i - the csr (relative to CCE_INT_MASK) + * read_mod_write() - Calculate the IRQ register index and set/clear the bits + * @dd: valid devdata + * @src: IRQ source to determine register index from + * @bits: the bits to set or clear + * @set: true == set the bits, false == clear the bits * - * Returns the mask with the urgent interrupt mask - * bit clear for kernel receive contexts. */ -static u64 get_int_mask(struct hfi1_devdata *dd, u32 i) +static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits, + bool set) { - u64 mask = U64_MAX; /* default to no change */ - - if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) { - int j = (i - (IS_RCVURGENT_START / 64)) * 64; - int k = !j ? IS_RCVURGENT_START % 64 : 0; + u64 reg; + u16 idx = src / BITS_PER_REGISTER; - if (j) - j -= IS_RCVURGENT_START % 64; - /* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */ - for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++) - /* convert to bit in mask and clear */ - mask &= ~BIT_ULL(k); - } - return mask; + spin_lock(&dd->irq_src_lock); + reg = read_csr(dd, CCE_INT_MASK + (8 * idx)); + if (set) + reg |= bits; + else + reg &= ~bits; + write_csr(dd, CCE_INT_MASK + (8 * idx), reg); + spin_unlock(&dd->irq_src_lock); } -/* ========================================================================= */ - -/* - * Enable/disable chip from delivering interrupts. +/** + * set_intr_bits() - Enable/disable a range (one or more) IRQ sources + * @dd: valid devdata + * @first: first IRQ source to set/clear + * @last: last IRQ source (inclusive) to set/clear + * @set: true == set the bits, false == clear the bits + * + * If first == last, set the exact source. */ -void set_intr_state(struct hfi1_devdata *dd, u32 enable) +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) { - int i; + u64 bits = 0; + u64 bit; + u16 src; - /* - * In HFI, the mask needs to be 1 to allow interrupts. - */ - if (enable) { - /* enable all interrupts but urgent on kernel contexts */ - for (i = 0; i < CCE_NUM_INT_CSRS; i++) { - u64 mask = get_int_mask(dd, i); + if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES) + return -EINVAL; - write_csr(dd, CCE_INT_MASK + (8 * i), mask); - } + if (last < first) + return -ERANGE; - init_qsfp_int(dd); - } else { - for (i = 0; i < CCE_NUM_INT_CSRS; i++) - write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); + for (src = first; src <= last; src++) { + bit = src % BITS_PER_REGISTER; + /* wrapped to next register? */ + if (!bit && bits) { + read_mod_write(dd, src - 1, bits, set); + bits = 0; + } + bits |= BIT_ULL(bit); } + read_mod_write(dd, last, bits, set); + + return 0; } /* * Clear all interrupt sources on the chip. */ -static void clear_all_interrupts(struct hfi1_devdata *dd) +void clear_all_interrupts(struct hfi1_devdata *dd) { int i; @@ -13043,38 +13066,11 @@ static void clear_all_interrupts(struct hfi1_devdata *dd) write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0); } -/** - * hfi1_clean_up_interrupts() - Free all IRQ resources - * @dd: valid device data data structure - * - * Free the MSIx and assoicated PCI resources, if they have been allocated. - */ -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd) -{ - int i; - struct hfi1_msix_entry *me = dd->msix_entries; - - /* remove irqs - must happen before disabling/turning off */ - for (i = 0; i < dd->num_msix_entries; i++, me++) { - if (!me->arg) /* => no irq, no affinity */ - continue; - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, i, me->arg); - } - - /* clean structures */ - kfree(dd->msix_entries); - dd->msix_entries = NULL; - dd->num_msix_entries = 0; - - pci_free_irq_vectors(dd->pcidev); -} - /* * Remap the interrupt source from the general handler to the given MSI-X * interrupt. */ -static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) { u64 reg; int m, n; @@ -13098,8 +13094,7 @@ static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) write_csr(dd, CCE_INT_MAP + (8 * m), reg); } -static void remap_sdma_interrupts(struct hfi1_devdata *dd, - int engine, int msix_intr) +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr) { /* * SDMA engine interrupt sources grouped by type, rather than @@ -13108,204 +13103,16 @@ static void remap_sdma_interrupts(struct hfi1_devdata *dd, * SDMAProgress * SDMAIdle */ - remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); -} - -static int request_msix_irqs(struct hfi1_devdata *dd) -{ - int first_general, last_general; - int first_sdma, last_sdma; - int first_rx, last_rx; - int i, ret = 0; - - /* calculate the ranges we are going to use */ - first_general = 0; - last_general = first_general + 1; - first_sdma = last_general; - last_sdma = first_sdma + dd->num_sdma; - first_rx = last_sdma; - last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts; - - /* VNIC MSIx interrupts get mapped when VNIC contexts are created */ - dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues; - - /* - * Sanity check - the code expects all SDMA chip source - * interrupts to be in the same CSR, starting at bit 0. Verify - * that this is true by checking the bit location of the start. - */ - BUILD_BUG_ON(IS_SDMA_START % 64); - - for (i = 0; i < dd->num_msix_entries; i++) { - struct hfi1_msix_entry *me = &dd->msix_entries[i]; - const char *err_info; - irq_handler_t handler; - irq_handler_t thread = NULL; - void *arg = NULL; - int idx; - struct hfi1_ctxtdata *rcd = NULL; - struct sdma_engine *sde = NULL; - char name[MAX_NAME_SIZE]; - - /* obtain the arguments to pci_request_irq */ - if (first_general <= i && i < last_general) { - idx = i - first_general; - handler = general_interrupt; - arg = dd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d", dd->unit); - err_info = "general"; - me->type = IRQ_GENERAL; - } else if (first_sdma <= i && i < last_sdma) { - idx = i - first_sdma; - sde = &dd->per_sdma[idx]; - handler = sdma_interrupt; - arg = sde; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d sdma%d", dd->unit, idx); - err_info = "sdma"; - remap_sdma_interrupts(dd, idx, i); - me->type = IRQ_SDMA; - } else if (first_rx <= i && i < last_rx) { - idx = i - first_rx; - rcd = hfi1_rcd_get_by_index_safe(dd, idx); - if (rcd) { - /* - * Set the interrupt register and mask for this - * context's interrupt. - */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - handler = receive_context_interrupt; - thread = receive_context_thread; - arg = rcd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d kctxt%d", - dd->unit, idx); - err_info = "receive context"; - remap_intr(dd, IS_RCVAVAIL_START + idx, i); - me->type = IRQ_RCVCTXT; - rcd->msix_intr = i; - hfi1_rcd_put(rcd); - } - } else { - /* not in our expected range - complain, then - * ignore it - */ - dd_dev_err(dd, - "Unexpected extra MSI-X interrupt %d\n", i); - continue; - } - /* no argument, no interrupt */ - if (!arg) - continue; - /* make sure the name is terminated */ - name[sizeof(name) - 1] = 0; - me->irq = pci_irq_vector(dd->pcidev, i); - ret = pci_request_irq(dd->pcidev, i, handler, thread, arg, - name); - if (ret) { - dd_dev_err(dd, - "unable to allocate %s interrupt, irq %d, index %d, err %d\n", - err_info, me->irq, idx, ret); - return ret; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) - dd_dev_err(dd, "unable to pin IRQ %d\n", ret); - } - - return ret; -} - -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd) -{ - int i; - - for (i = 0; i < dd->vnic.num_ctxt; i++) { - struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - synchronize_irq(me->irq); - } -} - -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - if (!me->arg) /* => no irq, no affinity */ - return; - - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - - me->arg = NULL; -} - -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me; - int idx = rcd->ctxt; - void *arg = rcd; - int ret; - - rcd->msix_intr = dd->vnic.msix_idx++; - me = &dd->msix_entries[rcd->msix_intr]; - - /* - * Set the interrupt register and mask for this - * context's interrupt. - */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - me->type = IRQ_RCVCTXT; - me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr); - remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr); - - ret = pci_request_irq(dd->pcidev, rcd->msix_intr, - receive_context_interrupt, - receive_context_thread, arg, - DRIVER_NAME "_%d kctxt%d", dd->unit, idx); - if (ret) { - dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n", - me->irq, idx, ret); - return; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) { - dd_dev_err(dd, - "unable to pin IRQ %d\n", ret); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - } + remap_intr(dd, IS_SDMA_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr); } /* * Set the general handler to accept all interrupts, remap all * chip interrupts back to MSI-X 0. */ -static void reset_interrupts(struct hfi1_devdata *dd) +void reset_interrupts(struct hfi1_devdata *dd) { int i; @@ -13318,54 +13125,33 @@ static void reset_interrupts(struct hfi1_devdata *dd) write_csr(dd, CCE_INT_MAP + (8 * i), 0); } +/** + * set_up_interrupts() - Initialize the IRQ resources and state + * @dd: valid devdata + * + */ static int set_up_interrupts(struct hfi1_devdata *dd) { - u32 total; - int ret, request; - - /* - * Interrupt count: - * 1 general, "slow path" interrupt (includes the SDMA engines - * slow source, SDMACleanupDone) - * N interrupts - one per used SDMA engine - * M interrupt - one per kernel receive context - * V interrupt - one for each VNIC context - */ - total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; - - /* ask for MSI-X interrupts */ - request = request_msix(dd, total); - if (request < 0) { - ret = request; - goto fail; - } else { - dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries), - GFP_KERNEL); - if (!dd->msix_entries) { - ret = -ENOMEM; - goto fail; - } - /* using MSI-X */ - dd->num_msix_entries = total; - dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); - } + int ret; /* mask all interrupts */ - set_intr_state(dd, 0); + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + /* clear all pending interrupts */ clear_all_interrupts(dd); /* reset general handler mask, chip MSI-X mappings */ reset_interrupts(dd); - ret = request_msix_irqs(dd); + /* ask for MSI-X interrupts */ + ret = msix_initialize(dd); if (ret) - goto fail; + return ret; - return 0; + ret = msix_request_irqs(dd); + if (ret) + msix_clean_up_interrupts(dd); -fail: - hfi1_clean_up_interrupts(dd); return ret; } @@ -14918,20 +14704,16 @@ err_exit: } /** - * Allocate and initialize the device structure for the hfi. + * hfi1_init_dd() - Initialize most of the dd structure. * @dev: the pci_dev for hfi1_ib device * @ent: pci_device_id struct for this dev * - * Also allocates, initializes, and returns the devdata struct for this - * device instance - * * This is global, and is called directly at init to set up the * chip-specific function pointers for later use. */ -struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, - const struct pci_device_id *ent) +int hfi1_init_dd(struct hfi1_devdata *dd) { - struct hfi1_devdata *dd; + struct pci_dev *pdev = dd->pcidev; struct hfi1_pportdata *ppd; u64 reg; int i, ret; @@ -14942,13 +14724,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, "Functional simulator" }; struct pci_dev *parent = pdev->bus->self; - u32 sdma_engines; + u32 sdma_engines = chip_sdma_engines(dd); - dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * - sizeof(struct hfi1_pportdata)); - if (IS_ERR(dd)) - goto bail; - sdma_engines = chip_sdma_engines(dd); ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) { int vl; @@ -15127,6 +14904,12 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_cleanup; + /* + * This should probably occur in hfi1_pcie_init(), but historically + * occurs after the do_pcie_gen3_transition() code. + */ + tune_pcie_caps(dd); + /* start setting dd values and adjusting CSRs */ init_early_variables(dd); @@ -15239,14 +15022,13 @@ bail_free_cntrs: free_cntrs(dd); bail_clear_intr: hfi1_comp_vectors_clean_up(dd); - hfi1_clean_up_interrupts(dd); + msix_clean_up_interrupts(dd); bail_cleanup: hfi1_pcie_ddcleanup(dd); bail_free: hfi1_free_devdata(dd); - dd = ERR_PTR(ret); bail: - return dd; + return ret; } static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate, diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 36b04d6300e5..6b9c8f12dff8 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -52,9 +52,7 @@ */ /* sizes */ -#define CCE_NUM_MSIX_VECTORS 256 -#define CCE_NUM_INT_CSRS 12 -#define CCE_NUM_INT_MAP_CSRS 96 +#define BITS_PER_REGISTER (BITS_PER_BYTE * sizeof(u64)) #define NUM_INTERRUPT_SOURCES 768 #define RXE_NUM_CONTEXTS 160 #define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */ @@ -161,34 +159,49 @@ (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \ CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT) -/* interrupt source numbers */ -#define IS_GENERAL_ERR_START 0 -#define IS_SDMAENG_ERR_START 16 -#define IS_SENDCTXT_ERR_START 32 -#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */ +/* Specific IRQ sources */ +#define CCE_ERR_INT 0 +#define RXE_ERR_INT 1 +#define MISC_ERR_INT 2 +#define PIO_ERR_INT 4 +#define SDMA_ERR_INT 5 +#define EGRESS_ERR_INT 6 +#define TXE_ERR_INT 7 +#define PBC_INT 240 +#define GPIO_ASSERT_INT 241 +#define QSFP1_INT 242 +#define QSFP2_INT 243 +#define TCRIT_INT 244 + +/* interrupt source ranges */ +#define IS_FIRST_SOURCE CCE_ERR_INT +#define IS_GENERAL_ERR_START 0 +#define IS_SDMAENG_ERR_START 16 +#define IS_SENDCTXT_ERR_START 32 +#define IS_SDMA_START 192 +#define IS_SDMA_PROGRESS_START 208 +#define IS_SDMA_IDLE_START 224 #define IS_VARIOUS_START 240 #define IS_DC_START 248 #define IS_RCVAVAIL_START 256 #define IS_RCVURGENT_START 416 #define IS_SENDCREDIT_START 576 #define IS_RESERVED_START 736 -#define IS_MAX_SOURCES 768 +#define IS_LAST_SOURCE 767 /* derived interrupt source values */ -#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START -#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START -#define IS_SENDCTXT_ERR_END IS_SDMA_START -#define IS_SDMA_END IS_VARIOUS_START -#define IS_VARIOUS_END IS_DC_START -#define IS_DC_END IS_RCVAVAIL_START -#define IS_RCVAVAIL_END IS_RCVURGENT_START -#define IS_RCVURGENT_END IS_SENDCREDIT_START -#define IS_SENDCREDIT_END IS_RESERVED_START -#define IS_RESERVED_END IS_MAX_SOURCES - -/* absolute interrupt numbers for QSFP1Int and QSFP2Int */ -#define QSFP1_INT 242 -#define QSFP2_INT 243 +#define IS_GENERAL_ERR_END 7 +#define IS_SDMAENG_ERR_END 31 +#define IS_SENDCTXT_ERR_END 191 +#define IS_SDMA_END 207 +#define IS_SDMA_PROGRESS_END 223 +#define IS_SDMA_IDLE_END 239 +#define IS_VARIOUS_END 244 +#define IS_DC_END 255 +#define IS_RCVAVAIL_END 415 +#define IS_RCVURGENT_END 575 +#define IS_SENDCREDIT_END 735 +#define IS_RESERVED_END IS_LAST_SOURCE /* DCC_CFG_PORT_CONFIG logical link states */ #define LSTATE_DOWN 0x1 @@ -1416,6 +1429,18 @@ void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); void hfi1_init_vnic_rsm(struct hfi1_devdata *dd); void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd); +irqreturn_t general_interrupt(int irq, void *data); +irqreturn_t sdma_interrupt(int irq, void *data); +irqreturn_t receive_context_interrupt(int irq, void *data); +irqreturn_t receive_context_thread(int irq, void *data); + +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); +void init_qsfp_int(struct hfi1_devdata *dd); +void clear_all_interrupts(struct hfi1_devdata *dd); +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); +void reset_interrupts(struct hfi1_devdata *dd); + /* * Interrupt source table. * diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index ee6dca5e2a2f..c6163a347e93 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -878,6 +878,10 @@ #define SEND_CTRL (TXE + 0x000000000000) #define SEND_CTRL_CM_RESET_SMASK 0x4ull #define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull +#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 +#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xFFull +#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ + << SEND_CTRL_UNSUPPORTED_VL_SHIFT) #define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull #define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080) #define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 1fc75647e47b..c22ebc774a6a 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -681,7 +681,8 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_ONE_PKT_EGR_DIS | HFI1_RCVCTRL_NO_RHQ_DROP_DIS | - HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + HFI1_RCVCTRL_NO_EGR_DROP_DIS | + HFI1_RCVCTRL_URGENT_DIS, uctxt); /* Clear the context's J_KEY */ hfi1_clear_ctxt_jkey(dd, uctxt); /* @@ -1096,6 +1097,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt) hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey); rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; + rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB; if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; /* diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index d9470317983f..1401b6ea4a28 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -80,6 +80,7 @@ #include "qsfp.h" #include "platform.h" #include "affinity.h" +#include "msix.h" /* bumped 1 from s/w major version of TrueScale */ #define HFI1_CHIP_VERS_MAJ 3U @@ -620,6 +621,8 @@ struct rvt_sge_state; #define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000 #define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000 #define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000 +#define HFI1_RCVCTRL_URGENT_ENB 0x40000 +#define HFI1_RCVCTRL_URGENT_DIS 0x80000 /* partition enforcement flags */ #define HFI1_PART_ENFORCE_IN 0x1 @@ -667,6 +670,14 @@ struct hfi1_msix_entry { struct irq_affinity_notify notify; }; +struct hfi1_msix_info { + /* lock to synchronize in_use_msix access */ + spinlock_t msix_lock; + DECLARE_BITMAP(in_use_msix, CCE_NUM_MSIX_VECTORS); + struct hfi1_msix_entry *msix_entries; + u16 max_requested; +}; + /* per-SL CCA information */ struct cca_timer { struct hrtimer hrtimer; @@ -992,7 +1003,6 @@ struct hfi1_vnic_data { struct idr vesw_idr; u8 rmt_start; u8 num_ctxt; - u32 msix_idx; }; struct hfi1_vnic_vport_info; @@ -1205,11 +1215,6 @@ struct hfi1_devdata { struct diag_client *diag_client; - /* MSI-X information */ - struct hfi1_msix_entry *msix_entries; - u32 num_msix_entries; - u32 first_dyn_msix_idx; - /* general interrupt: mask of handled interrupts */ u64 gi_mask[CCE_NUM_INT_CSRS]; @@ -1223,6 +1228,9 @@ struct hfi1_devdata { */ struct timer_list synth_stats_timer; + /* MSI-X information */ + struct hfi1_msix_info msix_info; + /* * device counters */ @@ -1349,6 +1357,8 @@ struct hfi1_devdata { /* vnic data */ struct hfi1_vnic_data vnic; + /* Lock to protect IRQ SRC register access */ + spinlock_t irq_src_lock; }; static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare) @@ -1431,9 +1441,6 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread); void set_all_slowpath(struct hfi1_devdata *dd); -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd); -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd); -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd); extern const struct pci_device_id hfi1_pci_tbl[]; void hfi1_make_ud_req_9B(struct rvt_qp *qp, @@ -1887,10 +1894,8 @@ struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) #define HFI1_CTXT_WAITING_URG 4 /* free up any allocated data at closes */ -struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, - const struct pci_device_id *ent); +int hfi1_init_dd(struct hfi1_devdata *dd); void hfi1_free_devdata(struct hfi1_devdata *dd); -struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra); /* LED beaconing functions */ void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, @@ -1963,6 +1968,7 @@ static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) */ extern const char ib_hfi1_version[]; +extern const struct attribute_group ib_hfi1_attr_group; int hfi1_device_create(struct hfi1_devdata *dd); void hfi1_device_remove(struct hfi1_devdata *dd); @@ -1974,16 +1980,15 @@ void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd); /* Hook for sysfs read of QSFP */ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); -int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent); -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd); +int hfi1_pcie_init(struct hfi1_devdata *dd); void hfi1_pcie_cleanup(struct pci_dev *pdev); int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev); void hfi1_pcie_ddcleanup(struct hfi1_devdata *); int pcie_speeds(struct hfi1_devdata *dd); -int request_msix(struct hfi1_devdata *dd, u32 msireq); int restore_pci_variables(struct hfi1_devdata *dd); int save_pci_variables(struct hfi1_devdata *dd); int do_pcie_gen3_transition(struct hfi1_devdata *dd); +void tune_pcie_caps(struct hfi1_devdata *dd); int parse_platform_config(struct hfi1_devdata *dd); int get_platform_config_field(struct hfi1_devdata *dd, enum platform_config_table_type_encoding @@ -2124,19 +2129,6 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) return base_sdma_integrity; } -/* - * hfi1_early_err is used (only!) to print early errors before devdata is - * allocated, or when dd->pcidev may not be valid, and at the tail end of - * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is - * the same as dd_dev_err, but is used when the message really needs - * the IB port# to be definitive as to what's happening.. - */ -#define hfi1_early_err(dev, fmt, ...) \ - dev_err(dev, fmt, ##__VA_ARGS__) - -#define hfi1_early_info(dev, fmt, ...) \ - dev_info(dev, fmt, ##__VA_ARGS__) - #define dd_dev_emerg(dd, fmt, ...) \ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 758d273c32cf..09044905284f 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -83,6 +83,8 @@ #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ +#define NUM_IB_PORTS 1 + /* * Number of user receive contexts we are configured to use (to allow for more * pio buffers per ctxt, etc.) Zero means use one user context per CPU. @@ -654,9 +656,8 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, ppd->part_enforce |= HFI1_PART_ENFORCE_IN; if (loopback) { - hfi1_early_err(&pdev->dev, - "Faking data partition 0x8001 in idx %u\n", - !default_pkey_idx); + dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n", + !default_pkey_idx); ppd->pkeys[!default_pkey_idx] = 0x8001; } @@ -702,9 +703,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, return; bail: - - hfi1_early_err(&pdev->dev, - "Congestion Control Agent disabled for port %d\n", port); + dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); } /* @@ -833,6 +832,23 @@ wq_error: } /** + * enable_general_intr() - Enable the IRQs that will be handled by the + * general interrupt handler. + * @dd: valid devdata + * + */ +static void enable_general_intr(struct hfi1_devdata *dd) +{ + set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); + set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); + set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); + set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); + set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); + set_intr_bits(dd, IS_DC_START, IS_DC_END, true); + set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); +} + +/** * hfi1_init - do the actual initialization sequence on the chip * @dd: the hfi1_ib device * @reinit: re-initializing, so don't allocate new memory @@ -916,6 +932,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); ret = lastfail; } + /* enable IRQ */ hfi1_rcd_put(rcd); } @@ -954,7 +971,8 @@ done: HFI1_STATUS_INITTED; if (!ret) { /* enable all interrupts from the chip */ - set_intr_state(dd, 1); + enable_general_intr(dd); + init_qsfp_int(dd); /* chip is OK for user apps; mark it as initialized */ for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -1051,9 +1069,9 @@ static void shutdown_device(struct hfi1_devdata *dd) } dd->flags &= ~HFI1_INITTED; - /* mask and clean up interrupts, but not errors */ - set_intr_state(dd, 0); - hfi1_clean_up_interrupts(dd); + /* mask and clean up interrupts */ + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + msix_clean_up_interrupts(dd); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1246,15 +1264,19 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) kobject_put(&dd->kobj); } -/* - * Allocate our primary per-unit data structure. Must be done via verbs - * allocator, because the verbs cleanup process both does cleanup and - * free of the data structure. +/** + * hfi1_alloc_devdata - Allocate our primary per-unit data structure. + * @pdev: Valid PCI device + * @extra: How many bytes to alloc past the default + * + * Must be done via verbs allocator, because the verbs cleanup process + * both does cleanup and free of the data structure. * "extra" is for chip-specific data. * * Use the idr mechanism to get a unit number for this unit. */ -struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) +static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, + size_t extra) { unsigned long flags; struct hfi1_devdata *dd; @@ -1287,8 +1309,8 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) idr_preload_end(); if (ret < 0) { - hfi1_early_err(&pdev->dev, - "Could not allocate unit ID: error %d\n", -ret); + dev_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); goto bail; } rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); @@ -1309,6 +1331,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) spin_lock_init(&dd->pio_map_lock); mutex_init(&dd->dc8051_lock); init_waitqueue_head(&dd->event_queue); + spin_lock_init(&dd->irq_src_lock); dd->int_counter = alloc_percpu(u64); if (!dd->int_counter) { @@ -1481,9 +1504,6 @@ static int __init hfi1_mod_init(void) idr_init(&hfi1_unit_table); hfi1_dbg_init(); - ret = hfi1_wss_init(); - if (ret < 0) - goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1492,8 +1512,6 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: - hfi1_wss_exit(); -bail_wss: hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); dev_cleanup(); @@ -1510,7 +1528,6 @@ static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); node_affinity_destroy_all(); - hfi1_wss_exit(); hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); @@ -1604,23 +1621,23 @@ static void postinit_cleanup(struct hfi1_devdata *dd) hfi1_free_devdata(dd); } -static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt) +static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt) { if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, "Receive header queue count too small\n"); + dd_dev_err(dd, "Receive header queue count too small\n"); return -EINVAL; } if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, - "Receive header queue count cannot be greater than %u\n", - HFI1_MAX_HDRQ_EGRBUF_CNT); + dd_dev_err(dd, + "Receive header queue count cannot be greater than %u\n", + HFI1_MAX_HDRQ_EGRBUF_CNT); return -EINVAL; } if (thecnt % HDRQ_INCREMENT) { - hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n", - thecnt, HDRQ_INCREMENT); + dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n", + thecnt, HDRQ_INCREMENT); return -EINVAL; } @@ -1639,22 +1656,29 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* Validate dev ids */ if (!(ent->device == PCI_DEVICE_ID_INTEL0 || ent->device == PCI_DEVICE_ID_INTEL1)) { - hfi1_early_err(&pdev->dev, - "Failing on unknown Intel deviceid 0x%x\n", - ent->device); + dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", + ent->device); ret = -ENODEV; goto bail; } + /* Allocate the dd so we can get to work */ + dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * + sizeof(struct hfi1_pportdata)); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + goto bail; + } + /* Validate some global module parameters */ - ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt); + ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt); if (ret) goto bail; /* use the encoding function as a sanitization check */ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { - hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", - hfi1_hdrq_entsize); + dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", + hfi1_hdrq_entsize); ret = -EINVAL; goto bail; } @@ -1676,10 +1700,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) clamp_val(eager_buffer_size, MIN_EAGER_BUFFER * 8, MAX_EAGER_BUFFER_TOTAL); - hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", - eager_buffer_size); + dd_dev_info(dd, "Eager buffer size %u\n", + eager_buffer_size); } else { - hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); + dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); ret = -EINVAL; goto bail; } @@ -1687,7 +1711,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* restrict value of hfi1_rcvarr_split */ hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); - ret = hfi1_pcie_init(pdev, ent); + ret = hfi1_pcie_init(dd); if (ret) goto bail; @@ -1695,12 +1719,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) * Do device-specific initialization, function table setup, dd * allocation, etc. */ - dd = hfi1_init_dd(pdev, ent); - - if (IS_ERR(dd)) { - ret = PTR_ERR(dd); + ret = hfi1_init_dd(dd); + if (ret) goto clean_bail; /* error already printed */ - } ret = create_workqueues(dd); if (ret) @@ -1731,7 +1752,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); if (initfail || ret) { - hfi1_clean_up_interrupts(dd); + msix_clean_up_interrupts(dd); stop_timers(dd); flush_workqueue(ib_wq); for (pidx = 0; pidx < dd->num_pports; ++pidx) { diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c new file mode 100644 index 000000000000..582f1ba136ff --- /dev/null +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "iowait.h" +#include "trace_iowait.h" + +void iowait_set_flag(struct iowait *wait, u32 flag) +{ + trace_hfi1_iowait_set(wait, flag); + set_bit(flag, &wait->flags); +} + +bool iowait_flag_set(struct iowait *wait, u32 flag) +{ + return test_bit(flag, &wait->flags); +} + +inline void iowait_clear_flag(struct iowait *wait, u32 flag) +{ + trace_hfi1_iowait_clear(wait, flag); + clear_bit(flag, &wait->flags); +} + +/** + * iowait_init() - initialize wait structure + * @wait: wait struct to initialize + * @tx_limit: limit for overflow queuing + * @func: restart function for workqueue + * @sleep: sleep function for no space + * @resume: wakeup function for no space + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)) +{ + int i; + + wait->count = 0; + INIT_LIST_HEAD(&wait->list); + init_waitqueue_head(&wait->wait_dma); + init_waitqueue_head(&wait->wait_pio); + atomic_set(&wait->sdma_busy, 0); + atomic_set(&wait->pio_busy, 0); + wait->tx_limit = tx_limit; + wait->sleep = sleep; + wait->wakeup = wakeup; + wait->sdma_drained = sdma_drained; + wait->flags = 0; + for (i = 0; i < IOWAIT_SES; i++) { + wait->wait[i].iow = wait; + INIT_LIST_HEAD(&wait->wait[i].tx_head); + if (i == IOWAIT_IB_SE) + INIT_WORK(&wait->wait[i].iowork, func); + else + INIT_WORK(&wait->wait[i].iowork, tidfunc); + } +} + +/** + * iowait_cancel_work - cancel all work in iowait + * @w: the iowait struct + */ +void iowait_cancel_work(struct iowait *w) +{ + cancel_work_sync(&iowait_get_ib_work(w)->iowork); + cancel_work_sync(&iowait_get_tid_work(w)->iowork); +} + +/** + * iowait_set_work_flag - set work flag based on leg + * @w - the iowait work struct + */ +int iowait_set_work_flag(struct iowait_work *w) +{ + if (w == &w->iow->wait[IOWAIT_IB_SE]) { + iowait_set_flag(w->iow, IOWAIT_PENDING_IB); + return IOWAIT_IB_SE; + } + iowait_set_flag(w->iow, IOWAIT_PENDING_TID); + return IOWAIT_TID_SE; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 3d9c32c7c340..23a58ac0d47c 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -1,7 +1,7 @@ #ifndef _HFI1_IOWAIT_H #define _HFI1_IOWAIT_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -49,6 +49,7 @@ #include <linux/list.h> #include <linux/workqueue.h> +#include <linux/wait.h> #include <linux/sched.h> #include "sdma_txreq.h" @@ -59,16 +60,47 @@ */ typedef void (*restart_t)(struct work_struct *work); +#define IOWAIT_PENDING_IB 0x0 +#define IOWAIT_PENDING_TID 0x1 + +/* + * A QP can have multiple Send Engines (SEs). + * + * The current use case is for supporting a TID RDMA + * packet build/xmit mechanism independent from verbs. + */ +#define IOWAIT_SES 2 +#define IOWAIT_IB_SE 0 +#define IOWAIT_TID_SE 1 + struct sdma_txreq; struct sdma_engine; /** - * struct iowait - linkage for delayed progress/waiting + * @iowork: the work struct + * @tx_head: list of prebuilt packets + * @iow: the parent iowait structure + * + * This structure is the work item (process) specific + * details associated with the each of the two SEs of the + * QP. + * + * The workstruct and the queued TXs are unique to each + * SE. + */ +struct iowait; +struct iowait_work { + struct work_struct iowork; + struct list_head tx_head; + struct iowait *iow; +}; + +/** * @list: used to add/insert into QP/PQ wait lists - * @lock: uses to record the list head lock * @tx_head: overflow list of sdma_txreq's * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 * @wait_pio: wait for pio_busy == 0 @@ -76,6 +108,8 @@ struct sdma_engine; * @count: total number of descriptors in tx_head'ed list * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list + * @flags: wait flags (one per QP) + * @wait: SE array * * This is to be embedded in user's state structure * (QP or PQ). @@ -98,13 +132,11 @@ struct sdma_engine; * Waiters explicity know that, but the destroy * code that unwaits QPs does not. */ - struct iowait { struct list_head list; - struct list_head tx_head; int (*sleep)( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, uint seq, bool pkts_sent @@ -112,7 +144,6 @@ struct iowait { void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); seqlock_t *lock; - struct work_struct iowork; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; atomic_t sdma_busy; @@ -121,63 +152,37 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + unsigned long flags; + struct iowait_work wait[IOWAIT_SES]; }; #define SDMA_AVAIL_REASON 0 -/** - * iowait_init() - initialize wait structure - * @wait: wait struct to initialize - * @tx_limit: limit for overflow queuing - * @func: restart function for workqueue - * @sleep: sleep function for no space - * @resume: wakeup function for no space - * - * This function initializes the iowait - * structure embedded in the QP or PQ. - * - */ +void iowait_set_flag(struct iowait *wait, u32 flag); +bool iowait_flag_set(struct iowait *wait, u32 flag); +void iowait_clear_flag(struct iowait *wait, u32 flag); -static inline void iowait_init( - struct iowait *wait, - u32 tx_limit, - void (*func)(struct work_struct *work), - int (*sleep)( - struct sdma_engine *sde, - struct iowait *wait, - struct sdma_txreq *tx, - uint seq, - bool pkts_sent), - void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) -{ - wait->count = 0; - wait->lock = NULL; - INIT_LIST_HEAD(&wait->list); - INIT_LIST_HEAD(&wait->tx_head); - INIT_WORK(&wait->iowork, func); - init_waitqueue_head(&wait->wait_dma); - init_waitqueue_head(&wait->wait_pio); - atomic_set(&wait->sdma_busy, 0); - atomic_set(&wait->pio_busy, 0); - wait->tx_limit = tx_limit; - wait->sleep = sleep; - wait->wakeup = wakeup; - wait->sdma_drained = sdma_drained; -} +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)); /** - * iowait_schedule() - initialize wait structure + * iowait_schedule() - schedule the default send engine work * @wait: wait struct to schedule * @wq: workqueue for schedule * @cpu: cpu */ -static inline void iowait_schedule( - struct iowait *wait, - struct workqueue_struct *wq, - int cpu) +static inline bool iowait_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) { - queue_work_on(cpu, wq, &wait->iowork); + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); } /** @@ -228,6 +233,8 @@ static inline void iowait_sdma_add(struct iowait *wait, int count) */ static inline int iowait_sdma_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->sdma_busy); } @@ -267,11 +274,13 @@ static inline void iowait_pio_inc(struct iowait *wait) } /** - * iowait_sdma_dec - note pio complete + * iowait_pio_dec - note pio complete * @wait: iowait structure */ static inline int iowait_pio_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->pio_busy); } @@ -293,9 +302,9 @@ static inline void iowait_drain_wakeup(struct iowait *wait) /** * iowait_get_txhead() - get packet off of iowait list * - * @wait wait struture + * @wait iowait_work struture */ -static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) +static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait) { struct sdma_txreq *tx = NULL; @@ -309,6 +318,28 @@ static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) return tx; } +static inline u16 iowait_get_desc(struct iowait_work *w) +{ + u16 num_desc = 0; + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + num_desc = tx->num_desc; + } + return num_desc; +} + +static inline u32 iowait_get_all_desc(struct iowait *w) +{ + u32 num_desc = 0; + + num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]); + num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]); + return num_desc; +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -372,12 +403,57 @@ static inline void iowait_starve_find_max(struct iowait *w, u8 *max, } /** - * iowait_packet_queued() - determine if a packet is already built - * @wait: the wait structure + * iowait_packet_queued() - determine if a packet is queued + * @wait: the iowait_work structure */ -static inline bool iowait_packet_queued(struct iowait *wait) +static inline bool iowait_packet_queued(struct iowait_work *wait) { return !list_empty(&wait->tx_head); } +/** + * inc_wait_count - increment wait counts + * @w: the log work struct + * @n: the count + */ +static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n) +{ + if (!w) + return; + w->iow->tx_count++; + w->iow->count += n; +} + +/** + * iowait_get_tid_work - return iowait_work for tid SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_tid_work(struct iowait *w) +{ + return &w->wait[IOWAIT_TID_SE]; +} + +/** + * iowait_get_ib_work - return iowait_work for ib SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_ib_work(struct iowait *w) +{ + return &w->wait[IOWAIT_IB_SE]; +} + +/** + * iowait_ioww_to_iow - return iowait given iowait_work + * @w: the iowait_work struct + */ +static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w) +{ + if (likely(w)) + return w->iow; + return NULL; +} + +void iowait_cancel_work(struct iowait *w); +int iowait_set_work_flag(struct iowait_work *w); + #endif diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 0307405491e0..88a0cf930136 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -4836,7 +4836,7 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, int ret; int pkey_idx; int local_mad = 0; - u32 resp_len = 0; + u32 resp_len = in_wc->byte_len - sizeof(*in_grh); struct hfi1_ibport *ibp = to_iport(ibdev, port); pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e1c7996c018e..475b769e120c 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,7 +77,6 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { - .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c new file mode 100644 index 000000000000..d920b165d696 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "affinity.h" +#include "sdma.h" + +/** + * msix_initialize() - Calculate, request and configure MSIx IRQs + * @dd: valid hfi1 devdata + * + */ +int msix_initialize(struct hfi1_devdata *dd) +{ + u32 total; + int ret; + struct hfi1_msix_entry *entries; + + /* + * MSIx interrupt count: + * one for the general, "slow path" interrupt + * one per used SDMA engine + * one per kernel receive context + * one for each VNIC context + * ...any new IRQs should be added here. + */ + total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; + + if (total >= CCE_NUM_MSIX_VECTORS) + return -EINVAL; + + ret = pci_alloc_irq_vectors(dd->pcidev, total, total, PCI_IRQ_MSIX); + if (ret < 0) { + dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", ret); + return ret; + } + + entries = kcalloc(total, sizeof(*dd->msix_info.msix_entries), + GFP_KERNEL); + if (!entries) { + pci_free_irq_vectors(dd->pcidev); + return -ENOMEM; + } + + dd->msix_info.msix_entries = entries; + spin_lock_init(&dd->msix_info.msix_lock); + bitmap_zero(dd->msix_info.in_use_msix, total); + dd->msix_info.max_requested = total; + dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); + + return 0; +} + +/** + * msix_request_irq() - Allocate a free MSIx IRQ + * @dd: valid devdata + * @arg: context information for the IRQ + * @handler: IRQ handler + * @thread: IRQ thread handler (could be NULL) + * @idx: zero base idx if multiple devices are needed + * @type: affinty IRQ type + * + * Allocated an MSIx vector if available, and then create the appropriate + * meta data needed to keep track of the pci IRQ request. + * + * Return: + * < 0 Error + * >= 0 MSIx vector + * + */ +static int msix_request_irq(struct hfi1_devdata *dd, void *arg, + irq_handler_t handler, irq_handler_t thread, + u32 idx, enum irq_type type) +{ + unsigned long nr; + int irq; + int ret; + const char *err_info; + char name[MAX_NAME_SIZE]; + struct hfi1_msix_entry *me; + + /* Allocate an MSIx vector */ + spin_lock(&dd->msix_info.msix_lock); + nr = find_first_zero_bit(dd->msix_info.in_use_msix, + dd->msix_info.max_requested); + if (nr < dd->msix_info.max_requested) + __set_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + + if (nr == dd->msix_info.max_requested) + return -ENOSPC; + + /* Specific verification and determine the name */ + switch (type) { + case IRQ_GENERAL: + /* general interrupt must be MSIx vector 0 */ + if (nr) { + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + dd_dev_err(dd, "Invalid index %lu for GENERAL IRQ\n", + nr); + return -EINVAL; + } + snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit); + err_info = "general"; + break; + case IRQ_SDMA: + snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d", + dd->unit, idx); + err_info = "sdma"; + break; + case IRQ_RCVCTXT: + snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d", + dd->unit, idx); + err_info = "receive context"; + break; + case IRQ_OTHER: + default: + return -EINVAL; + } + name[sizeof(name) - 1] = 0; + + irq = pci_irq_vector(dd->pcidev, nr); + ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name); + if (ret) { + dd_dev_err(dd, + "%s: request for IRQ %d failed, MSIx %d, err %d\n", + err_info, irq, idx, ret); + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + return ret; + } + + /* + * assign arg after pci_request_irq call, so it will be + * cleaned up + */ + me = &dd->msix_info.msix_entries[nr]; + me->irq = irq; + me->arg = arg; + me->type = type; + + /* This is a request, so a failure is not fatal */ + ret = hfi1_get_irq_affinity(dd, me); + if (ret) + dd_dev_err(dd, "unable to pin IRQ %d\n", ret); + + return nr; +} + +/** + * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs + * @rcd: valid rcd context + * + */ +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd) +{ + int nr; + + nr = msix_request_irq(rcd->dd, rcd, receive_context_interrupt, + receive_context_thread, rcd->ctxt, IRQ_RCVCTXT); + if (nr < 0) + return nr; + + /* + * Set the interrupt register and mask for this + * context's interrupt. + */ + rcd->ireg = (IS_RCVAVAIL_START + rcd->ctxt) / 64; + rcd->imask = ((u64)1) << ((IS_RCVAVAIL_START + rcd->ctxt) % 64); + rcd->msix_intr = nr; + remap_intr(rcd->dd, IS_RCVAVAIL_START + rcd->ctxt, nr); + + return 0; +} + +/** + * msix_request_smda_ira() - Helper for getting SDMA IRQ resources + * @sde: valid sdma engine + * + */ +int msix_request_sdma_irq(struct sdma_engine *sde) +{ + int nr; + + nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL, + sde->this_idx, IRQ_SDMA); + if (nr < 0) + return nr; + sde->msix_intr = nr; + remap_sdma_interrupts(sde->dd, sde->this_idx, nr); + + return 0; +} + +/** + * enable_sdma_src() - Helper to enable SDMA IRQ srcs + * @dd: valid devdata structure + * @i: index of SDMA engine + */ +static void enable_sdma_srcs(struct hfi1_devdata *dd, int i) +{ + set_intr_bits(dd, IS_SDMA_START + i, IS_SDMA_START + i, true); + set_intr_bits(dd, IS_SDMA_PROGRESS_START + i, + IS_SDMA_PROGRESS_START + i, true); + set_intr_bits(dd, IS_SDMA_IDLE_START + i, IS_SDMA_IDLE_START + i, true); + set_intr_bits(dd, IS_SDMAENG_ERR_START + i, IS_SDMAENG_ERR_START + i, + true); +} + +/** + * msix_request_irqs() - Allocate all MSIx IRQs + * @dd: valid devdata structure + * + * Helper function to request the used MSIx IRQs. + * + */ +int msix_request_irqs(struct hfi1_devdata *dd) +{ + int i; + int ret; + + ret = msix_request_irq(dd, dd, general_interrupt, NULL, 0, IRQ_GENERAL); + if (ret < 0) + return ret; + + for (i = 0; i < dd->num_sdma; i++) { + struct sdma_engine *sde = &dd->per_sdma[i]; + + ret = msix_request_sdma_irq(sde); + if (ret) + return ret; + enable_sdma_srcs(sde->dd, i); + } + + for (i = 0; i < dd->n_krcv_queues; i++) { + struct hfi1_ctxtdata *rcd = hfi1_rcd_get_by_index_safe(dd, i); + + if (rcd) + ret = msix_request_rcd_irq(rcd); + hfi1_rcd_put(rcd); + if (ret) + return ret; + } + + return 0; +} + +/** + * msix_free_irq() - Free the specified MSIx resources and IRQ + * @dd: valid devdata + * @msix_intr: MSIx vector to free. + * + */ +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr) +{ + struct hfi1_msix_entry *me; + + if (msix_intr >= dd->msix_info.max_requested) + return; + + me = &dd->msix_info.msix_entries[msix_intr]; + + if (!me->arg) /* => no irq, no affinity */ + return; + + hfi1_put_irq_affinity(dd, me); + pci_free_irq(dd->pcidev, msix_intr, me->arg); + + me->arg = NULL; + + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(msix_intr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); +} + +/** + * hfi1_clean_up_msix_interrupts() - Free all MSIx IRQ resources + * @dd: valid device data data structure + * + * Free the MSIx and associated PCI resources, if they have been allocated. + */ +void msix_clean_up_interrupts(struct hfi1_devdata *dd) +{ + int i; + struct hfi1_msix_entry *me = dd->msix_info.msix_entries; + + /* remove irqs - must happen before disabling/turning off */ + for (i = 0; i < dd->msix_info.max_requested; i++, me++) + msix_free_irq(dd, i); + + /* clean structures */ + kfree(dd->msix_info.msix_entries); + dd->msix_info.msix_entries = NULL; + dd->msix_info.max_requested = 0; + + pci_free_irq_vectors(dd->pcidev); +} + +/** + * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize + * @dd: valid devdata + */ +void msix_vnic_synchronize_irq(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->vnic.num_ctxt; i++) { + struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; + struct hfi1_msix_entry *me; + + me = &dd->msix_info.msix_entries[rcd->msix_intr]; + + synchronize_irq(me->irq); + } +} diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h new file mode 100644 index 000000000000..a514881632a4 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_MSIX_H +#define _HFI1_MSIX_H + +#include "hfi.h" + +/* MSIx interface */ +int msix_initialize(struct hfi1_devdata *dd); +int msix_request_irqs(struct hfi1_devdata *dd); +void msix_clean_up_interrupts(struct hfi1_devdata *dd); +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd); +int msix_request_sdma_irq(struct sdma_engine *sde); +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr); + +/* VNIC interface */ +void msix_vnic_synchronize_irq(struct hfi1_devdata *dd); + +#endif diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 6c967dde58e7..c96d193bb236 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -61,19 +61,12 @@ */ /* - * Code to adjust PCIe capabilities. - */ -static void tune_pcie_caps(struct hfi1_devdata *); - -/* * Do all the common PCIe setup and initialization. - * devdata is not yet allocated, and is not allocated until after this - * routine returns success. Therefore dd_dev_err() can't be used for error - * printing. */ -int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) +int hfi1_pcie_init(struct hfi1_devdata *dd) { int ret; + struct pci_dev *pdev = dd->pcidev; ret = pci_enable_device(pdev); if (ret) { @@ -89,15 +82,13 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) * about that, it appears. If the original BAR was retained * in the kernel data structures, this may be OK. */ - hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n", - -ret); - goto done; + dd_dev_err(dd, "pci enable failed: error %d\n", -ret); + return ret; } ret = pci_request_regions(pdev, DRIVER_NAME); if (ret) { - hfi1_early_err(&pdev->dev, - "pci_request_regions fails: err %d\n", -ret); + dd_dev_err(dd, "pci_request_regions fails: err %d\n", -ret); goto bail; } @@ -110,8 +101,7 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) */ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (ret) { - hfi1_early_err(&pdev->dev, - "Unable to set DMA mask: %d\n", ret); + dd_dev_err(dd, "Unable to set DMA mask: %d\n", ret); goto bail; } ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); @@ -119,18 +109,16 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); } if (ret) { - hfi1_early_err(&pdev->dev, - "Unable to set DMA consistent mask: %d\n", ret); + dd_dev_err(dd, "Unable to set DMA consistent mask: %d\n", ret); goto bail; } pci_set_master(pdev); (void)pci_enable_pcie_error_reporting(pdev); - goto done; + return 0; bail: hfi1_pcie_cleanup(pdev); -done: return ret; } @@ -206,7 +194,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) dd_dev_err(dd, "WC mapping of send buffers failed\n"); goto nomem; } - dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE); + dd_dev_info(dd, "WC piobase: %p for %x\n", dd->piobase, TXE_PIO_SIZE); dd->physaddr = addr; /* used for io_remap, etc. */ @@ -344,26 +332,6 @@ int pcie_speeds(struct hfi1_devdata *dd) return 0; } -/* - * Returns: - * - actual number of interrupts allocated or - * - error - */ -int request_msix(struct hfi1_devdata *dd, u32 msireq) -{ - int nvec; - - nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX); - if (nvec < 0) { - dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec); - return nvec; - } - - tune_pcie_caps(dd); - - return nvec; -} - /* restore command and BARs after a reset has wiped them out */ int restore_pci_variables(struct hfi1_devdata *dd) { @@ -479,14 +447,19 @@ error: * Check and optionally adjust them to maximize our throughput. */ static int hfi1_pcie_caps; -module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO); +module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); uint aspm_mode = ASPM_MODE_DISABLED; -module_param_named(aspm, aspm_mode, uint, S_IRUGO); +module_param_named(aspm, aspm_mode, uint, 0444); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static void tune_pcie_caps(struct hfi1_devdata *dd) +/** + * tune_pcie_caps() - Code to adjust PCIe capabilities. + * @dd: Valid device data structure + * + */ +void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; @@ -650,7 +623,6 @@ pci_resume(struct pci_dev *pdev) struct hfi1_devdata *dd = pci_get_drvdata(pdev); dd_dev_info(dd, "HFI1 resume function called\n"); - pci_cleanup_aer_uncorrect_error_status(pdev); /* * Running jobs will fail, since it's asynchronous * unlike sysfs-requested reset. Better than @@ -1029,6 +1001,7 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) const u8 (*ctle_tunings)[4]; uint static_ctle_mode; int return_error = 0; + u32 target_width; /* PCIe Gen3 is for the ASIC only */ if (dd->icode != ICODE_RTL_SILICON) @@ -1068,6 +1041,9 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) return 0; } + /* Previous Gen1/Gen2 bus width */ + target_width = dd->lbus_width; + /* * Do the Gen3 transition. Steps are those of the PCIe Gen3 * recipe. @@ -1436,11 +1412,12 @@ retry: dd_dev_info(dd, "%s: new speed and width: %s\n", __func__, dd->lbus_info); - if (dd->lbus_speed != target_speed) { /* not target */ + if (dd->lbus_speed != target_speed || + dd->lbus_width < target_width) { /* not target */ /* maybe retry */ do_retry = retry_count < pcie_retry; - dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n", - pcie_target, do_retry ? ", retrying" : ""); + dd_dev_err(dd, "PCIe link speed or width did not match target%s\n", + do_retry ? ", retrying" : ""); retry_count++; if (do_retry) { msleep(100); /* allow time to settle */ diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 752057647f09..9ab50d2308dc 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -71,14 +71,6 @@ void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl) } } -/* defined in header release 48 and higher */ -#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT -#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 -#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull -#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ - << SEND_CTRL_UNSUPPORTED_VL_SHIFT) -#endif - /* global control of PIO send */ void pio_send_control(struct hfi1_devdata *dd, int op) { diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 9b1e84a6b1cc..6f3bc4dab858 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(qp_table_size, "QP table size"); static void flush_tx_list(struct rvt_qp *qp); static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, unsigned int seq, bool pkts_sent); @@ -134,15 +134,13 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { }; -static void flush_tx_list(struct rvt_qp *qp) +static void flush_list_head(struct list_head *l) { - struct hfi1_qp_priv *priv = qp->priv; - - while (!list_empty(&priv->s_iowait.tx_head)) { + while (!list_empty(l)) { struct sdma_txreq *tx; tx = list_first_entry( - &priv->s_iowait.tx_head, + l, struct sdma_txreq, list); list_del_init(&tx->list); @@ -151,6 +149,14 @@ static void flush_tx_list(struct rvt_qp *qp) } } +static void flush_tx_list(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head); + flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head); +} + static void flush_iowait(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; @@ -282,33 +288,46 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, } /** - * hfi1_check_send_wqe - validate wqe + * hfi1_setup_wqe - set up the wqe * @qp - The qp * @wqe - The built wqe + * @call_send - Determine if the send should be posted or scheduled. * - * validate wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. + * Perform setup of the wqe. This is called + * prior to inserting the wqe into the ring but after + * the wqe has been setup by RDMAVT. This function + * allows the driver the opportunity to perform + * validation and additional setup of the wqe. * * Returns 0 on success, -EINVAL on failure * */ -int hfi1_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; switch (qp->ibqp.qp_type) { case IB_QPT_RC: case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; break; case IB_QPT_SMI: - ah = ibah_to_rvtah(wqe->ud_wr.ah); - if (wqe->length > (1 << ah->log_pmtu)) + /* + * SM packets should exclusively use VL15 and their SL is + * ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when ah + * is created, SL is 0 in most cases and as a result some + * fields (vl and pmtu) in ah may not be set correctly, + * depending on the SL2SC and SC2VL tables at the time. + */ + ppd = ppd_from_ibp(ibp); + dd = dd_from_ppd(ppd); + if (wqe->length > dd->vld[15].mtu) return -EINVAL; break; case IB_QPT_GSI: @@ -321,7 +340,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, default: break; } - return wqe->length <= piothreshold; + return 0; } /** @@ -333,7 +352,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, * It is only used in the post send, which doesn't hold * the s_lock. */ -void _hfi1_schedule_send(struct rvt_qp *qp) +bool _hfi1_schedule_send(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = @@ -341,10 +360,10 @@ void _hfi1_schedule_send(struct rvt_qp *qp) struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, - priv->s_sde ? - priv->s_sde->cpu : - cpumask_first(cpumask_of_node(dd->node))); + return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); } static void qp_pio_drain(struct rvt_qp *qp) @@ -372,12 +391,32 @@ static void qp_pio_drain(struct rvt_qp *qp) * * This schedules qp progress and caller should hold * the s_lock. + * @return true if the first leg is scheduled; + * false if the first leg is not scheduled. */ -void hfi1_schedule_send(struct rvt_qp *qp) +bool hfi1_schedule_send(struct rvt_qp *qp) { lockdep_assert_held(&qp->s_lock); - if (hfi1_send_ok(qp)) + if (hfi1_send_ok(qp)) { _hfi1_schedule_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_IB); + return false; +} + +static void hfi1_qp_schedule(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + bool ret; + + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) { + ret = hfi1_schedule_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -388,16 +427,22 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) if (qp->s_flags & flag) { qp->s_flags &= ~flag; trace_hfi1_qpwakeup(qp, flag); - hfi1_schedule_send(qp); + hfi1_qp_schedule(qp); } spin_unlock_irqrestore(&qp->s_lock, flags); /* Notify hfi1_destroy_qp() if it is waiting. */ rvt_put_qp(qp); } +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) +{ + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + qp->s_flags &= ~RVT_S_BUSY; +} + static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, uint seq, bool pkts_sent) @@ -438,7 +483,7 @@ static int iowait_sleep( rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, wait); spin_unlock_irqrestore(&qp->s_lock, flags); ret = -EBUSY; } else { @@ -637,6 +682,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, + NULL, iowait_sleep, iowait_wakeup, iowait_sdma_drained); @@ -686,7 +732,7 @@ void stop_send_queue(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - cancel_work_sync(&priv->s_iowait.iowork); + iowait_cancel_work(&priv->s_iowait); } void quiesce_qp(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 078cff7560b6..7adb6dff6813 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -58,18 +58,6 @@ extern unsigned int hfi1_qp_table_size; extern const struct rvt_operation_params hfi1_post_parms[]; /* - * Send if not busy or waiting for I/O and either - * a RC response is pending or we can process send work requests. - */ -static inline int hfi1_send_ok(struct rvt_qp *qp) -{ - return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) && - (verbs_txreq_queued(qp) || - (qp->s_flags & RVT_S_RESP_PENDING) || - !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); -} - -/* * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK * * HFI1_S_AHG_VALID - ahg header valid on chip @@ -90,6 +78,20 @@ static inline int hfi1_send_ok(struct rvt_qp *qp) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) /* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int hfi1_send_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) && + (verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) || + (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); +} + +/* * free_ahg - clear ahg from QP */ static inline void clear_ahg(struct rvt_qp *qp) @@ -129,8 +131,8 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter); -void _hfi1_schedule_send(struct rvt_qp *qp); -void hfi1_schedule_send(struct rvt_qp *qp); +bool _hfi1_schedule_send(struct rvt_qp *qp); +bool hfi1_schedule_send(struct rvt_qp *qp); void hfi1_migrate_qp(struct rvt_qp *qp); @@ -150,4 +152,5 @@ void quiesce_qp(struct rvt_qp *qp); u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); int mtu_to_path_mtu(u32 mtu); void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl); +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait); #endif /* _QP_H */ diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9bd63abb2dfe..188aa4f686a0 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -309,7 +309,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done_free_tx; @@ -378,9 +378,9 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, - err ? IB_WC_LOC_PROT_ERR - : IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, + err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); goto done_free_tx; @@ -1043,7 +1043,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else { /* need to handle delayed completion */ @@ -1468,7 +1468,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1644,7 +1644,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1684,7 +1685,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void)do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -1704,7 +1706,7 @@ ack_len_err: status = IB_WC_LOC_LEN_ERR; ack_err: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: @@ -2144,7 +2146,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -2200,7 +2202,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 5f56f3c1b4c4..7fb317c711df 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -156,333 +156,6 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet) } /** - * ruc_loopback - handle UC and RC loopback requests - * @sqp: the sending QP - * - * This is called from hfi1_do_send() to - * forward a WQE addressed to the same HFI. - * Note that although we are single threaded due to the send engine, we still - * have to protect against post_send(). We don't have to worry about - * receive interrupts since this is a connected protocol and all packets - * will pass through here. - */ -static void ruc_loopback(struct rvt_qp *sqp) -{ - struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct rvt_qp *qp; - struct rvt_swqe *wqe; - struct rvt_sge *sge; - unsigned long flags; - struct ib_wc wc; - u64 sdata; - atomic64_t *maddr; - enum ib_wc_status send_status; - bool release; - int ret; - bool copy_last = false; - int local_ops = 0; - - rcu_read_lock(); - - /* - * Note that we check the responder QP state after - * checking the requester's state. - */ - qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - sqp->remote_qpn); - - spin_lock_irqsave(&sqp->s_lock, flags); - - /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) || - !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - goto unlock; - - sqp->s_flags |= RVT_S_BUSY; - -again: - if (sqp->s_last == READ_ONCE(sqp->s_head)) - goto clr_busy; - wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); - - /* Return if it is not OK to start a new work request. */ - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { - if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) - goto clr_busy; - /* We are in the error state, flush the work request. */ - send_status = IB_WC_WR_FLUSH_ERR; - goto flush_send; - } - - /* - * We can rely on the entry not changing without the s_lock - * being held until we update s_last. - * We increment s_cur to indicate s_last is in progress. - */ - if (sqp->s_last == sqp->s_cur) { - if (++sqp->s_cur >= sqp->s_size) - sqp->s_cur = 0; - } - spin_unlock_irqrestore(&sqp->s_lock, flags); - - if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || - qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->rvp.n_pkt_drops++; - /* - * For RC, the requester would timeout and retry so - * shortcut the timeouts and just signal too many retries. - */ - if (sqp->ibqp.qp_type == IB_QPT_RC) - send_status = IB_WC_RETRY_EXC_ERR; - else - send_status = IB_WC_SUCCESS; - goto serr; - } - - memset(&wc, 0, sizeof(wc)); - send_status = IB_WC_SUCCESS; - - release = true; - sqp->s_sge.sge = wqe->sg_list[0]; - sqp->s_sge.sg_list = wqe->sg_list + 1; - sqp->s_sge.num_sge = wqe->wr.num_sge; - sqp->s_len = wqe->length; - switch (wqe->wr.opcode) { - case IB_WR_REG_MR: - goto send_comp; - - case IB_WR_LOCAL_INV: - if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { - if (rvt_invalidate_rkey(sqp, - wqe->wr.ex.invalidate_rkey)) - send_status = IB_WC_LOC_PROT_ERR; - local_ops = 1; - } - goto send_comp; - - case IB_WR_SEND_WITH_INV: - if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { - wc.wc_flags = IB_WC_WITH_INVALIDATE; - wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; - } - goto send; - - case IB_WR_SEND_WITH_IMM: - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - /* FALLTHROUGH */ - case IB_WR_SEND: -send: - ret = rvt_get_rwqe(qp, false); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - break; - - case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = rvt_get_rwqe(qp, true); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - /* skip copy_last set and qp_access_flags recheck */ - goto do_write; - case IB_WR_RDMA_WRITE: - copy_last = rvt_is_user_qp(qp); - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; -do_write: - if (wqe->length == 0) - break; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) - goto acc_err; - qp->r_sge.sg_list = NULL; - qp->r_sge.num_sge = 1; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) - goto acc_err; - release = false; - sqp->s_sge.sg_list = NULL; - sqp->s_sge.num_sge = 1; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->wr.num_sge; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) - goto acc_err; - /* Perform atomic OP and save result. */ - maddr = (atomic64_t *)qp->r_sge.sge.vaddr; - sdata = wqe->atomic_wr.compare_add; - *(u64 *)sqp->s_sge.sge.vaddr = - (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64)atomic64_add_return(sdata, maddr) - sdata : - (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, - sdata, wqe->atomic_wr.swap); - rvt_put_mr(qp->r_sge.sge.mr); - qp->r_sge.num_sge = 0; - goto send_comp; - - default: - send_status = IB_WC_LOC_QP_OP_ERR; - goto serr; - } - - sge = &sqp->s_sge.sge; - while (sqp->s_len) { - u32 len = sqp->s_len; - - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; - WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - sqp->s_len -= len; - } - if (release) - rvt_put_ss(&qp->r_sge); - - if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) - goto send_comp; - - if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - else - wc.opcode = IB_WC_RECV; - wc.wr_id = qp->r_wr_id; - wc.status = IB_WC_SUCCESS; - wc.byte_len = wqe->length; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; - wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); - wc.port_num = 1; - /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); - -send_comp: - spin_lock_irqsave(&sqp->s_lock, flags); - ibp->rvp.n_loop_pkts++; -flush_send: - sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - hfi1_send_complete(sqp, wqe, send_status); - if (local_ops) { - atomic_dec(&sqp->local_ops_pending); - local_ops = 0; - } - goto again; - -rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - ibp->rvp.n_rnr_naks++; - /* - * Note: we don't need the s_lock held since the BUSY flag - * makes this single threaded. - */ - if (sqp->s_rnr_retry == 0) { - send_status = IB_WC_RNR_RETRY_EXC_ERR; - goto serr; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) - goto clr_busy; - rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << - IB_AETH_CREDIT_SHIFT); - goto clr_busy; - -op_err: - send_status = IB_WC_REM_OP_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -inv_err: - send_status = IB_WC_REM_INV_REQ_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -acc_err: - send_status = IB_WC_REM_ACCESS_ERR; - wc.status = IB_WC_LOC_PROT_ERR; -err: - /* responder goes to error state */ - rvt_rc_error(qp, wc.status); - -serr: - spin_lock_irqsave(&sqp->s_lock, flags); - hfi1_send_complete(sqp, wqe, send_status); - if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - - sqp->s_flags &= ~RVT_S_BUSY; - spin_unlock_irqrestore(&sqp->s_lock, flags); - if (lastwqe) { - struct ib_event ev; - - ev.device = sqp->ibqp.device; - ev.element.qp = &sqp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); - } - goto done; - } -clr_busy: - sqp->s_flags &= ~RVT_S_BUSY; -unlock: - spin_unlock_irqrestore(&sqp->s_lock, flags); -done: - rcu_read_unlock(); -} - -/** * hfi1_make_grh - construct a GRH header * @ibp: a pointer to the IB port * @hdr: a pointer to the GRH header being constructed @@ -825,8 +498,8 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp) void _hfi1_do_send(struct work_struct *work) { - struct iowait *wait = container_of(work, struct iowait, iowork); - struct rvt_qp *qp = iowait_to_qp(wait); + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); hfi1_do_send(qp, true); } @@ -850,6 +523,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.ibp = to_iport(qp->ibqp.device, qp->port_num); ps.ppd = ppd_from_ibp(ps.ibp); ps.in_thread = in_thread; + ps.wait = iowait_get_ib_work(&priv->s_iowait); trace_hfi1_rc_do_send(qp, in_thread); @@ -858,7 +532,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_rc_req; @@ -868,7 +542,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_uc_req; @@ -883,6 +557,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) /* Return if we are already busy processing a work request. */ if (!hfi1_send_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); spin_unlock_irqrestore(&qp->s_lock, ps.flags); return; } @@ -896,7 +572,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.pkts_sent = false; /* insure a pre-built packet is handled */ - ps.s_txreq = get_waiting_verbs_txreq(qp); + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { @@ -907,6 +583,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) */ if (hfi1_verbs_send(qp, &ps)) return; + /* allow other tasks to run */ if (schedule_send_yield(qp, &ps)) return; @@ -917,44 +594,3 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); spin_unlock_irqrestore(&qp->s_lock, ps.flags); } - -/* - * This should be called with s_lock held. - */ -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status) -{ - u32 old_last, last; - - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - return; - - last = qp->s_last; - old_last = last; - trace_hfi1_qp_send_completion(qp, wqe, last); - if (++last >= qp->s_size) - last = 0; - trace_hfi1_qp_send_completion(qp, wqe, last); - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_swqe(wqe); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); - - rvt_qp_swqe_complete(qp, - wqe, - ib_hfi1_wc_opcode[wqe->wr.opcode], - status); - - if (qp->s_acked == old_last) - qp->s_acked = last; - if (qp->s_cur == old_last) - qp->s_cur = last; - if (qp->s_tail == old_last) - qp->s_tail = last; - if (qp->state == IB_QPS_SQD && last == qp->s_cur) - qp->s_draining = 0; -} diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 88e326d6cc49..891d2386d1ca 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -378,7 +378,7 @@ static inline void complete_tx(struct sdma_engine *sde, __sdma_txclean(sde->dd, tx); if (complete) (*complete)(tx, res); - if (wait && iowait_sdma_dec(wait)) + if (iowait_sdma_dec(wait)) iowait_drain_wakeup(wait); } @@ -1758,7 +1758,6 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) struct iowait *wait, *nw; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; uint i, n = 0, seq, max_idx = 0; - struct sdma_txreq *stx; struct hfi1_ibdev *dev = &sde->dd->verbs_dev; u8 max_starved_cnt = 0; @@ -1779,19 +1778,13 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) nw, &sde->dmawait, list) { - u16 num_desc = 0; + u32 num_desc; if (!wait->wakeup) continue; if (n == ARRAY_SIZE(waits)) break; - if (!list_empty(&wait->tx_head)) { - stx = list_first_entry( - &wait->tx_head, - struct sdma_txreq, - list); - num_desc = stx->num_desc; - } + num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; @@ -2346,7 +2339,7 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) */ static int sdma_check_progress( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2356,12 +2349,12 @@ static int sdma_check_progress( if (tx->num_desc <= sde->desc_avail) return -EAGAIN; /* pulse the head_lock */ - if (wait && wait->sleep) { + if (wait && iowait_ioww_to_iow(wait)->sleep) { unsigned seq; seq = raw_seqcount_begin( (const seqcount_t *)&sde->head_lock.seqcount); - ret = wait->sleep(sde, wait, tx, seq, pkts_sent); + ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent); if (ret == -EAGAIN) sde->desc_avail = sdma_descq_freecnt(sde); } else { @@ -2373,7 +2366,7 @@ static int sdma_check_progress( /** * sdma_send_txreq() - submit a tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx: sdma_txreq to submit * @pkts_sent: has any packet been sent yet? * @@ -2386,7 +2379,7 @@ static int sdma_check_progress( * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2397,7 +2390,7 @@ int sdma_send_txreq(struct sdma_engine *sde, /* user should have supplied entire packet */ if (unlikely(tx->tlen)) return -EINVAL; - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); spin_lock_irqsave(&sde->tail_lock, flags); retry: if (unlikely(!__sdma_running(sde))) @@ -2406,14 +2399,14 @@ retry: goto nodesc; tail = submit_tx(sde, tx); if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); sdma_update_tail(sde, tail); unlock: spin_unlock_irqrestore(&sde->tail_lock, flags); return ret; unlock_noconn: if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; @@ -2422,10 +2415,7 @@ unlock_noconn: spin_lock(&sde->flushlist_lock); list_add_tail(&tx->list, &sde->flushlist); spin_unlock(&sde->flushlist_lock); - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); schedule_work(&sde->flush_worker); ret = -ECOMM; goto unlock; @@ -2442,9 +2432,9 @@ nodesc: /** * sdma_send_txlist() - submit a list of tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx_list: list of sdma_txreqs to submit - * @count: pointer to a u32 which, after return will contain the total number of + * @count: pointer to a u16 which, after return will contain the total number of * sdma_txreqs removed from the tx_list. This will include sdma_txreqs * whose SDMA descriptors are submitted to the ring and the sdma_txreqs * which are added to SDMA engine flush list if the SDMA engine state is @@ -2467,8 +2457,8 @@ nodesc: * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL) * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ -int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, - struct list_head *tx_list, u32 *count_out) +int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait, + struct list_head *tx_list, u16 *count_out) { struct sdma_txreq *tx, *tx_next; int ret = 0; @@ -2479,7 +2469,7 @@ int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, spin_lock_irqsave(&sde->tail_lock, flags); retry: list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); if (unlikely(!__sdma_running(sde))) goto unlock_noconn; if (unlikely(tx->num_desc > sde->desc_avail)) @@ -2500,8 +2490,9 @@ retry: update_tail: total_count = submit_count + flush_count; if (wait) { - iowait_sdma_add(wait, total_count); - iowait_starve_clear(submit_count > 0, wait); + iowait_sdma_add(iowait_ioww_to_iow(wait), total_count); + iowait_starve_clear(submit_count > 0, + iowait_ioww_to_iow(wait)); } if (tail != INVALID_TAIL) sdma_update_tail(sde, tail); @@ -2511,7 +2502,7 @@ update_tail: unlock_noconn: spin_lock(&sde->flushlist_lock); list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); list_del_init(&tx->list); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER @@ -2520,10 +2511,7 @@ unlock_noconn: #endif list_add_tail(&tx->list, &sde->flushlist); flush_count++; - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); } spin_unlock(&sde->flushlist_lock); schedule_work(&sde->flush_worker); diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 46c775f255d1..6dc63d7c5685 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -1,7 +1,7 @@ #ifndef _HFI1_SDMA_H #define _HFI1_SDMA_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -62,16 +62,6 @@ /* Hardware limit for SDMA packet size */ #define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1) -#define SDMA_TXREQ_S_OK 0 -#define SDMA_TXREQ_S_SENDERROR 1 -#define SDMA_TXREQ_S_ABORTED 2 -#define SDMA_TXREQ_S_SHUTDOWN 3 - -/* flags bits */ -#define SDMA_TXREQ_F_URGENT 0x0001 -#define SDMA_TXREQ_F_AHG_COPY 0x0002 -#define SDMA_TXREQ_F_USE_AHG 0x0004 - #define SDMA_MAP_NONE 0 #define SDMA_MAP_SINGLE 1 #define SDMA_MAP_PAGE 2 @@ -415,6 +405,7 @@ struct sdma_engine { struct list_head flushlist; struct cpumask cpu_mask; struct kobject kobj; + u32 msix_intr; }; int sdma_init(struct hfi1_devdata *dd, u8 port); @@ -849,16 +840,16 @@ static inline int sdma_txadd_kvaddr( dd, SDMA_MAP_SINGLE, tx, addr, len); } -struct iowait; +struct iowait_work; int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent); int sdma_send_txlist(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct list_head *tx_list, - u32 *count); + u16 *count_out); int sdma_ahg_alloc(struct sdma_engine *sde); void sdma_ahg_free(struct sdma_engine *sde, int ahg_index); diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 25e867393463..2be513d4c9da 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -494,17 +494,18 @@ static struct kobj_type hfi1_vl2mtu_ktype = { * Start of per-unit (or driver, in some cases, but replicated * per unit) functions (these get a device *) */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) { struct hfi1_ibdev *dev = container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hfi(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); @@ -517,8 +518,9 @@ static ssize_t show_hfi(struct device *device, struct device_attribute *attr, ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); return ret; } +static DEVICE_ATTR_RO(board_id); -static ssize_t show_boardversion(struct device *device, +static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -528,8 +530,9 @@ static ssize_t show_boardversion(struct device *device, /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); } +static DEVICE_ATTR_RO(boardversion); -static ssize_t show_nctxts(struct device *device, +static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -546,8 +549,9 @@ static ssize_t show_nctxts(struct device *device, min(dd->num_user_contexts, (u32)dd->sc_sizes[SC_USER].count)); } +static DEVICE_ATTR_RO(nctxts); -static ssize_t show_nfreectxts(struct device *device, +static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -557,8 +561,9 @@ static ssize_t show_nfreectxts(struct device *device, /* Return the number of free user ports (contexts) available. */ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); } +static DEVICE_ATTR_RO(nfreectxts); -static ssize_t show_serial(struct device *device, +static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -567,8 +572,9 @@ static ssize_t show_serial(struct device *device, return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); } +static DEVICE_ATTR_RO(serial); -static ssize_t store_chip_reset(struct device *device, +static ssize_t chip_reset_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { @@ -586,6 +592,7 @@ static ssize_t store_chip_reset(struct device *device, bail: return ret < 0 ? ret : count; } +static DEVICE_ATTR_WO(chip_reset); /* * Convert the reported temperature from an integer (reported in @@ -598,7 +605,7 @@ bail: /* * Dump tempsense values, in decimal, to ease shell-scripts. */ -static ssize_t show_tempsense(struct device *device, +static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -622,6 +629,7 @@ static ssize_t show_tempsense(struct device *device, } return ret; } +static DEVICE_ATTR_RO(tempsense); /* * end of per-unit (or driver, in some cases, but replicated @@ -629,24 +637,20 @@ static ssize_t show_tempsense(struct device *device, */ /* start of per-unit file structures and support code */ -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL); -static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); -static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); -static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); -static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); -static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); -static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); - -static struct device_attribute *hfi1_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_board_id, - &dev_attr_nctxts, - &dev_attr_nfreectxts, - &dev_attr_serial, - &dev_attr_boardversion, - &dev_attr_tempsense, - &dev_attr_chip_reset, +static struct attribute *hfi1_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_board_id.attr, + &dev_attr_nctxts.attr, + &dev_attr_nfreectxts.attr, + &dev_attr_serial.attr, + &dev_attr_boardversion.attr, + &dev_attr_tempsense.attr, + &dev_attr_chip_reset.attr, + NULL, +}; + +const struct attribute_group ib_hfi1_attr_group = { + .attrs = hfi1_attributes, }; int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, @@ -832,12 +836,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) struct device *class_dev = &dev->dev; int i, j, ret; - for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) { - ret = device_create_file(&dev->dev, hfi1_attributes[i]); - if (ret) - goto bail; - } - for (i = 0; i < dd->num_sdma; i++) { ret = kobject_init_and_add(&dd->per_sdma[i].kobj, &sde_ktype, &class_dev->kobj, @@ -855,9 +853,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) return 0; bail: - for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) - device_remove_file(&dev->dev, hfi1_attributes[i]); - for (i = 0; i < dd->num_sdma; i++) kobject_del(&dd->per_sdma[i].kobj); diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 8540463ef3f7..84458f1325e1 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -62,3 +62,4 @@ __print_symbolic(etype, \ #include "trace_rx.h" #include "trace_tx.h" #include "trace_mmu.h" +#include "trace_iowait.h" diff --git a/drivers/infiniband/hw/hfi1/trace_iowait.h b/drivers/infiniband/hw/hfi1/trace_iowait.h new file mode 100644 index 000000000000..27f4334ece2b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_iowait.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#if !defined(__HFI1_TRACE_IOWAIT_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_IOWAIT_H + +#include <linux/tracepoint.h> +#include "iowait.h" +#include "verbs.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_iowait + +DECLARE_EVENT_CLASS(hfi1_iowait_template, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag), + TP_STRUCT__entry(/* entry */ + __field(unsigned long, addr) + __field(unsigned long, flags) + __field(u32, flag) + __field(u32, qpn) + ), + TP_fast_assign(/* assign */ + __entry->addr = (unsigned long)wait; + __entry->flags = wait->flags; + __entry->flag = (1 << flag); + __entry->qpn = iowait_to_qp(wait)->ibqp.qp_num; + ), + TP_printk(/* print */ + "iowait 0x%lx qp %u flags 0x%lx flag 0x%x", + __entry->addr, + __entry->qpn, + __entry->flags, + __entry->flag + ) + ); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_set, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_clear, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +#endif /* __HFI1_TRACE_IOWAIT_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_iowait +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index e254dcec6f64..6aca0c5a7f97 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -88,7 +88,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -140,7 +140,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp, wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR + rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); @@ -426,7 +426,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -449,7 +449,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - hfi1_copy_sge(&qp->r_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -523,7 +523,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -550,7 +550,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -564,7 +564,7 @@ rdma_last: tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 70d39fc450a1..4baa8f4d49de 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -210,8 +210,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } hfi1_make_grh(ibp, &grh, &grd, 0, 0); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -228,7 +228,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -518,7 +518,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -560,7 +560,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, tflags); ps->flags = tflags; - hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); goto done_free_tx; } } @@ -1019,8 +1019,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (packet->grh) { - hfi1_copy_sge(&qp->r_sge, packet->grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { struct ib_grh grh; @@ -1030,14 +1030,14 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * out when creating 16B, add back the GRH here. */ hfi1_make_ext_grh(packet, &grh, slid, dlid); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); } - hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), - true, false); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 5c88706121c1..3f0aadccd9f6 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -76,8 +76,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12 static unsigned initial_pkt_count = 8; -static int user_sdma_send_pkts(struct user_sdma_request *req, - unsigned maxpkts); +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); @@ -101,7 +100,7 @@ static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent); @@ -124,13 +123,13 @@ static struct mmu_rb_ops sdma_rb_ops = { static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_user_sdma_pkt_q *pq = - container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); struct hfi1_ibdev *dev = &pq->dd->verbs_dev; struct user_sdma_txreq *tx = container_of(txreq, struct user_sdma_txreq, txreq); @@ -187,13 +186,12 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, pq->ctxt = uctxt->ctxt; pq->subctxt = fd->subctxt; pq->n_max_reqs = hfi1_sdma_comp_ring_size; - pq->state = SDMA_PKT_Q_INACTIVE; atomic_set(&pq->n_reqs, 0); init_waitqueue_head(&pq->wait); atomic_set(&pq->n_locked, 0); pq->mm = fd->mm; - iowait_init(&pq->busy, 0, NULL, defer_packet_queue, + iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, activate_packet_queue, NULL); pq->reqidx = 0; @@ -276,7 +274,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, /* Wait until all requests have been freed. */ wait_event_interruptible( pq->wait, - (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); + !atomic_read(&pq->n_reqs)); kfree(pq->reqs); kfree(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); @@ -312,6 +310,13 @@ static u8 dlid_to_selector(u16 dlid) return mapping[hash]; } +/** + * hfi1_user_sdma_process_request() - Process and start a user sdma request + * @fd: valid file descriptor + * @iovec: array of io vectors to process + * @dim: overall iovec array size + * @count: number of io vector array entries processed + */ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, struct iovec *iovec, unsigned long dim, unsigned long *count) @@ -328,7 +333,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, u8 opcode, sc, vl; u16 pkey; u32 slid; - int req_queued = 0; u16 dlid; u32 selector; @@ -392,7 +396,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->data_len = 0; req->pq = pq; req->cq = cq; - req->status = -1; req->ahg_idx = -1; req->iov_idx = 0; req->sent = 0; @@ -400,12 +403,14 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->seqcomp = 0; req->seqsubmitted = 0; req->tids = NULL; - req->done = 0; req->has_error = 0; INIT_LIST_HEAD(&req->txps); memcpy(&req->info, &info, sizeof(info)); + /* The request is initialized, count it */ + atomic_inc(&pq->n_reqs); + if (req_opcode(info.ctrl) == EXPECTED) { /* expected must have a TID info and at least one data vector */ if (req->data_iovs < 2) { @@ -500,7 +505,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, ret = pin_vector_pages(req, &req->iovs[i]); if (ret) { req->data_iovs = i; - req->status = ret; goto free_req; } req->data_len += req->iovs[i].iov.iov_len; @@ -561,23 +565,11 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->ahg_idx = sdma_ahg_alloc(req->sde); set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); - atomic_inc(&pq->n_reqs); - req_queued = 1; + pq->state = SDMA_PKT_Q_ACTIVE; /* Send the first N packets in the request to buy us some time */ ret = user_sdma_send_pkts(req, pcount); - if (unlikely(ret < 0 && ret != -EBUSY)) { - req->status = ret; + if (unlikely(ret < 0 && ret != -EBUSY)) goto free_req; - } - - /* - * It is possible that the SDMA engine would have processed all the - * submitted packets by the time we get here. Therefore, only set - * packet queue state to ACTIVE if there are still uncompleted - * requests. - */ - if (atomic_read(&pq->n_reqs)) - xchg(&pq->state, SDMA_PKT_Q_ACTIVE); /* * This is a somewhat blocking send implementation. @@ -588,14 +580,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, while (req->seqsubmitted != req->info.npkts) { ret = user_sdma_send_pkts(req, pcount); if (ret < 0) { - if (ret != -EBUSY) { - req->status = ret; - WRITE_ONCE(req->has_error, 1); - if (READ_ONCE(req->seqcomp) == - req->seqsubmitted - 1) - goto free_req; - return ret; - } + if (ret != -EBUSY) + goto free_req; wait_event_interruptible_timeout( pq->busy.wait_dma, (pq->state == SDMA_PKT_Q_ACTIVE), @@ -606,10 +592,19 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, *count += idx; return 0; free_req: - user_sdma_free_request(req, true); - if (req_queued) + /* + * If the submitted seqsubmitted == npkts, the completion routine + * controls the final state. If sequbmitted < npkts, wait for any + * outstanding packets to finish before cleaning up. + */ + if (req->seqsubmitted < req->info.npkts) { + if (req->seqsubmitted) + wait_event(pq->busy.wait_dma, + (req->seqcomp == req->seqsubmitted - 1)); + user_sdma_free_request(req, true); pq_update(pq); - set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); + set_comp_state(pq, cq, info.comp_idx, ERROR, ret); + } return ret; } @@ -760,9 +755,10 @@ static int user_sdma_txadd(struct user_sdma_request *req, return ret; } -static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) { - int ret = 0, count; + int ret = 0; + u16 count; unsigned npkts = 0; struct user_sdma_txreq *tx = NULL; struct hfi1_user_sdma_pkt_q *pq = NULL; @@ -864,8 +860,10 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) changes = set_txreq_header_ahg(req, tx, datalen); - if (changes < 0) + if (changes < 0) { + ret = changes; goto free_tx; + } } } else { ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + @@ -914,10 +912,11 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) npkts++; } dosend: - ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); + ret = sdma_send_txlist(req->sde, + iowait_get_ib_work(&pq->busy), + &req->txps, &count); req->seqsubmitted += count; if (req->seqsubmitted == req->info.npkts) { - WRITE_ONCE(req->done, 1); /* * The txreq has already been submitted to the HW queue * so we can free the AHG entry now. Corruption will not @@ -1365,11 +1364,15 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, return idx; } -/* - * SDMA tx request completion callback. Called when the SDMA progress - * state machine gets notification that the SDMA descriptors for this - * tx request have been processed by the DMA engine. Called in - * interrupt context. +/** + * user_sdma_txreq_cb() - SDMA tx request completion callback. + * @txreq: valid sdma tx request + * @status: success/failure of request + * + * Called when the SDMA progress state machine gets notification that + * the SDMA descriptors for this tx request have been processed by the + * DMA engine. Called in interrupt context. + * Only do work on completed sequences. */ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) { @@ -1378,7 +1381,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) struct user_sdma_request *req; struct hfi1_user_sdma_pkt_q *pq; struct hfi1_user_sdma_comp_q *cq; - u16 idx; + enum hfi1_sdma_comp_state state = COMPLETE; if (!tx->req) return; @@ -1391,39 +1394,25 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) SDMA_DBG(req, "SDMA completion with error %d", status); WRITE_ONCE(req->has_error, 1); + state = ERROR; } req->seqcomp = tx->seqnum; kmem_cache_free(pq->txreq_cache, tx); - tx = NULL; - - idx = req->info.comp_idx; - if (req->status == -1 && status == SDMA_TXREQ_S_OK) { - if (req->seqcomp == req->info.npkts - 1) { - req->status = 0; - user_sdma_free_request(req, false); - pq_update(pq); - set_comp_state(pq, cq, idx, COMPLETE, 0); - } - } else { - if (status != SDMA_TXREQ_S_OK) - req->status = status; - if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) && - (READ_ONCE(req->done) || - READ_ONCE(req->has_error))) { - user_sdma_free_request(req, false); - pq_update(pq); - set_comp_state(pq, cq, idx, ERROR, req->status); - } - } + + /* sequence isn't complete? We are done */ + if (req->seqcomp != req->info.npkts - 1) + return; + + user_sdma_free_request(req, false); + set_comp_state(pq, cq, req->info.comp_idx, state, status); + pq_update(pq); } static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) { - if (atomic_dec_and_test(&pq->n_reqs)) { - xchg(&pq->state, SDMA_PKT_Q_INACTIVE); + if (atomic_dec_and_test(&pq->n_reqs)) wake_up(&pq->wait); - } } static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) @@ -1448,6 +1437,8 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) if (!node) continue; + req->iovs[i].node = NULL; + if (unpin) hfi1_mmu_rb_remove(req->pq->handler, &node->rb); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index d2bc77f75253..14dfd757dafd 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -105,9 +105,10 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size, #define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ -#define SDMA_PKT_Q_INACTIVE BIT(0) -#define SDMA_PKT_Q_ACTIVE BIT(1) -#define SDMA_PKT_Q_DEFERRED BIT(2) +enum pkt_q_sdma_state { + SDMA_PKT_Q_ACTIVE, + SDMA_PKT_Q_DEFERRED, +}; /* * Maximum retry attempts to submit a TX request @@ -133,7 +134,7 @@ struct hfi1_user_sdma_pkt_q { struct user_sdma_request *reqs; unsigned long *req_in_use; struct iowait busy; - unsigned state; + enum pkt_q_sdma_state state; wait_queue_head_t wait; unsigned long unpinned; struct mmu_rb_handler *handler; @@ -203,14 +204,12 @@ struct user_sdma_request { s8 ahg_idx; /* Writeable fields shared with interrupt */ - u64 seqcomp ____cacheline_aligned_in_smp; - u64 seqsubmitted; - /* status of the last txreq completed */ - int status; + u16 seqcomp ____cacheline_aligned_in_smp; + u16 seqsubmitted; /* Send side fields */ struct list_head txps ____cacheline_aligned_in_smp; - u64 seqnum; + u16 seqnum; /* * KDETH.OFFSET (TID) field * The offset can cover multiple packets, depending on the @@ -228,7 +227,6 @@ struct user_sdma_request { u16 tididx; /* progress index moving along the iovs array */ u8 iov_idx; - u8 done; u8 has_error; struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; @@ -248,7 +246,7 @@ struct user_sdma_txreq { struct user_sdma_request *req; u16 flags; unsigned int busycount; - u64 seqnum; + u16 seqnum; }; int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a7c586a5589d..48e11e510358 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -129,8 +129,6 @@ unsigned short piothreshold = 256; module_param(piothreshold, ushort, S_IRUGO); MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); -#define COPY_CACHELESS 1 -#define COPY_ADAPTIVE 2 static unsigned int sge_copy_mode; module_param(sge_copy_mode, uint, S_IRUGO); MODULE_PARM_DESC(sge_copy_mode, @@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp, /* 16B trailing buffer */ static const u8 trail_buf[MAX_16B_PADDING]; -static uint wss_threshold; +static uint wss_threshold = 80; module_param(wss_threshold, uint, S_IRUGO); MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); static uint wss_clean_period = 256; module_param(wss_clean_period, uint, S_IRUGO); MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); -/* memory working set size */ -struct hfi1_wss { - unsigned long *entries; - atomic_t total_count; - atomic_t clean_counter; - atomic_t clean_entry; - - int threshold; - int num_entries; - long pages_mask; -}; - -static struct hfi1_wss wss; - -int hfi1_wss_init(void) -{ - long llc_size; - long llc_bits; - long table_size; - long table_bits; - - /* check for a valid percent range - default to 80 if none or invalid */ - if (wss_threshold < 1 || wss_threshold > 100) - wss_threshold = 80; - /* reject a wildly large period */ - if (wss_clean_period > 1000000) - wss_clean_period = 256; - /* reject a zero period */ - if (wss_clean_period == 0) - wss_clean_period = 1; - - /* - * Calculate the table size - the next power of 2 larger than the - * LLC size. LLC size is in KiB. - */ - llc_size = wss_llc_size() * 1024; - table_size = roundup_pow_of_two(llc_size); - - /* one bit per page in rounded up table */ - llc_bits = llc_size / PAGE_SIZE; - table_bits = table_size / PAGE_SIZE; - wss.pages_mask = table_bits - 1; - wss.num_entries = table_bits / BITS_PER_LONG; - - wss.threshold = (llc_bits * wss_threshold) / 100; - if (wss.threshold == 0) - wss.threshold = 1; - - atomic_set(&wss.clean_counter, wss_clean_period); - - wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), - GFP_KERNEL); - if (!wss.entries) { - hfi1_wss_exit(); - return -ENOMEM; - } - - return 0; -} - -void hfi1_wss_exit(void) -{ - /* coded to handle partially initialized and repeat callers */ - kfree(wss.entries); - wss.entries = NULL; -} - -/* - * Advance the clean counter. When the clean period has expired, - * clean an entry. - * - * This is implemented in atomics to avoid locking. Because multiple - * variables are involved, it can be racy which can lead to slightly - * inaccurate information. Since this is only a heuristic, this is - * OK. Any innaccuracies will clean themselves out as the counter - * advances. That said, it is unlikely the entry clean operation will - * race - the next possible racer will not start until the next clean - * period. - * - * The clean counter is implemented as a decrement to zero. When zero - * is reached an entry is cleaned. - */ -static void wss_advance_clean_counter(void) -{ - int entry; - int weight; - unsigned long bits; - - /* become the cleaner if we decrement the counter to zero */ - if (atomic_dec_and_test(&wss.clean_counter)) { - /* - * Set, not add, the clean period. This avoids an issue - * where the counter could decrement below the clean period. - * Doing a set can result in lost decrements, slowing the - * clean advance. Since this a heuristic, this possible - * slowdown is OK. - * - * An alternative is to loop, advancing the counter by a - * clean period until the result is > 0. However, this could - * lead to several threads keeping another in the clean loop. - * This could be mitigated by limiting the number of times - * we stay in the loop. - */ - atomic_set(&wss.clean_counter, wss_clean_period); - - /* - * Uniquely grab the entry to clean and move to next. - * The current entry is always the lower bits of - * wss.clean_entry. The table size, wss.num_entries, - * is always a power-of-2. - */ - entry = (atomic_inc_return(&wss.clean_entry) - 1) - & (wss.num_entries - 1); - - /* clear the entry and count the bits */ - bits = xchg(&wss.entries[entry], 0); - weight = hweight64((u64)bits); - /* only adjust the contended total count if needed */ - if (weight) - atomic_sub(weight, &wss.total_count); - } -} - -/* - * Insert the given address into the working set array. - */ -static void wss_insert(void *address) -{ - u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; - u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ - u32 nr = page & (BITS_PER_LONG - 1); - - if (!test_and_set_bit(nr, &wss.entries[entry])) - atomic_inc(&wss.total_count); - - wss_advance_clean_counter(); -} - -/* - * Is the working set larger than the threshold? - */ -static inline bool wss_exceeds_threshold(void) -{ - return atomic_read(&wss.total_count) >= wss.threshold; -} - /* * Translate ib_wr_opcode into ib_wc_opcode. */ @@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = { */ __be64 ib_hfi1_sys_image_guid; -/** - * hfi1_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - * @release: boolean to release MR - * @copy_last: do a separate copy of the last 8 bytes - */ -void hfi1_copy_sge( - struct rvt_sge_state *ss, - void *data, u32 length, - bool release, - bool copy_last) -{ - struct rvt_sge *sge = &ss->sge; - int i; - bool in_last = false; - bool cacheless_copy = false; - - if (sge_copy_mode == COPY_CACHELESS) { - cacheless_copy = length >= PAGE_SIZE; - } else if (sge_copy_mode == COPY_ADAPTIVE) { - if (length >= PAGE_SIZE) { - /* - * NOTE: this *assumes*: - * o The first vaddr is the dest. - * o If multiple pages, then vaddr is sequential. - */ - wss_insert(sge->vaddr); - if (length >= (2 * PAGE_SIZE)) - wss_insert(sge->vaddr + PAGE_SIZE); - - cacheless_copy = wss_exceeds_threshold(); - } else { - wss_advance_clean_counter(); - } - } - if (copy_last) { - if (length > 8) { - length -= 8; - } else { - copy_last = false; - in_last = true; - } - } - -again: - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - if (unlikely(in_last)) { - /* enforce byte transfer ordering */ - for (i = 0; i < len; i++) - ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; - } else if (cacheless_copy) { - cacheless_memcpy(sge->vaddr, data, len); - } else { - memcpy(sge->vaddr, data, len); - } - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } - - if (copy_last) { - copy_last = false; - in_last = true; - length = 8; - goto again; - } -} - /* * Make sure the QP is ready and able to accept the given opcode. */ @@ -713,7 +492,7 @@ static void verbs_sdma_complete( spin_lock(&qp->s_lock); if (tx->wqe) { - hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); } else if (qp->ibqp.qp_type == IB_QPT_RC) { struct hfi1_opa_header *hdr; @@ -737,7 +516,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { if (list_empty(&dev->memwait)) mod_timer(&dev->mem_timer, jiffies + 1); @@ -748,7 +527,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -950,8 +729,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (unlikely(ret)) goto bail_build; } - ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq, - ps->pkts_sent); + ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent); if (unlikely(ret < 0)) { if (ret == -ECOMM) goto bail_ecomm; @@ -1001,7 +779,7 @@ static int pio_wait(struct rvt_qp *qp, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { struct hfi1_ibdev *dev = &dd->verbs_dev; int was_empty; @@ -1020,7 +798,7 @@ static int pio_wait(struct rvt_qp *qp, hfi1_sc_wantpiobuf_intr(sc, 1); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1160,7 +938,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, pio_bail: if (qp->s_wqe) { spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, wc_status); + rvt_send_complete(qp, qp->s_wqe, wc_status); spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); @@ -1367,7 +1145,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) hfi1_cdbg(PIO, "%s() Failed. Completing with err", __func__); spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock_irqrestore(&qp->s_lock, flags); } return -EINVAL; @@ -1943,7 +1721,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; - dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe; dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = hfi1_comp_vect_mappings_lookup; @@ -1956,10 +1734,16 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode; + ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) rvt_init_port(&dd->verbs_dev.rdi, @@ -1967,6 +1751,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) i, ppd->pkeys); + rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, + &ib_hfi1_attr_group); + ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1); if (ret) goto err_verbs_txreq; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index a4d06502f06d..64c9054db5f3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -166,11 +166,13 @@ struct hfi1_qp_priv { * This structure is used to hold commonly lookedup and computed values during * the send engine progress. */ +struct iowait_work; struct hfi1_pkt_state { struct hfi1_ibdev *dev; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct verbs_txreq *s_txreq; + struct iowait_work *wait; unsigned long flags; unsigned long timeout; unsigned long timeout_int; @@ -247,7 +249,7 @@ static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) return container_of(rdi, struct hfi1_ibdev, rdi); } -static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) +static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) { struct hfi1_qp_priv *priv; @@ -313,9 +315,6 @@ void hfi1_put_txreq(struct verbs_txreq *tx); int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - bool release, bool copy_last); - void hfi1_cnp_rcv(struct hfi1_packet *packet); void hfi1_uc_rcv(struct hfi1_packet *packet); @@ -343,7 +342,8 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); -int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; @@ -363,9 +363,6 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp); void hfi1_do_send(struct rvt_qp *qp, bool in_thread); -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status); - void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn); int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); @@ -390,28 +387,6 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -int hfi1_wss_init(void); -void hfi1_wss_exit(void); - -/* platform specific: return the lowest level cache (llc) size, in KiB */ -static inline int wss_llc_size(void) -{ - /* assume that the boot CPU value is universal for all CPUs */ - return boot_cpu_data.x86_cache_size; -} - -/* platform specific: cacheless copy */ -static inline void cacheless_memcpy(void *dst, void *src, size_t n) -{ - /* - * Use the only available X64 cacheless copy. Add a __user cast - * to quiet sparse. The src agument is already in the kernel so - * there are no security issues. The extra fault recovery machinery - * is not invoked. - */ - __copy_user_nocache(dst, (void __user *)src, n, 0); -} - static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) { return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index 1c19bbc764b2..2a77af26a231 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -102,22 +102,19 @@ static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx) return &tx->txreq; } -static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp) +static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w) { struct sdma_txreq *stx; - struct hfi1_qp_priv *priv = qp->priv; - stx = iowait_get_txhead(&priv->s_iowait); + stx = iowait_get_txhead(w); if (stx) return container_of(stx, struct verbs_txreq, txreq); return NULL; } -static inline bool verbs_txreq_queued(struct rvt_qp *qp) +static inline bool verbs_txreq_queued(struct iowait_work *w) { - struct hfi1_qp_priv *priv = qp->priv; - - return iowait_packet_queued(&priv->s_iowait); + return iowait_packet_queued(w); } void hfi1_put_txreq(struct verbs_txreq *tx); diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index c643d80c5a53..c9876d9e3cb9 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -120,7 +120,7 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd, uctxt->seq_cnt = 1; uctxt->is_vnic = true; - hfi1_set_vnic_msix_info(uctxt); + msix_request_rcd_irq(uctxt); hfi1_stats.sps_ctxts++; dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt); @@ -135,8 +135,6 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt); flush_wc(); - hfi1_reset_vnic_msix_info(uctxt); - /* * Disable receive context and interrupt available, reset all * RcvCtxtCtrl bits to default values. @@ -148,6 +146,10 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, HFI1_RCVCTRL_NO_RHQ_DROP_DIS | HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + /* msix_intr will always be > 0, only clean up if this is true */ + if (uctxt->msix_intr) + msix_free_irq(dd, uctxt->msix_intr); + uctxt->event_flags = 0; hfi1_clear_tids(uctxt); @@ -626,7 +628,7 @@ static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo) idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id); /* ensure irqs see the change */ - hfi1_vnic_synchronize_irq(dd); + msix_vnic_synchronize_irq(dd); /* remove unread skbs */ for (i = 0; i < vinfo->num_rx_q; i++) { @@ -690,8 +692,6 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) rc = hfi1_vnic_txreq_init(dd); if (rc) goto txreq_fail; - - dd->vnic.msix_idx = dd->first_dyn_msix_idx; } for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) { diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index c3c96c5869ed..97bd940a056a 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -198,8 +198,8 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, goto free_desc; tx->retry_count = 0; - ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq, - vnic_sdma->pkts_sent); + ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait), + &tx->txreq, vnic_sdma->pkts_sent); /* When -ECOMM, sdma callback will be called with ABORT status */ if (unlikely(ret && unlikely(ret != -ECOMM))) goto free_desc; @@ -230,13 +230,13 @@ tx_err: * become available. */ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_vnic_sdma *vnic_sdma = - container_of(wait, struct hfi1_vnic_sdma, wait); + container_of(wait->iow, struct hfi1_vnic_sdma, wait); struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev; struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq); @@ -247,7 +247,7 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; write_seqlock(&dev->iowait_lock); if (list_empty(&vnic_sdma->wait.list)) - iowait_queue(pkts_sent, wait, &sde->dmawait); + iowait_queue(pkts_sent, wait->iow, &sde->dmawait); write_sequnlock(&dev->iowait_lock); return -EBUSY; } @@ -285,7 +285,8 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) for (i = 0; i < vinfo->num_tx_q; i++) { struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i]; - iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep, + iowait_init(&vnic_sdma->wait, 0, NULL, NULL, + hfi1_vnic_sdma_sleep, hfi1_vnic_sdma_wakeup, NULL); vnic_sdma->sde = &vinfo->dd->per_sdma[i]; vnic_sdma->dd = vinfo->dd; @@ -295,10 +296,12 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) /* Add a free descriptor watermark for wakeups */ if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) { + struct iowait_work *work; + INIT_LIST_HEAD(&vnic_sdma->stx.list); vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK; - list_add_tail(&vnic_sdma->stx.list, - &vnic_sdma->wait.tx_head); + work = iowait_get_ib_work(&vnic_sdma->wait); + list_add_tail(&vnic_sdma->stx.list, &work->tx_head); } } } |