From 82a979265638c505e12fbe7ba40980dc0901436d Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 1 Feb 2018 10:43:42 -0800 Subject: IB/hfi1: Re-order IRQ cleanup to address driver cleanup race The pci_request_irq() interfaces always adds the IRQF_SHARED bit to all IRQ requests. When the kernel is built with CONFIG_DEBUG_SHIRQ config flag, if the IRQF_SHARED bit is set, a call to the IRQ handler is made from the __free_irq() function. This is testing a race condition between the IRQ cleanup and an IRQ racing the cleanup. The HFI driver should be able to handle this race, but does not. This race can cause traces that start with this footprint: BUG: unable to handle kernel NULL pointer dereference at (null) Call Trace: ... __free_irq+0x1b3/0x2d0 free_irq+0x35/0x70 pci_free_irq+0x1c/0x30 clean_up_interrupts+0x53/0xf0 [hfi1] hfi1_start_cleanup+0x122/0x190 [hfi1] postinit_cleanup+0x1d/0x280 [hfi1] remove_one+0x233/0x250 [hfi1] pci_device_remove+0x39/0xc0 Export IRQ cleanup function so it can be called from other modules. Using the exported cleanup function: Re-order the driver cleanup code to clean up IRQ resources before other resources, eliminating the race. Re-order error path for init so that the race does not occur. Reduce severity on spurious error message for SDMA IRQs to info. Reviewed-by: Alex Estrin Reviewed-by: Patel Jay P Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/hfi1/init.c') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 9b128268fb28..c676d1cb8755 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1058,8 +1058,9 @@ static void shutdown_device(struct hfi1_devdata *dd) } dd->flags &= ~HFI1_INITTED; - /* mask interrupts, but not errors */ + /* mask and clean up interrupts, but not errors */ set_intr_state(dd, 0); + hfi1_clean_up_interrupts(dd); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1704,6 +1705,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); if (initfail || ret) { + hfi1_clean_up_interrupts(dd); stop_timers(dd); flush_workqueue(ib_wq); for (pidx = 0; pidx < dd->num_pports; ++pidx) { -- cgit v1.2.3 From 473291b3ea0e1df81f7abf13b8ab4b98a346df5e Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Thu, 1 Feb 2018 10:43:50 -0800 Subject: IB/hfi1: Fix for early release of sdma context With IRQF_SHARED flag set and CONFIG_DEBUG_SHIRQ enabled module removal may result in panic in sdma_interrupt() routine if associated sdma context was released before pci_free_irq(); [ 9198.939885] BUG: unable to handle kernel NULL pointer dereference at (null) [ 9198.940514] IP: sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.941114] PGD 170bdc0067 P4D 170bdc0067 PUD 172063e067 PMD 0 [ 9198.941783] Oops: 0000 [#1] SMP ..... [ 9198.958877] CPU: 132 PID: 64173 Comm: rmmod Tainted: G OE 4.14.0-rc4+ #1 [ 9198.961032] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.02.0118.080620171935 08/06/2017 [ 9198.963323] task: ffff9681397f0000 task.stack: ffffae1647c40000 [ 9198.965695] RIP: 0010:sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.968082] RSP: 0018:ffffae1647c43be8 EFLAGS: 00010046 [ 9198.970503] RAX: 0000000000000000 RBX: ffff9680ce8b5ca8 RCX: 0000000000000000 [ 9198.973006] RDX: 0000000000000000 RSI: 0000000001a00d28 RDI: ffff9680ce8b5ca0 [ 9198.975546] RBP: ffffae1647c43c40 R08: ffff96814325ec00 R09: 00000000ffffffff [ 9198.978142] R10: 000000004325e501 R11: ffff96814325ec00 R12: ffff9680ce8b5c44 [ 9198.980779] R13: ffff9680ce8b5ca0 R14: 0000000000000000 R15: ffff9680ce8b5b00 [ 9198.983462] FS: 00007f31196ba740(0000) GS:ffff96819df00000(0000) knlGS:0000000000000000 [ 9198.986231] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9198.989036] CR2: 0000000000000000 CR3: 000000170833f000 CR4: 00000000001406e0 [ 9198.991911] Call Trace: [ 9198.994847] sdma_engine_interrupt+0x82/0x100 [hfi1] [ 9198.997852] sdma_interrupt+0x61/0xc0 [hfi1] [ 9199.000852] __free_irq+0x1b3/0x2d0 [ 9199.003873] free_irq+0x35/0x70 [ 9199.006909] pci_free_irq+0x1c/0x30 [ 9199.009999] clean_up_interrupts+0x53/0xf0 [hfi1] [ 9199.013137] hfi1_start_cleanup+0x117/0x190 [hfi1] [ 9199.016315] postinit_cleanup+0x1d/0x270 [hfi1] [ 9199.019529] remove_one+0x1f3/0x210 [hfi1] [ 9199.022738] pci_device_remove+0x39/0xc0 [ 9199.025974] device_release_driver_internal+0x141/0x210 [ 9199.029268] driver_detach+0x3f/0x80 [ 9199.032580] bus_remove_driver+0x55/0xd0 [ 9199.035931] driver_unregister+0x2c/0x50 [ 9199.039321] pci_unregister_driver+0x2a/0xa0 [ 9199.042755] hfi1_mod_cleanup+0x10/0xb50 [hfi1] [ 9199.046196] SyS_delete_module+0x171/0x250 ... Fix by exporting sdma_clean() and removing from sdma_exit(). sdma_exit() now just manipulates the engine state, leaving the memory free to sdma_clean() which is now called just before the dd is freed. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J Ruhl Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 1 + drivers/infiniband/hw/hfi1/sdma.c | 13 +++++++------ drivers/infiniband/hw/hfi1/sdma.h | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/init.c') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index c676d1cb8755..4c51a75b238a 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1219,6 +1219,7 @@ static void __hfi1_free_devdata(struct kobject *kobj) free_percpu(dd->rcv_limit); free_percpu(dd->send_schedule); free_percpu(dd->tx_opstats); + sdma_clean(dd, dd->num_sdma); rvt_dealloc_device(&dd->verbs_dev.rdi); } diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 31c8f89b5fc8..37424a81a7c9 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1276,13 +1276,15 @@ bail: return -ENOMEM; } -/* - * Clean up allocated memory. - * - * This routine is can be called regardless of the success of sdma_init() +/** + * sdma_clean() Clean up allocated memory + * @dd: struct hfi1_devdata + * @num_engines: num sdma engines * + * This routine can be called regardless of the success of + * sdma_init() */ -static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) +void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) { size_t i; struct sdma_engine *sde; @@ -1618,7 +1620,6 @@ void sdma_exit(struct hfi1_devdata *dd) */ sdma_finalput(&sde->state); } - sdma_clean(dd, dd->num_sdma); } /* diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 374c59784950..46c775f255d1 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -420,6 +420,7 @@ struct sdma_engine { int sdma_init(struct hfi1_devdata *dd, u8 port); void sdma_start(struct hfi1_devdata *dd); void sdma_exit(struct hfi1_devdata *dd); +void sdma_clean(struct hfi1_devdata *dd, size_t num_engines); void sdma_all_running(struct hfi1_devdata *dd); void sdma_all_idle(struct hfi1_devdata *dd); void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle); -- cgit v1.2.3 From 0719007663ce2d5da653ec1dc3bcfe2ab681b964 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Thu, 1 Feb 2018 10:52:28 -0800 Subject: IB/hfi1: Convert PortXmitWait/PortVLXmitWait counters to flit times HFI's counters SendWaitCnt and SendWaitVlCnt are in units of TXE cycle time (at 805MHz). OPA counters PortXmitWait and PortVLXmtWait are in units of flit times. Convert the counter values to flit units using following conversion formula: PortXmitWait = SendWaitCnt * 2 * (4 /link_width) * (25 Gbps /link_speed) PortVLXmitWait = SendWaitVLCnt * 2 * (4 /link_width) * (25 Gbps /link_speed) At link up or downgrade events, the link width can change. To ensure accurate counter calculations, sample the counters after the events, during counter requests, and then aggregate the OPA counters. Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 64 +++++++++++++++++-- drivers/infiniband/hw/hfi1/chip.h | 4 +- drivers/infiniband/hw/hfi1/hfi.h | 7 +++ drivers/infiniband/hw/hfi1/init.c | 9 +++ drivers/infiniband/hw/hfi1/mad.c | 127 +++++++++++++++++++++++++++++++++++--- drivers/infiniband/hw/hfi1/mad.h | 47 +++++++++++++- 6 files changed, 239 insertions(+), 19 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/init.c') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 57f0df2f84f2..e6a60fa59f2b 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1083,6 +1083,7 @@ static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd); static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms); static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index); +static void update_xmit_counters(struct hfi1_pportdata *ppd, u16 link_width); /* * Error interrupt table entry. This is used as input to the interrupt @@ -6905,6 +6906,32 @@ void handle_freeze(struct work_struct *work) /* no longer frozen */ } +/** + * update_xmit_counters - update PortXmitWait/PortVlXmitWait + * counters. + * @ppd: info of physical Hfi port + * @link_width: new link width after link up or downgrade + * + * Update the PortXmitWait and PortVlXmitWait counters after + * a link up or downgrade event to reflect a link width change. + */ +static void update_xmit_counters(struct hfi1_pportdata *ppd, u16 link_width) +{ + int i; + u16 tx_width; + u16 link_speed; + + tx_width = tx_link_width(link_width); + link_speed = get_link_speed(ppd->link_speed_active); + + /* + * There are C_VL_COUNT number of PortVLXmitWait counters. + * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. + */ + for (i = 0; i < C_VL_COUNT + 1; i++) + get_xmit_wait_counters(ppd, tx_width, link_speed, i); +} + /* * Handle a link up interrupt from the 8051. * @@ -7526,18 +7553,29 @@ void handle_verify_cap(struct work_struct *work) set_link_state(ppd, HLS_GOING_UP); } -/* - * Apply the link width downgrade enabled policy against the current active - * link widths. +/** + * apply_link_downgrade_policy - Apply the link width downgrade enabled + * policy against the current active link widths. + * @ppd: info of physical Hfi port + * @refresh_widths: True indicates link downgrade event + * @return: True indicates a successful link downgrade. False indicates + * link downgrade event failed and the link will bounce back to + * default link width. * - * Called when the enabled policy changes or the active link widths change. + * Called when the enabled policy changes or the active link widths + * change. + * Refresh_widths indicates that a link downgrade occurred. The + * link_downgraded variable is set by refresh_widths and + * determines the success/failure of the policy application. */ -void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths) +bool apply_link_downgrade_policy(struct hfi1_pportdata *ppd, + bool refresh_widths) { int do_bounce = 0; int tries; u16 lwde; u16 tx, rx; + bool link_downgraded = refresh_widths; /* use the hls lock to avoid a race with actual link up */ tries = 0; @@ -7571,6 +7609,7 @@ retry: ppd->link_width_downgrade_rx_active == 0) { /* the 8051 reported a dead link as a downgrade */ dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n"); + link_downgraded = false; } else if (lwde == 0) { /* downgrade is disabled */ @@ -7587,6 +7626,7 @@ retry: ppd->link_width_downgrade_tx_active, ppd->link_width_downgrade_rx_active); do_bounce = 1; + link_downgraded = false; } } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 || (lwde & ppd->link_width_downgrade_rx_active) == 0) { @@ -7598,6 +7638,7 @@ retry: lwde, ppd->link_width_downgrade_tx_active, ppd->link_width_downgrade_rx_active); do_bounce = 1; + link_downgraded = false; } done: @@ -7609,6 +7650,8 @@ done: set_link_state(ppd, HLS_DN_OFFLINE); start_link(ppd); } + + return link_downgraded; } /* @@ -7622,7 +7665,8 @@ void handle_link_downgrade(struct work_struct *work) link_downgrade_work); dd_dev_info(ppd->dd, "8051: Link width downgrade\n"); - apply_link_downgrade_policy(ppd, 1); + if (apply_link_downgrade_policy(ppd, true)) + update_xmit_counters(ppd, ppd->link_width_downgrade_tx_active); } static char *dcc_err_string(char *buf, int buf_len, u64 flags) @@ -10597,6 +10641,14 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); handle_linkup_change(dd, 1); + + /* + * After link up, a new link width will have been set. + * Update the xmit counters with regards to the new + * link width. + */ + update_xmit_counters(ppd, ppd->link_width_active); + ppd->host_link_state = HLS_UP_INIT; update_statusp(ppd, IB_PORT_INIT); break; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 21fca8ec5076..c0d70f255050 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -736,8 +736,8 @@ int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *); int start_link(struct hfi1_pportdata *ppd); int bringup_serdes(struct hfi1_pportdata *ppd); void set_intr_state(struct hfi1_devdata *dd, u32 enable); -void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, - int refresh_widths); +bool apply_link_downgrade_policy(struct hfi1_pportdata *ppd, + bool refresh_widths); void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd, u32 intr_adjust, u32 npkts); int stop_drain_data_vls(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 105d11dcc554..90bc8c76d2ca 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -858,6 +858,13 @@ struct hfi1_pportdata { struct work_struct linkstate_active_work; /* Does this port need to prescan for FECNs */ bool cc_prescan; + /* + * Sample sendWaitCnt & sendWaitVlCnt during link transition + * and counter request. + */ + u64 port_vl_xmit_wait_last[C_VL_COUNT + 1]; + u16 prev_link_width; + u64 vl_xmit_flit_cnt[C_VL_COUNT + 1]; }; typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 4c51a75b238a..8c4f04032b28 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -637,6 +637,15 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, ppd->dd = dd; ppd->hw_pidx = hw_pidx; ppd->port = port; /* IB port number, not index */ + ppd->prev_link_width = LINK_WIDTH_DEFAULT; + /* + * There are C_VL_COUNT number of PortVLXmitWait counters. + * Adding 1 to C_VL_COUNT to include the PortXmitWait counter. + */ + for (i = 0; i < C_VL_COUNT + 1; i++) { + ppd->port_vl_xmit_wait_last[i] = 0; + ppd->vl_xmit_flit_cnt[i] = 0; + } default_pkey_idx = 1; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 34547a48a445..e9962c65c68f 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -2649,6 +2649,79 @@ static void a0_portstatus(struct hfi1_pportdata *ppd, } } +/** + * tx_link_width - convert link width bitmask to integer + * value representing actual link width. + * @link_width: width of active link + * @return: return index of the bit set in link_width var + * + * The function convert and return the index of bit set + * that indicate the current link width. + */ +u16 tx_link_width(u16 link_width) +{ + int n = LINK_WIDTH_DEFAULT; + u16 tx_width = n; + + while (link_width && n) { + if (link_width & (1 << (n - 1))) { + tx_width = n; + break; + } + n--; + } + + return tx_width; +} + +/** + * get_xmit_wait_counters - Convert HFI 's SendWaitCnt/SendWaitVlCnt + * counter in unit of TXE cycle times to flit times. + * @ppd: info of physical Hfi port + * @link_width: width of active link + * @link_speed: speed of active link + * @vl: represent VL0-VL7, VL15 for PortVLXmitWait counters request + * and if vl value is C_VL_COUNT, it represent SendWaitCnt + * counter request + * @return: return SendWaitCnt/SendWaitVlCnt counter value per vl. + * + * Convert SendWaitCnt/SendWaitVlCnt counter from TXE cycle times to + * flit times. Call this function to samples these counters. This + * function will calculate for previous state transition and update + * current state at end of function using ppd->prev_link_width and + * ppd->port_vl_xmit_wait_last to port_vl_xmit_wait_curr and link_width. + */ +u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd, + u16 link_width, u16 link_speed, int vl) +{ + u64 port_vl_xmit_wait_curr; + u64 delta_vl_xmit_wait; + u64 xmit_wait_val; + + if (vl > C_VL_COUNT) + return 0; + if (vl < C_VL_COUNT) + port_vl_xmit_wait_curr = + read_port_cntr(ppd, C_TX_WAIT_VL, vl); + else + port_vl_xmit_wait_curr = + read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL); + + xmit_wait_val = + port_vl_xmit_wait_curr - + ppd->port_vl_xmit_wait_last[vl]; + delta_vl_xmit_wait = + convert_xmit_counter(xmit_wait_val, + ppd->prev_link_width, + link_speed); + + ppd->vl_xmit_flit_cnt[vl] += delta_vl_xmit_wait; + ppd->port_vl_xmit_wait_last[vl] = port_vl_xmit_wait_curr; + ppd->prev_link_width = link_width; + + return ppd->vl_xmit_flit_cnt[vl]; +} + static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, struct ib_device *ibdev, u8 port, u32 *resp_len) @@ -2668,6 +2741,8 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); int vfi; u64 tmp, tmp2; + u16 link_width; + u16 link_speed; response_data_size = sizeof(struct opa_port_status_rsp) + num_vls * sizeof(struct _vls_pctrs); @@ -2711,8 +2786,16 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, rsp->port_multicast_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL)); + /* + * Convert PortXmitWait counter from TXE cycle times + * to flit times. + */ + link_width = + tx_link_width(ppd->link_width_downgrade_tx_active); + link_speed = get_link_speed(ppd->link_speed_active); rsp->port_xmit_wait = - cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL)); + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, C_VL_COUNT)); rsp->port_rcv_fecn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); rsp->port_rcv_becn = @@ -2777,10 +2860,14 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, rsp->vls[vfi].port_vl_xmit_pkts = cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl))); - + /* + * Convert PortVlXmitWait counter from TXE cycle + * times to flit times. + */ rsp->vls[vfi].port_vl_xmit_wait = - cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, - idx_from_vl(vl))); + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_fecn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, @@ -2910,6 +2997,8 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, unsigned long vl; u32 vl_select_mask; int vfi; + u16 link_width; + u16 link_speed; num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; num_vls = hweight32(be32_to_cpu(req->vl_select_mask)); @@ -2959,8 +3048,16 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, rsp->link_quality_indicator = cpu_to_be32((u32)lq); pma_get_opa_port_dctrs(ibdev, rsp); + /* + * Convert PortXmitWait counter from TXE + * cycle times to flit times. + */ + link_width = + tx_link_width(ppd->link_width_downgrade_tx_active); + link_speed = get_link_speed(ppd->link_speed_active); rsp->port_xmit_wait = - cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL)); + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, C_VL_COUNT)); rsp->port_rcv_fecn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); rsp->port_rcv_becn = @@ -2996,9 +3093,14 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl))); + /* + * Convert PortVlXmitWait counter from TXE + * cycle times to flit times. + */ rsp->vls[vfi].port_vl_xmit_wait = - cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, - idx_from_vl(vl))); + cpu_to_be64(get_xmit_wait_counters(ppd, link_width, + link_speed, + idx_from_vl(vl))); rsp->vls[vfi].port_vl_rcv_fecn = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, @@ -3416,9 +3518,11 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, if (counter_select & CS_PORT_MCAST_RCV_PKTS) write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0); - if (counter_select & CS_PORT_XMIT_WAIT) + if (counter_select & CS_PORT_XMIT_WAIT) { write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0); - + ppd->port_vl_xmit_wait_last[C_VL_COUNT] = 0; + ppd->vl_xmit_flit_cnt[C_VL_COUNT] = 0; + } /* ignore cs_sw_portCongestion for HFIs */ if (counter_select & CS_PORT_RCV_FECN) @@ -3491,8 +3595,11 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, if (counter_select & CS_PORT_RCV_PKTS) write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0); - if (counter_select & CS_PORT_XMIT_WAIT) + if (counter_select & CS_PORT_XMIT_WAIT) { write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0); + ppd->port_vl_xmit_wait_last[idx_from_vl(vl)] = 0; + ppd->vl_xmit_flit_cnt[idx_from_vl(vl)] = 0; + } /* sw_port_vl_congestion is 0 for HFIs */ if (counter_select & CS_PORT_RCV_FECN) diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index c4938f3d97c8..2f48e6953629 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -180,6 +180,15 @@ struct opa_mad_notice_attr { #define OPA_VLARB_PREEMPT_MATRIX 3 #define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00) +#define LINK_SPEED_25G 1 +#define LINK_SPEED_12_5G 2 +#define LINK_WIDTH_DEFAULT 4 +#define DECIMAL_FACTORING 1000 +/* + * The default link width is multiplied by 1000 + * to get accurate value after division. + */ +#define FACTOR_LINK_WIDTH (LINK_WIDTH_DEFAULT * DECIMAL_FACTORING) struct ib_pma_portcounters_cong { u8 reserved; @@ -429,5 +438,41 @@ struct sc2vlnt { void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port); void hfi1_handle_trap_timer(struct timer_list *t); - +u16 tx_link_width(u16 link_width); +u64 get_xmit_wait_counters(struct hfi1_pportdata *ppd, u16 link_width, + u16 link_speed, int vl); +/** + * get_link_speed - determine whether 12.5G or 25G speed + * @link_speed: the speed of active link + * @return: Return 2 if link speed identified as 12.5G + * or return 1 if link speed is 25G. + * + * The function indirectly calculate required link speed + * value for convert_xmit_counter function. If the link + * speed is 25G, the function return as 1 as it is required + * by xmit counter conversion formula :-( 25G / link_speed). + * This conversion will provide value 1 if current + * link speed is 25G or 2 if 12.5G.This is done to avoid + * 12.5 float number conversion. + */ +static inline u16 get_link_speed(u16 link_speed) +{ + return (link_speed == 1) ? + LINK_SPEED_12_5G : LINK_SPEED_25G; +} + +/** + * convert_xmit_counter - calculate flit times for given xmit counter + * value + * @xmit_wait_val: current xmit counter value + * @link_width: width of active link + * @link_speed: speed of active link + * @return: return xmit counter value in flit times. + */ +static inline u64 convert_xmit_counter(u64 xmit_wait_val, u16 link_width, + u16 link_speed) +{ + return (xmit_wait_val * 2 * (FACTOR_LINK_WIDTH / link_width) + * link_speed) / DECIMAL_FACTORING; +} #endif /* _HFI1_MAD_H */ -- cgit v1.2.3 From 953a9cebeab43f33baed79d1a9ef643bfb249c4b Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Thu, 1 Feb 2018 12:37:30 -0800 Subject: IB/hfi1: Convert kzalloc_node and kcalloc to use kcalloc_node Kzalloc_node API doesn't check for overflows in size multiplication. While kcalloc API check for overflows in size multiplication but these implementations are not NUMA-aware. This conversion allowed for correcting an allocation used in the hot path to be on the local NUMA and ensure us overflow free multiplication for the size of a memory allocation. Reviewed-by: Mike Marciniszyn Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 17 +++++++++-------- drivers/infiniband/hw/hfi1/pio.c | 15 ++++++++------- drivers/infiniband/hw/hfi1/sdma.c | 3 ++- 3 files changed, 19 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband/hw/hfi1/init.c') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 8c4f04032b28..33eba2356742 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -172,7 +172,7 @@ int hfi1_create_kctxts(struct hfi1_devdata *dd) u16 i; int ret; - dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd), + dd->rcd = kcalloc_node(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL, dd->node); if (!dd->rcd) return -ENOMEM; @@ -439,15 +439,16 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, * The resulting value will be rounded down to the closest * multiple of dd->rcv_entries.group_size. */ - rcd->egrbufs.buffers = kzalloc_node( - rcd->egrbufs.count * sizeof(*rcd->egrbufs.buffers), - GFP_KERNEL, numa); + rcd->egrbufs.buffers = + kcalloc_node(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.buffers), + GFP_KERNEL, numa); if (!rcd->egrbufs.buffers) goto bail; - rcd->egrbufs.rcvtids = kzalloc_node( - rcd->egrbufs.count * - sizeof(*rcd->egrbufs.rcvtids), - GFP_KERNEL, numa); + rcd->egrbufs.rcvtids = + kcalloc_node(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.rcvtids), + GFP_KERNEL, numa); if (!rcd->egrbufs.rcvtids) goto bail; rcd->egrbufs.size = eager_buffer_size; diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 4c1198bc5e70..40dac4d16eb8 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -455,8 +455,8 @@ int init_send_contexts(struct hfi1_devdata *dd) dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8), GFP_KERNEL); dd->send_contexts = kcalloc(dd->num_send_contexts, - sizeof(struct send_context_info), - GFP_KERNEL); + sizeof(struct send_context_info), + GFP_KERNEL); if (!dd->send_contexts || !dd->hw_to_sw) { kfree(dd->hw_to_sw); kfree(dd->send_contexts); @@ -856,8 +856,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, * so head == tail can mean empty. */ sc->sr_size = sci->credits + 1; - sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) * - sc->sr_size, GFP_KERNEL, numa); + sc->sr = kcalloc_node(sc->sr_size, + sizeof(union pio_shadow_ring), + GFP_KERNEL, numa); if (!sc->sr) { sc_free(sc); return NULL; @@ -1958,9 +1959,9 @@ int init_pervl_scs(struct hfi1_devdata *dd) hfi1_init_ctxt(dd->vld[15].sc); dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); - dd->kernel_send_context = kzalloc_node(dd->num_send_contexts * - sizeof(struct send_context *), - GFP_KERNEL, dd->node); + dd->kernel_send_context = kcalloc_node(dd->num_send_contexts, + sizeof(struct send_context *), + GFP_KERNEL, dd->node); if (!dd->kernel_send_context) goto freesc15; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 37424a81a7c9..d8ddbfdf3a4d 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1389,7 +1389,8 @@ int sdma_init(struct hfi1_devdata *dd, u8 port) num_engines, descq_cnt); /* alloc memory for array of send engines */ - dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL); + dd->per_sdma = kcalloc_node(num_engines, sizeof(*dd->per_sdma), + GFP_KERNEL, dd->node); if (!dd->per_sdma) return ret; -- cgit v1.2.3