From dbe678f6192f27879ac9ff6bc7a1036aad85aae9 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 18 May 2023 11:49:45 -0400 Subject: usb: cdns3: fix NCM gadget RX speed 20x slow than expection at iMX8QM At iMX8QM platform, enable NCM gadget and run 'iperf3 -s'. At host, run 'iperf3 -V -c fe80::6863:98ff:feef:3e0%enxc6e147509498' [ 5] 0.00-1.00 sec 1.55 MBytes 13.0 Mbits/sec 90 4.18 KBytes [ 5] 1.00-2.00 sec 1.44 MBytes 12.0 Mbits/sec 75 4.18 KBytes [ 5] 2.00-3.00 sec 1.48 MBytes 12.4 Mbits/sec 75 4.18 KBytes Expected speed should be bigger than 300Mbits/sec. The root cause of this performance drop was found to be data corruption happening at 4K borders in some Ethernet packets, leading to TCP checksum errors. This corruption occurs from the position (4K - (address & 0x7F)) to 4K. The u_ether function's allocation of skb_buff reserves 64B, meaning all RX addresses resemble 0xXXXX0040. Force trb_burst_size to 16 can fix this problem. Cc: stable@vger.kernel.org Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver") Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20230518154946.3666662-1-Frank.Li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdns3-gadget.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/usb/cdns3/cdns3-gadget.c') diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c index ccfaebca6faa..1dcadef933e3 100644 --- a/drivers/usb/cdns3/cdns3-gadget.c +++ b/drivers/usb/cdns3/cdns3-gadget.c @@ -2097,6 +2097,19 @@ int cdns3_ep_config(struct cdns3_endpoint *priv_ep, bool enable) else priv_ep->trb_burst_size = 16; + /* + * In versions preceding DEV_VER_V2, for example, iMX8QM, there exit the bugs + * in the DMA. These bugs occur when the trb_burst_size exceeds 16 and the + * address is not aligned to 128 Bytes (which is a product of the 64-bit AXI + * and AXI maximum burst length of 16 or 0xF+1, dma_axi_ctrl0[3:0]). This + * results in data corruption when it crosses the 4K border. The corruption + * specifically occurs from the position (4K - (address & 0x7F)) to 4K. + * + * So force trb_burst_size to 16 at such platform. + */ + if (priv_dev->dev_ver < DEV_VER_V2) + priv_ep->trb_burst_size = 16; + mult = min_t(u8, mult, EP_CFG_MULT_MAX); buffering = min_t(u8, buffering, EP_CFG_BUFFERING_MAX); maxburst = min_t(u8, maxburst, EP_CFG_MAXBURST_MAX); -- cgit v1.2.3 From 2a1c4639d6d6bcee27f74e38f83ffb43579c4733 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 18 May 2023 16:49:45 -0400 Subject: usb: cdns3: improve handling of unaligned address case When the address of a request was not aligned with an 8-byte boundary, the USB DMA was unable to process it, necessitating the use of an internal bounce buffer. In these cases, the request->buf had to be copied to/from this bounce buffer. However, if this unaligned address scenario arises, it is unnecessary to perform heavy cache maintenance operations like usb_gadget_map(unmap)_request_by_dev() on the request->buf, as the DMA does not utilize it at all. it can be skipped at this case. iperf3 tests on the rndis case: Transmit speed (TX): Improved from 299Mbps to 440Mbps Receive speed (RX): Improved from 290Mbps to 500Mbps Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20230518204947.3770236-1-Frank.Li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdns3-gadget.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/usb/cdns3/cdns3-gadget.c') diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c index ccfaebca6faa..d2cedd6db88f 100644 --- a/drivers/usb/cdns3/cdns3-gadget.c +++ b/drivers/usb/cdns3/cdns3-gadget.c @@ -800,7 +800,8 @@ void cdns3_gadget_giveback(struct cdns3_endpoint *priv_ep, if (request->status == -EINPROGRESS) request->status = status; - usb_gadget_unmap_request_by_dev(priv_dev->sysdev, request, + if (likely(!(priv_req->flags & REQUEST_UNALIGNED))) + usb_gadget_unmap_request_by_dev(priv_dev->sysdev, request, priv_ep->dir); if ((priv_req->flags & REQUEST_UNALIGNED) && @@ -2530,10 +2531,12 @@ static int __cdns3_gadget_ep_queue(struct usb_ep *ep, if (ret < 0) return ret; - ret = usb_gadget_map_request_by_dev(priv_dev->sysdev, request, + if (likely(!(priv_req->flags & REQUEST_UNALIGNED))) { + ret = usb_gadget_map_request_by_dev(priv_dev->sysdev, request, usb_endpoint_dir_in(ep->desc)); - if (ret) - return ret; + if (ret) + return ret; + } list_add_tail(&request->list, &priv_ep->deferred_req_list); -- cgit v1.2.3 From 3124387537bc94251e65c2841062d14736380ec4 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 18 May 2023 16:49:46 -0400 Subject: usb: cdns3: optimize OUT transfer by copying only actual received data Previously, the entire length of the request, which is equal to or greater than the actual data, was dma synced and memcpy when using the bounce buffer. Actually only the actual data indicated by request->actual need be synced and copied. Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20230518204947.3770236-2-Frank.Li@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdns3-gadget.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/usb/cdns3/cdns3-gadget.c') diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c index d2cedd6db88f..27199198cc26 100644 --- a/drivers/usb/cdns3/cdns3-gadget.c +++ b/drivers/usb/cdns3/cdns3-gadget.c @@ -809,10 +809,10 @@ void cdns3_gadget_giveback(struct cdns3_endpoint *priv_ep, /* Make DMA buffer CPU accessible */ dma_sync_single_for_cpu(priv_dev->sysdev, priv_req->aligned_buf->dma, - priv_req->aligned_buf->size, + request->actual, priv_req->aligned_buf->dir); memcpy(request->buf, priv_req->aligned_buf->buf, - request->length); + request->actual); } priv_req->flags &= ~(REQUEST_PENDING | REQUEST_UNALIGNED); -- cgit v1.2.3