diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-03 00:38:27 +0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-03 00:38:27 +0400 |
commit | aecdc33e111b2c447b622e287c6003726daa1426 (patch) | |
tree | 3e7657eae4b785e1a1fb5dfb225dbae0b2f0cfc6 /drivers/net/ethernet/sfc | |
parent | a20acf99f75e49271381d65db097c9763060a1e8 (diff) | |
parent | a3a6cab5ea10cca64d036851fe0d932448f2fe4f (diff) | |
download | linux-aecdc33e111b2c447b622e287c6003726daa1426.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller:
1) GRE now works over ipv6, from Dmitry Kozlov.
2) Make SCTP more network namespace aware, from Eric Biederman.
3) TEAM driver now works with non-ethernet devices, from Jiri Pirko.
4) Make openvswitch network namespace aware, from Pravin B Shelar.
5) IPV6 NAT implementation, from Patrick McHardy.
6) Server side support for TCP Fast Open, from Jerry Chu and others.
7) Packet BPF filter supports MOD and XOR, from Eric Dumazet and Daniel
Borkmann.
8) Increate the loopback default MTU to 64K, from Eric Dumazet.
9) Use a per-task rather than per-socket page fragment allocator for
outgoing networking traffic. This benefits processes that have very
many mostly idle sockets, which is quite common.
From Eric Dumazet.
10) Use up to 32K for page fragment allocations, with fallbacks to
smaller sizes when higher order page allocations fail. Benefits are
a) less segments for driver to process b) less calls to page
allocator c) less waste of space.
From Eric Dumazet.
11) Allow GRO to be used on GRE tunnels, from Eric Dumazet.
12) VXLAN device driver, one way to handle VLAN issues such as the
limitation of 4096 VLAN IDs yet still have some level of isolation.
From Stephen Hemminger.
13) As usual there is a large boatload of driver changes, with the scale
perhaps tilted towards the wireless side this time around.
Fix up various fairly trivial conflicts, mostly caused by the user
namespace changes.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1012 commits)
hyperv: Add buffer for extended info after the RNDIS response message.
hyperv: Report actual status in receive completion packet
hyperv: Remove extra allocated space for recv_pkt_list elements
hyperv: Fix page buffer handling in rndis_filter_send_request()
hyperv: Fix the missing return value in rndis_filter_set_packet_filter()
hyperv: Fix the max_xfer_size in RNDIS initialization
vxlan: put UDP socket in correct namespace
vxlan: Depend on CONFIG_INET
sfc: Fix the reported priorities of different filter types
sfc: Remove EFX_FILTER_FLAG_RX_OVERRIDE_IP
sfc: Fix loopback self-test with separate_tx_channels=1
sfc: Fix MCDI structure field lookup
sfc: Add parentheses around use of bitfield macro arguments
sfc: Fix null function pointer in efx_sriov_channel_type
vxlan: virtual extensible lan
igmp: export symbol ip_mc_leave_group
netlink: add attributes to fdb interface
tg3: unconditionally select HWMON support when tg3 is enabled.
Revert "net: ti cpsw ethernet: allow reading phy interface mode from DT"
gre: fix sparse warning
...
Diffstat (limited to 'drivers/net/ethernet/sfc')
-rw-r--r-- | drivers/net/ethernet/sfc/Kconfig | 7 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/Makefile | 1 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/bitfield.h | 22 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/efx.c | 250 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/efx.h | 1 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/ethtool.c | 16 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/falcon_boards.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/filter.c | 108 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/filter.h | 7 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/mcdi.c | 49 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/mcdi.h | 12 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/mcdi_pcol.h | 29 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/mtd.c | 7 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/net_driver.h | 78 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/nic.c | 6 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/nic.h | 36 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/ptp.c | 1484 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/rx.c | 20 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/selftest.c | 3 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/siena.c | 1 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/siena_sriov.c | 8 | ||||
-rw-r--r-- | drivers/net/ethernet/sfc/tx.c | 627 |
22 files changed, 2152 insertions, 622 deletions
diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig index fb3cbc27063c..25906c1d1b15 100644 --- a/drivers/net/ethernet/sfc/Kconfig +++ b/drivers/net/ethernet/sfc/Kconfig @@ -34,3 +34,10 @@ config SFC_SRIOV This enables support for the SFC9000 I/O Virtualization features, allowing accelerated network performance in virtualized environments. +config SFC_PTP + bool "Solarflare SFC9000-family PTP support" + depends on SFC && PTP_1588_CLOCK && !(SFC=y && PTP_1588_CLOCK=m) + default y + ---help--- + This enables support for the Precision Time Protocol (PTP) + on SFC9000-family NICs diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile index ea1f8db57318..e11f2ecf69d9 100644 --- a/drivers/net/ethernet/sfc/Makefile +++ b/drivers/net/ethernet/sfc/Makefile @@ -5,5 +5,6 @@ sfc-y += efx.o nic.o falcon.o siena.o tx.o rx.o filter.o \ mcdi.o mcdi_phy.o mcdi_mon.o sfc-$(CONFIG_SFC_MTD) += mtd.o sfc-$(CONFIG_SFC_SRIOV) += siena_sriov.o +sfc-$(CONFIG_SFC_PTP) += ptp.o obj-$(CONFIG_SFC) += sfc.o diff --git a/drivers/net/ethernet/sfc/bitfield.h b/drivers/net/ethernet/sfc/bitfield.h index b26a954c27fc..5400a33f254f 100644 --- a/drivers/net/ethernet/sfc/bitfield.h +++ b/drivers/net/ethernet/sfc/bitfield.h @@ -120,10 +120,10 @@ typedef union efx_oword { * [0,high-low), with garbage in bits [high-low+1,...). */ #define EFX_EXTRACT_NATIVE(native_element, min, max, low, high) \ - (((low > max) || (high < min)) ? 0 : \ - ((low > min) ? \ - ((native_element) >> (low - min)) : \ - ((native_element) << (min - low)))) + ((low) > (max) || (high) < (min) ? 0 : \ + (low) > (min) ? \ + (native_element) >> ((low) - (min)) : \ + (native_element) << ((min) - (low))) /* * Extract bit field portion [low,high) from the 64-bit little-endian @@ -142,27 +142,27 @@ typedef union efx_oword { #define EFX_EXTRACT_OWORD64(oword, low, high) \ ((EFX_EXTRACT64((oword).u64[0], 0, 63, low, high) | \ EFX_EXTRACT64((oword).u64[1], 64, 127, low, high)) & \ - EFX_MASK64(high + 1 - low)) + EFX_MASK64((high) + 1 - (low))) #define EFX_EXTRACT_QWORD64(qword, low, high) \ (EFX_EXTRACT64((qword).u64[0], 0, 63, low, high) & \ - EFX_MASK64(high + 1 - low)) + EFX_MASK64((high) + 1 - (low))) #define EFX_EXTRACT_OWORD32(oword, low, high) \ ((EFX_EXTRACT32((oword).u32[0], 0, 31, low, high) | \ EFX_EXTRACT32((oword).u32[1], 32, 63, low, high) | \ EFX_EXTRACT32((oword).u32[2], 64, 95, low, high) | \ EFX_EXTRACT32((oword).u32[3], 96, 127, low, high)) & \ - EFX_MASK32(high + 1 - low)) + EFX_MASK32((high) + 1 - (low))) #define EFX_EXTRACT_QWORD32(qword, low, high) \ ((EFX_EXTRACT32((qword).u32[0], 0, 31, low, high) | \ EFX_EXTRACT32((qword).u32[1], 32, 63, low, high)) & \ - EFX_MASK32(high + 1 - low)) + EFX_MASK32((high) + 1 - (low))) #define EFX_EXTRACT_DWORD(dword, low, high) \ (EFX_EXTRACT32((dword).u32[0], 0, 31, low, high) & \ - EFX_MASK32(high + 1 - low)) + EFX_MASK32((high) + 1 - (low))) #define EFX_OWORD_FIELD64(oword, field) \ EFX_EXTRACT_OWORD64(oword, EFX_LOW_BIT(field), \ @@ -442,10 +442,10 @@ typedef union efx_oword { cpu_to_le32(EFX_INSERT_NATIVE(min, max, low, high, value)) #define EFX_INPLACE_MASK64(min, max, low, high) \ - EFX_INSERT64(min, max, low, high, EFX_MASK64(high + 1 - low)) + EFX_INSERT64(min, max, low, high, EFX_MASK64((high) + 1 - (low))) #define EFX_INPLACE_MASK32(min, max, low, high) \ - EFX_INSERT32(min, max, low, high, EFX_MASK32(high + 1 - low)) + EFX_INSERT32(min, max, low, high, EFX_MASK32((high) + 1 - (low))) #define EFX_SET_OWORD64(oword, low, high, value) do { \ (oword).u64[0] = (((oword).u64[0] \ diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 65a8d49106a4..96bd980e828d 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -202,11 +202,21 @@ static void efx_stop_all(struct efx_nic *efx); #define EFX_ASSERT_RESET_SERIALISED(efx) \ do { \ - if ((efx->state == STATE_RUNNING) || \ + if ((efx->state == STATE_READY) || \ (efx->state == STATE_DISABLED)) \ ASSERT_RTNL(); \ } while (0) +static int efx_check_disabled(struct efx_nic *efx) +{ + if (efx->state == STATE_DISABLED) { + netif_err(efx, drv, efx->net_dev, + "device is disabled due to earlier errors\n"); + return -EIO; + } + return 0; +} + /************************************************************************** * * Event queue processing @@ -630,6 +640,16 @@ static void efx_start_datapath(struct efx_nic *efx) efx->rx_buffer_order = get_order(efx->rx_buffer_len + sizeof(struct efx_rx_page_state)); + /* We must keep at least one descriptor in a TX ring empty. + * We could avoid this when the queue size does not exactly + * match the hardware ring size, but it's not that important. + * Therefore we stop the queue when one more skb might fill + * the ring completely. We wake it when half way back to + * empty. + */ + efx->txq_stop_thresh = efx->txq_entries - efx_tx_max_skb_descs(efx); + efx->txq_wake_thresh = efx->txq_stop_thresh / 2; + /* Initialise the channels */ efx_for_each_channel(channel, efx) { efx_for_each_channel_tx_queue(tx_queue, channel) @@ -714,6 +734,7 @@ static void efx_remove_channel(struct efx_channel *channel) efx_for_each_possible_channel_tx_queue(tx_queue, channel) efx_remove_tx_queue(tx_queue); efx_remove_eventq(channel); + channel->type->post_remove(channel); } static void efx_remove_channels(struct efx_nic *efx) @@ -730,7 +751,11 @@ efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries) struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel; u32 old_rxq_entries, old_txq_entries; unsigned i, next_buffer_table = 0; - int rc = 0; + int rc; + + rc = efx_check_disabled(efx); + if (rc) + return rc; /* Not all channels should be reallocated. We must avoid * reallocating their buffer table entries. @@ -828,6 +853,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue) static const struct efx_channel_type efx_default_channel_type = { .pre_probe = efx_channel_dummy_op_int, + .post_remove = efx_channel_dummy_op_void, .get_name = efx_get_channel_name, .copy = efx_copy_channel, .keep_eventq = false, @@ -838,6 +864,10 @@ int efx_channel_dummy_op_int(struct efx_channel *channel) return 0; } +void efx_channel_dummy_op_void(struct efx_channel *channel) +{ +} + /************************************************************************** * * Port handling @@ -1365,6 +1395,8 @@ static void efx_start_interrupts(struct efx_nic *efx, bool may_keep_eventq) { struct efx_channel *channel; + BUG_ON(efx->state == STATE_DISABLED); + if (efx->legacy_irq) efx->legacy_irq_enabled = true; efx_nic_enable_interrupts(efx); @@ -1382,6 +1414,9 @@ static void efx_stop_interrupts(struct efx_nic *efx, bool may_keep_eventq) { struct efx_channel *channel; + if (efx->state == STATE_DISABLED) + return; + efx_mcdi_mode_poll(efx); efx_nic_disable_interrupts(efx); @@ -1422,10 +1457,16 @@ static void efx_set_channels(struct efx_nic *efx) efx->tx_channel_offset = separate_tx_channels ? efx->n_channels - efx->n_tx_channels : 0; - /* We need to adjust the TX queue numbers if we have separate + /* We need to mark which channels really have RX and TX + * queues, and adjust the TX queue numbers if we have separate * RX-only and TX-only channels. */ efx_for_each_channel(channel, efx) { + if (channel->channel < efx->n_rx_channels) + channel->rx_queue.core_index = channel->channel; + else + channel->rx_queue.core_index = -1; + efx_for_each_channel_tx_queue(tx_queue, channel) tx_queue->queue -= (efx->tx_channel_offset * EFX_TXQ_TYPES); @@ -1533,22 +1574,21 @@ static int efx_probe_all(struct efx_nic *efx) return rc; } -/* Called after previous invocation(s) of efx_stop_all, restarts the port, - * kernel transmit queues and NAPI processing, and ensures that the port is - * scheduled to be reconfigured. This function is safe to call multiple - * times when the NIC is in any state. +/* If the interface is supposed to be running but is not, start + * the hardware and software data path, regular activity for the port + * (MAC statistics, link polling, etc.) and schedule the port to be + * reconfigured. Interrupts must already be enabled. This function + * is safe to call multiple times, so long as the NIC is not disabled. + * Requires the RTNL lock. */ static void efx_start_all(struct efx_nic *efx) { EFX_ASSERT_RESET_SERIALISED(efx); + BUG_ON(efx->state == STATE_DISABLED); /* Check that it is appropriate to restart the interface. All * of these flags are safe to read under just the rtnl lock */ - if (efx->port_enabled) - return; - if ((efx->state != STATE_RUNNING) && (efx->state != STATE_INIT)) - return; - if (!netif_running(efx->net_dev)) + if (efx->port_enabled || !netif_running(efx->net_dev)) return; efx_start_port(efx); @@ -1582,11 +1622,11 @@ static void efx_flush_all(struct efx_nic *efx) cancel_work_sync(&efx->mac_work); } -/* Quiesce hardware and software without bringing the link down. - * Safe to call multiple times, when the nic and interface is in any - * state. The caller is guaranteed to subsequently be in a position - * to modify any hardware and software state they see fit without - * taking locks. */ +/* Quiesce the hardware and software data path, and regular activity + * for the port without bringing the link down. Safe to call multiple + * times with the NIC in almost any state, but interrupts should be + * enabled. Requires the RTNL lock. + */ static void efx_stop_all(struct efx_nic *efx) { EFX_ASSERT_RESET_SERIALISED(efx); @@ -1739,7 +1779,8 @@ static int efx_ioctl(struct net_device *net_dev, struct ifreq *ifr, int cmd) struct efx_nic *efx = netdev_priv(net_dev); struct mii_ioctl_data *data = if_mii(ifr); - EFX_ASSERT_RESET_SERIALISED(efx); + if (cmd == SIOCSHWTSTAMP) + return efx_ptp_ioctl(efx, ifr, cmd); /* Convert phy_id from older PRTAD/DEVAD format */ if ((cmd == SIOCGMIIREG || cmd == SIOCSMIIREG) && @@ -1820,13 +1861,14 @@ static void efx_netpoll(struct net_device *net_dev) static int efx_net_open(struct net_device *net_dev) { struct efx_nic *efx = netdev_priv(net_dev); - EFX_ASSERT_RESET_SERIALISED(efx); + int rc; netif_dbg(efx, ifup, efx->net_dev, "opening device on CPU %d\n", raw_smp_processor_id()); - if (efx->state == STATE_DISABLED) - return -EIO; + rc = efx_check_disabled(efx); + if (rc) + return rc; if (efx->phy_mode & PHY_MODE_SPECIAL) return -EBUSY; if (efx_mcdi_poll_reboot(efx) && efx_reset(efx, RESET_TYPE_ALL)) @@ -1852,10 +1894,8 @@ static int efx_net_stop(struct net_device *net_dev) netif_dbg(efx, ifdown, efx->net_dev, "closing on CPU %d\n", raw_smp_processor_id()); - if (efx->state != STATE_DISABLED) { - /* Stop the device and flush all the channels */ - efx_stop_all(efx); - } + /* Stop the device and flush all the channels */ + efx_stop_all(efx); return 0; } @@ -1915,9 +1955,11 @@ static void efx_watchdog(struct net_device *net_dev) static int efx_change_mtu(struct net_device *net_dev, int new_mtu) { struct efx_nic *efx = netdev_priv(net_dev); + int rc; - EFX_ASSERT_RESET_SERIALISED(efx); - + rc = efx_check_disabled(efx); + if (rc) + return rc; if (new_mtu > EFX_MAX_MTU) return -EINVAL; @@ -1926,8 +1968,6 @@ static int efx_change_mtu(struct net_device *net_dev, int new_mtu) netif_dbg(efx, drv, efx->net_dev, "changing MTU to %d\n", new_mtu); mutex_lock(&efx->mac_lock); - /* Reconfigure the MAC before enabling the dma queues so that - * the RX buffers don't overflow */ net_dev->mtu = new_mtu; efx->type->reconfigure_mac(efx); mutex_unlock(&efx->mac_lock); @@ -1942,8 +1982,6 @@ static int efx_set_mac_address(struct net_device *net_dev, void *data) struct sockaddr *addr = data; char *new_addr = addr->sa_data; - EFX_ASSERT_RESET_SERIALISED(efx); - if (!is_valid_ether_addr(new_addr)) { netif_err(efx, drv, efx->net_dev, "invalid ethernet MAC address requested: %pM\n", @@ -2079,11 +2117,27 @@ static int efx_register_netdev(struct efx_nic *efx) rtnl_lock(); + /* Enable resets to be scheduled and check whether any were + * already requested. If so, the NIC is probably hosed so we + * abort. + */ + efx->state = STATE_READY; + smp_mb(); /* ensure we change state before checking reset_pending */ + if (efx->reset_pending) { + netif_err(efx, probe, efx->net_dev, + "aborting probe due to scheduled reset\n"); + rc = -EIO; + goto fail_locked; + } + rc = dev_alloc_name(net_dev, net_dev->name); if (rc < 0) goto fail_locked; efx_update_name(efx); + /* Always start with carrier off; PHY events will detect the link */ + netif_carrier_off(net_dev); + rc = register_netdevice(net_dev); if (rc) goto fail_locked; @@ -2094,9 +2148,6 @@ static int efx_register_netdev(struct efx_nic *efx) efx_init_tx_queue_core_txq(tx_queue); } - /* Always start with carrier off; PHY events will detect the link */ - netif_carrier_off(net_dev); - rtnl_unlock(); rc = device_create_file(&efx->pci_dev->dev, &dev_attr_phy_type); @@ -2108,14 +2159,14 @@ static int efx_register_netdev(struct efx_nic *efx) return 0; +fail_registered: + rtnl_lock(); + unregister_netdevice(net_dev); fail_locked: + efx->state = STATE_UNINIT; rtnl_unlock(); netif_err(efx, drv, efx->net_dev, "could not register net dev\n"); return rc; - -fail_registered: - unregister_netdev(net_dev); - return rc; } static void efx_unregister_netdev(struct efx_nic *efx) @@ -2138,7 +2189,11 @@ static void efx_unregister_netdev(struct efx_nic *efx) strlcpy(efx->name, pci_name(efx->pci_dev), sizeof(efx->name)); device_remove_file(&efx->pci_dev->dev, &dev_attr_phy_type); - unregister_netdev(efx->net_dev); + + rtnl_lock(); + unregister_netdevice(efx->net_dev); + efx->state = STATE_UNINIT; + rtnl_unlock(); } /************************************************************************** @@ -2154,9 +2209,9 @@ void efx_reset_down(struct efx_nic *efx, enum reset_type method) EFX_ASSERT_RESET_SERIALISED(efx); efx_stop_all(efx); - mutex_lock(&efx->mac_lock); - efx_stop_interrupts(efx, false); + + mutex_lock(&efx->mac_lock); if (efx->port_initialized && method != RESET_TYPE_INVISIBLE) efx->phy_op->fini(efx); efx->type->fini(efx); @@ -2276,16 +2331,15 @@ static void efx_reset_work(struct work_struct *data) if (!pending) return; - /* If we're not RUNNING then don't reset. Leave the reset_pending - * flags set so that efx_pci_probe_main will be retried */ - if (efx->state != STATE_RUNNING) { - netif_info(efx, drv, efx->net_dev, - "scheduled reset quenched. NIC not RUNNING\n"); - return; - } - rtnl_lock(); - (void)efx_reset(efx, fls(pending) - 1); + + /* We checked the state in efx_schedule_reset() but it may + * have changed by now. Now that we have the RTNL lock, + * it cannot change again. + */ + if (efx->state == STATE_READY) + (void)efx_reset(efx, fls(pending) - 1); + rtnl_unlock(); } @@ -2311,6 +2365,13 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type) } set_bit(method, &efx->reset_pending); + smp_mb(); /* ensure we change reset_pending before checking state */ + + /* If we're not READY then just leave the flags set as the cue + * to abort probing or reschedule the reset later. + */ + if (ACCESS_ONCE(efx->state) != STATE_READY) + return; /* efx_process_channel() will no longer read events once a * reset is scheduled. So switch back to poll'd MCDI completions. */ @@ -2376,13 +2437,12 @@ static const struct efx_phy_operations efx_dummy_phy_operations = { /* This zeroes out and then fills in the invariants in a struct * efx_nic (including all sub-structures). */ -static int efx_init_struct(struct efx_nic *efx, const struct efx_nic_type *type, +static int efx_init_struct(struct efx_nic *efx, struct pci_dev *pci_dev, struct net_device *net_dev) { int i; /* Initialise common structures */ - memset(efx, 0, sizeof(*efx)); spin_lock_init(&efx->biu_lock); #ifdef CONFIG_SFC_MTD INIT_LIST_HEAD(&efx->mtd_list); @@ -2392,7 +2452,7 @@ static int efx_init_struct(struct efx_nic *efx, const struct efx_nic_type *type, INIT_DELAYED_WORK(&efx->selftest_work, efx_selftest_async_work); efx->pci_dev = pci_dev; efx->msg_enable = debug; - efx->state = STATE_INIT; + efx->state = STATE_UNINIT; strlcpy(efx->name, pci_name(pci_dev), sizeof(efx->name)); efx->net_dev = net_dev; @@ -2409,8 +2469,6 @@ static int efx_init_struct(struct efx_nic *efx, const struct efx_nic_type *type, goto fail; } - efx->type = type; - EFX_BUG_ON_PARANOID(efx->type->phys_addr_channels > EFX_MAX_CHANNELS); /* Higher numbered interrupt modes are less capable! */ @@ -2455,6 +2513,12 @@ static void efx_fini_struct(struct efx_nic *efx) */ static void efx_pci_remove_main(struct efx_nic *efx) { + /* Flush reset_work. It can no longer be scheduled since we + * are not READY. + */ + BUG_ON(efx->state == STATE_READY); + cancel_work_sync(&efx->reset_work); + #ifdef CONFIG_RFS_ACCEL free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap); efx->net_dev->rx_cpu_rmap = NULL; @@ -2480,24 +2544,15 @@ static void efx_pci_remove(struct pci_dev *pci_dev) /* Mark the NIC as fini, then stop the interface */ rtnl_lock(); - efx->state = STATE_FINI; dev_close(efx->net_dev); - - /* Allow any queued efx_resets() to complete */ + efx_stop_interrupts(efx, false); rtnl_unlock(); - efx_stop_interrupts(efx, false); efx_sriov_fini(efx); efx_unregister_netdev(efx); efx_mtd_remove(efx); - /* Wait for any scheduled resets to complete. No more will be - * scheduled from this point because efx_stop_all() has been - * called, we are no longer registered with driverlink, and - * the net_device's have been removed. */ - cancel_work_sync(&efx->reset_work); - efx_pci_remove_main(efx); efx_fini_io(efx); @@ -2617,7 +2672,6 @@ static int efx_pci_probe_main(struct efx_nic *efx) static int __devinit efx_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *entry) { - const struct efx_nic_type *type = (const struct efx_nic_type *) entry->driver_data; struct net_device *net_dev; struct efx_nic *efx; int rc; @@ -2627,10 +2681,12 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev, EFX_MAX_RX_QUEUES); if (!net_dev) return -ENOMEM; - net_dev->features |= (type->offload_features | NETIF_F_SG | + efx = netdev_priv(net_dev); + efx->type = (const struct efx_nic_type *) entry->driver_data; + net_dev->features |= (efx->type->offload_features | NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_RXCSUM); - if (type->offload_features & NETIF_F_V6_CSUM) + if (efx->type->offload_features & NETIF_F_V6_CSUM) net_dev->features |= NETIF_F_TSO6; /* Mask for features that also apply to VLAN devices */ net_dev->vlan_features |= (NETIF_F_ALL_CSUM | NETIF_F_SG | @@ -2638,10 +2694,9 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev, NETIF_F_RXCSUM); /* All offloads can be toggled */ net_dev->hw_features = net_dev->features & ~NETIF_F_HIGHDMA; - efx = netdev_priv(net_dev); pci_set_drvdata(pci_dev, efx); SET_NETDEV_DEV(net_dev, &pci_dev->dev); - rc = efx_init_struct(efx, type, pci_dev, net_dev); + rc = efx_init_struct(efx, pci_dev, net_dev); if (rc) goto fail1; @@ -2656,28 +2711,9 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev, goto fail2; rc = efx_pci_probe_main(efx); - - /* Serialise against efx_reset(). No more resets will be - * scheduled since efx_stop_all() has been called, and we have - * not and never have been registered. - */ - cancel_work_sync(&efx->reset_work); - if (rc) goto fail3; - /* If there was a scheduled reset during probe, the NIC is - * probably hosed anyway. - */ - if (efx->reset_pending) { - rc = -EIO; - goto fail4; - } - - /* Switch to the running state before we expose the device to the OS, - * so that dev_open()|efx_start_all() will actually start the device */ - efx->state = STATE_RUNNING; - rc = efx_register_netdev(efx); if (rc) goto fail4; @@ -2717,12 +2753,18 @@ static int efx_pm_freeze(struct device *dev) { struct efx_nic *efx = pci_get_drvdata(to_pci_dev(dev)); - efx->state = STATE_FINI; + rtnl_lock(); - netif_device_detach(efx->net_dev); + if (efx->state != STATE_DISABLED) { + efx->state = STATE_UNINIT; - efx_stop_all(efx); - efx_stop_interrupts(efx, false); + netif_device_detach(efx->net_dev); + + efx_stop_all(efx); + efx_stop_interrupts(efx, false); + } + + rtnl_unlock(); return 0; } @@ -2731,21 +2773,25 @@ static int efx_pm_thaw(struct device *dev) { struct efx_nic *efx = pci_get_drvdata(to_pci_dev(dev)); - efx->state = STATE_INIT; + rtnl_lock(); - efx_start_interrupts(efx, false); + if (efx->state != STATE_DISABLED) { + efx_start_interrupts(efx, false); - mutex_lock(&efx->mac_lock); - efx->phy_op->reconfigure(efx); - mutex_unlock(&efx->mac_lock); + mutex_lock(&efx->mac_lock); + efx->phy_op->reconfigure(efx); + mutex_unlock(&efx->mac_lock); - efx_start_all(efx); + efx_start_all(efx); - netif_device_attach(efx->net_dev); + netif_device_attach(efx->net_dev); - efx->state = STATE_RUNNING; + efx->state = STATE_READY; - efx->type->resume_wol(efx); + efx->type->resume_wol(efx); + } + + rtnl_unlock(); /* Reschedule any quenched resets scheduled during efx_pm_freeze() */ queue_work(reset_workqueue, &efx->reset_work); diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index 70755c97251a..f11170bc48bf 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -102,6 +102,7 @@ static inline void efx_filter_rfs_expire(struct efx_channel *channel) {} /* Channels */ extern int efx_channel_dummy_op_int(struct efx_channel *channel); +extern void efx_channel_dummy_op_void(struct efx_channel *channel); extern void efx_process_channel_now(struct efx_channel *channel); extern int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries); diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 5faedd855b77..90f078eff8e6 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -337,7 +337,8 @@ static int efx_fill_loopback_test(struct efx_nic *efx, unsigned int test_index, struct ethtool_string *strings, u64 *data) { - struct efx_channel *channel = efx_get_channel(efx, 0); + struct efx_channel *channel = + efx_get_channel(efx, efx->tx_channel_offset); struct efx_tx_queue *tx_queue; efx_for_each_channel_tx_queue(tx_queue, channel) { @@ -529,9 +530,7 @@ static void efx_ethtool_self_test(struct net_device *net_dev, if (!efx_tests) goto fail; - - ASSERT_RTNL(); - if (efx->state != STATE_RUNNING) { + if (efx->state != STATE_READY) { rc = -EIO; goto fail1; } @@ -962,9 +961,7 @@ static int efx_ethtool_set_class_rule(struct efx_nic *efx, int rc; /* Check that user wants us to choose the location */ - if (rule->location != RX_CLS_LOC_ANY && - rule->location != RX_CLS_LOC_FIRST && - rule->location != RX_CLS_LOC_LAST) + if (rule->location != RX_CLS_LOC_ANY) return -EINVAL; /* Range-check ring_cookie */ @@ -978,9 +975,7 @@ static int efx_ethtool_set_class_rule(struct efx_nic *efx, rule->m_ext.data[1])) return -EINVAL; - efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL, - (rule->location == RX_CLS_LOC_FIRST) ? - EFX_FILTER_FLAG_RX_OVERRIDE_IP : 0, + efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL, 0, (rule->ring_cookie == RX_CLS_FLOW_DISC) ? 0xfff : rule->ring_cookie); @@ -1176,6 +1171,7 @@ const struct ethtool_ops efx_ethtool_ops = { .get_rxfh_indir_size = efx_ethtool_get_rxfh_indir_size, .get_rxfh_indir = efx_ethtool_get_rxfh_indir, .set_rxfh_indir = efx_ethtool_set_rxfh_indir, + .get_ts_info = efx_ptp_get_ts_info, .get_module_info = efx_ethtool_get_module_info, .get_module_eeprom = efx_ethtool_get_module_eeprom, }; diff --git a/drivers/net/ethernet/sfc/falcon_boards.c b/drivers/net/ethernet/sfc/falcon_boards.c index 8687a6c3db0d..ec1e99d0dcad 100644 --- a/drivers/net/ethernet/sfc/falcon_boards.c +++ b/drivers/net/ethernet/sfc/falcon_boards.c @@ -380,7 +380,7 @@ static ssize_t set_phy_flash_cfg(struct device *dev, new_mode = PHY_MODE_SPECIAL; if (!((old_mode ^ new_mode) & PHY_MODE_SPECIAL)) { err = 0; - } else if (efx->state != STATE_RUNNING || netif_running(efx->net_dev)) { + } else if (efx->state != STATE_READY || netif_running(efx->net_dev)) { err = -EBUSY; } else { /* Reset the PHY, reconfigure the MAC and enable/disable diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c index c3fd61f0a95c..8af42cd1feda 100644 --- a/drivers/net/ethernet/sfc/filter.c +++ b/drivers/net/ethernet/sfc/filter.c @@ -162,20 +162,12 @@ static void efx_filter_push_rx_config(struct efx_nic *efx) !!(table->spec[EFX_FILTER_INDEX_UC_DEF].flags & EFX_FILTER_FLAG_RX_RSS)); EFX_SET_OWORD_FIELD( - filter_ctl, FRF_CZ_UNICAST_NOMATCH_IP_OVERRIDE, - !!(table->spec[EFX_FILTER_INDEX_UC_DEF].flags & - EFX_FILTER_FLAG_RX_OVERRIDE_IP)); - EFX_SET_OWORD_FIELD( filter_ctl, FRF_CZ_MULTICAST_NOMATCH_Q_ID, table->spec[EFX_FILTER_INDEX_MC_DEF].dmaq_id); EFX_SET_OWORD_FIELD( filter_ctl, FRF_CZ_MULTICAST_NOMATCH_RSS_ENABLED, !!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags & EFX_FILTER_FLAG_RX_RSS)); - EFX_SET_OWORD_FIELD( - filter_ctl, FRF_CZ_MULTICAST_NOMATCH_IP_OVERRIDE, - !!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags & - EFX_FILTER_FLAG_RX_OVERRIDE_IP)); } efx_writeo(efx, &filter_ctl, FR_BZ_RX_FILTER_CTL); @@ -480,14 +472,12 @@ static u32 efx_filter_build(efx_oword_t *filter, struct efx_filter_spec *spec) case EFX_FILTER_TABLE_RX_MAC: { bool is_wild = spec->type == EFX_FILTER_MAC_WILD; - EFX_POPULATE_OWORD_8( + EFX_POPULATE_OWORD_7( *filter, FRF_CZ_RMFT_RSS_EN, !!(spec->flags & EFX_FILTER_FLAG_RX_RSS), FRF_CZ_RMFT_SCATTER_EN, !!(spec->flags & EFX_FILTER_FLAG_RX_SCATTER), - FRF_CZ_RMFT_IP_OVERRIDE, - !!(spec->flags & EFX_FILTER_FLAG_RX_OVERRIDE_IP), FRF_CZ_RMFT_RXQ_ID, spec->dmaq_id, FRF_CZ_RMFT_WILDCARD_MATCH, is_wild, FRF_CZ_RMFT_DEST_MAC_HI, spec->data[2], @@ -567,49 +557,62 @@ static int efx_filter_search(struct efx_filter_table *table, } /* - * Construct/deconstruct external filter IDs. These must be ordered - * by matching priority, for RX NFC semantics. + * Construct/deconstruct external filter IDs. At least the RX filter + * IDs must be ordered by matching priority, for RX NFC semantics. * - * Each RX MAC filter entry has a flag for whether it can override an - * RX IP filter that also matches. So we assign locations for MAC - * filters with overriding behaviour, then for IP filters, then for - * MAC filters without overriding behaviour. + * Deconstruction needs to be robust against invalid IDs so that + * efx_filter_remove_id_safe() and efx_filter_get_filter_safe() can + * accept user-provided IDs. */ -#define EFX_FILTER_MATCH_PRI_RX_MAC_OVERRIDE_IP 0 -#define EFX_FILTER_MATCH_PRI_RX_DEF_OVERRIDE_IP 1 -#define EFX_FILTER_MATCH_PRI_NORMAL_BASE 2 +#define EFX_FILTER_MATCH_PRI_COUNT 5 + +static const u8 efx_filter_type_match_pri[EFX_FILTER_TYPE_COUNT] = { + [EFX_FILTER_TCP_FULL] = 0, + [EFX_FILTER_UDP_FULL] = 0, + [EFX_FILTER_TCP_WILD] = 1, + [EFX_FILTER_UDP_WILD] = 1, + [EFX_FILTER_MAC_FULL] = 2, + [EFX_FILTER_MAC_WILD] = 3, + [EFX_FILTER_UC_DEF] = 4, + [EFX_FILTER_MC_DEF] = 4, +}; + +static const enum efx_filter_table_id efx_filter_range_table[] = { + EFX_FILTER_TABLE_RX_IP, /* RX match pri 0 */ + EFX_FILTER_TABLE_RX_IP, + EFX_FILTER_TABLE_RX_MAC, + EFX_FILTER_TABLE_RX_MAC, + EFX_FILTER_TABLE_RX_DEF, /* RX match pri 4 */ + EFX_FILTER_TABLE_COUNT, /* TX match pri 0; invalid */ + EFX_FILTER_TABLE_COUNT, /* invalid */ + EFX_FILTER_TABLE_TX_MAC, + EFX_FILTER_TABLE_TX_MAC, /* TX match pri 3 */ +}; #define EFX_FILTER_INDEX_WIDTH 13 #define EFX_FILTER_INDEX_MASK ((1 << EFX_FILTER_INDEX_WIDTH) - 1) -static inline u32 efx_filter_make_id(enum efx_filter_table_id table_id, - unsigned int index, u8 flags) +static inline u32 +efx_filter_make_id(const struct efx_filter_spec *spec, unsigned int index) { - unsigned int match_pri = EFX_FILTER_MATCH_PRI_NORMAL_BASE + table_id; + unsigned int range; - if (flags & EFX_FILTER_FLAG_RX_OVERRIDE_IP) { - if (table_id == EFX_FILTER_TABLE_RX_MAC) - match_pri = EFX_FILTER_MATCH_PRI_RX_MAC_OVERRIDE_IP; - else if (table_id == EFX_FILTER_TABLE_RX_DEF) - match_pri = EFX_FILTER_MATCH_PRI_RX_DEF_OVERRIDE_IP; - } + range = efx_filter_type_match_pri[spec->type]; + if (!(spec->flags & EFX_FILTER_FLAG_RX)) + range += EFX_FILTER_MATCH_PRI_COUNT; - return match_pri << EFX_FILTER_INDEX_WIDTH | index; + return range << EFX_FILTER_INDEX_WIDTH | index; } static inline enum efx_filter_table_id efx_filter_id_table_id(u32 id) { - unsigned int match_pri = id >> EFX_FILTER_INDEX_WIDTH; + unsigned int range = id >> EFX_FILTER_INDEX_WIDTH; - switch (match_pri) { - case EFX_FILTER_MATCH_PRI_RX_MAC_OVERRIDE_IP: - return EFX_FILTER_TABLE_RX_MAC; - case EFX_FILTER_MATCH_PRI_RX_DEF_OVERRIDE_IP: - return EFX_FILTER_TABLE_RX_DEF; - default: - return match_pri - EFX_FILTER_MATCH_PRI_NORMAL_BASE; - } + if (range < ARRAY_SIZE(efx_filter_range_table)) + return efx_filter_range_table[range]; + else + return EFX_FILTER_TABLE_COUNT; /* invalid */ } static inline unsigned int efx_filter_id_index(u32 id) @@ -619,12 +622,9 @@ static inline unsigned int efx_filter_id_index(u32 id) static inline u8 efx_filter_id_flags(u32 id) { - unsigned int match_pri = id >> EFX_FILTER_INDEX_WIDTH; + unsigned int range = id >> EFX_FILTER_INDEX_WIDTH; - if (match_pri < EFX_FILTER_MATCH_PRI_NORMAL_BASE) - return EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_RX_OVERRIDE_IP; - else if (match_pri <= - EFX_FILTER_MATCH_PRI_NORMAL_BASE + EFX_FILTER_TABLE_RX_DEF) + if (range < EFX_FILTER_MATCH_PRI_COUNT) return EFX_FILTER_FLAG_RX; else return EFX_FILTER_FLAG_TX; @@ -633,14 +633,15 @@ static inline u8 efx_filter_id_flags(u32 id) u32 efx_filter_get_rx_id_limit(struct efx_nic *efx) { struct efx_filter_state *state = efx->filter_state; - unsigned int table_id = EFX_FILTER_TABLE_RX_DEF; + unsigned int range = EFX_FILTER_MATCH_PRI_COUNT - 1; + enum efx_filter_table_id table_id; do { + table_id = efx_filter_range_table[range]; if (state->table[table_id].size != 0) - return ((EFX_FILTER_MATCH_PRI_NORMAL_BASE + table_id) - << EFX_FILTER_INDEX_WIDTH) + + return range << EFX_FILTER_INDEX_WIDTH | state->table[table_id].size; - } while (table_id--); + } while (range--); return 0; } @@ -718,7 +719,7 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec, netif_vdbg(efx, hw, efx->net_dev, "%s: filter type %d index %d rxq %u set", __func__, spec->type, filter_idx, spec->dmaq_id); - rc = efx_filter_make_id(table->id, filter_idx, spec->flags); + rc = efx_filter_make_id(spec, filter_idx); out: spin_unlock_bh(&state->lock); @@ -781,8 +782,7 @@ int efx_filter_remove_id_safe(struct efx_nic *efx, spin_lock_bh(&state->lock); if (test_bit(filter_idx, table->used_bitmap) && - spec->priority == priority && - !((spec->flags ^ filter_flags) & EFX_FILTER_FLAG_RX_OVERRIDE_IP)) { + spec->priority == priority) { efx_filter_table_clear_entry(efx, table, filter_idx); if (table->used == 0) efx_filter_table_reset_search_depth(table); @@ -833,8 +833,7 @@ int efx_filter_get_filter_safe(struct efx_nic *efx, spin_lock_bh(&state->lock); if (test_bit(filter_idx, table->used_bitmap) && - spec->priority == priority && - !((spec->flags ^ filter_flags) & EFX_FILTER_FLAG_RX_OVERRIDE_IP)) { + spec->priority == priority) { *spec_buf = *spec; rc = 0; } else { @@ -927,8 +926,7 @@ s32 efx_filter_get_rx_ids(struct efx_nic *efx, goto out; } buf[count++] = efx_filter_make_id( - table_id, filter_idx, - table->spec[filter_idx].flags); + &table->spec[filter_idx], filter_idx); } } } diff --git a/drivers/net/ethernet/sfc/filter.h b/drivers/net/ethernet/sfc/filter.h index 3c77802aed6c..5cb54723b824 100644 --- a/drivers/net/ethernet/sfc/filter.h +++ b/drivers/net/ethernet/sfc/filter.h @@ -61,16 +61,12 @@ enum efx_filter_priority { * according to the indirection table. * @EFX_FILTER_FLAG_RX_SCATTER: Enable DMA scatter on the receiving * queue. - * @EFX_FILTER_FLAG_RX_OVERRIDE_IP: Enables a MAC filter to override - * any IP filter that matches the same packet. By default, IP - * filters take precedence. * @EFX_FILTER_FLAG_RX: Filter is for RX * @EFX_FILTER_FLAG_TX: Filter is for TX */ enum efx_filter_flags { EFX_FILTER_FLAG_RX_RSS = 0x01, EFX_FILTER_FLAG_RX_SCATTER = 0x02, - EFX_FILTER_FLAG_RX_OVERRIDE_IP = 0x04, EFX_FILTER_FLAG_RX = 0x08, EFX_FILTER_FLAG_TX = 0x10, }; @@ -88,8 +84,7 @@ enum efx_filter_flags { * * The @priority field is used by software to determine whether a new * filter may replace an old one. The hardware priority of a filter - * depends on the filter type and %EFX_FILTER_FLAG_RX_OVERRIDE_IP - * flag. + * depends on the filter type. */ struct efx_filter_spec { u8 type:4; diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c index fc5e7bbcbc9e..aea43cbd0520 100644 --- a/drivers/net/ethernet/sfc/mcdi.c +++ b/drivers/net/ethernet/sfc/mcdi.c @@ -320,14 +320,20 @@ static void efx_mcdi_ev_cpl(struct efx_nic *efx, unsigned int seqno, efx_mcdi_complete(mcdi); } -/* Issue the given command by writing the data into the shared memory PDU, - * ring the doorbell and wait for completion. Copyout the result. */ int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd, const u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen, size_t *outlen_actual) { + efx_mcdi_rpc_start(efx, cmd, inbuf, inlen); + return efx_mcdi_rpc_finish(efx, cmd, inlen, + outbuf, outlen, outlen_actual); +} + +void efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd, const u8 *inbuf, + size_t inlen) +{ struct efx_mcdi_iface *mcdi = efx_mcdi(efx); - int rc; + BUG_ON(efx_nic_rev(efx) < EFX_REV_SIENA_A0); efx_mcdi_acquire(mcdi); @@ -338,6 +344,15 @@ int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd, spin_unlock_bh(&mcdi->iface_lock); efx_mcdi_copyin(efx, cmd, inbuf, inlen); +} + +int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen, + u8 *outbuf, size_t outlen, size_t *outlen_actual) +{ + struct efx_mcdi_iface *mcdi = efx_mcdi(efx); + int rc; + + BUG_ON(efx_nic_rev(efx) < EFX_REV_SIENA_A0); if (mcdi->mode == MCDI_MODE_POLL) rc = efx_mcdi_poll(efx); @@ -563,6 +578,11 @@ void efx_mcdi_process_event(struct efx_channel *channel, case MCDI_EVENT_CODE_FLR: efx_sriov_flr(efx, MCDI_EVENT_FIELD(*event, FLR_VF)); break; + case MCDI_EVENT_CODE_PTP_RX: + case MCDI_EVENT_CODE_PTP_FAULT: + case MCDI_EVENT_CODE_PTP_PPS: + efx_ptp_event(efx, event); + break; default: netif_err(efx, hw, efx->net_dev, "Unknown MCDI event 0x%x\n", @@ -641,9 +661,8 @@ int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address, u16 *fw_subtype_list, u32 *capabilities) { uint8_t outbuf[MC_CMD_GET_BOARD_CFG_OUT_LENMIN]; - size_t outlen; + size_t outlen, offset, i; int port_num = efx_port_num(efx); - int offset; int rc; BUILD_BUG_ON(MC_CMD_GET_BOARD_CFG_IN_LEN != 0); @@ -663,11 +682,18 @@ int efx_mcdi_get_board_cfg(struct efx_nic *efx, u8 *mac_address, : MC_CMD_GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT0_OFST; if (mac_address) memcpy(mac_address, outbuf + offset, ETH_ALEN); - if (fw_subtype_list) - memcpy(fw_subtype_list, - outbuf + MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_OFST, - MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MINNUM * - sizeof(fw_subtype_list[0])); + if (fw_subtype_list) { + /* Byte-swap and truncate or zero-pad as necessary */ + offset = MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_OFST; + for (i = 0; + i < MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MAXNUM; + i++) { + fw_subtype_list[i] = + (offset + 2 <= outlen) ? + le16_to_cpup((__le16 *)(outbuf + offset)) : 0; + offset += 2; + } + } if (capabilities) { if (port_num) *capabilities = MCDI_DWORD(outbuf, @@ -1169,6 +1195,9 @@ int efx_mcdi_flush_rxqs(struct efx_nic *efx) __le32 *qid; int rc, count; + BUILD_BUG_ON(EFX_MAX_CHANNELS > + MC_CMD_FLUSH_RX_QUEUES_IN_QID_OFST_MAXNUM); + qid = kmalloc(EFX_MAX_CHANNELS * sizeof(*qid), GFP_KERNEL); if (qid == NULL) return -ENOMEM; diff --git a/drivers/net/ethernet/sfc/mcdi.h b/drivers/net/ethernet/sfc/mcdi.h index 0bdf3e331832..3ba2e5b5a9cc 100644 --- a/drivers/net/ethernet/sfc/mcdi.h +++ b/drivers/net/ethernet/sfc/mcdi.h @@ -71,6 +71,12 @@ extern int efx_mcdi_rpc(struct efx_nic *efx, unsigned cmd, const u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen, size_t *outlen_actual); +extern void efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd, + const u8 *inbuf, size_t inlen); +extern int efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen, + u8 *outbuf, size_t outlen, + size_t *outlen_actual); + extern int efx_mcdi_poll_reboot(struct efx_nic *efx); extern void efx_mcdi_mode_poll(struct efx_nic *efx); extern void efx_mcdi_mode_event(struct efx_nic *efx); @@ -107,11 +113,13 @@ extern void efx_mcdi_sensor_event(struct efx_nic *efx, efx_qword_t *ev); #define MCDI_EVENT_FIELD(_ev, _field) \ EFX_QWORD_FIELD(_ev, MCDI_EVENT_ ## _field) #define MCDI_ARRAY_FIELD(_buf, _field1, _type, _index, _field2) \ - EFX_DWORD_FIELD( \ + EFX_EXTRACT_DWORD( \ *((efx_dword_t *) \ (MCDI_ARRAY_PTR(_buf, _field1, _type, _index) + \ (MC_CMD_ ## _type ## _TYPEDEF_ ## _field2 ## _OFST & ~3))), \ - MC_CMD_ ## _type ## _TYPEDEF_ ## _field2) + MC_CMD_ ## _type ## _TYPEDEF_ ## _field2 ## _LBN & 0x1f, \ + (MC_CMD_ ## _type ## _TYPEDEF_ ## _field2 ## _LBN & 0x1f) + \ + MC_CMD_ ## _type ## _TYPEDEF_ ## _field2 ## _WIDTH - 1) extern void efx_mcdi_print_fwver(struct efx_nic *efx, char *buf, size_t len); extern int efx_mcdi_drv_attach(struct efx_nic *efx, bool driver_operating, diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h index db4beed97669..9d426d0457bd 100644 --- a/drivers/net/ethernet/sfc/mcdi_pcol.h +++ b/drivers/net/ethernet/sfc/mcdi_pcol.h @@ -289,6 +289,7 @@ #define MCDI_EVENT_CODE_TX_FLUSH 0xc /* enum */ #define MCDI_EVENT_CODE_PTP_RX 0xd /* enum */ #define MCDI_EVENT_CODE_PTP_FAULT 0xe /* enum */ +#define MCDI_EVENT_CODE_PTP_PPS 0xf /* enum */ #define MCDI_EVENT_CMDDONE_DATA_OFST 0 #define MCDI_EVENT_CMDDONE_DATA_LBN 0 #define MCDI_EVENT_CMDDONE_DATA_WIDTH 32 @@ -491,12 +492,12 @@ /* MC_CMD_GET_FPGAREG_OUT msgresponse */ #define MC_CMD_GET_FPGAREG_OUT_LENMIN 1 -#define MC_CMD_GET_FPGAREG_OUT_LENMAX 255 +#define MC_CMD_GET_FPGAREG_OUT_LENMAX 252 #define MC_CMD_GET_FPGAREG_OUT_LEN(num) (0+1*(num)) #define MC_CMD_GET_FPGAREG_OUT_BUFFER_OFST 0 #define MC_CMD_GET_FPGAREG_OUT_BUFFER_LEN 1 #define MC_CMD_GET_FPGAREG_OUT_BUFFER_MINNUM 1 -#define MC_CMD_GET_FPGAREG_OUT_BUFFER_MAXNUM 255 +#define MC_CMD_GET_FPGAREG_OUT_BUFFER_MAXNUM 252 /***********************************/ @@ -507,13 +508,13 @@ /* MC_CMD_PUT_FPGAREG_IN msgrequest */ #define MC_CMD_PUT_FPGAREG_IN_LENMIN 5 -#define MC_CMD_PUT_FPGAREG_IN_LENMAX 255 +#define MC_CMD_PUT_FPGAREG_IN_LENMAX 252 #define MC_CMD_PUT_FPGAREG_IN_LEN(num) (4+1*(num)) #define MC_CMD_PUT_FPGAREG_IN_ADDR_OFST 0 #define MC_CMD_PUT_FPGAREG_IN_BUFFER_OFST 4 #define MC_CMD_PUT_FPGAREG_IN_BUFFER_LEN 1 #define MC_CMD_PUT_FPGAREG_IN_BUFFER_MINNUM 1 -#define MC_CMD_PUT_FPGAREG_IN_BUFFER_MAXNUM 251 +#define MC_CMD_PUT_FPGAREG_IN_BUFFER_MAXNUM 248 /* MC_CMD_PUT_FPGAREG_OUT msgresponse */ #define MC_CMD_PUT_FPGAREG_OUT_LEN 0 @@ -560,7 +561,7 @@ /* MC_CMD_PTP_IN_TRANSMIT msgrequest */ #define MC_CMD_PTP_IN_TRANSMIT_LENMIN 13 -#define MC_CMD_PTP_IN_TRANSMIT_LENMAX 255 +#define MC_CMD_PTP_IN_TRANSMIT_LENMAX 252 #define MC_CMD_PTP_IN_TRANSMIT_LEN(num) (12+1*(num)) /* MC_CMD_PTP_IN_CMD_OFST 0 */ /* MC_CMD_PTP_IN_PERIPH_ID_OFST 4 */ @@ -568,7 +569,7 @@ #define MC_CMD_PTP_IN_TRANSMIT_PACKET_OFST 12 #define MC_CMD_PTP_IN_TRANSMIT_PACKET_LEN 1 #define MC_CMD_PTP_IN_TRANSMIT_PACKET_MINNUM 1 -#define MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM 243 +#define MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM 240 /* MC_CMD_PTP_IN_READ_NIC_TIME msgrequest */ #define MC_CMD_PTP_IN_READ_NIC_TIME_LEN 8 @@ -1145,7 +1146,7 @@ /* MC_CMD_PUTS_IN msgrequest */ #define MC_CMD_PUTS_IN_LENMIN 13 -#define MC_CMD_PUTS_IN_LENMAX 255 +#define MC_CMD_PUTS_IN_LENMAX 252 #define MC_CMD_PUTS_IN_LEN(num) (12+1*(num)) #define MC_CMD_PUTS_IN_DEST_OFST 0 #define MC_CMD_PUTS_IN_UART_LBN 0 @@ -1157,7 +1158,7 @@ #define MC_CMD_PUTS_IN_STRING_OFST 12 #define MC_CMD_PUTS_IN_STRING_LEN 1 #define MC_CMD_PUTS_IN_STRING_MINNUM 1 -#define MC_CMD_PUTS_IN_STRING_MAXNUM 243 +#define MC_CMD_PUTS_IN_STRING_MAXNUM 240 /* MC_CMD_PUTS_OUT msgresponse */ #define MC_CMD_PUTS_OUT_LEN 0 @@ -1947,12 +1948,12 @@ /* MC_CMD_NVRAM_READ_OUT msgresponse */ #define MC_CMD_NVRAM_READ_OUT_LENMIN 1 -#define MC_CMD_NVRAM_READ_OUT_LENMAX 255 +#define MC_CMD_NVRAM_READ_OUT_LENMAX 252 #define MC_CMD_NVRAM_READ_OUT_LEN(num) (0+1*(num)) #define MC_CMD_NVRAM_READ_OUT_READ_BUFFER_OFST 0 #define MC_CMD_NVRAM_READ_OUT_READ_BUFFER_LEN 1 #define MC_CMD_NVRAM_READ_OUT_READ_BUFFER_MINNUM 1 -#define MC_CMD_NVRAM_READ_OUT_READ_BUFFER_MAXNUM 255 +#define MC_CMD_NVRAM_READ_OUT_READ_BUFFER_MAXNUM 252 /***********************************/ @@ -1963,7 +1964,7 @@ /* MC_CMD_NVRAM_WRITE_IN msgrequest */ #define MC_CMD_NVRAM_WRITE_IN_LENMIN 13 -#define MC_CMD_NVRAM_WRITE_IN_LENMAX 255 +#define MC_CMD_NVRAM_WRITE_IN_LENMAX 252 #define MC_CMD_NVRAM_WRITE_IN_LEN(num) (12+1*(num)) #define MC_CMD_NVRAM_WRITE_IN_TYPE_OFST 0 /* Enum values, see field(s): */ @@ -1973,7 +1974,7 @@ #define MC_CMD_NVRAM_WRITE_IN_WRITE_BUFFER_OFST 12 #define MC_CMD_NVRAM_WRITE_IN_WRITE_BUFFER_LEN 1 #define MC_CMD_NVRAM_WRITE_IN_WRITE_BUFFER_MINNUM 1 -#define MC_CMD_NVRAM_WRITE_IN_WRITE_BUFFER_MAXNUM 243 +#define MC_CMD_NVRAM_WRITE_IN_WRITE_BUFFER_MAXNUM 240 /* MC_CMD_NVRAM_WRITE_OUT msgresponse */ #define MC_CMD_NVRAM_WRITE_OUT_LEN 0 @@ -2305,13 +2306,13 @@ /* MC_CMD_GET_PHY_MEDIA_INFO_OUT msgresponse */ #define MC_CMD_GET_PHY_MEDIA_INFO_OUT_LENMIN 5 -#define MC_CMD_GET_PHY_MEDIA_INFO_OUT_LENMAX 255 +#define MC_CMD_GET_PHY_MEDIA_INFO_OUT_LENMAX 252 #define MC_CMD_GET_PHY_MEDIA_INFO_OUT_LEN(num) (4+1*(num)) #define MC_CMD_GET_PHY_MEDIA_INFO_OUT_DATALEN_OFST 0 #define MC_CMD_GET_PHY_MEDIA_INFO_OUT_DATA_OFST 4 #define MC_CMD_GET_PHY_MEDIA_INFO_OUT_DATA_LEN 1 #define MC_CMD_GET_PHY_MEDIA_INFO_OUT_DATA_MINNUM 1 -#define MC_CMD_GET_PHY_MEDIA_INFO_OUT_DATA_MAXNUM 251 +#define MC_CMD_GET_PHY_MEDIA_INFO_OUT_DATA_MAXNUM 248 /***********************************/ diff --git a/drivers/net/ethernet/sfc/mtd.c b/drivers/net/ethernet/sfc/mtd.c index 758148379b0e..08f825b71ac8 100644 --- a/drivers/net/ethernet/sfc/mtd.c +++ b/drivers/net/ethernet/sfc/mtd.c @@ -585,6 +585,7 @@ static const struct siena_nvram_type_info siena_nvram_types[] = { [MC_CMD_NVRAM_TYPE_EXP_ROM_CFG_PORT1] = { 1, "sfc_exp_rom_cfg" }, [MC_CMD_NVRAM_TYPE_PHY_PORT0] = { 0, "sfc_phy_fw" }, [MC_CMD_NVRAM_TYPE_PHY_PORT1] = { 1, "sfc_phy_fw" }, + [MC_CMD_NVRAM_TYPE_FPGA] = { 0, "sfc_fpga" }, }; static int siena_mtd_probe_partition(struct efx_nic *efx, @@ -598,7 +599,8 @@ static int siena_mtd_probe_partition(struct efx_nic *efx, bool protected; int rc; - if (type >= ARRAY_SIZE(siena_nvram_types)) + if (type >= ARRAY_SIZE(siena_nvram_types) || + siena_nvram_types[type].name == NULL) return -ENODEV; info = &siena_nvram_types[type]; @@ -627,7 +629,8 @@ static int siena_mtd_get_fw_subtypes(struct efx_nic *efx, struct efx_mtd *efx_mtd) { struct efx_mtd_partition *part; - uint16_t fw_subtype_list[MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MINNUM]; + uint16_t fw_subtype_list[ + MC_CMD_GET_BOARD_CFG_OUT_FW_SUBTYPE_LIST_MAXNUM]; int rc; rc = efx_mcdi_get_board_cfg(efx, NULL, fw_subtype_list, NULL); diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index cd9c0a989692..c1a010cda89b 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -37,7 +37,7 @@ * **************************************************************************/ -#define EFX_DRIVER_VERSION "3.1" +#define EFX_DRIVER_VERSION "3.2" #ifdef DEBUG #define EFX_BUG_ON_PARANOID(x) BUG_ON(x) @@ -56,7 +56,8 @@ #define EFX_MAX_CHANNELS 32U #define EFX_MAX_RX_QUEUES EFX_MAX_CHANNELS #define EFX_EXTRA_CHANNEL_IOV 0 -#define EFX_MAX_EXTRA_CHANNELS 1U +#define EFX_EXTRA_CHANNEL_PTP 1 +#define EFX_MAX_EXTRA_CHANNELS 2U /* Checksum generation is a per-queue option in hardware, so each * queue visible to the networking core is backed by two hardware TX @@ -68,6 +69,9 @@ #define EFX_TXQ_TYPES 4 #define EFX_MAX_TX_QUEUES (EFX_TXQ_TYPES * EFX_MAX_CHANNELS) +/* Forward declare Precision Time Protocol (PTP) support structure. */ +struct efx_ptp_data; + struct efx_self_tests; /** @@ -91,29 +95,31 @@ struct efx_special_buffer { }; /** - * struct efx_tx_buffer - An Efx TX buffer - * @skb: The associated socket buffer. - * Set only on the final fragment of a packet; %NULL for all other - * fragments. When this fragment completes, then we can free this - * skb. - * @tsoh: The associated TSO header structure, or %NULL if this - * buffer is not a TSO header. + * struct efx_tx_buffer - buffer state for a TX descriptor + * @skb: When @flags & %EFX_TX_BUF_SKB, the associated socket buffer to be + * freed when descriptor completes + * @heap_buf: When @flags & %EFX_TX_BUF_HEAP, the associated heap buffer to be + * freed when descriptor completes. * @dma_addr: DMA address of the fragment. + * @flags: Flags for allocation and DMA mapping type * @len: Length of this fragment. * This field is zero when the queue slot is empty. - * @continuation: True if this fragment is not the end of a packet. - * @unmap_single: True if dma_unmap_single should be used. * @unmap_len: Length of this fragment to unmap */ struct efx_tx_buffer { - const struct sk_buff *skb; - struct efx_tso_header *tsoh; + union { + const struct sk_buff *skb; + void *heap_buf; + }; dma_addr_t dma_addr; + unsigned short flags; unsigned short len; - bool continuation; - bool unmap_single; unsigned short unmap_len; }; +#define EFX_TX_BUF_CONT 1 /* not last descriptor of packet */ +#define EFX_TX_BUF_SKB 2 /* buffer is last part of skb */ +#define EFX_TX_BUF_HEAP 4 /* buffer was allocated with kmalloc() */ +#define EFX_TX_BUF_MAP_SINGLE 8 /* buffer was mapped with dma_map_single() */ /** * struct efx_tx_queue - An Efx TX queue @@ -133,6 +139,7 @@ struct efx_tx_buffer { * @channel: The associated channel * @core_txq: The networking core TX queue structure * @buffer: The software buffer ring + * @tsoh_page: Array of pages of TSO header buffers * @txd: The hardware descriptor ring * @ptr_mask: The size of the ring minus 1. * @initialised: Has hardware queue been initialised? @@ -156,9 +163,6 @@ struct efx_tx_buffer { * variable indicates that the queue is full. This is to * avoid cache-line ping-pong between the xmit path and the * completion path. - * @tso_headers_free: A list of TSO headers allocated for this TX queue - * that are not in use, and so available for new TSO sends. The list - * is protected by the TX queue lock. * @tso_bursts: Number of times TSO xmit invoked by kernel * @tso_long_headers: Number of packets with headers too long for standard * blocks @@ -175,6 +179,7 @@ struct efx_tx_queue { struct efx_channel *channel; struct netdev_queue *core_txq; struct efx_tx_buffer *buffer; + struct efx_buffer *tsoh_page; struct efx_special_buffer txd; unsigned int ptr_mask; bool initialised; @@ -187,7 +192,6 @@ struct efx_tx_queue { unsigned int insert_count ____cacheline_aligned_in_smp; unsigned int write_count; unsigned int old_read_count; - struct efx_tso_header *tso_headers_free; unsigned int tso_bursts; unsigned int tso_long_headers; unsigned int tso_packets; @@ -242,6 +246,8 @@ struct efx_rx_page_state { /** * struct efx_rx_queue - An Efx RX queue * @efx: The associated Efx NIC + * @core_index: Index of network core RX queue. Will be >= 0 iff this + * is associated with a real RX queue. * @buffer: The software buffer ring * @rxd: The hardware descriptor ring * @ptr_mask: The size of the ring minus 1. @@ -263,6 +269,7 @@ struct efx_rx_page_state { */ struct efx_rx_queue { struct efx_nic *efx; + int core_index; struct efx_rx_buffer *buffer; struct efx_special_buffer rxd; unsigned int ptr_mask; @@ -390,14 +397,17 @@ struct efx_channel { * @get_name: Generate the channel's name (used for its IRQ handler) * @copy: Copy the channel state prior to reallocation. May be %NULL if * reallocation is not supported. + * @receive_skb: Handle an skb ready to be passed to netif_receive_skb() * @keep_eventq: Flag for whether event queue should be kept initialised * while the device is stopped */ struct efx_channel_type { void (*handle_no_channel)(struct efx_nic *); int (*pre_probe)(struct efx_channel *); + void (*post_remove)(struct efx_channel *); void (*get_name)(struct efx_channel *, char *buf, size_t len); struct efx_channel *(*copy)(const struct efx_channel *); + void (*receive_skb)(struct efx_channel *, struct sk_buff *); bool keep_eventq; }; @@ -430,11 +440,9 @@ enum efx_int_mode { #define EFX_INT_MODE_USE_MSI(x) (((x)->interrupt_mode) <= EFX_INT_MODE_MSI) enum nic_state { - STATE_INIT = 0, - STATE_RUNNING = 1, - STATE_FINI = 2, - STATE_DISABLED = 3, - STATE_MAX, + STATE_UNINIT = 0, /* device being probed/removed or is frozen */ + STATE_READY = 1, /* hardware ready and netdev registered */ + STATE_DISABLED = 2, /* device disabled due to hardware errors */ }; /* @@ -654,7 +662,7 @@ struct vfdi_status; * @irq_rx_adaptive: Adaptive IRQ moderation enabled for RX event queues * @irq_rx_moderation: IRQ moderation time for RX event queues * @msg_enable: Log message enable flags - * @state: Device state flag. Serialised by the rtnl_lock. + * @state: Device state number (%STATE_*). Serialised by the rtnl_lock. * @reset_pending: Bitmask for pending resets * @tx_queue: TX DMA queues * @rx_queue: RX DMA queues @@ -664,6 +672,8 @@ struct vfdi_status; * should be allocated for this NIC * @rxq_entries: Size of receive queues requested by user. * @txq_entries: Size of transmit queues requested by user. + * @txq_stop_thresh: TX queue fill level at or above which we stop it. + * @txq_wake_thresh: TX queue fill level at or below which we wake it. * @tx_dc_base: Base qword address in SRAM of TX queue descriptor caches * @rx_dc_base: Base qword address in SRAM of RX queue descriptor caches * @sram_lim_qw: Qword address limit of SRAM @@ -730,6 +740,7 @@ struct vfdi_status; * %local_addr_list. Protected by %local_lock. * @local_lock: Mutex protecting %local_addr_list and %local_page_list. * @peer_work: Work item to broadcast peer addresses to VMs. + * @ptp_data: PTP state data * @monitor_work: Hardware monitor workitem * @biu_lock: BIU (bus interface unit) lock * @last_irq_cpu: Last CPU to handle a possible test interrupt. This @@ -774,6 +785,9 @@ struct efx_nic { unsigned rxq_entries; unsigned txq_entries; + unsigned int txq_stop_thresh; + unsigned int txq_wake_thresh; + unsigned tx_dc_base; unsigned rx_dc_base; unsigned sram_lim_qw; @@ -854,6 +868,10 @@ struct efx_nic { struct work_struct peer_work; #endif +#ifdef CONFIG_SFC_PTP + struct efx_ptp_data *ptp_data; +#endif + /* The following fields may be written more often */ struct delayed_work monitor_work ____cacheline_aligned_in_smp; @@ -1044,7 +1062,7 @@ static inline bool efx_tx_queue_used(struct efx_tx_queue *tx_queue) static inline bool efx_channel_has_rx_queue(struct efx_channel *channel) { - return channel->channel < channel->efx->n_rx_channels; + return channel->rx_queue.core_index >= 0; } static inline struct efx_rx_queue * @@ -1116,5 +1134,13 @@ static inline void clear_bit_le(unsigned nr, unsigned char *addr) #define EFX_MAX_FRAME_LEN(mtu) \ ((((mtu) + ETH_HLEN + VLAN_HLEN + 4/* FCS */ + 7) & ~7) + 16) +static inline bool efx_xmit_with_hwtstamp(struct sk_buff *skb) +{ + return skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP; +} +static inline void efx_xmit_hwtstamp_pending(struct sk_buff *skb) +{ + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; +} #endif /* EFX_NET_DRIVER_H */ diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c index 326d799762d6..cdff40b65729 100644 --- a/drivers/net/ethernet/sfc/nic.c +++ b/drivers/net/ethernet/sfc/nic.c @@ -298,7 +298,7 @@ efx_free_special_buffer(struct efx_nic *efx, struct efx_special_buffer *buffer) /************************************************************************** * * Generic buffer handling - * These buffers are used for interrupt status and MAC stats + * These buffers are used for interrupt status, MAC stats, etc. * **************************************************************************/ @@ -401,8 +401,10 @@ void efx_nic_push_buffers(struct efx_tx_queue *tx_queue) ++tx_queue->write_count; /* Create TX descriptor ring entry */ + BUILD_BUG_ON(EFX_TX_BUF_CONT != 1); EFX_POPULATE_QWORD_4(*txd, - FSF_AZ_TX_KER_CONT, buffer->continuation, + FSF_AZ_TX_KER_CONT, + buffer->flags & EFX_TX_BUF_CONT, FSF_AZ_TX_KER_BYTE_COUNT, buffer->len, FSF_AZ_TX_KER_BUF_REGION, 0, FSF_AZ_TX_KER_BUF_ADDR, buffer->dma_addr); diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index bab5cd9f5740..438cef11f727 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -11,6 +11,7 @@ #ifndef EFX_NIC_H #define EFX_NIC_H +#include <linux/net_tstamp.h> #include <linux/i2c-algo-bit.h> #include "net_driver.h" #include "efx.h" @@ -250,6 +251,41 @@ extern int efx_sriov_get_vf_config(struct net_device *dev, int vf, extern int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf, bool spoofchk); +struct ethtool_ts_info; +#ifdef CONFIG_SFC_PTP +extern void efx_ptp_probe(struct efx_nic *efx); +extern int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd); +extern int efx_ptp_get_ts_info(struct net_device *net_dev, + struct ethtool_ts_info *ts_info); +extern bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb); +extern int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb); +extern void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev); +#else +static inline void efx_ptp_probe(struct efx_nic *efx) {} +static inline int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd) +{ + return -EOPNOTSUPP; +} +static inline int efx_ptp_get_ts_info(struct net_device *net_dev, + struct ethtool_ts_info *ts_info) +{ + ts_info->so_timestamping = (SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_RX_SOFTWARE); + ts_info->phc_index = -1; + + return 0; +} +static inline bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb) +{ + return false; +} +static inline int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb) +{ + return NETDEV_TX_OK; +} +static inline void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev) {} +#endif + extern const struct efx_nic_type falcon_a1_nic_type; extern const struct efx_nic_type falcon_b0_nic_type; extern const struct efx_nic_type siena_a0_nic_type; diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c new file mode 100644 index 000000000000..5b3dd028ce85 --- /dev/null +++ b/drivers/net/ethernet/sfc/ptp.c @@ -0,0 +1,1484 @@ +/**************************************************************************** + * Driver for Solarflare Solarstorm network controllers and boards + * Copyright 2011 Solarflare Communications Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation, incorporated herein by reference. + */ + +/* Theory of operation: + * + * PTP support is assisted by firmware running on the MC, which provides + * the hardware timestamping capabilities. Both transmitted and received + * PTP event packets are queued onto internal queues for subsequent processing; + * this is because the MC operations are relatively long and would block + * block NAPI/interrupt operation. + * + * Receive event processing: + * The event contains the packet's UUID and sequence number, together + * with the hardware timestamp. The PTP receive packet queue is searched + * for this UUID/sequence number and, if found, put on a pending queue. + * Packets not matching are delivered without timestamps (MCDI events will + * always arrive after the actual packet). + * It is important for the operation of the PTP protocol that the ordering + * of packets between the event and general port is maintained. + * + * Work queue processing: + * If work waiting, synchronise host/hardware time + * + * Transmit: send packet through MC, which returns the transmission time + * that is converted to an appropriate timestamp. + * + * Receive: the packet's reception time is converted to an appropriate + * timestamp. + */ +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/time.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/net_tstamp.h> +#include <linux/pps_kernel.h> +#include <linux/ptp_clock_kernel.h> +#include "net_driver.h" +#include "efx.h" +#include "mcdi.h" +#include "mcdi_pcol.h" +#include "io.h" +#include "regs.h" +#include "nic.h" + +/* Maximum number of events expected to make up a PTP event */ +#define MAX_EVENT_FRAGS 3 + +/* Maximum delay, ms, to begin synchronisation */ +#define MAX_SYNCHRONISE_WAIT_MS 2 + +/* How long, at most, to spend synchronising */ +#define SYNCHRONISE_PERIOD_NS 250000 + +/* How often to update the shared memory time */ +#define SYNCHRONISATION_GRANULARITY_NS 200 + +/* Minimum permitted length of a (corrected) synchronisation time */ +#define MIN_SYNCHRONISATION_NS 120 + +/* Maximum permitted length of a (corrected) synchronisation time */ +#define MAX_SYNCHRONISATION_NS 1000 + +/* How many (MC) receive events that can be queued */ +#define MAX_RECEIVE_EVENTS 8 + +/* Length of (modified) moving average. */ +#define AVERAGE_LENGTH 16 + +/* How long an unmatched event or packet can be held */ +#define PKT_EVENT_LIFETIME_MS 10 + +/* Offsets into PTP packet for identification. These offsets are from the + * start of the IP header, not the MAC header. Note that neither PTP V1 nor + * PTP V2 permit the use of IPV4 options. + */ +#define PTP_DPORT_OFFSET 22 + +#define PTP_V1_VERSION_LENGTH 2 +#define PTP_V1_VERSION_OFFSET 28 + +#define PTP_V1_UUID_LENGTH 6 +#define PTP_V1_UUID_OFFSET 50 + +#define PTP_V1_SEQUENCE_LENGTH 2 +#define PTP_V1_SEQUENCE_OFFSET 58 + +/* The minimum length of a PTP V1 packet for offsets, etc. to be valid: + * includes IP header. + */ +#define PTP_V1_MIN_LENGTH 64 + +#define PTP_V2_VERSION_LENGTH 1 +#define PTP_V2_VERSION_OFFSET 29 + +/* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2), + * the MC only captures the last six bytes of the clock identity. These values + * reflect those, not the ones used in the standard. The standard permits + * mapping of V1 UUIDs to V2 UUIDs with these same values. + */ +#define PTP_V2_MC_UUID_LENGTH 6 +#define PTP_V2_MC_UUID_OFFSET 50 + +#define PTP_V2_SEQUENCE_LENGTH 2 +#define PTP_V2_SEQUENCE_OFFSET 58 + +/* The minimum length of a PTP V2 packet for offsets, etc. to be valid: + * includes IP header. + */ +#define PTP_V2_MIN_LENGTH 63 + +#define PTP_MIN_LENGTH 63 + +#define PTP_ADDRESS 0xe0000181 /* 224.0.1.129 */ +#define PTP_EVENT_PORT 319 +#define PTP_GENERAL_PORT 320 + +/* Annoyingly the format of the version numbers are different between + * versions 1 and 2 so it isn't possible to simply look for 1 or 2. + */ +#define PTP_VERSION_V1 1 + +#define PTP_VERSION_V2 2 +#define PTP_VERSION_V2_MASK 0x0f + +enum ptp_packet_state { + PTP_PACKET_STATE_UNMATCHED = 0, + PTP_PACKET_STATE_MATCHED, + PTP_PACKET_STATE_TIMED_OUT, + PTP_PACKET_STATE_MATCH_UNWANTED +}; + +/* NIC synchronised with single word of time only comprising + * partial seconds and full nanoseconds: 10^9 ~ 2^30 so 2 bits for seconds. + */ +#define MC_NANOSECOND_BITS 30 +#define MC_NANOSECOND_MASK ((1 << MC_NANOSECOND_BITS) - 1) +#define MC_SECOND_MASK ((1 << (32 - MC_NANOSECOND_BITS)) - 1) + +/* Maximum parts-per-billion adjustment that is acceptable */ +#define MAX_PPB 1000000 + +/* Number of bits required to hold the above */ +#define MAX_PPB_BITS 20 + +/* Number of extra bits allowed when calculating fractional ns. + * EXTRA_BITS + MC_CMD_PTP_IN_ADJUST_BITS + MAX_PPB_BITS should + * be less than 63. + */ +#define PPB_EXTRA_BITS 2 + +/* Precalculate scale word to avoid long long division at runtime */ +#define PPB_SCALE_WORD ((1LL << (PPB_EXTRA_BITS + MC_CMD_PTP_IN_ADJUST_BITS +\ + MAX_PPB_BITS)) / 1000000000LL) + +#define PTP_SYNC_ATTEMPTS 4 + +/** + * struct efx_ptp_match - Matching structure, stored in sk_buff's cb area. + * @words: UUID and (partial) sequence number + * @expiry: Time after which the packet should be delivered irrespective of + * event arrival. + * @state: The state of the packet - whether it is ready for processing or + * whether that is of no interest. + */ +struct efx_ptp_match { + u32 words[DIV_ROUND_UP(PTP_V1_UUID_LENGTH, 4)]; + unsigned long expiry; + enum ptp_packet_state state; +}; + +/** + * struct efx_ptp_event_rx - A PTP receive event (from MC) + * @seq0: First part of (PTP) UUID + * @seq1: Second part of (PTP) UUID and sequence number + * @hwtimestamp: Event timestamp + */ +struct efx_ptp_event_rx { + struct list_head link; + u32 seq0; + u32 seq1; + ktime_t hwtimestamp; + unsigned long expiry; +}; + +/** + * struct efx_ptp_timeset - Synchronisation between host and MC + * @host_start: Host time immediately before hardware timestamp taken + * @seconds: Hardware timestamp, seconds + * @nanoseconds: Hardware timestamp, nanoseconds + * @host_end: Host time immediately after hardware timestamp taken + * @waitns: Number of nanoseconds between hardware timestamp being read and + * host end time being seen + * @window: Difference of host_end and host_start + * @valid: Whether this timeset is valid + */ +struct efx_ptp_timeset { + u32 host_start; + u32 seconds; + u32 nanoseconds; + u32 host_end; + u32 waitns; + u32 window; /* Derived: end - start, allowing for wrap */ +}; + +/** + * struct efx_ptp_data - Precision Time Protocol (PTP) state + * @channel: The PTP channel + * @rxq: Receive queue (awaiting timestamps) + * @txq: Transmit queue + * @evt_list: List of MC receive events awaiting packets + * @evt_free_list: List of free events + * @evt_lock: Lock for manipulating evt_list and evt_free_list + * @rx_evts: Instantiated events (on evt_list and evt_free_list) + * @workwq: Work queue for processing pending PTP operations + * @work: Work task + * @reset_required: A serious error has occurred and the PTP task needs to be + * reset (disable, enable). + * @rxfilter_event: Receive filter when operating + * @rxfilter_general: Receive filter when operating + * @config: Current timestamp configuration + * @enabled: PTP operation enabled + * @mode: Mode in which PTP operating (PTP version) + * @evt_frags: Partly assembled PTP events + * @evt_frag_idx: Current fragment number + * @evt_code: Last event code + * @start: Address at which MC indicates ready for synchronisation + * @host_time_pps: Host time at last PPS + * @last_sync_ns: Last number of nanoseconds between readings when synchronising + * @base_sync_ns: Number of nanoseconds for last synchronisation. + * @base_sync_valid: Whether base_sync_time is valid. + * @current_adjfreq: Current ppb adjustment. + * @phc_clock: Pointer to registered phc device + * @phc_clock_info: Registration structure for phc device + * @pps_work: pps work task for handling pps events + * @pps_workwq: pps work queue + * @nic_ts_enabled: Flag indicating if NIC generated TS events are handled + * @txbuf: Buffer for use when transmitting (PTP) packets to MC (avoids + * allocations in main data path). + * @debug_ptp_dir: PTP debugfs directory + * @missed_rx_sync: Number of packets received without syncrhonisation. + * @good_syncs: Number of successful synchronisations. + * @no_time_syncs: Number of synchronisations with no good times. + * @bad_sync_durations: Number of synchronisations with bad durations. + * @bad_syncs: Number of failed synchronisations. + * @last_sync_time: Number of nanoseconds for last synchronisation. + * @sync_timeouts: Number of synchronisation timeouts + * @fast_syncs: Number of synchronisations requiring short delay + * @min_sync_delta: Minimum time between event and synchronisation + * @max_sync_delta: Maximum time between event and synchronisation + * @average_sync_delta: Average time between event and synchronisation. + * Modified moving average. + * @last_sync_delta: Last time between event and synchronisation + * @mc_stats: Context value for MC statistics + * @timeset: Last set of synchronisation statistics. + */ +struct efx_ptp_data { + struct efx_channel *channel; + struct sk_buff_head rxq; + struct sk_buff_head txq; + struct list_head evt_list; + struct list_head evt_free_list; + spinlock_t evt_lock; + struct efx_ptp_event_rx rx_evts[MAX_RECEIVE_EVENTS]; + struct workqueue_struct *workwq; + struct work_struct work; + bool reset_required; + u32 rxfilter_event; + u32 rxfilter_general; + bool rxfilter_installed; + struct hwtstamp_config config; + bool enabled; + unsigned int mode; + efx_qword_t evt_frags[MAX_EVENT_FRAGS]; + int evt_frag_idx; + int evt_code; + struct efx_buffer start; + struct pps_event_time host_time_pps; + unsigned last_sync_ns; + unsigned base_sync_ns; + bool base_sync_valid; + s64 current_adjfreq; + struct ptp_clock *phc_clock; + struct ptp_clock_info phc_clock_info; + struct work_struct pps_work; + struct workqueue_struct *pps_workwq; + bool nic_ts_enabled; + u8 txbuf[ALIGN(MC_CMD_PTP_IN_TRANSMIT_LEN( + MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM), 4)]; + struct efx_ptp_timeset + timeset[MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_MAXNUM]; +}; + +static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta); +static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta); +static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts); +static int efx_phc_settime(struct ptp_clock_info *ptp, + const struct timespec *e_ts); +static int efx_phc_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *request, int on); + +/* Enable MCDI PTP support. */ +static int efx_ptp_enable(struct efx_nic *efx) +{ + u8 inbuf[MC_CMD_PTP_IN_ENABLE_LEN]; + + MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ENABLE); + MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_QUEUE, + efx->ptp_data->channel->channel); + MCDI_SET_DWORD(inbuf, PTP_IN_ENABLE_MODE, efx->ptp_data->mode); + + return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf), + NULL, 0, NULL); +} + +/* Disable MCDI PTP support. + * + * Note that this function should never rely on the presence of ptp_data - + * may be called before that exists. + */ +static int efx_ptp_disable(struct efx_nic *efx) +{ + u8 inbuf[MC_CMD_PTP_IN_DISABLE_LEN]; + + MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_DISABLE); + return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf), + NULL, 0, NULL); +} + +static void efx_ptp_deliver_rx_queue(struct sk_buff_head *q) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(q))) { + local_bh_disable(); + netif_receive_skb(skb); + local_bh_enable(); + } +} + +static void efx_ptp_handle_no_channel(struct efx_nic *efx) +{ + netif_err(efx, drv, efx->net_dev, + "ERROR: PTP requires MSI-X and 1 additional interrupt" + "vector. PTP disabled\n"); +} + +/* Repeatedly send the host time to the MC which will capture the hardware + * time. + */ +static void efx_ptp_send_times(struct efx_nic *efx, + struct pps_event_time *last_time) +{ + struct pps_event_time now; + struct timespec limit; + struct efx_ptp_data *ptp = efx->ptp_data; + struct timespec start; + int *mc_running = ptp->start.addr; + + pps_get_ts(&now); + start = now.ts_real; + limit = now.ts_real; + timespec_add_ns(&limit, SYNCHRONISE_PERIOD_NS); + + /* Write host time for specified period or until MC is done */ + while ((timespec_compare(&now.ts_real, &limit) < 0) && + ACCESS_ONCE(*mc_running)) { + struct timespec update_time; + unsigned int host_time; + + /* Don't update continuously to avoid saturating the PCIe bus */ + update_time = now.ts_real; + timespec_add_ns(&update_time, SYNCHRONISATION_GRANULARITY_NS); + do { + pps_get_ts(&now); + } while ((timespec_compare(&now.ts_real, &update_time) < 0) && + ACCESS_ONCE(*mc_running)); + + /* Synchronise NIC with single word of time only */ + host_time = (now.ts_real.tv_sec << MC_NANOSECOND_BITS | + now.ts_real.tv_nsec); + /* Update host time in NIC memory */ + _efx_writed(efx, cpu_to_le32(host_time), + FR_CZ_MC_TREG_SMEM + MC_SMEM_P0_PTP_TIME_OFST); + } + *last_time = now; +} + +/* Read a timeset from the MC's results and partial process. */ +static void efx_ptp_read_timeset(u8 *data, struct efx_ptp_timeset *timeset) +{ + unsigned start_ns, end_ns; + + timeset->host_start = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTSTART); + timeset->seconds = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_SECONDS); + timeset->nanoseconds = MCDI_DWORD(data, + PTP_OUT_SYNCHRONIZE_NANOSECONDS); + timeset->host_end = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_HOSTEND), + timeset->waitns = MCDI_DWORD(data, PTP_OUT_SYNCHRONIZE_WAITNS); + + /* Ignore seconds */ + start_ns = timeset->host_start & MC_NANOSECOND_MASK; + end_ns = timeset->host_end & MC_NANOSECOND_MASK; + /* Allow for rollover */ + if (end_ns < start_ns) + end_ns += NSEC_PER_SEC; + /* Determine duration of operation */ + timeset->window = end_ns - start_ns; +} + +/* Process times received from MC. + * + * Extract times from returned results, and establish the minimum value + * seen. The minimum value represents the "best" possible time and events + * too much greater than this are rejected - the machine is, perhaps, too + * busy. A number of readings are taken so that, hopefully, at least one good + * synchronisation will be seen in the results. + */ +static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf, + size_t response_length, + const struct pps_event_time *last_time) +{ + unsigned number_readings = (response_length / + MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN); + unsigned i; + unsigned min; + unsigned min_set = 0; + unsigned total; + unsigned ngood = 0; + unsigned last_good = 0; + struct efx_ptp_data *ptp = efx->ptp_data; + bool min_valid = false; + u32 last_sec; + u32 start_sec; + struct timespec delta; + + if (number_readings == 0) + return -EAGAIN; + + /* Find minimum value in this set of results, discarding clearly + * erroneous results. + */ + for (i = 0; i < number_readings; i++) { + efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]); + synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN; + if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) { + if (min_valid) { + if (ptp->timeset[i].window < min_set) + min_set = ptp->timeset[i].window; + } else { + min_valid = true; + min_set = ptp->timeset[i].window; + } + } + } + + if (min_valid) { + if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns)) + min = ptp->base_sync_ns; + else + min = min_set; + } else { + min = SYNCHRONISATION_GRANULARITY_NS; + } + + /* Discard excessively long synchronise durations. The MC times + * when it finishes reading the host time so the corrected window + * time should be fairly constant for a given platform. + */ + total = 0; + for (i = 0; i < number_readings; i++) + if (ptp->timeset[i].window > ptp->timeset[i].waitns) { + unsigned win; + + win = ptp->timeset[i].window - ptp->timeset[i].waitns; + if (win >= MIN_SYNCHRONISATION_NS && + win < MAX_SYNCHRONISATION_NS) { + total += ptp->timeset[i].window; + ngood++; + last_good = i; + } + } + + if (ngood == 0) { + netif_warn(efx, drv, efx->net_dev, + "PTP no suitable synchronisations %dns %dns\n", + ptp->base_sync_ns, min_set); + return -EAGAIN; + } + + /* Average minimum this synchronisation */ + ptp->last_sync_ns = DIV_ROUND_UP(total, ngood); + if (!ptp->base_sync_valid || (ptp->last_sync_ns < ptp->base_sync_ns)) { + ptp->base_sync_valid = true; + ptp->base_sync_ns = ptp->last_sync_ns; + } + + /* Calculate delay from actual PPS to last_time */ + delta.tv_nsec = + ptp->timeset[last_good].nanoseconds + + last_time->ts_real.tv_nsec - + (ptp->timeset[last_good].host_start & MC_NANOSECOND_MASK); + + /* It is possible that the seconds rolled over between taking + * the start reading and the last value written by the host. The + * timescales are such that a gap of more than one second is never + * expected. + */ + start_sec = ptp->timeset[last_good].host_start >> MC_NANOSECOND_BITS; + last_sec = last_time->ts_real.tv_sec & MC_SECOND_MASK; + if (start_sec != last_sec) { + if (((start_sec + 1) & MC_SECOND_MASK) != last_sec) { + netif_warn(efx, hw, efx->net_dev, + "PTP bad synchronisation seconds\n"); + return -EAGAIN; + } else { + delta.tv_sec = 1; + } + } else { + delta.tv_sec = 0; + } + + ptp->host_time_pps = *last_time; + pps_sub_ts(&ptp->host_time_pps, delta); + + return 0; +} + +/* Synchronize times between the host and the MC */ +static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + u8 synch_buf[MC_CMD_PTP_OUT_SYNCHRONIZE_LENMAX]; + size_t response_length; + int rc; + unsigned long timeout; + struct pps_event_time last_time = {}; + unsigned int loops = 0; + int *start = ptp->start.addr; + + MCDI_SET_DWORD(synch_buf, PTP_IN_OP, MC_CMD_PTP_OP_SYNCHRONIZE); + MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_NUMTIMESETS, + num_readings); + MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_START_ADDR_LO, + (u32)ptp->start.dma_addr); + MCDI_SET_DWORD(synch_buf, PTP_IN_SYNCHRONIZE_START_ADDR_HI, + (u32)((u64)ptp->start.dma_addr >> 32)); + + /* Clear flag that signals MC ready */ + ACCESS_ONCE(*start) = 0; + efx_mcdi_rpc_start(efx, MC_CMD_PTP, synch_buf, + MC_CMD_PTP_IN_SYNCHRONIZE_LEN); + + /* Wait for start from MCDI (or timeout) */ + timeout = jiffies + msecs_to_jiffies(MAX_SYNCHRONISE_WAIT_MS); + while (!ACCESS_ONCE(*start) && (time_before(jiffies, timeout))) { + udelay(20); /* Usually start MCDI execution quickly */ + loops++; + } + + if (ACCESS_ONCE(*start)) + efx_ptp_send_times(efx, &last_time); + + /* Collect results */ + rc = efx_mcdi_rpc_finish(efx, MC_CMD_PTP, + MC_CMD_PTP_IN_SYNCHRONIZE_LEN, + synch_buf, sizeof(synch_buf), + &response_length); + if (rc == 0) + rc = efx_ptp_process_times(efx, synch_buf, response_length, + &last_time); + + return rc; +} + +/* Transmit a PTP packet, via the MCDI interface, to the wire. */ +static int efx_ptp_xmit_skb(struct efx_nic *efx, struct sk_buff *skb) +{ + u8 *txbuf = efx->ptp_data->txbuf; + struct skb_shared_hwtstamps timestamps; + int rc = -EIO; + /* MCDI driver requires word aligned lengths */ + size_t len = ALIGN(MC_CMD_PTP_IN_TRANSMIT_LEN(skb->len), 4); + u8 txtime[MC_CMD_PTP_OUT_TRANSMIT_LEN]; + + MCDI_SET_DWORD(txbuf, PTP_IN_OP, MC_CMD_PTP_OP_TRANSMIT); + MCDI_SET_DWORD(txbuf, PTP_IN_TRANSMIT_LENGTH, skb->len); + if (skb_shinfo(skb)->nr_frags != 0) { + rc = skb_linearize(skb); + if (rc != 0) + goto fail; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + rc = skb_checksum_help(skb); + if (rc != 0) + goto fail; + } + skb_copy_from_linear_data(skb, + &txbuf[MC_CMD_PTP_IN_TRANSMIT_PACKET_OFST], + len); + rc = efx_mcdi_rpc(efx, MC_CMD_PTP, txbuf, len, txtime, + sizeof(txtime), &len); + if (rc != 0) + goto fail; + + memset(×tamps, 0, sizeof(timestamps)); + timestamps.hwtstamp = ktime_set( + MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_SECONDS), + MCDI_DWORD(txtime, PTP_OUT_TRANSMIT_NANOSECONDS)); + + skb_tstamp_tx(skb, ×tamps); + + rc = 0; + +fail: + dev_kfree_skb(skb); + + return rc; +} + +static void efx_ptp_drop_time_expired_events(struct efx_nic *efx) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + struct list_head *cursor; + struct list_head *next; + + /* Drop time-expired events */ + spin_lock_bh(&ptp->evt_lock); + if (!list_empty(&ptp->evt_list)) { + list_for_each_safe(cursor, next, &ptp->evt_list) { + struct efx_ptp_event_rx *evt; + + evt = list_entry(cursor, struct efx_ptp_event_rx, + link); + if (time_after(jiffies, evt->expiry)) { + list_del(&evt->link); + list_add(&evt->link, &ptp->evt_free_list); + netif_warn(efx, hw, efx->net_dev, + "PTP rx event dropped\n"); + } + } + } + spin_unlock_bh(&ptp->evt_lock); +} + +static enum ptp_packet_state efx_ptp_match_rx(struct efx_nic *efx, + struct sk_buff *skb) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + bool evts_waiting; + struct list_head *cursor; + struct list_head *next; + struct efx_ptp_match *match; + enum ptp_packet_state rc = PTP_PACKET_STATE_UNMATCHED; + + spin_lock_bh(&ptp->evt_lock); + evts_waiting = !list_empty(&ptp->evt_list); + spin_unlock_bh(&ptp->evt_lock); + + if (!evts_waiting) + return PTP_PACKET_STATE_UNMATCHED; + + match = (struct efx_ptp_match *)skb->cb; + /* Look for a matching timestamp in the event queue */ + spin_lock_bh(&ptp->evt_lock); + list_for_each_safe(cursor, next, &ptp->evt_list) { + struct efx_ptp_event_rx *evt; + + evt = list_entry(cursor, struct efx_ptp_event_rx, link); + if ((evt->seq0 == match->words[0]) && + (evt->seq1 == match->words[1])) { + struct skb_shared_hwtstamps *timestamps; + + /* Match - add in hardware timestamp */ + timestamps = skb_hwtstamps(skb); + timestamps->hwtstamp = evt->hwtimestamp; + + match->state = PTP_PACKET_STATE_MATCHED; + rc = PTP_PACKET_STATE_MATCHED; + list_del(&evt->link); + list_add(&evt->link, &ptp->evt_free_list); + break; + } + } + spin_unlock_bh(&ptp->evt_lock); + + return rc; +} + +/* Process any queued receive events and corresponding packets + * + * q is returned with all the packets that are ready for delivery. + * true is returned if at least one of those packets requires + * synchronisation. + */ +static bool efx_ptp_process_events(struct efx_nic *efx, struct sk_buff_head *q) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + bool rc = false; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&ptp->rxq))) { + struct efx_ptp_match *match; + + match = (struct efx_ptp_match *)skb->cb; + if (match->state == PTP_PACKET_STATE_MATCH_UNWANTED) { + __skb_queue_tail(q, skb); + } else if (efx_ptp_match_rx(efx, skb) == + PTP_PACKET_STATE_MATCHED) { + rc = true; + __skb_queue_tail(q, skb); + } else if (time_after(jiffies, match->expiry)) { + match->state = PTP_PACKET_STATE_TIMED_OUT; + netif_warn(efx, rx_err, efx->net_dev, + "PTP packet - no timestamp seen\n"); + __skb_queue_tail(q, skb); + } else { + /* Replace unprocessed entry and stop */ + skb_queue_head(&ptp->rxq, skb); + break; + } + } + + return rc; +} + +/* Complete processing of a received packet */ +static inline void efx_ptp_process_rx(struct efx_nic *efx, struct sk_buff *skb) +{ + local_bh_disable(); + netif_receive_skb(skb); + local_bh_enable(); +} + +static int efx_ptp_start(struct efx_nic *efx) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + struct efx_filter_spec rxfilter; + int rc; + + ptp->reset_required = false; + + /* Must filter on both event and general ports to ensure + * that there is no packet re-ordering. + */ + efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0, + efx_rx_queue_index( + efx_channel_get_rx_queue(ptp->channel))); + rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP, + htonl(PTP_ADDRESS), + htons(PTP_EVENT_PORT)); + if (rc != 0) + return rc; + + rc = efx_filter_insert_filter(efx, &rxfilter, true); + if (rc < 0) + return rc; + ptp->rxfilter_event = rc; + + efx_filter_init_rx(&rxfilter, EFX_FILTER_PRI_REQUIRED, 0, + efx_rx_queue_index( + efx_channel_get_rx_queue(ptp->channel))); + rc = efx_filter_set_ipv4_local(&rxfilter, IPPROTO_UDP, + htonl(PTP_ADDRESS), + htons(PTP_GENERAL_PORT)); + if (rc != 0) + goto fail; + + rc = efx_filter_insert_filter(efx, &rxfilter, true); + if (rc < 0) + goto fail; + ptp->rxfilter_general = rc; + + rc = efx_ptp_enable(efx); + if (rc != 0) + goto fail2; + + ptp->evt_frag_idx = 0; + ptp->current_adjfreq = 0; + ptp->rxfilter_installed = true; + + return 0; + +fail2: + efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED, + ptp->rxfilter_general); +fail: + efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED, + ptp->rxfilter_event); + + return rc; +} + +static int efx_ptp_stop(struct efx_nic *efx) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + int rc = efx_ptp_disable(efx); + struct list_head *cursor; + struct list_head *next; + + if (ptp->rxfilter_installed) { + efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED, + ptp->rxfilter_general); + efx_filter_remove_id_safe(efx, EFX_FILTER_PRI_REQUIRED, + ptp->rxfilter_event); + ptp->rxfilter_installed = false; + } + + /* Make sure RX packets are really delivered */ + efx_ptp_deliver_rx_queue(&efx->ptp_data->rxq); + skb_queue_purge(&efx->ptp_data->txq); + + /* Drop any pending receive events */ + spin_lock_bh(&efx->ptp_data->evt_lock); + list_for_each_safe(cursor, next, &efx->ptp_data->evt_list) { + list_del(cursor); + list_add(cursor, &efx->ptp_data->evt_free_list); + } + spin_unlock_bh(&efx->ptp_data->evt_lock); + + return rc; +} + +static void efx_ptp_pps_worker(struct work_struct *work) +{ + struct efx_ptp_data *ptp = + container_of(work, struct efx_ptp_data, pps_work); + struct efx_nic *efx = ptp->channel->efx; + struct ptp_clock_event ptp_evt; + + if (efx_ptp_synchronize(efx, PTP_SYNC_ATTEMPTS)) + return; + + ptp_evt.type = PTP_CLOCK_PPSUSR; + ptp_evt.pps_times = ptp->host_time_pps; + ptp_clock_event(ptp->phc_clock, &ptp_evt); +} + +/* Process any pending transmissions and timestamp any received packets. + */ +static void efx_ptp_worker(struct work_struct *work) +{ + struct efx_ptp_data *ptp_data = + container_of(work, struct efx_ptp_data, work); + struct efx_nic *efx = ptp_data->channel->efx; + struct sk_buff *skb; + struct sk_buff_head tempq; + + if (ptp_data->reset_required) { + efx_ptp_stop(efx); + efx_ptp_start(efx); + return; + } + + efx_ptp_drop_time_expired_events(efx); + + __skb_queue_head_init(&tempq); + if (efx_ptp_process_events(efx, &tempq) || + !skb_queue_empty(&ptp_data->txq)) { + + while ((skb = skb_dequeue(&ptp_data->txq))) + efx_ptp_xmit_skb(efx, skb); + } + + while ((skb = __skb_dequeue(&tempq))) + efx_ptp_process_rx(efx, skb); +} + +/* Initialise PTP channel and state. + * + * Setting core_index to zero causes the queue to be initialised and doesn't + * overlap with 'rxq0' because ptp.c doesn't use skb_record_rx_queue. + */ +static int efx_ptp_probe_channel(struct efx_channel *channel) +{ + struct efx_nic *efx = channel->efx; + struct efx_ptp_data *ptp; + int rc = 0; + unsigned int pos; + + channel->irq_moderation = 0; + channel->rx_queue.core_index = 0; + + ptp = kzalloc(sizeof(struct efx_ptp_data), GFP_KERNEL); + efx->ptp_data = ptp; + if (!efx->ptp_data) + return -ENOMEM; + + rc = efx_nic_alloc_buffer(efx, &ptp->start, sizeof(int)); + if (rc != 0) + goto fail1; + + ptp->channel = channel; + skb_queue_head_init(&ptp->rxq); + skb_queue_head_init(&ptp->txq); + ptp->workwq = create_singlethread_workqueue("sfc_ptp"); + if (!ptp->workwq) { + rc = -ENOMEM; + goto fail2; + } + + INIT_WORK(&ptp->work, efx_ptp_worker); + ptp->config.flags = 0; + ptp->config.tx_type = HWTSTAMP_TX_OFF; + ptp->config.rx_filter = HWTSTAMP_FILTER_NONE; + INIT_LIST_HEAD(&ptp->evt_list); + INIT_LIST_HEAD(&ptp->evt_free_list); + spin_lock_init(&ptp->evt_lock); + for (pos = 0; pos < MAX_RECEIVE_EVENTS; pos++) + list_add(&ptp->rx_evts[pos].link, &ptp->evt_free_list); + + ptp->phc_clock_info.owner = THIS_MODULE; + snprintf(ptp->phc_clock_info.name, + sizeof(ptp->phc_clock_info.name), + "%pm", efx->net_dev->perm_addr); + ptp->phc_clock_info.max_adj = MAX_PPB; + ptp->phc_clock_info.n_alarm = 0; + ptp->phc_clock_info.n_ext_ts = 0; + ptp->phc_clock_info.n_per_out = 0; + ptp->phc_clock_info.pps = 1; + ptp->phc_clock_info.adjfreq = efx_phc_adjfreq; + ptp->phc_clock_info.adjtime = efx_phc_adjtime; + ptp->phc_clock_info.gettime = efx_phc_gettime; + ptp->phc_clock_info.settime = efx_phc_settime; + ptp->phc_clock_info.enable = efx_phc_enable; + + ptp->phc_clock = ptp_clock_register(&ptp->phc_clock_info, + &efx->pci_dev->dev); + if (!ptp->phc_clock) + goto fail3; + + INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker); + ptp->pps_workwq = create_singlethread_workqueue("sfc_pps"); + if (!ptp->pps_workwq) { + rc = -ENOMEM; + goto fail4; + } + ptp->nic_ts_enabled = false; + + return 0; +fail4: + ptp_clock_unregister(efx->ptp_data->phc_clock); + +fail3: + destroy_workqueue(efx->ptp_data->workwq); + +fail2: + efx_nic_free_buffer(efx, &ptp->start); + +fail1: + kfree(efx->ptp_data); + efx->ptp_data = NULL; + + return rc; +} + +static void efx_ptp_remove_channel(struct efx_channel *channel) +{ + struct efx_nic *efx = channel->efx; + + if (!efx->ptp_data) + return; + + (void)efx_ptp_disable(channel->efx); + + cancel_work_sync(&efx->ptp_data->work); + cancel_work_sync(&efx->ptp_data->pps_work); + + skb_queue_purge(&efx->ptp_data->rxq); + skb_queue_purge(&efx->ptp_data->txq); + + ptp_clock_unregister(efx->ptp_data->phc_clock); + + destroy_workqueue(efx->ptp_data->workwq); + destroy_workqueue(efx->ptp_data->pps_workwq); + + efx_nic_free_buffer(efx, &efx->ptp_data->start); + kfree(efx->ptp_data); +} + +static void efx_ptp_get_channel_name(struct efx_channel *channel, + char *buf, size_t len) +{ + snprintf(buf, len, "%s-ptp", channel->efx->name); +} + +/* Determine whether this packet should be processed by the PTP module + * or transmitted conventionally. + */ +bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb) +{ + return efx->ptp_data && + efx->ptp_data->enabled && + skb->len >= PTP_MIN_LENGTH && + skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM && + likely(skb->protocol == htons(ETH_P_IP)) && + ip_hdr(skb)->protocol == IPPROTO_UDP && + udp_hdr(skb)->dest == htons(PTP_EVENT_PORT); +} + +/* Receive a PTP packet. Packets are queued until the arrival of + * the receive timestamp from the MC - this will probably occur after the + * packet arrival because of the processing in the MC. + */ +static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) +{ + struct efx_nic *efx = channel->efx; + struct efx_ptp_data *ptp = efx->ptp_data; + struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb; + u8 *data; + unsigned int version; + + match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS); + + /* Correct version? */ + if (ptp->mode == MC_CMD_PTP_MODE_V1) { + if (skb->len < PTP_V1_MIN_LENGTH) { + netif_receive_skb(skb); + return; + } + version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]); + if (version != PTP_VERSION_V1) { + netif_receive_skb(skb); + return; + } + } else { + if (skb->len < PTP_V2_MIN_LENGTH) { + netif_receive_skb(skb); + return; + } + version = skb->data[PTP_V2_VERSION_OFFSET]; + + BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2); + BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET); + BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH); + BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET); + BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH); + + if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) { + netif_receive_skb(skb); + return; + } + } + + /* Does this packet require timestamping? */ + if (ntohs(*(__be16 *)&skb->data[PTP_DPORT_OFFSET]) == PTP_EVENT_PORT) { + struct skb_shared_hwtstamps *timestamps; + + match->state = PTP_PACKET_STATE_UNMATCHED; + + /* Clear all timestamps held: filled in later */ + timestamps = skb_hwtstamps(skb); + memset(timestamps, 0, sizeof(*timestamps)); + + /* Extract UUID/Sequence information */ + data = skb->data + PTP_V1_UUID_OFFSET; + match->words[0] = (data[0] | + (data[1] << 8) | + (data[2] << 16) | + (data[3] << 24)); + match->words[1] = (data[4] | + (data[5] << 8) | + (skb->data[PTP_V1_SEQUENCE_OFFSET + + PTP_V1_SEQUENCE_LENGTH - 1] << + 16)); + } else { + match->state = PTP_PACKET_STATE_MATCH_UNWANTED; + } + + skb_queue_tail(&ptp->rxq, skb); + queue_work(ptp->workwq, &ptp->work); +} + +/* Transmit a PTP packet. This has to be transmitted by the MC + * itself, through an MCDI call. MCDI calls aren't permitted + * in the transmit path so defer the actual transmission to a suitable worker. + */ +int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + + skb_queue_tail(&ptp->txq, skb); + + if ((udp_hdr(skb)->dest == htons(PTP_EVENT_PORT)) && + (skb->len <= MC_CMD_PTP_IN_TRANSMIT_PACKET_MAXNUM)) + efx_xmit_hwtstamp_pending(skb); + queue_work(ptp->workwq, &ptp->work); + + return NETDEV_TX_OK; +} + +static int efx_ptp_change_mode(struct efx_nic *efx, bool enable_wanted, + unsigned int new_mode) +{ + if ((enable_wanted != efx->ptp_data->enabled) || + (enable_wanted && (efx->ptp_data->mode != new_mode))) { + int rc; + + if (enable_wanted) { + /* Change of mode requires disable */ + if (efx->ptp_data->enabled && + (efx->ptp_data->mode != new_mode)) { + efx->ptp_data->enabled = false; + rc = efx_ptp_stop(efx); + if (rc != 0) + return rc; + } + + /* Set new operating mode and establish + * baseline synchronisation, which must + * succeed. + */ + efx->ptp_data->mode = new_mode; + rc = efx_ptp_start(efx); + if (rc == 0) { + rc = efx_ptp_synchronize(efx, + PTP_SYNC_ATTEMPTS * 2); + if (rc != 0) + efx_ptp_stop(efx); + } + } else { + rc = efx_ptp_stop(efx); + } + + if (rc != 0) + return rc; + + efx->ptp_data->enabled = enable_wanted; + } + + return 0; +} + +static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init) +{ + bool enable_wanted = false; + unsigned int new_mode; + int rc; + + if (init->flags) + return -EINVAL; + + if ((init->tx_type != HWTSTAMP_TX_OFF) && + (init->tx_type != HWTSTAMP_TX_ON)) + return -ERANGE; + + new_mode = efx->ptp_data->mode; + /* Determine whether any PTP HW operations are required */ + switch (init->rx_filter) { + case HWTSTAMP_FILTER_NONE: + break; + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + init->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT; + new_mode = MC_CMD_PTP_MODE_V1; + enable_wanted = true; + break; + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + /* Although these three are accepted only IPV4 packets will be + * timestamped + */ + init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT; + new_mode = MC_CMD_PTP_MODE_V2; + enable_wanted = true; + break; + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + /* Non-IP + IPv6 timestamping not supported */ + return -ERANGE; + break; + default: + return -ERANGE; + } + + if (init->tx_type != HWTSTAMP_TX_OFF) + enable_wanted = true; + + rc = efx_ptp_change_mode(efx, enable_wanted, new_mode); + if (rc != 0) + return rc; + + efx->ptp_data->config = *init; + + return 0; +} + +int +efx_ptp_get_ts_info(struct net_device *net_dev, struct ethtool_ts_info *ts_info) +{ + struct efx_nic *efx = netdev_priv(net_dev); + struct efx_ptp_data *ptp = efx->ptp_data; + + if (!ptp) + return -EOPNOTSUPP; + + ts_info->so_timestamping = (SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE); + ts_info->phc_index = ptp_clock_index(ptp->phc_clock); + ts_info->tx_types = 1 << HWTSTAMP_TX_OFF | 1 << HWTSTAMP_TX_ON; + ts_info->rx_filters = (1 << HWTSTAMP_FILTER_NONE | + 1 << HWTSTAMP_FILTER_PTP_V1_L4_EVENT | + 1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC | + 1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ | + 1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT | + 1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC | + 1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ); + return 0; +} + +int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd) +{ + struct hwtstamp_config config; + int rc; + + /* Not a PTP enabled port */ + if (!efx->ptp_data) + return -EOPNOTSUPP; + + if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) + return -EFAULT; + + rc = efx_ptp_ts_init(efx, &config); + if (rc != 0) + return rc; + + return copy_to_user(ifr->ifr_data, &config, sizeof(config)) + ? -EFAULT : 0; +} + +static void ptp_event_failure(struct efx_nic *efx, int expected_frag_len) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + + netif_err(efx, hw, efx->net_dev, + "PTP unexpected event length: got %d expected %d\n", + ptp->evt_frag_idx, expected_frag_len); + ptp->reset_required = true; + queue_work(ptp->workwq, &ptp->work); +} + +/* Process a completed receive event. Put it on the event queue and + * start worker thread. This is required because event and their + * correspoding packets may come in either order. + */ +static void ptp_event_rx(struct efx_nic *efx, struct efx_ptp_data *ptp) +{ + struct efx_ptp_event_rx *evt = NULL; + + if (ptp->evt_frag_idx != 3) { + ptp_event_failure(efx, 3); + return; + } + + spin_lock_bh(&ptp->evt_lock); + if (!list_empty(&ptp->evt_free_list)) { + evt = list_first_entry(&ptp->evt_free_list, + struct efx_ptp_event_rx, link); + list_del(&evt->link); + + evt->seq0 = EFX_QWORD_FIELD(ptp->evt_frags[2], MCDI_EVENT_DATA); + evt->seq1 = (EFX_QWORD_FIELD(ptp->evt_frags[2], + MCDI_EVENT_SRC) | + (EFX_QWORD_FIELD(ptp->evt_frags[1], + MCDI_EVENT_SRC) << 8) | + (EFX_QWORD_FIELD(ptp->evt_frags[0], + MCDI_EVENT_SRC) << 16)); + evt->hwtimestamp = ktime_set( + EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA), + EFX_QWORD_FIELD(ptp->evt_frags[1], MCDI_EVENT_DATA)); + evt->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS); + list_add_tail(&evt->link, &ptp->evt_list); + + queue_work(ptp->workwq, &ptp->work); + } else { + netif_err(efx, rx_err, efx->net_dev, "No free PTP event"); + } + spin_unlock_bh(&ptp->evt_lock); +} + +static void ptp_event_fault(struct efx_nic *efx, struct efx_ptp_data *ptp) +{ + int code = EFX_QWORD_FIELD(ptp->evt_frags[0], MCDI_EVENT_DATA); + if (ptp->evt_frag_idx != 1) { + ptp_event_failure(efx, 1); + return; + } + + netif_err(efx, hw, efx->net_dev, "PTP error %d\n", code); +} + +static void ptp_event_pps(struct efx_nic *efx, struct efx_ptp_data *ptp) +{ + if (ptp->nic_ts_enabled) + queue_work(ptp->pps_workwq, &ptp->pps_work); +} + +void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev) +{ + struct efx_ptp_data *ptp = efx->ptp_data; + int code = EFX_QWORD_FIELD(*ev, MCDI_EVENT_CODE); + + if (!ptp->enabled) + return; + + if (ptp->evt_frag_idx == 0) { + ptp->evt_code = code; + } else if (ptp->evt_code != code) { + netif_err(efx, hw, efx->net_dev, + "PTP out of sequence event %d\n", code); + ptp->evt_frag_idx = 0; + } + + ptp->evt_frags[ptp->evt_frag_idx++] = *ev; + if (!MCDI_EVENT_FIELD(*ev, CONT)) { + /* Process resulting event */ + switch (code) { + case MCDI_EVENT_CODE_PTP_RX: + ptp_event_rx(efx, ptp); + break; + case MCDI_EVENT_CODE_PTP_FAULT: + ptp_event_fault(efx, ptp); + break; + case MCDI_EVENT_CODE_PTP_PPS: + ptp_event_pps(efx, ptp); + break; + default: + netif_err(efx, hw, efx->net_dev, + "PTP unknown event %d\n", code); + break; + } + ptp->evt_frag_idx = 0; + } else if (MAX_EVENT_FRAGS == ptp->evt_frag_idx) { + netif_err(efx, hw, efx->net_dev, + "PTP too many event fragments\n"); + ptp->evt_frag_idx = 0; + } +} + +static int efx_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta) +{ + struct efx_ptp_data *ptp_data = container_of(ptp, + struct efx_ptp_data, + phc_clock_info); + struct efx_nic *efx = ptp_data->channel->efx; + u8 inadj[MC_CMD_PTP_IN_ADJUST_LEN]; + s64 adjustment_ns; + int rc; + + if (delta > MAX_PPB) + delta = MAX_PPB; + else if (delta < -MAX_PPB) + delta = -MAX_PPB; + + /* Convert ppb to fixed point ns. */ + adjustment_ns = (((s64)delta * PPB_SCALE_WORD) >> + (PPB_EXTRA_BITS + MAX_PPB_BITS)); + + MCDI_SET_DWORD(inadj, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST); + MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_FREQ_LO, (u32)adjustment_ns); + MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_FREQ_HI, + (u32)(adjustment_ns >> 32)); + MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_SECONDS, 0); + MCDI_SET_DWORD(inadj, PTP_IN_ADJUST_NANOSECONDS, 0); + rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inadj, sizeof(inadj), + NULL, 0, NULL); + if (rc != 0) + return rc; + + ptp_data->current_adjfreq = delta; + return 0; +} + +static int efx_phc_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + struct efx_ptp_data *ptp_data = container_of(ptp, + struct efx_ptp_data, + phc_clock_info); + struct efx_nic *efx = ptp_data->channel->efx; + struct timespec delta_ts = ns_to_timespec(delta); + u8 inbuf[MC_CMD_PTP_IN_ADJUST_LEN]; + + MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_ADJUST); + MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_FREQ_LO, 0); + MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_FREQ_HI, 0); + MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_SECONDS, (u32)delta_ts.tv_sec); + MCDI_SET_DWORD(inbuf, PTP_IN_ADJUST_NANOSECONDS, (u32)delta_ts.tv_nsec); + return efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf), + NULL, 0, NULL); +} + +static int efx_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts) +{ + struct efx_ptp_data *ptp_data = container_of(ptp, + struct efx_ptp_data, + phc_clock_info); + struct efx_nic *efx = ptp_data->channel->efx; + u8 inbuf[MC_CMD_PTP_IN_READ_NIC_TIME_LEN]; + u8 outbuf[MC_CMD_PTP_OUT_READ_NIC_TIME_LEN]; + int rc; + + MCDI_SET_DWORD(inbuf, PTP_IN_OP, MC_CMD_PTP_OP_READ_NIC_TIME); + + rc = efx_mcdi_rpc(efx, MC_CMD_PTP, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), NULL); + if (rc != 0) + return rc; + + ts->tv_sec = MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_SECONDS); + ts->tv_nsec = MCDI_DWORD(outbuf, PTP_OUT_READ_NIC_TIME_NANOSECONDS); + return 0; +} + +static int efx_phc_settime(struct ptp_clock_info *ptp, + const struct timespec *e_ts) +{ + /* Get the current NIC time, efx_phc_gettime. + * Subtract from the desired time to get the offset + * call efx_phc_adjtime with the offset + */ + int rc; + struct timespec time_now; + struct timespec delta; + + rc = efx_phc_gettime(ptp, &time_now); + if (rc != 0) + return rc; + + delta = timespec_sub(*e_ts, time_now); + + efx_phc_adjtime(ptp, timespec_to_ns(&delta)); + if (rc != 0) + return rc; + + return 0; +} + +static int efx_phc_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *request, + int enable) +{ + struct efx_ptp_data *ptp_data = container_of(ptp, + struct efx_ptp_data, + phc_clock_info); + if (request->type != PTP_CLK_REQ_PPS) + return -EOPNOTSUPP; + + ptp_data->nic_ts_enabled = !!enable; + return 0; +} + +static const struct efx_channel_type efx_ptp_channel_type = { + .handle_no_channel = efx_ptp_handle_no_channel, + .pre_probe = efx_ptp_probe_channel, + .post_remove = efx_ptp_remove_channel, + .get_name = efx_ptp_get_channel_name, + /* no copy operation; there is no need to reallocate this channel */ + .receive_skb = efx_ptp_rx, + .keep_eventq = false, +}; + +void efx_ptp_probe(struct efx_nic *efx) +{ + /* Check whether PTP is implemented on this NIC. The DISABLE + * operation will succeed if and only if it is implemented. + */ + if (efx_ptp_disable(efx) == 0) + efx->extra_channel_type[EFX_EXTRA_CHANNEL_PTP] = + &efx_ptp_channel_type; +} diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 719319b89d7a..9e0ad1b75c33 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -479,7 +479,7 @@ static void efx_rx_packet_gro(struct efx_channel *channel, skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE); - skb_record_rx_queue(skb, channel->channel); + skb_record_rx_queue(skb, channel->rx_queue.core_index); gro_result = napi_gro_frags(napi); } else { @@ -571,8 +571,14 @@ static void efx_rx_deliver(struct efx_channel *channel, /* Set the SKB flags */ skb_checksum_none_assert(skb); + /* Record the rx_queue */ + skb_record_rx_queue(skb, channel->rx_queue.core_index); + /* Pass the packet up */ - netif_receive_skb(skb); + if (channel->type->receive_skb) + channel->type->receive_skb(channel, skb); + else + netif_receive_skb(skb); /* Update allocation strategy method */ channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; @@ -608,13 +614,14 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf) * at the ethernet header */ skb->protocol = eth_type_trans(skb, efx->net_dev); - skb_record_rx_queue(skb, channel->channel); + skb_record_rx_queue(skb, channel->rx_queue.core_index); } if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM))) rx_buf->flags &= ~EFX_RX_PKT_CSUMMED; - if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED))) + if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) && + !channel->type->receive_skb) efx_rx_packet_gro(channel, rx_buf, eh); else efx_rx_deliver(channel, rx_buf); @@ -624,6 +631,11 @@ void efx_rx_strategy(struct efx_channel *channel) { enum efx_rx_alloc_method method = rx_alloc_method; + if (channel->type->receive_skb) { + channel->rx_alloc_push_pages = false; + return; + } + /* Only makes sense to use page based allocation if GRO is enabled */ if (!(channel->efx->net_dev->features & NETIF_F_GRO)) { method = RX_ALLOC_METHOD_SKB; diff --git a/drivers/net/ethernet/sfc/selftest.c b/drivers/net/ethernet/sfc/selftest.c index 96068d15b601..ce72ae4f399f 100644 --- a/drivers/net/ethernet/sfc/selftest.c +++ b/drivers/net/ethernet/sfc/selftest.c @@ -614,7 +614,8 @@ static int efx_test_loopbacks(struct efx_nic *efx, struct efx_self_tests *tests, { enum efx_loopback_mode mode; struct efx_loopback_state *state; - struct efx_channel *channel = efx_get_channel(efx, 0); + struct efx_channel *channel = + efx_get_channel(efx, efx->tx_channel_offset); struct efx_tx_queue *tx_queue; int rc = 0; diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c index 6bafd216e55e..84b41bf08a38 100644 --- a/drivers/net/ethernet/sfc/siena.c +++ b/drivers/net/ethernet/sfc/siena.c @@ -335,6 +335,7 @@ static int siena_probe_nic(struct efx_nic *efx) goto fail5; efx_sriov_probe(efx); + efx_ptp_probe(efx); return 0; diff --git a/drivers/net/ethernet/sfc/siena_sriov.c b/drivers/net/ethernet/sfc/siena_sriov.c index 9cb3b84ecae9..d49b53dc2a50 100644 --- a/drivers/net/ethernet/sfc/siena_sriov.c +++ b/drivers/net/ethernet/sfc/siena_sriov.c @@ -21,6 +21,9 @@ /* Number of longs required to track all the VIs in a VF */ #define VI_MASK_LENGTH BITS_TO_LONGS(1 << EFX_VI_SCALE_MAX) +/* Maximum number of RX queues supported */ +#define VF_MAX_RX_QUEUES 63 + /** * enum efx_vf_tx_filter_mode - TX MAC filtering behaviour * @VF_TX_FILTER_OFF: Disabled @@ -578,6 +581,7 @@ static int efx_vfdi_init_rxq(struct efx_vf *vf) efx_oword_t reg; if (bad_vf_index(efx, vf_evq) || bad_vf_index(efx, vf_rxq) || + vf_rxq >= VF_MAX_RX_QUEUES || bad_buf_count(buf_count, EFX_MAX_DMAQ_SIZE)) { if (net_ratelimit()) netif_err(efx, hw, efx->net_dev, @@ -683,6 +687,9 @@ static int efx_vfdi_fini_all_queues(struct efx_vf *vf) __le32 *rxqs; int rc; + BUILD_BUG_ON(VF_MAX_RX_QUEUES > + MC_CMD_FLUSH_RX_QUEUES_IN_QID_OFST_MAXNUM); + rxqs = kmalloc(count * sizeof(*rxqs), GFP_KERNEL); if (rxqs == NULL) return VFDI_RC_ENOMEM; @@ -1028,6 +1035,7 @@ efx_sriov_get_channel_name(struct efx_channel *channel, char *buf, size_t len) static const struct efx_channel_type efx_sriov_channel_type = { .handle_no_channel = efx_sriov_handle_no_channel, .pre_probe = efx_sriov_probe_channel, + .post_remove = efx_channel_dummy_op_void, .get_name = efx_sriov_get_channel_name, /* no copy operation; channel must not be reallocated */ .keep_eventq = true, diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index 18713436b443..5e090e54298e 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -22,14 +22,6 @@ #include "nic.h" #include "workarounds.h" -/* - * TX descriptor ring full threshold - * - * The tx_queue descriptor ring fill-level must fall below this value - * before we restart the netif queue - */ -#define EFX_TXQ_THRESHOLD(_efx) ((_efx)->txq_entries / 2u) - static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, struct efx_tx_buffer *buffer, unsigned int *pkts_compl, @@ -39,67 +31,32 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, struct device *dma_dev = &tx_queue->efx->pci_dev->dev; dma_addr_t unmap_addr = (buffer->dma_addr + buffer->len - buffer->unmap_len); - if (buffer->unmap_single) + if (buffer->flags & EFX_TX_BUF_MAP_SINGLE) dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len, DMA_TO_DEVICE); else dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len, DMA_TO_DEVICE); buffer->unmap_len = 0; - buffer->unmap_single = false; } - if (buffer->skb) { + if (buffer->flags & EFX_TX_BUF_SKB) { (*pkts_compl)++; (*bytes_compl) += buffer->skb->len; dev_kfree_skb_any((struct sk_buff *) buffer->skb); - buffer->skb = NULL; netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev, "TX queue %d transmission id %x complete\n", tx_queue->queue, tx_queue->read_count); + } else if (buffer->flags & EFX_TX_BUF_HEAP) { + kfree(buffer->heap_buf); } -} -/** - * struct efx_tso_header - a DMA mapped buffer for packet headers - * @next: Linked list of free ones. - * The list is protected by the TX queue lock. - * @dma_unmap_len: Length to unmap for an oversize buffer, or 0. - * @dma_addr: The DMA address of the header below. - * - * This controls the memory used for a TSO header. Use TSOH_DATA() - * to find the packet header data. Use TSOH_SIZE() to calculate the - * total size required for a given packet header length. TSO headers - * in the free list are exactly %TSOH_STD_SIZE bytes in size. - */ -struct efx_tso_header { - union { - struct efx_tso_header *next; - size_t unmap_len; - }; - dma_addr_t dma_addr; -}; + buffer->len = 0; + buffer->flags = 0; +} static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, struct sk_buff *skb); -static void efx_fini_tso(struct efx_tx_queue *tx_queue); -static void efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, - struct efx_tso_header *tsoh); - -static void efx_tsoh_free(struct efx_tx_queue *tx_queue, - struct efx_tx_buffer *buffer) -{ - if (buffer->tsoh) { - if (likely(!buffer->tsoh->unmap_len)) { - buffer->tsoh->next = tx_queue->tso_headers_free; - tx_queue->tso_headers_free = buffer->tsoh; - } else { - efx_tsoh_heap_free(tx_queue, buffer->tsoh); - } - buffer->tsoh = NULL; - } -} - static inline unsigned efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr) @@ -138,6 +95,56 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) return max_descs; } +/* Get partner of a TX queue, seen as part of the same net core queue */ +static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue) +{ + if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) + return tx_queue - EFX_TXQ_TYPE_OFFLOAD; + else + return tx_queue + EFX_TXQ_TYPE_OFFLOAD; +} + +static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1) +{ + /* We need to consider both queues that the net core sees as one */ + struct efx_tx_queue *txq2 = efx_tx_queue_partner(txq1); + struct efx_nic *efx = txq1->efx; + unsigned int fill_level; + + fill_level = max(txq1->insert_count - txq1->old_read_count, + txq2->insert_count - txq2->old_read_count); + if (likely(fill_level < efx->txq_stop_thresh)) + return; + + /* We used the stale old_read_count above, which gives us a + * pessimistic estimate of the fill level (which may even + * validly be >= efx->txq_entries). Now try again using + * read_count (more likely to be a cache miss). + * + * If we read read_count and then conditionally stop the + * queue, it is possible for the completion path to race with + * us and complete all outstanding descriptors in the middle, + * after which there will be no more completions to wake it. + * Therefore we stop the queue first, then read read_count + * (with a memory barrier to ensure the ordering), then + * restart the queue if the fill level turns out to be low + * enough. + */ + netif_tx_stop_queue(txq1->core_txq); + smp_mb(); + txq1->old_read_count = ACCESS_ONCE(txq1->read_count); + txq2->old_read_count = ACCESS_ONCE(txq2->read_count); + + fill_level = max(txq1->insert_count - txq1->old_read_count, + txq2->insert_count - txq2->old_read_count); + EFX_BUG_ON_PARANOID(fill_level >= efx->txq_entries); + if (likely(fill_level < efx->txq_stop_thresh)) { + smp_mb(); + if (likely(!efx->loopback_selftest)) + netif_tx_start_queue(txq1->core_txq); + } +} + /* * Add a socket buffer to a TX queue * @@ -151,7 +158,7 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) * This function is split out from efx_hard_start_xmit to allow the * loopback test to direct packets via specific TX queues. * - * Returns NETDEV_TX_OK or NETDEV_TX_BUSY + * Returns NETDEV_TX_OK. * You must hold netif_tx_lock() to call this function. */ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) @@ -160,12 +167,11 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) struct device *dma_dev = &efx->pci_dev->dev; struct efx_tx_buffer *buffer; skb_frag_t *fragment; - unsigned int len, unmap_len = 0, fill_level, insert_ptr; + unsigned int len, unmap_len = 0, insert_ptr; dma_addr_t dma_addr, unmap_addr = 0; unsigned int dma_len; - bool unmap_single; - int q_space, i = 0; - netdev_tx_t rc = NETDEV_TX_OK; + unsigned short dma_flags; + int i = 0; EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count); @@ -183,14 +189,11 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) return NETDEV_TX_OK; } - fill_level = tx_queue->insert_count - tx_queue->old_read_count; - q_space = efx->txq_entries - 1 - fill_level; - /* Map for DMA. Use dma_map_single rather than dma_map_page * since this is more efficient on machines with sparse * memory. */ - unmap_single = true; + dma_flags = EFX_TX_BUF_MAP_SINGLE; dma_addr = dma_map_single(dma_dev, skb->data, len, PCI_DMA_TODEVICE); /* Process all fragments */ @@ -205,39 +208,10 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) /* Add to TX queue, splitting across DMA boundaries */ do { - if (unlikely(q_space-- <= 0)) { - /* It might be that completions have - * happened since the xmit path last - * checked. Update the xmit path's - * copy of read_count. - */ - netif_tx_stop_queue(tx_queue->core_txq); - /* This memory barrier protects the - * change of queue state from the access - * of read_count. */ - smp_mb(); - tx_queue->old_read_count = - ACCESS_ONCE(tx_queue->read_count); - fill_level = (tx_queue->insert_count - - tx_queue->old_read_count); - q_space = efx->txq_entries - 1 - fill_level; - if (unlikely(q_space-- <= 0)) { - rc = NETDEV_TX_BUSY; - goto unwind; - } - smp_mb(); - if (likely(!efx->loopback_selftest)) - netif_tx_start_queue( - tx_queue->core_txq); - } - insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; buffer = &tx_queue->buffer[insert_ptr]; - efx_tsoh_free(tx_queue, buffer); - EFX_BUG_ON_PARANOID(buffer->tsoh); - EFX_BUG_ON_PARANOID(buffer->skb); + EFX_BUG_ON_PARANOID(buffer->flags); EFX_BUG_ON_PARANOID(buffer->len); - EFX_BUG_ON_PARANOID(!buffer->continuation); EFX_BUG_ON_PARANOID(buffer->unmap_len); dma_len = efx_max_tx_len(efx, dma_addr); @@ -247,13 +221,14 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) /* Fill out per descriptor fields */ buffer->len = dma_len; buffer->dma_addr = dma_addr; + buffer->flags = EFX_TX_BUF_CONT; len -= dma_len; dma_addr += dma_len; ++tx_queue->insert_count; } while (len); /* Transfer ownership of the unmapping to the final buffer */ - buffer->unmap_single = unmap_single; + buffer->flags = EFX_TX_BUF_CONT | dma_flags; buffer->unmap_len = unmap_len; unmap_len = 0; @@ -264,20 +239,22 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) len = skb_frag_size(fragment); i++; /* Map for DMA */ - unmap_single = false; + dma_flags = 0; dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len, DMA_TO_DEVICE); } /* Transfer ownership of the skb to the final buffer */ buffer->skb = skb; - buffer->continuation = false; + buffer->flags = EFX_TX_BUF_SKB | dma_flags; netdev_tx_sent_queue(tx_queue->core_txq, skb->len); /* Pass off to hardware */ efx_nic_push_buffers(tx_queue); + efx_tx_maybe_stop_queue(tx_queue); + return NETDEV_TX_OK; dma_err: @@ -289,7 +266,6 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) /* Mark the packet as transmitted, and free the SKB ourselves */ dev_kfree_skb_any(skb); - unwind: /* Work backwards until we hit the original insert pointer value */ while (tx_queue->insert_count != tx_queue->write_count) { unsigned int pkts_compl = 0, bytes_compl = 0; @@ -297,12 +273,11 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; buffer = &tx_queue->buffer[insert_ptr]; efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); - buffer->len = 0; } /* Free the fragment we were mid-way through pushing */ if (unmap_len) { - if (unmap_single) + if (dma_flags & EFX_TX_BUF_MAP_SINGLE) dma_unmap_single(dma_dev, unmap_addr, unmap_len, DMA_TO_DEVICE); else @@ -310,7 +285,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) DMA_TO_DEVICE); } - return rc; + return NETDEV_TX_OK; } /* Remove packets from the TX queue @@ -340,8 +315,6 @@ static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue, } efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl); - buffer->continuation = true; - buffer->len = 0; ++tx_queue->read_count; read_ptr = tx_queue->read_count & tx_queue->ptr_mask; @@ -366,6 +339,12 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb, EFX_WARN_ON_PARANOID(!netif_device_present(net_dev)); + /* PTP "event" packet */ + if (unlikely(efx_xmit_with_hwtstamp(skb)) && + unlikely(efx_ptp_is_ptp_tx(efx, skb))) { + return efx_ptp_tx(efx, skb); + } + index = skb_get_queue_mapping(skb); type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0; if (index >= efx->n_tx_channels) { @@ -450,6 +429,7 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) { unsigned fill_level; struct efx_nic *efx = tx_queue->efx; + struct efx_tx_queue *txq2; unsigned int pkts_compl = 0, bytes_compl = 0; EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask); @@ -457,15 +437,18 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl); netdev_tx_completed_queue(tx_queue->core_txq, pkts_compl, bytes_compl); - /* See if we need to restart the netif queue. This barrier - * separates the update of read_count from the test of the - * queue state. */ + /* See if we need to restart the netif queue. This memory + * barrier ensures that we write read_count (inside + * efx_dequeue_buffers()) before reading the queue status. + */ smp_mb(); if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) && likely(efx->port_enabled) && likely(netif_device_present(efx->net_dev))) { - fill_level = tx_queue->insert_count - tx_queue->read_count; - if (fill_level < EFX_TXQ_THRESHOLD(efx)) + txq2 = efx_tx_queue_partner(tx_queue); + fill_level = max(tx_queue->insert_count - tx_queue->read_count, + txq2->insert_count - txq2->read_count); + if (fill_level <= efx->txq_wake_thresh) netif_tx_wake_queue(tx_queue->core_txq); } @@ -480,11 +463,26 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) } } +/* Size of page-based TSO header buffers. Larger blocks must be + * allocated from the heap. + */ +#define TSOH_STD_SIZE 128 +#define TSOH_PER_PAGE (PAGE_SIZE / TSOH_STD_SIZE) + +/* At most half the descriptors in the queue at any time will refer to + * a TSO header buffer, since they must always be followed by a + * payload descriptor referring to an skb. + */ +static unsigned int efx_tsoh_page_count(struct efx_tx_queue *tx_queue) +{ + return DIV_ROUND_UP(tx_queue->ptr_mask + 1, 2 * TSOH_PER_PAGE); +} + int efx_probe_tx_queue(struct efx_tx_queue *tx_queue) { struct efx_nic *efx = tx_queue->efx; unsigned int entries; - int i, rc; + int rc; /* Create the smallest power-of-two aligned ring */ entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE); @@ -500,17 +498,28 @@ int efx_probe_tx_queue(struct efx_tx_queue *tx_queue) GFP_KERNEL); if (!tx_queue->buffer) return -ENOMEM; - for (i = 0; i <= tx_queue->ptr_mask; ++i) - tx_queue->buffer[i].continuation = true; + + if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) { + tx_queue->tsoh_page = + kcalloc(efx_tsoh_page_count(tx_queue), + sizeof(tx_queue->tsoh_page[0]), GFP_KERNEL); + if (!tx_queue->tsoh_page) { + rc = -ENOMEM; + goto fail1; + } + } /* Allocate hardware ring */ rc = efx_nic_probe_tx(tx_queue); if (rc) - goto fail; + goto fail2; return 0; - fail: +fail2: + kfree(tx_queue->tsoh_page); + tx_queue->tsoh_page = NULL; +fail1: kfree(tx_queue->buffer); tx_queue->buffer = NULL; return rc; @@ -546,8 +555,6 @@ void efx_release_tx_buffers(struct efx_tx_queue *tx_queue) unsigned int pkts_compl = 0, bytes_compl = 0; buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask]; efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); - buffer->continuation = true; - buffer->len = 0; ++tx_queue->read_count; } @@ -568,13 +575,12 @@ void efx_fini_tx_queue(struct efx_tx_queue *tx_queue) efx_nic_fini_tx(tx_queue); efx_release_tx_buffers(tx_queue); - - /* Free up TSO header cache */ - efx_fini_tso(tx_queue); } void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) { + int i; + if (!tx_queue->buffer) return; @@ -582,6 +588,14 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) "destroying TX queue %d\n", tx_queue->queue); efx_nic_remove_tx(tx_queue); + if (tx_queue->tsoh_page) { + for (i = 0; i < efx_tsoh_page_count(tx_queue); i++) + efx_nic_free_buffer(tx_queue->efx, + &tx_queue->tsoh_page[i]); + kfree(tx_queue->tsoh_page); + tx_queue->tsoh_page = NULL; + } + kfree(tx_queue->buffer); tx_queue->buffer = NULL; } @@ -604,22 +618,7 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) #define TSOH_OFFSET NET_IP_ALIGN #endif -#define TSOH_BUFFER(tsoh) ((u8 *)(tsoh + 1) + TSOH_OFFSET) - -/* Total size of struct efx_tso_header, buffer and padding */ -#define TSOH_SIZE(hdr_len) \ - (sizeof(struct efx_tso_header) + TSOH_OFFSET + hdr_len) - -/* Size of blocks on free list. Larger blocks must be allocated from - * the heap. - */ -#define TSOH_STD_SIZE 128 - #define PTR_DIFF(p1, p2) ((u8 *)(p1) - (u8 *)(p2)) -#define ETH_HDR_LEN(skb) (skb_network_header(skb) - (skb)->data) -#define SKB_TCP_OFF(skb) PTR_DIFF(tcp_hdr(skb), (skb)->data) -#define SKB_IPV4_OFF(skb) PTR_DIFF(ip_hdr(skb), (skb)->data) -#define SKB_IPV6_OFF(skb) PTR_DIFF(ipv6_hdr(skb), (skb)->data) /** * struct tso_state - TSO state for an SKB @@ -631,10 +630,12 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) * @in_len: Remaining length in current SKB fragment * @unmap_len: Length of SKB fragment * @unmap_addr: DMA address of SKB fragment - * @unmap_single: DMA single vs page mapping flag + * @dma_flags: TX buffer flags for DMA mapping - %EFX_TX_BUF_MAP_SINGLE or 0 * @protocol: Network protocol (after any VLAN header) + * @ip_off: Offset of IP header + * @tcp_off: Offset of TCP header * @header_len: Number of bytes of header - * @full_packet_size: Number of bytes to put in each outgoing segment + * @ip_base_len: IPv4 tot_len or IPv6 payload_len, before TCP payload * * The state used during segmentation. It is put into this data structure * just to make it easy to pass into inline functions. @@ -651,11 +652,13 @@ struct tso_state { unsigned in_len; unsigned unmap_len; dma_addr_t unmap_addr; - bool unmap_single; + unsigned short dma_flags; __be16 protocol; + unsigned int ip_off; + unsigned int tcp_off; unsigned header_len; - int full_packet_size; + unsigned int ip_base_len; }; @@ -687,91 +690,43 @@ static __be16 efx_tso_check_protocol(struct sk_buff *skb) return protocol; } - -/* - * Allocate a page worth of efx_tso_header structures, and string them - * into the tx_queue->tso_headers_free linked list. Return 0 or -ENOMEM. - */ -static int efx_tsoh_block_alloc(struct efx_tx_queue *tx_queue) +static u8 *efx_tsoh_get_buffer(struct efx_tx_queue *tx_queue, + struct efx_tx_buffer *buffer, unsigned int len) { - struct device *dma_dev = &tx_queue->efx->pci_dev->dev; - struct efx_tso_header *tsoh; - dma_addr_t dma_addr; - u8 *base_kva, *kva; + u8 *result; - base_kva = dma_alloc_coherent(dma_dev, PAGE_SIZE, &dma_addr, GFP_ATOMIC); - if (base_kva == NULL) { - netif_err(tx_queue->efx, tx_err, tx_queue->efx->net_dev, - "Unable to allocate page for TSO headers\n"); - return -ENOMEM; - } - - /* dma_alloc_coherent() allocates pages. */ - EFX_BUG_ON_PARANOID(dma_addr & (PAGE_SIZE - 1u)); - - for (kva = base_kva; kva < base_kva + PAGE_SIZE; kva += TSOH_STD_SIZE) { - tsoh = (struct efx_tso_header *)kva; - tsoh->dma_addr = dma_addr + (TSOH_BUFFER(tsoh) - base_kva); - tsoh->next = tx_queue->tso_headers_free; - tx_queue->tso_headers_free = tsoh; - } - - return 0; -} - - -/* Free up a TSO header, and all others in the same page. */ -static void efx_tsoh_block_free(struct efx_tx_queue *tx_queue, - struct efx_tso_header *tsoh, - struct device *dma_dev) -{ - struct efx_tso_header **p; - unsigned long base_kva; - dma_addr_t base_dma; - - base_kva = (unsigned long)tsoh & PAGE_MASK; - base_dma = tsoh->dma_addr & PAGE_MASK; - - p = &tx_queue->tso_headers_free; - while (*p != NULL) { - if (((unsigned long)*p & PAGE_MASK) == base_kva) - *p = (*p)->next; - else - p = &(*p)->next; - } + EFX_BUG_ON_PARANOID(buffer->len); + EFX_BUG_ON_PARANOID(buffer->flags); + EFX_BUG_ON_PARANOID(buffer->unmap_len); - dma_free_coherent(dma_dev, PAGE_SIZE, (void *)base_kva, base_dma); -} + if (likely(len <= TSOH_STD_SIZE - TSOH_OFFSET)) { + unsigned index = + (tx_queue->insert_count & tx_queue->ptr_mask) / 2; + struct efx_buffer *page_buf = + &tx_queue->tsoh_page[index / TSOH_PER_PAGE]; + unsigned offset = + TSOH_STD_SIZE * (index % TSOH_PER_PAGE) + TSOH_OFFSET; + + if (unlikely(!page_buf->addr) && + efx_nic_alloc_buffer(tx_queue->efx, page_buf, PAGE_SIZE)) + return NULL; + + result = (u8 *)page_buf->addr + offset; + buffer->dma_addr = page_buf->dma_addr + offset; + buffer->flags = EFX_TX_BUF_CONT; + } else { + tx_queue->tso_long_headers++; -static struct efx_tso_header * -efx_tsoh_heap_alloc(struct efx_tx_queue *tx_queue, size_t header_len) -{ - struct efx_tso_header *tsoh; - - tsoh = kmalloc(TSOH_SIZE(header_len), GFP_ATOMIC | GFP_DMA); - if (unlikely(!tsoh)) - return NULL; - - tsoh->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev, - TSOH_BUFFER(tsoh), header_len, - DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev, - tsoh->dma_addr))) { - kfree(tsoh); - return NULL; + buffer->heap_buf = kmalloc(TSOH_OFFSET + len, GFP_ATOMIC); + if (unlikely(!buffer->heap_buf)) + return NULL; + result = (u8 *)buffer->heap_buf + TSOH_OFFSET; + buffer->flags = EFX_TX_BUF_CONT | EFX_TX_BUF_HEAP; } - tsoh->unmap_len = header_len; - return tsoh; -} + buffer->len = len; -static void -efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, struct efx_tso_header *tsoh) -{ - dma_unmap_single(&tx_queue->efx->pci_dev->dev, - tsoh->dma_addr, tsoh->unmap_len, - DMA_TO_DEVICE); - kfree(tsoh); + return result; } /** @@ -781,47 +736,19 @@ efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, struct efx_tso_header *tsoh) * @len: Length of fragment * @final_buffer: The final buffer inserted into the queue * - * Push descriptors onto the TX queue. Return 0 on success or 1 if - * @tx_queue full. + * Push descriptors onto the TX queue. */ -static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue, - dma_addr_t dma_addr, unsigned len, - struct efx_tx_buffer **final_buffer) +static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue, + dma_addr_t dma_addr, unsigned len, + struct efx_tx_buffer **final_buffer) { struct efx_tx_buffer *buffer; struct efx_nic *efx = tx_queue->efx; - unsigned dma_len, fill_level, insert_ptr; - int q_space; + unsigned dma_len, insert_ptr; EFX_BUG_ON_PARANOID(len <= 0); - fill_level = tx_queue->insert_count - tx_queue->old_read_count; - /* -1 as there is no way to represent all descriptors used */ - q_space = efx->txq_entries - 1 - fill_level; - while (1) { - if (unlikely(q_space-- <= 0)) { - /* It might be that completions have happened - * since the xmit path last checked. Update - * the xmit path's copy of read_count. - */ - netif_tx_stop_queue(tx_queue->core_txq); - /* This memory barrier protects the change of - * queue state from the access of read_count. */ - smp_mb(); - tx_queue->old_read_count = - ACCESS_ONCE(tx_queue->read_count); - fill_level = (tx_queue->insert_count - - tx_queue->old_read_count); - q_space = efx->txq_entries - 1 - fill_level; - if (unlikely(q_space-- <= 0)) { - *final_buffer = NULL; - return 1; - } - smp_mb(); - netif_tx_start_queue(tx_queue->core_txq); - } - insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; buffer = &tx_queue->buffer[insert_ptr]; ++tx_queue->insert_count; @@ -830,12 +757,9 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue, tx_queue->read_count >= efx->txq_entries); - efx_tsoh_free(tx_queue, buffer); EFX_BUG_ON_PARANOID(buffer->len); EFX_BUG_ON_PARANOID(buffer->unmap_len); - EFX_BUG_ON_PARANOID(buffer->skb); - EFX_BUG_ON_PARANOID(!buffer->continuation); - EFX_BUG_ON_PARANOID(buffer->tsoh); + EFX_BUG_ON_PARANOID(buffer->flags); buffer->dma_addr = dma_addr; @@ -845,7 +769,8 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue, if (dma_len >= len) break; - buffer->len = dma_len; /* Don't set the other members */ + buffer->len = dma_len; + buffer->flags = EFX_TX_BUF_CONT; dma_addr += dma_len; len -= dma_len; } @@ -853,7 +778,6 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue, EFX_BUG_ON_PARANOID(!len); buffer->len = len; *final_buffer = buffer; - return 0; } @@ -864,54 +788,42 @@ static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue, * a single fragment, and we know it doesn't cross a page boundary. It * also allows us to not worry about end-of-packet etc. */ -static void efx_tso_put_header(struct efx_tx_queue *tx_queue, - struct efx_tso_header *tsoh, unsigned len) +static int efx_tso_put_header(struct efx_tx_queue *tx_queue, + struct efx_tx_buffer *buffer, u8 *header) { - struct efx_tx_buffer *buffer; - - buffer = &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask]; - efx_tsoh_free(tx_queue, buffer); - EFX_BUG_ON_PARANOID(buffer->len); - EFX_BUG_ON_PARANOID(buffer->unmap_len); - EFX_BUG_ON_PARANOID(buffer->skb); - EFX_BUG_ON_PARANOID(!buffer->continuation); - EFX_BUG_ON_PARANOID(buffer->tsoh); - buffer->len = len; - buffer->dma_addr = tsoh->dma_addr; - buffer->tsoh = tsoh; + if (unlikely(buffer->flags & EFX_TX_BUF_HEAP)) { + buffer->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev, + header, buffer->len, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev, + buffer->dma_addr))) { + kfree(buffer->heap_buf); + buffer->len = 0; + buffer->flags = 0; + return -ENOMEM; + } + buffer->unmap_len = buffer->len; + buffer->flags |= EFX_TX_BUF_MAP_SINGLE; + } ++tx_queue->insert_count; + return 0; } -/* Remove descriptors put into a tx_queue. */ +/* Remove buffers put into a tx_queue. None of the buffers must have + * an skb attached. + */ static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue) { struct efx_tx_buffer *buffer; - dma_addr_t unmap_addr; /* Work backwards until we hit the original insert pointer value */ while (tx_queue->insert_count != tx_queue->write_count) { --tx_queue->insert_count; buffer = &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask]; - efx_tsoh_free(tx_queue, buffer); - EFX_BUG_ON_PARANOID(buffer->skb); - if (buffer->unmap_len) { - unmap_addr = (buffer->dma_addr + buffer->len - - buffer->unmap_len); - if (buffer->unmap_single) - dma_unmap_single(&tx_queue->efx->pci_dev->dev, - unmap_addr, buffer->unmap_len, - DMA_TO_DEVICE); - else - dma_unmap_page(&tx_queue->efx->pci_dev->dev, - unmap_addr, buffer->unmap_len, - DMA_TO_DEVICE); - buffer->unmap_len = 0; - } - buffer->len = 0; - buffer->continuation = true; + efx_dequeue_buffer(tx_queue, buffer, NULL, NULL); } } @@ -919,17 +831,16 @@ static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue) /* Parse the SKB header and initialise state. */ static void tso_start(struct tso_state *st, const struct sk_buff *skb) { - /* All ethernet/IP/TCP headers combined size is TCP header size - * plus offset of TCP header relative to start of packet. - */ - st->header_len = ((tcp_hdr(skb)->doff << 2u) - + PTR_DIFF(tcp_hdr(skb), skb->data)); - st->full_packet_size = st->header_len + skb_shinfo(skb)->gso_size; - - if (st->protocol == htons(ETH_P_IP)) + st->ip_off = skb_network_header(skb) - skb->data; + st->tcp_off = skb_transport_header(skb) - skb->data; + st->header_len = st->tcp_off + (tcp_hdr(skb)->doff << 2u); + if (st->protocol == htons(ETH_P_IP)) { + st->ip_base_len = st->header_len - st->ip_off; st->ipv4_id = ntohs(ip_hdr(skb)->id); - else + } else { + st->ip_base_len = st->header_len - st->tcp_off; st->ipv4_id = 0; + } st->seqnum = ntohl(tcp_hdr(skb)->seq); EFX_BUG_ON_PARANOID(tcp_hdr(skb)->urg); @@ -938,7 +849,7 @@ static void tso_start(struct tso_state *st, const struct sk_buff *skb) st->out_len = skb->len - st->header_len; st->unmap_len = 0; - st->unmap_single = false; + st->dma_flags = 0; } static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx, @@ -947,7 +858,7 @@ static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx, st->unmap_addr = skb_frag_dma_map(&efx->pci_dev->dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE); if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) { - st->unmap_single = false; + st->dma_flags = 0; st->unmap_len = skb_frag_size(frag); st->in_len = skb_frag_size(frag); st->dma_addr = st->unmap_addr; @@ -965,7 +876,7 @@ static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx, st->unmap_addr = dma_map_single(&efx->pci_dev->dev, skb->data + hl, len, DMA_TO_DEVICE); if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) { - st->unmap_single = true; + st->dma_flags = EFX_TX_BUF_MAP_SINGLE; st->unmap_len = len; st->in_len = len; st->dma_addr = st->unmap_addr; @@ -982,20 +893,19 @@ static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx, * @st: TSO state * * Form descriptors for the current fragment, until we reach the end - * of fragment or end-of-packet. Return 0 on success, 1 if not enough - * space in @tx_queue. + * of fragment or end-of-packet. */ -static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue, - const struct sk_buff *skb, - struct tso_state *st) +static void tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue, + const struct sk_buff *skb, + struct tso_state *st) { struct efx_tx_buffer *buffer; - int n, end_of_packet, rc; + int n; if (st->in_len == 0) - return 0; + return; if (st->packet_space == 0) - return 0; + return; EFX_BUG_ON_PARANOID(st->in_len <= 0); EFX_BUG_ON_PARANOID(st->packet_space <= 0); @@ -1006,25 +916,24 @@ static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue, st->out_len -= n; st->in_len -= n; - rc = efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer); - if (likely(rc == 0)) { - if (st->out_len == 0) - /* Transfer ownership of the skb */ - buffer->skb = skb; + efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer); - end_of_packet = st->out_len == 0 || st->packet_space == 0; - buffer->continuation = !end_of_packet; + if (st->out_len == 0) { + /* Transfer ownership of the skb */ + buffer->skb = skb; + buffer->flags = EFX_TX_BUF_SKB; + } else if (st->packet_space != 0) { + buffer->flags = EFX_TX_BUF_CONT; + } - if (st->in_len == 0) { - /* Transfer ownership of the DMA mapping */ - buffer->unmap_len = st->unmap_len; - buffer->unmap_single = st->unmap_single; - st->unmap_len = 0; - } + if (st->in_len == 0) { + /* Transfer ownership of the DMA mapping */ + buffer->unmap_len = st->unmap_len; + buffer->flags |= st->dma_flags; + st->unmap_len = 0; } st->dma_addr += n; - return rc; } @@ -1035,36 +944,25 @@ static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue, * @st: TSO state * * Generate a new header and prepare for the new packet. Return 0 on - * success, or -1 if failed to alloc header. + * success, or -%ENOMEM if failed to alloc header. */ static int tso_start_new_packet(struct efx_tx_queue *tx_queue, const struct sk_buff *skb, struct tso_state *st) { - struct efx_tso_header *tsoh; + struct efx_tx_buffer *buffer = + &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask]; struct tcphdr *tsoh_th; unsigned ip_length; u8 *header; + int rc; - /* Allocate a DMA-mapped header buffer. */ - if (likely(TSOH_SIZE(st->header_len) <= TSOH_STD_SIZE)) { - if (tx_queue->tso_headers_free == NULL) { - if (efx_tsoh_block_alloc(tx_queue)) - return -1; - } - EFX_BUG_ON_PARANOID(!tx_queue->tso_headers_free); - tsoh = tx_queue->tso_headers_free; - tx_queue->tso_headers_free = tsoh->next; - tsoh->unmap_len = 0; - } else { - tx_queue->tso_long_headers++; - tsoh = efx_tsoh_heap_alloc(tx_queue, st->header_len); - if (unlikely(!tsoh)) - return -1; - } + /* Allocate and insert a DMA-mapped header buffer. */ + header = efx_tsoh_get_buffer(tx_queue, buffer, st->header_len); + if (!header) + return -ENOMEM; - header = TSOH_BUFFER(tsoh); - tsoh_th = (struct tcphdr *)(header + SKB_TCP_OFF(skb)); + tsoh_th = (struct tcphdr *)(header + st->tcp_off); /* Copy and update the headers. */ memcpy(header, skb->data, st->header_len); @@ -1073,19 +971,19 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue, st->seqnum += skb_shinfo(skb)->gso_size; if (st->out_len > skb_shinfo(skb)->gso_size) { /* This packet will not finish the TSO burst. */ - ip_length = st->full_packet_size - ETH_HDR_LEN(skb); + st->packet_space = skb_shinfo(skb)->gso_size; tsoh_th->fin = 0; tsoh_th->psh = 0; } else { /* This packet will be the last in the TSO burst. */ - ip_length = st->header_len - ETH_HDR_LEN(skb) + st->out_len; + st->packet_space = st->out_len; tsoh_th->fin = tcp_hdr(skb)->fin; tsoh_th->psh = tcp_hdr(skb)->psh; } + ip_length = st->ip_base_len + st->packet_space; if (st->protocol == htons(ETH_P_IP)) { - struct iphdr *tsoh_iph = - (struct iphdr *)(header + SKB_IPV4_OFF(skb)); + struct iphdr *tsoh_iph = (struct iphdr *)(header + st->ip_off); tsoh_iph->tot_len = htons(ip_length); @@ -1094,16 +992,16 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue, st->ipv4_id++; } else { struct ipv6hdr *tsoh_iph = - (struct ipv6hdr *)(header + SKB_IPV6_OFF(skb)); + (struct ipv6hdr *)(header + st->ip_off); - tsoh_iph->payload_len = htons(ip_length - sizeof(*tsoh_iph)); + tsoh_iph->payload_len = htons(ip_length); } - st->packet_space = skb_shinfo(skb)->gso_size; - ++tx_queue->tso_packets; + rc = efx_tso_put_header(tx_queue, buffer, header); + if (unlikely(rc)) + return rc; - /* Form a descriptor for this header. */ - efx_tso_put_header(tx_queue, tsoh, st->header_len); + ++tx_queue->tso_packets; return 0; } @@ -1118,13 +1016,13 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue, * * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if * @skb was not enqueued. In all cases @skb is consumed. Return - * %NETDEV_TX_OK or %NETDEV_TX_BUSY. + * %NETDEV_TX_OK. */ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, struct sk_buff *skb) { struct efx_nic *efx = tx_queue->efx; - int frag_i, rc, rc2 = NETDEV_TX_OK; + int frag_i, rc; struct tso_state state; /* Find the packet protocol and sanity-check it */ @@ -1156,11 +1054,7 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, goto mem_err; while (1) { - rc = tso_fill_packet_with_fragment(tx_queue, skb, &state); - if (unlikely(rc)) { - rc2 = NETDEV_TX_BUSY; - goto unwind; - } + tso_fill_packet_with_fragment(tx_queue, skb, &state); /* Move onto the next fragment? */ if (state.in_len == 0) { @@ -1184,6 +1078,8 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, /* Pass off to hardware */ efx_nic_push_buffers(tx_queue); + efx_tx_maybe_stop_queue(tx_queue); + tx_queue->tso_bursts++; return NETDEV_TX_OK; @@ -1192,10 +1088,9 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, "Out of memory for TSO headers, or DMA mapping error\n"); dev_kfree_skb_any(skb); - unwind: /* Free the DMA mapping we were in the process of writing out */ if (state.unmap_len) { - if (state.unmap_single) + if (state.dma_flags & EFX_TX_BUF_MAP_SINGLE) dma_unmap_single(&efx->pci_dev->dev, state.unmap_addr, state.unmap_len, DMA_TO_DEVICE); else @@ -1204,25 +1099,5 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, } efx_enqueue_unwind(tx_queue); - return rc2; -} - - -/* - * Free up all TSO datastructures associated with tx_queue. This - * routine should be called only once the tx_queue is both empty and - * will no longer be used. - */ -static void efx_fini_tso(struct efx_tx_queue *tx_queue) -{ - unsigned i; - - if (tx_queue->buffer) { - for (i = 0; i <= tx_queue->ptr_mask; ++i) - efx_tsoh_free(tx_queue, &tx_queue->buffer[i]); - } - - while (tx_queue->tso_headers_free != NULL) - efx_tsoh_block_free(tx_queue, tx_queue->tso_headers_free, - &tx_queue->efx->pci_dev->dev); + return NETDEV_TX_OK; } |