From e8d4d34df715133c319fabcf63fdec684be75ff8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 23 Sep 2024 23:22:41 +0200 Subject: net: Add netif_get_gro_max_size helper for GRO Add a small netif_get_gro_max_size() helper which returns the maximum IPv4 or IPv6 GRO size of the netdevice. We later add a netif_get_gso_max_size() equivalent as well for GSO, so that these helpers can be used consistently instead of open-coded checks. Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240923212242.15669-1-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87b5e488325..d571451638de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5029,6 +5029,15 @@ void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); +static inline unsigned int +netif_get_gro_max_size(const struct net_device *dev, const struct sk_buff *skb) +{ + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ + return skb->protocol == htons(ETH_P_IPV6) ? + READ_ONCE(dev->gro_max_size) : + READ_ONCE(dev->gro_ipv4_max_size); +} + static inline bool netif_is_macsec(const struct net_device *dev) { return dev->priv_flags & IFF_MACSEC; -- cgit v1.2.3 From e609c959a939660c7519895f853dfa5624c6827a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 23 Sep 2024 23:22:42 +0200 Subject: net: Fix gso_features_check to check for both dev->gso_{ipv4_,}max_size Commit 24ab059d2ebd ("net: check dev->gso_max_size in gso_features_check()") added a dev->gso_max_size test to gso_features_check() in order to fall back to GSO when needed. This was added as it was noticed that some drivers could misbehave if TSO packets get too big. However, the check doesn't respect dev->gso_ipv4_max_size limit. For instance, a device could be configured with BIG TCP for IPv4, but not IPv6. Therefore, add a netif_get_gso_max_size() equivalent to netif_get_gro_max_size() and use the helper to respect both limits before falling back to GSO engine. Fixes: 24ab059d2ebd ("net: check dev->gso_max_size in gso_features_check()") Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240923212242.15669-2-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 9 +++++++++ net/core/dev.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d571451638de..4d20c776a4ff 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5038,6 +5038,15 @@ netif_get_gro_max_size(const struct net_device *dev, const struct sk_buff *skb) READ_ONCE(dev->gro_ipv4_max_size); } +static inline unsigned int +netif_get_gso_max_size(const struct net_device *dev, const struct sk_buff *skb) +{ + /* pairs with WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ + return skb->protocol == htons(ETH_P_IPV6) ? + READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); +} + static inline bool netif_is_macsec(const struct net_device *dev) { return dev->priv_flags & IFF_MACSEC; diff --git a/net/core/dev.c b/net/core/dev.c index cd479f5f22f6..74cf78a6b512 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3512,7 +3512,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; - if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size))) + if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { -- cgit v1.2.3 From 95ecba62e2fd201bcdcca636f5d774f1cd4f1458 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Oct 2024 19:41:18 +0000 Subject: net: fix races in netdev_tx_sent_queue()/dev_watchdog() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some workloads hit the infamous dev_watchdog() message: "NETDEV WATCHDOG: eth0 (xxxx): transmit queue XX timed out" It seems possible to hit this even for perfectly normal BQL enabled drivers: 1) Assume a TX queue was idle for more than dev->watchdog_timeo (5 seconds unless changed by the driver) 2) Assume a big packet is sent, exceeding current BQL limit. 3) Driver ndo_start_xmit() puts the packet in TX ring, and netdev_tx_sent_queue() is called. 4) QUEUE_STATE_STACK_XOFF could be set from netdev_tx_sent_queue() before txq->trans_start has been written. 5) txq->trans_start is written later, from netdev_start_xmit() if (rc == NETDEV_TX_OK) txq_trans_update(txq) dev_watchdog() running on another cpu could read the old txq->trans_start, and then see QUEUE_STATE_STACK_XOFF, because 5) did not happen yet. To solve the issue, write txq->trans_start right before one XOFF bit is set : - _QUEUE_STATE_DRV_XOFF from netif_tx_stop_queue() - __QUEUE_STATE_STACK_XOFF from netdev_tx_sent_queue() From dev_watchdog(), we have to read txq->state before txq->trans_start. Add memory barriers to enforce correct ordering. In the future, we could avoid writing over txq->trans_start for normal operations, and rename this field to txq->xoff_start_time. Fixes: bec251bc8b6a ("net: no longer stop all TX queues in dev_watchdog()") Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241015194118.3951657-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 12 ++++++++++++ net/sched/sch_generic.c | 8 +++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..8896705ccd63 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3325,6 +3325,12 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { + /* Paired with READ_ONCE() from dev_watchdog() */ + WRITE_ONCE(dev_queue->trans_start, jiffies); + + /* This barrier is paired with smp_mb() from dev_watchdog() */ + smp_mb__before_atomic(); + /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } @@ -3451,6 +3457,12 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, if (likely(dql_avail(&dev_queue->dql) >= 0)) return; + /* Paired with READ_ONCE() from dev_watchdog() */ + WRITE_ONCE(dev_queue->trans_start, jiffies); + + /* This barrier is paired with smp_mb() from dev_watchdog() */ + smp_mb__before_atomic(); + set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); /* diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2af24547a82c..38ec18f73de4 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -512,9 +512,15 @@ static void dev_watchdog(struct timer_list *t) struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); - trans_start = READ_ONCE(txq->trans_start); if (!netif_xmit_stopped(txq)) continue; + + /* Paired with WRITE_ONCE() + smp_mb...() in + * netdev_tx_sent_queue() and netif_tx_stop_queue(). + */ + smp_mb(); + trans_start = READ_ONCE(txq->trans_start); + if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); -- cgit v1.2.3