diff options
| author | Paolo Abeni <pabeni@redhat.com> | 2026-06-09 13:42:22 +0300 |
|---|---|---|
| committer | Paolo Abeni <pabeni@redhat.com> | 2026-06-09 13:42:22 +0300 |
| commit | 193b76eb109ed3ed77400c6d493b2f495ca64d9e (patch) | |
| tree | 3556494521f91c61a8a59959b10f4cc80da7e297 | |
| parent | 69a4e74e4cc46bdc65586f600b8552cc10a2811a (diff) | |
| parent | 0360976d7ed5c69484d873afa34a22db4d04996f (diff) | |
| download | linux-193b76eb109ed3ed77400c6d493b2f495ca64d9e.tar.xz | |
Merge branch 'netconsole-fix-reported-problems'
Breno Leitao says:
====================
netconsole: Fix reported problems
These are some of the issues that LLM reported to netconsole, and they
are being addressed here before big refactors.
I was doing some big refactors, and got some "pre-existent-issues"
during LLM review of the refactor, that make them hard to guarantee that
refactor is not introducing any bug, so, let's clean these pre-existent
bugs first, and then submit the refactor.
The issues fixed in this patchset were reported during the review of
https://lore.kernel.org/all/20260524-netconsole_move_more-v1-0-909d1ab398b4@debian.org/
Not all of them got fixed, but, those that were easy to reason about.
Why net-next and not 'net' tree.
Most of the functions that are being fixed here moved from netpoll to
netconsole, thus, fixing this on net will cause merge conflicts from
'net' to 'net-next', thus I decided to fix it on 'net-next', given we
are on 7.1-rc6 already. Sorry if that is not the right approach.
====================
Link: https://patch.msgid.link/20260604-netcons_fix_before_move-v3-0-ab055b3a6aa5@debian.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
| -rw-r--r-- | drivers/net/netconsole.c | 76 | ||||
| -rw-r--r-- | include/linux/netpoll.h | 16 | ||||
| -rw-r--r-- | net/core/netpoll.c | 7 |
3 files changed, 87 insertions, 12 deletions
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 8ecc2c71c699..606e265cdfd7 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -335,6 +335,24 @@ static void process_resume_target(struct work_struct *work) resume_target(nt); + /* netpoll_setup() took a net_device reference and dropped the RTNL + * before returning, all while this target was off target_list and + * thus invisible to netconsole_netdev_event(). If the device was + * unregistered in that window the NETDEV_UNREGISTER notifier could not + * tear this target down, which would leak the reference and hang + * unregister_netdevice(). Re-check under the RTNL before re-publishing: + * taking it across the check and the list_add() serialises against the + * notifier (which also runs under the RTNL), so the device is either + * still registered (the notifier will find the re-added target) or + * already unregistering (we drop the reference here). + */ + rtnl_lock(); + if (nt->state == STATE_ENABLED && nt->np.dev && + nt->np.dev->reg_state != NETREG_REGISTERED) { + do_netpoll_cleanup(&nt->np); + nt->state = STATE_DISABLED; + } + /* At this point the target is either enabled or disabled and * was cleaned up before getting deactivated. Either way, add it * back to target list. @@ -342,6 +360,7 @@ static void process_resume_target(struct work_struct *work) spin_lock_irqsave(&target_list_lock, flags); list_add(&nt->list, &target_list); spin_unlock_irqrestore(&target_list_lock, flags); + rtnl_unlock(); out_unlock: dynamic_netconsole_mutex_unlock(); @@ -1449,10 +1468,21 @@ static void drop_netconsole_target(struct config_group *group, { struct netconsole_target *nt = to_target(item); unsigned long flags; + bool needs_cleanup; dynamic_netconsole_mutex_lock(); + mutex_lock(&target_cleanup_list_lock); spin_lock_irqsave(&target_list_lock, flags); + /* A STATE_DEACTIVATED target may have been moved to + * target_cleanup_list by netconsole_netdev_event() but not yet + * processed by netconsole_process_cleanups_core(). Unlinking it below + * hides it from the cleanup worker, so this path has to clean it up + * itself. Record that the target still owns a netpoll before the + * state is downgraded. + */ + needs_cleanup = nt->state == STATE_ENABLED || + nt->state == STATE_DEACTIVATED; /* Disable deactivated target to prevent races between resume attempt * and target removal. */ @@ -1460,6 +1490,7 @@ static void drop_netconsole_target(struct config_group *group, nt->state = STATE_DISABLED; list_del(&nt->list); spin_unlock_irqrestore(&target_list_lock, flags); + mutex_unlock(&target_cleanup_list_lock); dynamic_netconsole_mutex_unlock(); @@ -1473,8 +1504,10 @@ static void drop_netconsole_target(struct config_group *group, /* * The target may have never been enabled, or was manually disabled * before being removed so netpoll may have already been cleaned up. + * netpoll_cleanup() is idempotent (it skips when np->dev is NULL), so + * it is safe even if the cleanup worker already tore the netpoll down. */ - if (nt->state == STATE_ENABLED) + if (needs_cleanup) netpoll_cleanup(&nt->np); config_item_put(&nt->group.cg_item); @@ -1654,6 +1687,41 @@ static struct notifier_block netconsole_netdev_notifier = { .notifier_call = netconsole_netdev_event, }; +/* Pop a pre-allocated skb from the pool and request a refill. + * + * The pool is refilled with MAX_SKB_SIZE buffers, so a pooled skb cannot + * satisfy a larger request. Return NULL in that case rather than handing + * back a too-small skb that would later trip skb_over_panic() in skb_put(); + * the caller still polls and retries, and alloc_skb() itself can satisfy the + * oversized request once memory frees up. + * + * The refill is requested via schedule_work(), which takes the workqueue + * pool locks and is therefore not NMI-safe. Skip the refill when called + * from NMI context; the next non-NMI caller will top the pool back up. + */ +static struct sk_buff *netcons_skb_pop(struct netpoll *np, int len) +{ + struct sk_buff *skb; + + if (len > MAX_SKB_SIZE) { + /* net_warn_ratelimited() pulls in printk machinery that is not + * NMI-safe and could recurse into the nbcon console we are + * servicing, so only warn outside NMI. + */ + if (!in_nmi()) + net_warn_ratelimited("netconsole: dropping message, requested skb len %d exceeds pool buffer size %zu on %s\n", + len, (size_t)MAX_SKB_SIZE, + np->dev->name); + return NULL; + } + + skb = skb_dequeue(&np->skb_pool); + if (!in_nmi()) + schedule_work(&np->refill_wq); + + return skb; +} + static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) { int count = 0; @@ -1663,10 +1731,8 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) { - skb = skb_dequeue(&np->skb_pool); - schedule_work(&np->refill_wq); - } + if (!skb) + skb = netcons_skb_pop(np, len); if (!skb) { if (++count < 10) { diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e4b8f1f91e54..88f7daa8560e 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -13,12 +13,28 @@ #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/refcount.h> +#include <linux/ip.h> +#include <linux/udp.h> union inet_addr { __be32 ip; struct in6_addr in6; }; +/* + * Maximum payload netpoll's preallocated skb pool can carry. Keep this in + * sync with the buffer size used by refill_skbs() in net/core/netpoll.c; + * callers (e.g. netconsole) use it to detect requests the pool can never + * satisfy and avoid dequeuing a pooled skb that would later trip + * skb_over_panic() in skb_put(). + */ +#define MAX_UDP_CHUNK 1460 +#define MAX_SKB_SIZE \ + (sizeof(struct ethhdr) + \ + sizeof(struct iphdr) + \ + sizeof(struct udphdr) + \ + MAX_UDP_CHUNK) + struct netpoll { struct net_device *dev; netdevice_tracker dev_tracker; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b3fe59445f2d..229dde818ab3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -41,16 +41,9 @@ * message gets out even in extreme OOM situations. */ -#define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 #define USEC_PER_POLL 50 -#define MAX_SKB_SIZE \ - (sizeof(struct ethhdr) + \ - sizeof(struct iphdr) + \ - sizeof(struct udphdr) + \ - MAX_UDP_CHUNK) - static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); |
