From 06b693d2eb6651a63ad85bad8673de3b7d4edd6d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Jun 2026 06:17:18 +0000 Subject: ipv4: fib: Don't dump dying fib_info in fib_leaf_notify(). syzbot reported use-after-free in nsim_fib4_prepare_event(). [0] The problem is that the following functions call fib_info_hold() / refcount_inc() while dumping fib_info under RCU, which is unsafe. * mlxsw_sp_router_fib4_event() * rocker_router_fib_event() * nsim_fib4_prepare_event() refcount_inc_not_zero() must be used, but it would be too late there. Let's guarantee the lifetime of fib_info in fib_leaf_notify(). Note that IPv6 does not need the corresponding change since fib6_table_dump() holds fib6_table.tb6_lock. [0]: refcount_t: addition on 0; use-after-free. WARNING: lib/refcount.c:25 at refcount_warn_saturate+0x9f/0x110 lib/refcount.c:25, CPU#0: kworker/u8:15/3420 Modules linked in: CPU: 0 UID: 0 PID: 3420 Comm: kworker/u8:15 Not tainted syzkaller #0 PREEMPT_{RT,(full)} Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026 Workqueue: netns cleanup_net RIP: 0010:refcount_warn_saturate+0x9f/0x110 lib/refcount.c:25 Code: eb 66 85 db 74 3e 83 fb 01 75 4c e8 1b f1 22 fd 48 8d 3d 84 cb f1 0a 67 48 0f b9 3a eb 4a e8 08 f1 22 fd 48 8d 3d 81 cb f1 0a <67> 48 0f b9 3a eb 37 e8 f5 f0 22 fd 48 8d 3d 7e cb f1 0a 67 48 0f RSP: 0018:ffffc9000f2c7270 EFLAGS: 00010293 RAX: ffffffff84a18858 RBX: 0000000000000002 RCX: ffff888032ff9ec0 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff8f9353e0 RBP: 0000000000000000 R08: ffff888032ff9ec0 R09: 0000000000000005 R10: 0000000000000100 R11: 0000000000000004 R12: ffff8880570cc000 R13: dffffc0000000000 R14: ffff88802b40563c R15: ffff8880570cc000 FS: 0000000000000000(0000) GS:ffff888126173000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb1f4d5d000 CR3: 000000006072a000 CR4: 00000000003526f0 Call Trace: __refcount_add include/linux/refcount.h:-1 [inline] __refcount_inc include/linux/refcount.h:366 [inline] refcount_inc include/linux/refcount.h:383 [inline] fib_info_hold include/net/ip_fib.h:629 [inline] nsim_fib4_prepare_event drivers/net/netdevsim/fib.c:930 [inline] nsim_fib_event_schedule_work drivers/net/netdevsim/fib.c:1000 [inline] nsim_fib_event_nb+0x1055/0x1240 drivers/net/netdevsim/fib.c:1043 call_fib_notifier+0x45/0x80 net/core/fib_notifier.c:25 call_fib_entry_notifier net/ipv4/fib_trie.c:90 [inline] fib_leaf_notify net/ipv4/fib_trie.c:2176 [inline] fib_table_notify net/ipv4/fib_trie.c:2194 [inline] fib_notify+0x36b/0x5e0 net/ipv4/fib_trie.c:2217 fib_net_dump net/core/fib_notifier.c:70 [inline] register_fib_notifier+0x184/0x360 net/core/fib_notifier.c:108 nsim_fib_create+0x85d/0x9f0 drivers/net/netdevsim/fib.c:1596 nsim_dev_reload_create drivers/net/netdevsim/dev.c:1604 [inline] nsim_dev_reload_up+0x374/0x7c0 drivers/net/netdevsim/dev.c:1058 devlink_reload+0x501/0x8d0 net/devlink/dev.c:475 devlink_pernet_pre_exit+0x1ff/0x420 net/devlink/core.c:558 ops_pre_exit_list net/core/net_namespace.c:161 [inline] ops_undo_list+0x187/0x940 net/core/net_namespace.c:234 cleanup_net+0x56e/0x800 net/core/net_namespace.c:702 process_one_work kernel/workqueue.c:3314 [inline] process_scheduled_works+0xb5d/0x1860 kernel/workqueue.c:3397 worker_thread+0xa53/0xfc0 kernel/workqueue.c:3478 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Fixes: 0ae3eb7b4611 ("netdevsim: fib: Perform the route programming in a non-atomic context") Fixes: c3852ef7f2f8 ("ipv4: fib: Replay events when registering FIB notifier") Reported-by: syzbot+cb2aa2390ac024e25f5c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6a290011.39669fcc.33b062.00b1.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260610061744.2030996-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 318593743b6e..541da2dde626 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -629,6 +629,11 @@ static inline void fib_info_hold(struct fib_info *fi) refcount_inc(&fi->fib_clntref); } +static inline bool fib_info_hold_safe(struct fib_info *fi) +{ + return refcount_inc_not_zero(&fi->fib_clntref); +} + static inline void fib_info_put(struct fib_info *fi) { if (refcount_dec_and_test(&fi->fib_clntref)) -- cgit v1.2.3 From 2821e85c058f81c9948a2fb1a634f7b47457d51c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 10 Jun 2026 06:17:19 +0000 Subject: net: fib_rules: Don't dump dying fib_rule in fib_rules_dump(). rocker_router_fib_event() calls fib_rule_get() during RCU dump. If the fib_rule is dying, refcount_inc() will complain about it. Let's call refcount_inc_not_zero() in fib_rules_dump(). Fixes: 5d7bfd141924 ("ipv4: fib_rules: Dump FIB rules when registering FIB notifier") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260610061744.2030996-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/fib_rules.h | 5 +++++ net/core/fib_rules.c | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 6e68e359ad18..7dee0ae616e3 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -111,6 +111,11 @@ static inline void fib_rule_get(struct fib_rule *rule) refcount_inc(&rule->refcnt); } +static inline bool fib_rule_get_safe(struct fib_rule *rule) +{ + return refcount_inc_not_zero(&rule->refcnt); +} + static inline void fib_rule_put(struct fib_rule *rule) { if (refcount_dec_and_test(&rule->refcnt)) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 8ca634964e36..cf374c208732 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -349,7 +349,7 @@ jumped: if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || - likely(refcount_inc_not_zero(&rule->refcnt))) { + likely(fib_rule_get_safe(rule))) { arg->rule = rule; goto out; } @@ -410,8 +410,12 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { + if (!fib_rule_get_safe(rule)) + continue; + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); + fib_rule_put(rule); if (err) break; } -- cgit v1.2.3 From 990348e5bb457697c2f1f7f7b65154a3334d9d2b Mon Sep 17 00:00:00 2001 From: Sechang Lim Date: Thu, 11 Jun 2026 09:29:18 +0000 Subject: tcp: clear sock_ops cb flags before force-closing a child socket A child socket inherits the listener's bpf_sock_ops_cb_flags via sk_clone_lock(). If its setup fails in tcp_v4_syn_recv_sock() / tcp_v6_syn_recv_sock(), the child is freed through put_and_exit, where inet_csk_prepare_forced_close() drops the socket lock and tcp_done() runs without it. If BPF_SOCK_OPS_STATE_CB_FLAG was inherited, tcp_done() -> tcp_set_state() calls tcp_call_bpf(), which expects the lock and trips sock_owned_by_me(): WARNING: include/net/sock.h:1799 at tcp_set_state+0x433/0x550 RIP: 0010:tcp_set_state+0x433/0x550 include/net/sock.h:1799 Call Trace: tcp_done+0xba/0x250 net/ipv4/tcp.c:5095 tcp_v4_syn_recv_sock+0x850/0xa50 net/ipv4/tcp_ipv4.c:1787 tcp_check_req+0xf30/0x1360 net/ipv4/tcp_minisocks.c:926 tcp_v4_rcv+0x1047/0x1b50 net/ipv4/tcp_ipv4.c:2164 The child is freed before it is ever established, so it should run no sock_ops callback. Clear its cb flags in inet_csk_prepare_for_destroy_sock(), the common point for the IPv4, IPv6 and chtls forced-close paths and for the MPTCP ->syn_recv_sock() failure path (dispose_child), which reaches tcp_done() on a child that was never established too. Suggested-by: Jiayuan Chen Fixes: d44874910a26 ("bpf: Add BPF_SOCK_OPS_STATE_CB") Signed-off-by: Sechang Lim Reviewed-by: Jiayuan Chen Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260611092923.1895982-1-rhkrqnwk98@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 9 +++++++++ net/ipv4/inet_connection_sock.c | 1 + 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 98848db62894..607298501e12 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2942,6 +2942,11 @@ static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, return tcp_call_bpf(sk, op, 3, args); } +static inline void tcp_clear_sock_ops_cb_flags(struct sock *sk) +{ + tcp_sk(sk)->bpf_sock_ops_cb_flags = 0; +} + #else static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { @@ -2959,6 +2964,10 @@ static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, return -EPERM; } +static inline void tcp_clear_sock_ops_cb_flags(struct sock *sk) +{ +} + #endif static inline u32 tcp_timeout_init(struct sock *sk) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5b934ce8d98a..690f7fb3f029 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1285,6 +1285,7 @@ EXPORT_SYMBOL(inet_csk_destroy_sock); void inet_csk_prepare_for_destroy_sock(struct sock *sk) { /* The below has to be done to allow calling inet_csk_destroy_sock */ + tcp_clear_sock_ops_cb_flags(sk); sock_set_flag(sk, SOCK_DEAD); tcp_orphan_count_inc(); } -- cgit v1.2.3 From 8eed5519e496b7a07f441a0f579cb228a33189f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2026 15:27:37 +0000 Subject: net: watchdog: fix refcount tracking races Blamed commit converted the untracked dev_hold()/dev_put() calls in the watchdog code to use the tracked dev_hold_track()/dev_put_track() (which were later renamed/interfaced to netdev_hold() and netdev_put()). By introducing dev->watchdog_dev_tracker to store the reference tracking information without adding synchronization between netdev_watchdog_up() and dev_watchdog(), it enabled the race condition where this pointer could be overwritten or freed concurrently, leading to the list corruption crash syzbot reported: list_del corruption, ffff888114a18c00->next is NULL kernel BUG at lib/list_debug.c:52 ! Oops: invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 1 UID: 0 PID: 91 Comm: kworker/u8:5 Not tainted syzkaller #0 PREEMPT(lazy) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026 Workqueue: events_unbound linkwatch_event RIP: 0010:__list_del_entry_valid_or_report.cold+0x22/0x2a lib/list_debug.c:52 Call Trace: __list_del_entry_valid include/linux/list.h:132 [inline] __list_del_entry include/linux/list.h:246 [inline] list_move_tail include/linux/list.h:341 [inline] ref_tracker_free+0x1a7/0x6c0 lib/ref_tracker.c:329 netdev_tracker_free include/linux/netdevice.h:4491 [inline] netdev_put include/linux/netdevice.h:4508 [inline] netdev_put include/linux/netdevice.h:4504 [inline] netdev_watchdog_down net/sched/sch_generic.c:600 [inline] dev_deactivate_many+0x28c/0xfe0 net/sched/sch_generic.c:1363 dev_deactivate+0x109/0x1d0 net/sched/sch_generic.c:1397 linkwatch_do_dev net/core/link_watch.c:184 [inline] linkwatch_do_dev+0xd3/0x120 net/core/link_watch.c:166 __linkwatch_run_queue+0x3a5/0x810 net/core/link_watch.c:240 linkwatch_event+0x8f/0xc0 net/core/link_watch.c:314 process_one_work+0xa0e/0x1980 kernel/workqueue.c:3314 process_scheduled_works kernel/workqueue.c:3397 [inline] worker_thread+0x5ef/0xe50 kernel/workqueue.c:3478 kthread+0x370/0x450 kernel/kthread.c:436 ret_from_fork+0x69a/0xc80 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 This patch has three coordinated parts: 1) Add dev->watchdog_lock and dev->watchdog_ref_held to serialize watchdog operations. 2) Remove netdev_watchdog_up() call from netif_carrier_on(): This ensures netdev_watchdog_up() is only called from process/BH context (via linkwatch workqueue dev_activate()), allowing us to use spin_lock_bh() for synchronization. 3) Synchronize watchdog up and watchdog timer: Protect netdev_watchdog_up() with tx_global_lock and watchdog_lock. Only allocate a new tracker in netdev_watchdog_up() if one is not already present. In dev_watchdog(), ensure we don't release the tracker if the timer was rescheduled either by dev_watchdog() itself or concurrently by netdev_watchdog_up(). Fixes: f12bf6f3f942 ("net: watchdog: add net device refcount tracker") Reported-by: syzbot+381d82bbf0253710b35d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6a26b751.c25708ab.1b19ef.0013.GAE@google.com/T/#u Tested-by: syzbot+3479efbc2821cb2a79f2@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260611152737.2580480-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 3 ++- net/sched/sch_generic.c | 44 +++++++++++++++++++++++++++++++++----------- 3 files changed, 39 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0e1e581efc5a..4a0e83709f29 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1980,6 +1980,8 @@ enum netdev_reg_state { * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) + * @watchdog_lock: protect watchdog_ref_held + * @watchdog_ref_held: True if the watchdog device ref is taken. * @watchdog_timer: List of timers * * @proto_down_reason: reason a netdev interface is held down @@ -2392,6 +2394,8 @@ struct net_device { /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; int watchdog_timeo; + spinlock_t watchdog_lock; + bool watchdog_ref_held; u32 proto_down_reason; diff --git a/net/core/dev.c b/net/core/dev.c index 0c6c270d9f7d..731e661d7be6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11217,7 +11217,8 @@ static int netif_alloc_netdev_queues(struct net_device *dev) netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); - + spin_lock_init(&dev->watchdog_lock); + dev->watchdog_ref_held = false; return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a93321db8fd7..6cdf2ccfb093 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -568,16 +568,24 @@ static void dev_watchdog(struct timer_list *t) dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); } - if (!mod_timer(&dev->watchdog_timer, - round_jiffies(oldest_start + - dev->watchdog_timeo))) - release = false; + spin_lock(&dev->watchdog_lock); + mod_timer(&dev->watchdog_timer, + round_jiffies(oldest_start + + dev->watchdog_timeo)); + release = false; + spin_unlock(&dev->watchdog_lock); } } spin_unlock(&dev->tx_global_lock); - if (release) + spin_lock(&dev->watchdog_lock); + if (timer_pending(&dev->watchdog_timer)) + release = false; + if (release && dev->watchdog_ref_held) { netdev_put(dev, &dev->watchdog_dev_tracker); + dev->watchdog_ref_held = false; + } + spin_unlock(&dev->watchdog_lock); } void netdev_watchdog_up(struct net_device *dev) @@ -586,18 +594,34 @@ void netdev_watchdog_up(struct net_device *dev) return; if (dev->watchdog_timeo <= 0) dev->watchdog_timeo = 5*HZ; + spin_lock_bh(&dev->tx_global_lock); + + spin_lock(&dev->watchdog_lock); if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + dev->watchdog_timeo))) - netdev_hold(dev, &dev->watchdog_dev_tracker, - GFP_ATOMIC); + round_jiffies(jiffies + dev->watchdog_timeo))) { + if (!dev->watchdog_ref_held) { + netdev_hold(dev, &dev->watchdog_dev_tracker, + GFP_ATOMIC); + dev->watchdog_ref_held = true; + } + } + spin_unlock(&dev->watchdog_lock); + + spin_unlock_bh(&dev->tx_global_lock); } EXPORT_SYMBOL_GPL(netdev_watchdog_up); static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); - if (timer_delete(&dev->watchdog_timer)) + + spin_lock(&dev->watchdog_lock); + if (timer_delete(&dev->watchdog_timer)) { netdev_put(dev, &dev->watchdog_dev_tracker); + dev->watchdog_ref_held = false; + } + spin_unlock(&dev->watchdog_lock); + netif_tx_unlock_bh(dev); } @@ -614,8 +638,6 @@ void netif_carrier_on(struct net_device *dev) return; atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); - if (netif_running(dev)) - netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); -- cgit v1.2.3 From 406e8a651a7b854c41fecd5117bb282b3a6c2c6b Mon Sep 17 00:00:00 2001 From: Yiming Qian Date: Wed, 10 Jun 2026 06:21:36 +0000 Subject: net: skmsg: preserve sg.copy across SG transforms The sk_msg sg.copy bitmap is part of the scatterlist entry ownership state. A set bit tells sk_msg_compute_data_pointers() not to expose the entry through writable BPF ctx->data. This protects entries backed by pages that are not private to the sk_msg, such as splice-backed file page-cache pages. Several sk_msg transform paths move, copy, split, or compact msg->sg.data[] entries without moving the matching sg.copy bit. This can make an externally backed entry arrive at a new slot with a clear copy bit. A later SK_MSG verdict can then expose sg_virt(sge) as writable ctx->data and BPF stores can modify the original page cache. Keep sg.copy synchronized with sg.data[] whenever entries are transferred, shifted, split, or copied into a new sk_msg. Clear the bit when an entry is replaced by a newly allocated private page or freed. This covers the BPF pull/push/pop helpers, sk_msg_shift_left/right(), sk_msg_xfer(), and tls_split_open_record(), including the partial tail entry created during TLS open-record splitting. Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Cc: stable@vger.kernel.org Reported-by: Yiming Qian Reported-by: Keenan Dong Signed-off-by: Yiming Qian Link: https://patch.msgid.link/20260610062137.49075-1-yimingqian591@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skmsg.h | 15 +++++++++++---- net/core/filter.c | 27 +++++++++++++++++++++++++++ net/core/skmsg.c | 2 ++ net/tls/tls_sw.c | 4 ++++ 4 files changed, 44 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 19f4f253b4f9..937823856de5 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -4,6 +4,7 @@ #ifndef _LINUX_SKMSG_H #define _LINUX_SKMSG_H +#include #include #include #include @@ -199,11 +200,14 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, int which, u32 size) { dst->sg.data[which] = src->sg.data[which]; + __assign_bit(which, dst->sg.copy, test_bit(which, src->sg.copy)); dst->sg.data[which].length = size; dst->sg.size += size; src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; + if (!src->sg.data[which].length) + __clear_bit(which, src->sg.copy); } static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) @@ -273,16 +277,19 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { - if (copy_state) - __set_bit(i, msg->sg.copy); - else - __clear_bit(i, msg->sg.copy); + __assign_bit(i, msg->sg.copy, copy_state); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; } while (1); } +static inline void sk_msg_sg_copy_assign(struct sk_msg *dst, u32 dst_i, + const struct sk_msg *src, u32 src_i) +{ + __assign_bit(dst_i, dst->sg.copy, test_bit(src_i, src->sg.copy)); +} + static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) { sk_msg_sg_copy(msg, start, true); diff --git a/net/core/filter.c b/net/core/filter.c index 80439767e0ee..40037413dd4e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2733,11 +2733,13 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, poffset += len; sge->length = 0; put_page(sg_page(sge)); + __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); } while (i != last_sge); sg_set_page(&msg->sg.data[first_sge], page, copy, 0); + __clear_bit(first_sge, msg->sg.copy); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and @@ -2763,9 +2765,11 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, break; msg->sg.data[i] = msg->sg.data[move_from]; + sk_msg_sg_copy_assign(msg, i, msg, move_from); msg->sg.data[move_from].length = 0; msg->sg.data[move_from].page_link = 0; msg->sg.data[move_from].offset = 0; + __clear_bit(move_from, msg->sg.copy); sk_msg_iter_var_next(i); } while (1); @@ -2794,6 +2798,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; u32 new, i = 0, l = 0, space, copy = 0, offset = 0; + bool sge_copy, nsge_copy, nnsge_copy, rsge_copy = false; u8 *raw, *to, *from; struct page *page; @@ -2866,6 +2871,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); + rsge_copy = test_bit(i, msg->sg.copy); psge->length = start - offset; rsge.length -= psge->length; @@ -2890,24 +2896,32 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Shift one or two slots as needed */ sge = sk_msg_elem_cpy(msg, new); + sge_copy = test_bit(new, msg->sg.copy); sg_unmark_end(&sge); nsge = sk_msg_elem_cpy(msg, i); + nsge_copy = test_bit(i, msg->sg.copy); if (rsge.length) { sk_msg_iter_var_next(i); nnsge = sk_msg_elem_cpy(msg, i); + nnsge_copy = test_bit(i, msg->sg.copy); sk_msg_iter_next(msg, end); } while (i != msg->sg.end) { msg->sg.data[i] = sge; + __assign_bit(i, msg->sg.copy, sge_copy); sge = nsge; + sge_copy = nsge_copy; sk_msg_iter_var_next(i); if (rsge.length) { nsge = nnsge; + nsge_copy = nnsge_copy; nnsge = sk_msg_elem_cpy(msg, i); + nnsge_copy = test_bit(i, msg->sg.copy); } else { nsge = sk_msg_elem_cpy(msg, i); + nsge_copy = test_bit(i, msg->sg.copy); } } @@ -2921,6 +2935,7 @@ place_new: get_page(sg_page(&rsge)); sk_msg_iter_var_next(new); msg->sg.data[new] = rsge; + __assign_bit(new, msg->sg.copy, rsge_copy); } sk_msg_reset_curr(msg); @@ -2948,25 +2963,33 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i) prev = i; sk_msg_iter_var_next(i); msg->sg.data[prev] = msg->sg.data[i]; + sk_msg_sg_copy_assign(msg, prev, msg, i); } while (i != msg->sg.end); sk_msg_iter_prev(msg, end); + __clear_bit(msg->sg.end, msg->sg.copy); } static void sk_msg_shift_right(struct sk_msg *msg, int i) { struct scatterlist tmp, sge; + bool tmp_copy, sge_copy; sk_msg_iter_next(msg, end); sge = sk_msg_elem_cpy(msg, i); + sge_copy = test_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); tmp = sk_msg_elem_cpy(msg, i); + tmp_copy = test_bit(i, msg->sg.copy); while (i != msg->sg.end) { msg->sg.data[i] = sge; + __assign_bit(i, msg->sg.copy, sge_copy); sk_msg_iter_var_next(i); sge = tmp; + sge_copy = tmp_copy; tmp = sk_msg_elem_cpy(msg, i); + tmp_copy = test_bit(i, msg->sg.copy); } } @@ -3026,6 +3049,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); int a = start - offset; int b = sge->length - pop - a; + u32 sge_i = i; + bool sge_copy = test_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); @@ -3038,6 +3063,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, sg_set_page(nsge, sg_page(sge), b, sge->offset + pop + a); + __assign_bit(i, msg->sg.copy, sge_copy); } else { struct page *page, *orig; u8 *to, *from; @@ -3054,6 +3080,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, memcpy(to, from, a); memcpy(to + a, from + a + pop, b); sg_set_page(sge, page, a + b, 0); + __clear_bit(sge_i, msg->sg.copy); put_page(orig); } pop = 0; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e1850caf1a71..30c3b9a2681c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -66,6 +66,7 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, sge = &msg->sg.data[msg->sg.end]; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); + __clear_bit(msg->sg.end, msg->sg.copy); get_page(pfrag->page); sk_msg_iter_next(msg, end); } @@ -186,6 +187,7 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, sk_mem_uncharge(sk, len); put_page(sg_page(sge)); } + __clear_bit(i, msg->sg.copy); memset(sge, 0, sizeof(*sge)); return len; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 964ebc268ee4..a47f6a1e2c77 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -623,6 +623,7 @@ static int tls_split_open_record(struct sock *sk, struct tls_rec *from, struct scatterlist *sge, *osge, *nsge; u32 orig_size = msg_opl->sg.size; struct scatterlist tmp = { }; + u32 tmp_i = 0; struct sk_msg *msg_npl; struct tls_rec *new; int ret; @@ -644,6 +645,7 @@ static int tls_split_open_record(struct sock *sk, struct tls_rec *from, if (sge->length > apply) { u32 len = sge->length - apply; + tmp_i = i; get_page(sg_page(sge)); sg_set_page(&tmp, sg_page(sge), len, sge->offset + apply); @@ -675,6 +677,7 @@ static int tls_split_open_record(struct sock *sk, struct tls_rec *from, nsge = sk_msg_elem(msg_npl, j); if (tmp.length) { memcpy(nsge, &tmp, sizeof(*nsge)); + sk_msg_sg_copy_assign(msg_npl, j, msg_opl, tmp_i); sk_msg_iter_var_next(j); nsge = sk_msg_elem(msg_npl, j); } @@ -682,6 +685,7 @@ static int tls_split_open_record(struct sock *sk, struct tls_rec *from, osge = sk_msg_elem(msg_opl, i); while (osge->length) { memcpy(nsge, osge, sizeof(*nsge)); + sk_msg_sg_copy_assign(msg_npl, j, msg_opl, i); sg_unmark_end(nsge); sk_msg_iter_var_next(i); sk_msg_iter_var_next(j); -- cgit v1.2.3