From e4da8c78973c1e307c0431e0b99a969ffb8aa3f1 Mon Sep 17 00:00:00 2001 From: Heng Guo Date: Fri, 25 Aug 2023 15:55:05 +0800 Subject: net: ipv4, ipv6: fix IPSTATS_MIB_OUTOCTETS increment duplicated commit edf391ff1723 ("snmp: add missing counters for RFC 4293") had already added OutOctets for RFC 4293. In commit 2d8dbb04c63e ("snmp: fix OutOctets counter to include forwarded datagrams"), OutOctets was counted again, but not removed from ip_output(). According to RFC 4293 "3.2.3. IP Statistics Tables", ipipIfStatsOutTransmits is not equal to ipIfStatsOutForwDatagrams. So "IPSTATS_MIB_OUTOCTETS must be incremented when incrementing" is not accurate. And IPSTATS_MIB_OUTOCTETS should be counted after fragment. This patch reverts commit 2d8dbb04c63e ("snmp: fix OutOctets counter to include forwarded datagrams") and move IPSTATS_MIB_OUTOCTETS to ip_finish_output2 for ipv4. Reviewed-by: Filip Pudak Signed-off-by: Heng Guo Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 1 - net/ipv4/ip_output.c | 7 +++---- net/ipv4/ipmr.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e18931a6d153..66fac1216d46 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,7 +67,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s struct ip_options *opt = &(IPCB(skb)->opt); __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); - __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); #ifdef CONFIG_NET_SWITCHDEV if (skb->offload_l3_fwd_mark) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 43ba4b77b248..b2e0ad312028 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -207,6 +207,9 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); + /* OUTOCTETS should be counted after fragment */ + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) @@ -366,8 +369,6 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); - skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -424,8 +425,6 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; - IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); - skb->dev = dev; skb->protocol = htons(ETH_P_IP); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3f0c6d602fb7..9e222a57bc2b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1804,7 +1804,6 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); -- cgit v1.2.3 From fce92af1c29d90184dfec638b5738831097d66e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Aug 2023 09:55:20 +0000 Subject: ipv4: annotate data-races around fi->fib_dead syzbot complained about a data-race in fib_table_lookup() [1] Add appropriate annotations to document it. [1] BUG: KCSAN: data-race in fib_release_info / fib_table_lookup write to 0xffff888150f31744 of 1 bytes by task 1189 on cpu 0: fib_release_info+0x3a0/0x460 net/ipv4/fib_semantics.c:281 fib_table_delete+0x8d2/0x900 net/ipv4/fib_trie.c:1777 fib_magic+0x1c1/0x1f0 net/ipv4/fib_frontend.c:1106 fib_del_ifaddr+0x8cf/0xa60 net/ipv4/fib_frontend.c:1317 fib_inetaddr_event+0x77/0x200 net/ipv4/fib_frontend.c:1448 notifier_call_chain kernel/notifier.c:93 [inline] blocking_notifier_call_chain+0x90/0x200 kernel/notifier.c:388 __inet_del_ifa+0x4df/0x800 net/ipv4/devinet.c:432 inet_del_ifa net/ipv4/devinet.c:469 [inline] inetdev_destroy net/ipv4/devinet.c:322 [inline] inetdev_event+0x553/0xaf0 net/ipv4/devinet.c:1606 notifier_call_chain kernel/notifier.c:93 [inline] raw_notifier_call_chain+0x6b/0x1c0 kernel/notifier.c:461 call_netdevice_notifiers_info net/core/dev.c:1962 [inline] call_netdevice_notifiers_mtu+0xd2/0x130 net/core/dev.c:2037 dev_set_mtu_ext+0x30b/0x3e0 net/core/dev.c:8673 do_setlink+0x5be/0x2430 net/core/rtnetlink.c:2837 rtnl_setlink+0x255/0x300 net/core/rtnetlink.c:3177 rtnetlink_rcv_msg+0x807/0x8c0 net/core/rtnetlink.c:6445 netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2549 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6463 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x56f/0x640 net/netlink/af_netlink.c:1365 netlink_sendmsg+0x665/0x770 net/netlink/af_netlink.c:1914 sock_sendmsg_nosec net/socket.c:725 [inline] sock_sendmsg net/socket.c:748 [inline] sock_write_iter+0x1aa/0x230 net/socket.c:1129 do_iter_write+0x4b4/0x7b0 fs/read_write.c:860 vfs_writev+0x1a8/0x320 fs/read_write.c:933 do_writev+0xf8/0x220 fs/read_write.c:976 __do_sys_writev fs/read_write.c:1049 [inline] __se_sys_writev fs/read_write.c:1046 [inline] __x64_sys_writev+0x45/0x50 fs/read_write.c:1046 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888150f31744 of 1 bytes by task 21839 on cpu 1: fib_table_lookup+0x2bf/0xd50 net/ipv4/fib_trie.c:1585 fib_lookup include/net/ip_fib.h:383 [inline] ip_route_output_key_hash_rcu+0x38c/0x12c0 net/ipv4/route.c:2751 ip_route_output_key_hash net/ipv4/route.c:2641 [inline] __ip_route_output_key include/net/route.h:134 [inline] ip_route_output_flow+0xa6/0x150 net/ipv4/route.c:2869 send4+0x1e7/0x500 drivers/net/wireguard/socket.c:61 wg_socket_send_skb_to_peer+0x94/0x130 drivers/net/wireguard/socket.c:175 wg_socket_send_buffer_to_peer+0xd6/0x100 drivers/net/wireguard/socket.c:200 wg_packet_send_handshake_initiation drivers/net/wireguard/send.c:40 [inline] wg_packet_handshake_send_worker+0x10c/0x150 drivers/net/wireguard/send.c:51 process_one_work+0x434/0x860 kernel/workqueue.c:2600 worker_thread+0x5f2/0xa10 kernel/workqueue.c:2751 kthread+0x1d7/0x210 kernel/kthread.c:389 ret_from_fork+0x2e/0x40 arch/x86/kernel/process.c:145 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 value changed: 0x00 -> 0x01 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 21839 Comm: kworker/u4:18 Tainted: G W 6.5.0-syzkaller #0 Fixes: dccd9ecc3744 ("ipv4: Do not use dead fib_info entries.") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230830095520.1046984-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/ipv4/fib_semantics.c | 5 ++++- net/ipv4/fib_trie.c | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 65ba18a91865..eafa4a033515 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -278,7 +278,8 @@ void fib_release_info(struct fib_info *fi) hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } - fi->fib_dead = 1; + /* Paired with READ_ONCE() from fib_table_lookup() */ + WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } spin_unlock_bh(&fib_info_lock); @@ -1581,6 +1582,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, link_it: ofi = fib_find_info(fi); if (ofi) { + /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); refcount_inc(&ofi->fib_treeref); @@ -1619,6 +1621,7 @@ err_inval: failure: if (fi) { + /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 74d403dbd2b4..d13fb9e76b97 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1582,7 +1582,8 @@ found: if (fa->fa_dscp && inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos) continue; - if (fi->fib_dead) + /* Paired with WRITE_ONCE() in fib_release_info() */ + if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; -- cgit v1.2.3 From 5e6300e7b3a4ab5b72a82079753868e91fbf9efc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Aug 2023 13:52:09 +0000 Subject: net: annotate data-races around sk->sk_forward_alloc Every time sk->sk_forward_alloc is read locklessly, add a READ_ONCE(). Add sk_forward_alloc_add() helper to centralize updates, to reduce number of WRITE_ONCE(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 12 +++++++++--- net/core/sock.c | 8 ++++---- net/ipv4/tcp_output.c | 2 +- net/ipv4/udp.c | 6 +++--- net/mptcp/protocol.c | 6 +++--- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/sock.h b/include/net/sock.h index 11d503417591..f04869ac1d92 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1053,6 +1053,12 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val) WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } +static inline void sk_forward_alloc_add(struct sock *sk, int val) +{ + /* Paired with lockless reads of sk->sk_forward_alloc */ + WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); +} + void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ @@ -1377,7 +1383,7 @@ static inline int sk_forward_alloc_get(const struct sock *sk) if (sk->sk_prot->forward_alloc_get) return sk->sk_prot->forward_alloc_get(sk); #endif - return sk->sk_forward_alloc; + return READ_ONCE(sk->sk_forward_alloc); } static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) @@ -1673,14 +1679,14 @@ static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc -= size; + sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; - sk->sk_forward_alloc += size; + sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } diff --git a/net/core/sock.c b/net/core/sock.c index a61ec97098ad..40e1bda4bde0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1045,7 +1045,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } - sk->sk_forward_alloc += pages << PAGE_SHIFT; + sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem + (pages << PAGE_SHIFT)); @@ -3139,10 +3139,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) - sk->sk_forward_alloc -= amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); return ret; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -3174,7 +3174,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) void __sk_mem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; - sk->sk_forward_alloc -= amount << PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e6b4fbd642f7..ccfc8bbf7455 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3474,7 +3474,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) if (delta <= 0) return; amt = sk_mem_pages(delta); - sk->sk_forward_alloc += amt << PAGE_SHIFT; + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0794a2c46a56..f39b9c844580 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1414,9 +1414,9 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, spin_lock(&sk_queue->lock); - sk->sk_forward_alloc += size; + sk_forward_alloc_add(sk, size); amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); - sk->sk_forward_alloc -= amt; + sk_forward_alloc_add(sk, -amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); @@ -1527,7 +1527,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) goto uncharge_drop; } - sk->sk_forward_alloc -= size; + sk_forward_alloc_add(sk, -size); /* no need to setup a destructor, we will explicitly release the * forward allocated memory on dequeue diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 933b257eee02..625df3a36c46 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1800,7 +1800,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } /* data successfully copied into the write queue */ - sk->sk_forward_alloc -= total_ts; + sk_forward_alloc_add(sk, -total_ts); copied += psize; dfrag->data_len += psize; frag_truesize += psize; @@ -3257,7 +3257,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ - sk->sk_forward_alloc += msk->rmem_fwd_alloc; + sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); @@ -3522,7 +3522,7 @@ static void mptcp_shutdown(struct sock *sk, int how) static int mptcp_forward_alloc_get(const struct sock *sk) { - return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; + return READ_ONCE(sk->sk_forward_alloc) + mptcp_sk(sk)->rmem_fwd_alloc; } static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) -- cgit v1.2.3 From e3390b30a5dfb112e8e802a59c0f68f947b638b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Aug 2023 13:52:11 +0000 Subject: net: annotate data-races around sk->sk_tsflags sk->sk_tsflags can be read locklessly, add corresponding annotations. Fixes: b9f40e21ef42 ("net-timestamp: move timestamp flags out of sk_flags") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- include/net/sock.h | 17 ++++++++++------- net/can/j1939/socket.c | 10 ++++++---- net/core/skbuff.c | 10 ++++++---- net/core/sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- net/socket.c | 13 +++++++------ 13 files changed, 40 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 19adacd5ece0..9276cea775cc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -94,7 +94,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.tsflags = inet->sk.sk_tsflags; + ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; diff --git a/include/net/sock.h b/include/net/sock.h index f04869ac1d92..b770261fbdaf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1906,7 +1906,9 @@ struct sockcm_cookie { static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { - *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; + *sockc = (struct sockcm_cookie) { + .tsflags = READ_ONCE(sk->sk_tsflags) + }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, @@ -2701,9 +2703,9 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - ktime_t kt = skb->tstamp; struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); - + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested @@ -2711,10 +2713,10 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || - (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || - (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || + (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); @@ -2736,7 +2738,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY) + if (sk->sk_flags & FLAGS_RECV_CMSGS || + READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index feaec4ad6d16..b28c976f52a0 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -974,6 +974,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; + u32 tsflags; int err; jsk = j1939_sk(sk); @@ -981,13 +982,14 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; + tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) + if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) + if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: @@ -997,7 +999,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) + if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: @@ -1054,7 +1056,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, } serr->opt_stats = true; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 45707059082f..24f26e816184 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5207,7 +5207,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); @@ -5263,21 +5263,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, { struct sk_buff *skb; bool tsonly, opt_stats = false; + u32 tsflags; if (!sk) return; - if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; - tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); diff --git a/net/core/sock.c b/net/core/sock.c index 40e1bda4bde0..d05a290300b6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -937,7 +937,7 @@ int sock_set_timestamping(struct sock *sk, int optname, return ret; } - sk->sk_tsflags = val; + WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) @@ -1719,7 +1719,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_TIMESTAMPING_OLD: lv = sizeof(v.timestamping); - v.timestamping.flags = sk->sk_tsflags; + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = sk->sk_bind_phc; break; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b2e0ad312028..4ab877cf6d35 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -981,7 +981,7 @@ static int __ip_append_data(struct sock *sk, paged = !!cork->gso_size; if (cork->tx_flags & SKBTX_ANY_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d1c73660b844..cce9cb25f3b3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -511,7 +511,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk, * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). */ info = PKTINFO_SKB_CB(skb); - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) || + if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) || !info->ipi_ifindex) return false; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cee1e548660c..cc4b250262c1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2259,14 +2259,14 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } } - if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { - if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 4ab50169a5a9..54fc4c711f2c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1501,7 +1501,7 @@ static int __ip6_append_data(struct sock *sk, orig_mtu = mtu; if (cork->tx_flags & SKBTX_ANY_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 1b2772834972..5831aaa53d75 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, np); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0eae7661a85c..42fcec3ecf5e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -772,7 +772,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; if (sin6) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ebc6ae47cfea..86b5d509a468 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1339,7 +1339,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); /* destination address check */ diff --git a/net/socket.c b/net/socket.c index 848116d06b51..98ffffab949e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -825,7 +825,7 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) { - bool cycles = sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC; + bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); struct net_device *orig_dev; ktime_t hwtstamp; @@ -877,12 +877,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); struct scm_timestamping_internal tss; - int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); int if_index; ktime_t hwtstamp; + u32 tsflags; /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ @@ -924,11 +924,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, } memset(&tss, 0, sizeof(tss)); - if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && + tsflags = READ_ONCE(sk->sk_tsflags); + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp)) { if_index = 0; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) @@ -936,14 +937,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, else hwtstamp = shhwtstamps->hwtstamp; - if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, sk->sk_bind_phc); if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } -- cgit v1.2.3 From 6ac66cb03ae306c2e288a9be18226310529f5b25 Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Thu, 31 Aug 2023 10:03:30 +0200 Subject: ipv4: ignore dst hint for multipath routes Route hints when the nexthop is part of a multipath group causes packets in the same receive batch to be sent to the same nexthop irrespective of the multipath hash of the packet. So, do not extract route hint for packets whose destination is part of a multipath group. A new SKB flag IPSKB_MULTIPATH is introduced for this purpose, set the flag when route is looked up in ip_mkroute_input() and use it in ip_extract_route_hint() to check for the existence of the flag. Fixes: 02b24941619f ("ipv4: use dst hint for ipv4 list receive") Signed-off-by: Sriram Yagnaraman Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 1 + net/ipv4/ip_input.c | 3 ++- net/ipv4/route.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 9276cea775cc..3489a1cca5e7 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -57,6 +57,7 @@ struct inet_skb_parm { #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) +#define IPSKB_MULTIPATH BIT(9) u16 frag_max_size; }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fe9ead9ee863..5e9c8156656a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -584,7 +584,8 @@ static void ip_sublist_rcv_finish(struct list_head *head) static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { - if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || + IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a4e153dd615b..6a3f57a3fa41 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2144,6 +2144,7 @@ static int ip_mkroute_input(struct sk_buff *skb, int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); + IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif -- cgit v1.2.3 From c3b704d4a4a265660e665df51b129e8425216ed1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Sep 2023 04:23:38 +0000 Subject: igmp: limit igmpv3_newpack() packet size to IP_MAX_MTU This is a follow up of commit 915d975b2ffa ("net: deal with integer overflows in kmalloc_reserve()") based on David Laight feedback. Back in 2010, I failed to realize malicious users could set dev->mtu to arbitrary values. This mtu has been since limited to 0x7fffffff but regardless of how big dev->mtu is, it makes no sense for igmpv3_newpack() to allocate more than IP_MAX_MTU and risk various skb fields overflows. Fixes: 57e1ab6eaddc ("igmp: refine skb allocations") Link: https://lore.kernel.org/netdev/d273628df80f45428e739274ab9ecb72@AcuMS.aculab.com/ Signed-off-by: Eric Dumazet Reported-by: David Laight Cc: Kyle Zeng Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 0c9e768e5628..418e5fb58fd3 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -353,8 +353,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - unsigned int size = mtu; + unsigned int size; + size = min(mtu, IP_MAX_MTU); while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); -- cgit v1.2.3