From 4faeee0cf8a5d88d63cdbc3bab124fb0e6aed08c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 26 May 2023 16:34:58 +0000 Subject: tcp: deny tcp_disconnect() when threads are waiting Historically connect(AF_UNSPEC) has been abused by syzkaller and other fuzzers to trigger various bugs. A recent one triggers a divide-by-zero [1], and Paolo Abeni was able to diagnose the issue. tcp_recvmsg_locked() has tests about sk_state being not TCP_LISTEN and TCP REPAIR mode being not used. Then later if socket lock is released in sk_wait_data(), another thread can call connect(AF_UNSPEC), then make this socket a TCP listener. When recvmsg() is resumed, it can eventually call tcp_cleanup_rbuf() and attempt a divide by 0 in tcp_rcv_space_adjust() [1] This patch adds a new socket field, counting number of threads blocked in sk_wait_event() and inet_wait_for_connect(). If this counter is not zero, tcp_disconnect() returns an error. This patch adds code in blocking socket system calls, thus should not hurt performance of non blocking ones. Note that we probably could revert commit 499350a5a6e7 ("tcp: initialize rcv_mss to TCP_MIN_MSS instead of 0") to restore original tcpi_rcv_mss meaning (was 0 if no payload was ever received on a socket) [1] divide error: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 13832 Comm: syz-executor.5 Not tainted 6.3.0-rc4-syzkaller-00224-g00c7b5f4ddc5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 RIP: 0010:tcp_rcv_space_adjust+0x36e/0x9d0 net/ipv4/tcp_input.c:740 Code: 00 00 00 00 fc ff df 4c 89 64 24 48 8b 44 24 04 44 89 f9 41 81 c7 80 03 00 00 c1 e1 04 44 29 f0 48 63 c9 48 01 e9 48 0f af c1 <49> f7 f6 48 8d 04 41 48 89 44 24 40 48 8b 44 24 30 48 c1 e8 03 48 RSP: 0018:ffffc900033af660 EFLAGS: 00010206 RAX: 4a66b76cbade2c48 RBX: ffff888076640cc0 RCX: 00000000c334e4ac RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000001 RBP: 00000000c324e86c R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8880766417f8 R13: ffff888028fbb980 R14: 0000000000000000 R15: 0000000000010344 FS: 00007f5bffbfe700(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32f25000 CR3: 000000007ced0000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_recvmsg_locked+0x100e/0x22e0 net/ipv4/tcp.c:2616 tcp_recvmsg+0x117/0x620 net/ipv4/tcp.c:2681 inet6_recvmsg+0x114/0x640 net/ipv6/af_inet6.c:670 sock_recvmsg_nosec net/socket.c:1017 [inline] sock_recvmsg+0xe2/0x160 net/socket.c:1038 ____sys_recvmsg+0x210/0x5a0 net/socket.c:2720 ___sys_recvmsg+0xf2/0x180 net/socket.c:2762 do_recvmmsg+0x25e/0x6e0 net/socket.c:2856 __sys_recvmmsg net/socket.c:2935 [inline] __do_sys_recvmmsg net/socket.c:2958 [inline] __se_sys_recvmmsg net/socket.c:2951 [inline] __x64_sys_recvmmsg+0x20f/0x260 net/socket.c:2951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5c0108c0f9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f5bffbfe168 EFLAGS: 00000246 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 00007f5c011ac050 RCX: 00007f5c0108c0f9 RDX: 0000000000000001 RSI: 0000000020000bc0 RDI: 0000000000000003 RBP: 00007f5c010e7b39 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000122 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f5c012cfb1f R14: 00007f5bffbfe300 R15: 0000000000022000 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Reported-by: Paolo Abeni Diagnosed-by: Paolo Abeni Signed-off-by: Eric Dumazet Tested-by: Paolo Abeni Link: https://lore.kernel.org/r/20230526163458.2880232-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/inet_connection_sock.c') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 65ad4251f6fd..1386787eaf1a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1142,6 +1142,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); + newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; -- cgit v1.2.3 From 4b095281cacab0b62e7f11d6d8c4a6d49ecaae42 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 5 Jun 2023 23:55:25 +0200 Subject: ipv4: Set correct scope in inet_csk_route_*(). RT_CONN_FLAGS(sk) overloads the tos parameter with the RTO_ONLINK bit when sk has the SOCK_LOCALROUTE flag set. This is only useful for ip_route_output_key_hash() to eventually adjust the route scope. Let's drop RTO_ONLINK and set the correct scope directly to avoid this special case in the future and to allow converting ->flowi4_tos to dscp_t. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/inet_connection_sock.c') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1386787eaf1a..15424dec4584 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -792,7 +792,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, - RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, @@ -830,7 +830,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, fl4 = &newinet->cork.fl.u.ip4; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, - RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, -- cgit v1.2.3 From 53bf91641ae19fe51fb24422de6d07565a3536d0 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Tue, 20 Jun 2023 17:27:11 +0800 Subject: inet: Cleanup on charging memory for newly accepted sockets If there is no net-memcg associated with the sock, don't bother calculating its memory usage for charge. Signed-off-by: Abel Wu Link: https://lore.kernel.org/r/20230620092712.16217-1-wuyun.abel@bytedance.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net/ipv4/inet_connection_sock.c') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 15424dec4584..0cc19cfbb673 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -706,20 +706,23 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { - int amt; + int amt = 0; /* atomically get the memory usage, set and charge the * newsk->sk_memcg. */ lock_sock(newsk); - /* The socket has not been accepted yet, no need to look at - * newsk->sk_wmem_queued. - */ - amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&newsk->sk_rmem_alloc)); mem_cgroup_sk_alloc(newsk); - if (newsk->sk_memcg && amt) + if (newsk->sk_memcg) { + /* The socket has not been accepted yet, no need + * to look at newsk->sk_wmem_queued. + */ + amt = sk_mem_pages(newsk->sk_forward_alloc + + atomic_read(&newsk->sk_rmem_alloc)); + } + + if (amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt, GFP_KERNEL | __GFP_NOFAIL); -- cgit v1.2.3