diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2024-05-30 03:21:37 +0300 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2024-05-30 03:21:38 +0300 |
| commit | 0f4b437b5fbf5141ff886bb47581123eb222c543 (patch) | |
| tree | fc8014baaf67f7eb33fa5249a7c068a43438c7cc | |
| parent | c3390677f6258748a91bf37b9bb21eab89f63b42 (diff) | |
| parent | fde6f897f2a184546bf5516ac736523ef24dc6a7 (diff) | |
| download | linux-0f4b437b5fbf5141ff886bb47581123eb222c543.tar.xz | |
Merge branch 'tcp-fix-tcp_poll-races'
Eric Dumazet says:
====================
tcp: fix tcp_poll() races
Flakes in packetdrill tests stressing epoll_wait()
were root caused to bad ordering in tcp_write_err()
Precisely, we have to call sk_error_report() after
tcp_done().
When fixing this issue, we discovered tcp_abort(),
tcp_v4_err() and tcp_v6_err() had similar issues.
Since tcp_reset() has the correct ordering,
first patch takes part of it and creates
tcp_done_with_error() helper.
====================
Link: https://lore.kernel.org/r/20240528125253.1966136-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | include/net/tcp.h | 1 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 8 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 32 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 11 | ||||
| -rw-r--r-- | net/ipv4/tcp_timer.c | 6 | ||||
| -rw-r--r-- | net/ipv6/tcp_ipv6.c | 10 |
6 files changed, 31 insertions, 37 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index 060e95b331a2..32815a40dea1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); +void tcp_done_with_error(struct sock *sk, int err); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 681b54e1f3a6..5fa68e7f6ddb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) */ mask |= EPOLLOUT | EPOLLWRNORM; } - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) @@ -4576,14 +4576,10 @@ int tcp_abort(struct sock *sk, int err) bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { - WRITE_ONCE(sk->sk_err, err); - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); - tcp_done(sk); + tcp_done_with_error(sk, err); } bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c04a9c8be9d..5aadf64e554d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4436,9 +4436,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, return SKB_NOT_DROPPED_YET; } + +void tcp_done_with_error(struct sock *sk, int err) +{ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + WRITE_ONCE(sk->sk_err, err); + smp_wmb(); + + tcp_write_queue_purge(sk); + tcp_done(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); +} +EXPORT_SYMBOL(tcp_done_with_error); + /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { + int err; + trace_tcp_receive_reset(sk); /* mptcp can't tell us to ignore reset pkts, @@ -4450,24 +4467,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: - WRITE_ONCE(sk->sk_err, ECONNREFUSED); + err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: - WRITE_ONCE(sk->sk_err, EPIPE); + err = EPIPE; break; case TCP_CLOSE: return; default: - WRITE_ONCE(sk->sk_err, ECONNRESET); + err = ECONNRESET; } - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - - tcp_write_queue_purge(sk); - tcp_done(sk); - - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); + tcp_done_with_error(sk, err); } /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8f70b8d1d1e5..041c7eda9abe 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -611,15 +611,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - - sk_error_report(sk); - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 83fe7f62f7f1..3e8604ae7d06 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -74,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) static void tcp_write_err(struct sock *sk) { - WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); - sk_error_report(sk); - - tcp_write_queue_purge(sk); - tcp_done(sk); + tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 750aa681779c..1ac7502e1bf5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -490,14 +490,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; case TCP_LISTEN: break; |
