diff options
author | Paolo Abeni <pabeni@redhat.com> | 2024-04-09 12:47:43 +0300 |
---|---|---|
committer | Paolo Abeni <pabeni@redhat.com> | 2024-04-09 12:47:44 +0300 |
commit | d2fd6cf39a14283da1a6892438f9685ddd93a387 (patch) | |
tree | 3536f6f2395edecdf28f4a9ce3357d0aba56c68f | |
parent | 1c25fe9a044d5334153a3585754b26553f8287b9 (diff) | |
parent | 41eecbd712b73f0d5dcf1152b9a1c27b1f238028 (diff) | |
download | linux-d2fd6cf39a14283da1a6892438f9685ddd93a387.tar.xz |
Merge branch 'tcp-fix-isn-selection-in-timewait-syn_recv'
Eric Dumazet says:
====================
tcp: fix ISN selection in TIMEWAIT -> SYN_RECV
TCP can transform a TIMEWAIT socket into a SYN_RECV one from
a SYN packet, and the ISN of the SYNACK packet is normally
generated using TIMEWAIT tw_snd_nxt.
This SYN packet also bypasses normal checks against listen queue
being full or not.
Unfortunately this has been broken almost one decade ago.
This series fixes the issue, in two patches.
First patch refactors code to add tcp_tw_isn as a parameter
to ->route_req(), to make the second patch smaller.
Second patch fixes the issue, by no longer using TCP_SKB_CB(skb)
to store the tcp_tw_isn.
Following packetdrill test passes after this series:
// Set up a server listening socket.
0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0 bind(3, ..., ...) = 0
+0 listen(3, 1) = 0
// Establish connection
+0 < S 0:0(0) win 32792 <mss 1460,nop,nop,sackOK>
+0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+.01 < . 1:1(0) ack 1 win 32792
+0 accept(3, ..., ...) = 4
// We close(), send a FIN, and get an ACK and FIN, in order to get into TIME_WAIT.
+.01 close(4) = 0
+0 > F. 1:1(0) ack 1
+.01 < F. 1:1(0) ack 2 win 32792
+0 > . 2:2(0) ack 2
// SYN hitting a TIME_WAIT -> should use an ISN based on TIMEWAIT tw_snd_nxt
+.01 < S 1000:1000(0) win 65535 <mss 1460,nop,nop,sackOK>
+0 > S. 65539:65539(0) ack 1001 <mss 1460,nop,nop,sackOK>
====================
Link: https://lore.kernel.org/r/20240407093322.3172088-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
-rw-r--r-- | include/net/tcp.h | 13 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 28 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 15 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 10 |
7 files changed, 49 insertions, 32 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index 9ab5b37e9d53..ba6c5ae86e22 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -52,6 +52,8 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +DECLARE_PER_CPU(u32, tcp_tw_isn); + void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -392,7 +394,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th); + const struct tcphdr *th, + u32 *tw_isn); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); @@ -935,13 +938,10 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : tcp_tw_isn is used in input path only - * (isn chosen by tcp_timewait_state_process()) - * + /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ - __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; @@ -2284,7 +2284,8 @@ struct tcp_request_sock_ops { struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req); + struct request_sock *req, + u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 664c8ecb076b..b07aa71b24ec 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -290,6 +290,9 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(u32, tcp_tw_isn); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); + long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1f28a2561795..5a45a0923a1f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7097,7 +7097,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7107,21 +7106,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; + u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); + isn = __this_cpu_read(tcp_tw_isn); + if (isn) { + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + __this_cpu_write(tcp_tw_isn, 0); + } else { + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); - if (!want_cookie) - goto drop; + if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { + want_cookie = tcp_syn_flood_action(sk, + rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } } if (sk_acceptq_is_full(sk)) { @@ -7160,7 +7166,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - dst = af_ops->route_req(sk, skb, &fl, req); + dst = af_ops->route_req(sk, skb, &fl, req, isn); if (!dst) goto drop_and_free; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 52963c3bb8ca..1e650ec71d2f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1666,7 +1666,8 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { tcp_v4_init_req(req, sk, skb); @@ -2145,7 +2146,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2167,6 +2167,7 @@ int tcp_v4_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2382,7 +2383,7 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2396,6 +2397,7 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5b21a07ddf9a..f53c7ada2ace 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -95,7 +95,7 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th) + const struct tcphdr *th, u32 *tw_isn) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -228,7 +228,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->tcp_tw_isn = isn; + *tw_isn = isn; return TCP_TW_SYN; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cffebaec66f1..3aa9da5c9a66 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -793,7 +793,8 @@ clear_hash_nostart: static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, - struct sk_buff *skb) + struct sk_buff *skb, + u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); @@ -807,7 +808,7 @@ static void tcp_v6_init_req(struct request_sock *req, ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - if (!TCP_SKB_CB(skb)->tcp_tw_isn && + if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || @@ -820,9 +821,10 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { - tcp_v6_init_req(req, sk, skb); + tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; @@ -1739,7 +1741,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -1756,6 +1757,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -1965,7 +1967,7 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2; @@ -1983,6 +1985,7 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 162b218d9858..b94d1dca1094 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -289,7 +289,8 @@ EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -297,7 +298,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -356,7 +357,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -364,7 +366,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; |