From 58169ec9c40309541509181e068177eab72e6caa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 14:49:31 +0000 Subject: inet: preserve const qualifier in inet_csk() We can change inet_csk() to propagate its argument const qualifier, thanks to container_of_const(). We have to fix few places that had mistakes, like tcp_bound_rto(). Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240329144931.295800-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ae35199d3b3..6c27195619ab 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -742,7 +742,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -static inline void tcp_bound_rto(const struct sock *sk) +static inline void tcp_bound_rto(struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; -- cgit v1.2.3 From 1eeb5043573981f3a1278876515851b7f6b1df1b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:32:03 +0000 Subject: tcp/dccp: do not care about families in inet_twsk_purge() We lost ability to unload ipv6 module a long time ago. Instead of calling expensive inet_twsk_purge() twice, we can handle all families in one round. Also remove an extra line added in my prior patch, per Kuniyuki Iwashima feedback. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/netdev/20240327192934.6843-1-kuniyu@amazon.com/ Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240329153203.345203-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_timewait_sock.h | 2 +- include/net/tcp.h | 2 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 6 ------ net/ipv4/inet_timewait_sock.c | 9 +++------ net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv6/tcp_ipv6.c | 6 ------ 8 files changed, 10 insertions(+), 25 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index f28da08a37b4..2a536eea9424 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -111,7 +111,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); +void inet_twsk_purge(struct inet_hashinfo *hashinfo); static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6c27195619ab..9ab5b37e9d53 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -353,7 +353,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); -void tcp_twsk_purge(struct list_head *net_exit_list, int family); +void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 44b033fe1ef6..9fc9cea4c251 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1039,7 +1039,7 @@ static void __net_exit dccp_v4_exit_net(struct net *net) static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&dccp_hashinfo, AF_INET); + inet_twsk_purge(&dccp_hashinfo); } static struct pernet_operations dccp_v4_ops = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ded07e09f813..c8ca703dc331 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1119,15 +1119,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v6_ctl_sk); } -static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo, AF_INET6); -} - static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, - .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index b0cc07d9a568..e28075f0006e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -264,7 +264,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +void inet_twsk_purge(struct inet_hashinfo *hashinfo) { struct inet_ehash_bucket *head = &hashinfo->ehash[0]; unsigned int ehash_mask = hashinfo->ehash_mask; @@ -273,7 +273,6 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) struct sock *sk; for (slot = 0; slot <= ehash_mask; slot++, head++) { - if (hlist_nulls_empty(&head->chain)) continue; @@ -288,15 +287,13 @@ restart: TCPF_NEW_SYN_RECV)) continue; - if (sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count)) + if (refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (unlikely(sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count))) { + if (refcount_read(&sock_net(sk)->ns.count)) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a22ee5838751..1e0a9762f92e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3501,7 +3501,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - tcp_twsk_purge(net_exit_list, AF_INET); + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f0761f060a83..5b21a07ddf9a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -388,7 +388,7 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); -void tcp_twsk_purge(struct list_head *net_exit_list, int family) +void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; @@ -396,9 +396,9 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ - inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { - inet_twsk_purge(&tcp_hashinfo, family); + inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3f4cba49e9ee..5ae74f661d25 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2389,15 +2389,9 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) -{ - tcp_twsk_purge(net_exit_list, AF_INET6); -} - static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, - .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) -- cgit v1.2.3 From b9e810405880c99baafd550ada7043e86465396e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 7 Apr 2024 09:33:21 +0000 Subject: tcp: propagate tcp_tw_isn via an extra parameter to ->route_req() tcp_v6_init_req() reads TCP_SKB_CB(skb)->tcp_tw_isn to find out if the request socket is created by a SYN hitting a TIMEWAIT socket. This has been buggy for a decade, lets directly pass the information from tcp_conn_request(). This is a preparatory patch to make the following one easier to review. Signed-off-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/tcp.h | 3 ++- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 10 ++++++---- net/mptcp/subflow.c | 10 ++++++---- 5 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9ab5b37e9d53..fa0ab77acee2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2284,7 +2284,8 @@ struct tcp_request_sock_ops { struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req); + struct request_sock *req, + u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1f28a2561795..48c275e6ef02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7160,7 +7160,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - dst = af_ops->route_req(sk, skb, &fl, req); + dst = af_ops->route_req(sk, skb, &fl, req, isn); if (!dst) goto drop_and_free; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 52963c3bb8ca..81e2f05c244d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1666,7 +1666,8 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { tcp_v4_init_req(req, sk, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cffebaec66f1..5141f7033abd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -793,7 +793,8 @@ clear_hash_nostart: static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, - struct sk_buff *skb) + struct sk_buff *skb, + u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); @@ -807,7 +808,7 @@ static void tcp_v6_init_req(struct request_sock *req, ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - if (!TCP_SKB_CB(skb)->tcp_tw_isn && + if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || @@ -820,9 +821,10 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { - tcp_v6_init_req(req, sk, skb); + tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 162b218d9858..b94d1dca1094 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -289,7 +289,8 @@ EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -297,7 +298,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -356,7 +357,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -364,7 +366,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; -- cgit v1.2.3 From 41eecbd712b73f0d5dcf1152b9a1c27b1f238028 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 7 Apr 2024 09:33:22 +0000 Subject: tcp: replace TCP_SKB_CB(skb)->tcp_tw_isn with a per-cpu field TCP can transform a TIMEWAIT socket into a SYN_RECV one from a SYN packet, and the ISN of the SYNACK packet is normally generated using TIMEWAIT tw_snd_nxt : tcp_timewait_state_process() ... u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->tcp_tw_isn = isn; return TCP_TW_SYN; This SYN packet also bypasses normal checks against listen queue being full or not. tcp_conn_request() ... __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; ... /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; } This was using TCP_SKB_CB(skb)->tcp_tw_isn field in skb. Unfortunately this field has been accidentally cleared after the call to tcp_timewait_state_process() returning TCP_TW_SYN. Using a field in TCP_SKB_CB(skb) for a temporary state is overkill. Switch instead to a per-cpu variable. As a bonus, we do not have to clear tcp_tw_isn in TCP receive fast path. It is temporarily set then cleared only in the TCP_TW_SYN dance. Fixes: 4ad19de8774e ("net: tcp6: fix double call of tcp_v6_fill_cb()") Fixes: eeea10b83a13 ("tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb()") Signed-off-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/tcp.h | 10 +++++----- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 26 ++++++++++++++++---------- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_minisocks.c | 4 ++-- net/ipv6/tcp_ipv6.c | 5 +++-- 6 files changed, 32 insertions(+), 21 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index fa0ab77acee2..ba6c5ae86e22 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -52,6 +52,8 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +DECLARE_PER_CPU(u32, tcp_tw_isn); + void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -392,7 +394,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th); + const struct tcphdr *th, + u32 *tw_isn); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); @@ -935,13 +938,10 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : tcp_tw_isn is used in input path only - * (isn chosen by tcp_timewait_state_process()) - * + /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ - __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 664c8ecb076b..b07aa71b24ec 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -290,6 +290,9 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(u32, tcp_tw_isn); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); + long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 48c275e6ef02..5a45a0923a1f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7097,7 +7097,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7107,21 +7106,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; + u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); + isn = __this_cpu_read(tcp_tw_isn); + if (isn) { + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + __this_cpu_write(tcp_tw_isn, 0); + } else { + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); - if (!want_cookie) - goto drop; + if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { + want_cookie = tcp_syn_flood_action(sk, + rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } } if (sk_acceptq_is_full(sk)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 81e2f05c244d..1e650ec71d2f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2146,7 +2146,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2168,6 +2167,7 @@ int tcp_v4_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2383,7 +2383,7 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2397,6 +2397,7 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5b21a07ddf9a..f53c7ada2ace 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -95,7 +95,7 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th) + const struct tcphdr *th, u32 *tw_isn) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -228,7 +228,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->tcp_tw_isn = isn; + *tw_isn = isn; return TCP_TW_SYN; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5141f7033abd..3aa9da5c9a66 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1741,7 +1741,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -1758,6 +1757,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -1967,7 +1967,7 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2; @@ -1985,6 +1985,7 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } -- cgit v1.2.3 From 697a6c8cec03c2299f850fa50322641a8bf6b915 Mon Sep 17 00:00:00 2001 From: Hechao Li Date: Tue, 9 Apr 2024 09:43:55 -0700 Subject: tcp: increase the default TCP scaling ratio After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"), we noticed an application-level timeout due to reduced throughput. Before the commit, for a client that sets SO_RCVBUF to 65k, it takes around 22 seconds to transfer 10M data. After the commit, it takes 40 seconds. Because our application has a 30-second timeout, this regression broke the application. The reason that it takes longer to transfer data is that tp->scaling_ratio is initialized to a value that results in ~0.25 of rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k initial receive window. Later, even though the scaling_ratio is updated to a more accurate skb->len/skb->truesize, which is ~0.66 in our environment, the window stays at ~0.25 * rcvbuf. This is because tp->window_clamp does not change together with the tp->scaling_ratio update when autotuning is disabled due to SO_RCVBUF. As a result, the window size is capped at the initial window_clamp, which is also ~0.25 * rcvbuf, and never grows bigger. Most modern applications let the kernel do autotuning, and benefit from the increased scaling_ratio. But there are applications such as kafka that has a default setting of SO_RCVBUF=64k. This patch increases the initial scaling_ratio from ~25% to 50% in order to make it backward compatible with the original default sysctl_tcp_adv_win_scale for applications setting SO_RCVBUF. Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") Signed-off-by: Hechao Li Reviewed-by: Tycho Andersen Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/netdev/20240402215405.432863-1-hli@netflix.com/ Signed-off-by: David S. Miller --- include/net/tcp.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index ba6c5ae86e22..b935e1ae4caf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1539,11 +1539,10 @@ static inline int tcp_space_from_win(const struct sock *sk, int win) return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } -/* Assume a conservative default of 1200 bytes of payload per 4K page. +/* Assume a 50% default for skb->len/skb->truesize ratio. * This may be adjusted later in tcp_measure_rcv_mss(). */ -#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \ - SKB_TRUESIZE(4096)) +#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1)) static inline void tcp_scaling_ratio_init(struct sock *sk) { -- cgit v1.2.3 From 14b5fb2145caeb909a1cd57d9cd5e0c3cd005642 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Sun, 21 Apr 2024 12:20:08 +0800 Subject: tcp: move tcp_skb_cb->sacked flags to enum Move the flag definitions for tcp_skb_cb->sacked into a new enum named tcp_skb_cb_sacked_flags, then we can get access to them in bpf via vmlinux.h, e.g., in tracepoints. This patch does not change any existing functionality. Signed-off-by: Philo Lu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index b935e1ae4caf..ffc9371fe9de 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -928,6 +928,19 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +/* State flags for sacked in struct tcp_skb_cb */ +enum tcp_skb_cb_sacked_flags { + TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ + TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */ + TCPCB_LOST = (1 << 2), /* SKB is lost */ + TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS | + TCPCB_LOST), /* All tag bits */ + TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */ + TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */ + TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS | + TCPCB_REPAIRED), +}; + /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. @@ -950,15 +963,6 @@ struct tcp_skb_cb { __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ -#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ -#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ -#define TCPCB_LOST 0x04 /* SKB is lost */ -#define TCPCB_TAGBITS 0x07 /* All tag bits */ -#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ -#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ -#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ - TCPCB_REPAIRED) - __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ -- cgit v1.2.3 From 48e2cd3e3dcfe04f212df4fb189fa04c2a87b980 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 26 Apr 2024 00:17:23 +0800 Subject: bpf: add mrtt and srtt as BPF_SOCK_OPS_RTT_CB args Two important arguments in RTT estimation, mrtt and srtt, are passed to tcp_bpf_rtt(), so that bpf programs get more information about RTT computation in BPF_SOCK_OPS_RTT_CB. The difference between bpf_sock_ops->srtt_us and the srtt here is: the former is an old rtt before update, while srtt passed by tcp_bpf_rtt() is that after update. Signed-off-by: Philo Lu Link: https://lore.kernel.org/r/20240425161724.73707-2-lulie@linux.alibaba.com Signed-off-by: Martin KaFai Lau --- include/net/tcp.h | 4 ++-- include/uapi/linux/bpf.h | 2 ++ net/ipv4/tcp_input.c | 4 ++-- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ae35199d3b3..0f75d03287c2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2706,10 +2706,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e4ae83550fb3..d94a72593ead 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6947,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d874817a78d..d1115d7c3936 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -911,7 +911,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ @@ -921,7 +921,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e4ae83550fb3..d94a72593ead 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6947,6 +6947,8 @@ enum { * socket transition to LISTEN state. */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt */ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. * It will be called to handle -- cgit v1.2.3 From 5691276b39daf90294c6a81fb6d62d667f634c92 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 25 Apr 2024 11:13:36 +0800 Subject: rstreason: prepare for active reset Like what we did to passive reset: only passing possible reset reason in each active reset path. No functional changes. Signed-off-by: Jason Xing Acked-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/tcp.h | 3 ++- net/ipv4/tcp.c | 15 ++++++++++----- net/ipv4/tcp_output.c | 3 ++- net/ipv4/tcp_timer.c | 9 ++++++--- net/mptcp/protocol.c | 4 +++- net/mptcp/subflow.c | 5 +++-- 6 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index ffc9371fe9de..a9eb21251195 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); -void tcp_send_active_reset(struct sock *sk, gfp_t priority); +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f23b97777ea5..4ec0f4feee00 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -275,6 +275,7 @@ #include #include #include +#include #include #include @@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, sk->sk_allocation); + tcp_send_active_reset(sk, sk->sk_allocation, + SK_RST_REASON_NOT_SPECIFIED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2885,7 +2887,8 @@ adjudge_to_death: struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { @@ -2903,7 +2906,8 @@ adjudge_to_death: if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { @@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any()); + tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); @@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err) smp_wmb(); sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_done(sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ce59e4499b66..41c352bf3394 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3615,7 +3615,8 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ -void tcp_send_active_reset(struct sock *sk, gfp_t priority) +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason) { struct sk_buff *skb; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 976db57b95d4..83fe7f62f7f1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,7 @@ #include #include #include +#include static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { @@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; @@ -768,7 +770,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } } - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); goto death; } @@ -795,7 +797,8 @@ static void tcp_keepalive_timer (struct timer_list *t) icsk->icsk_probes_out > 0) || (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_write_err(sk); goto out; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f8bc34f0d973..065967086492 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #endif #include #include +#include #include #include "protocol.h" #include "mib.h" @@ -2569,7 +2570,8 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { - tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_send_active_reset(tcp_sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 32fe2ef36d56..ac867d277860 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -414,7 +414,7 @@ void mptcp_subflow_reset(struct sock *ssk) /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); - tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); @@ -1350,7 +1350,8 @@ reset: tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); - tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_send_active_reset(ssk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); WRITE_ONCE(subflow->data_avail, false); return false; } -- cgit v1.2.3 From dda4d96acb20c02920f6d9a20fdc3f4846192aeb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Apr 2024 13:40:23 +0000 Subject: tcp: move tcp_out_of_memory() to net/ipv4/tcp.c tcp_out_of_memory() has a single caller: tcp_check_oom(). Following patch will also make sk_memory_allocated() not anymore visible from and Add const qualifier to sock argument of tcp_out_of_memory() and tcp_check_oom(). Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240429134025.1233626-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 10 +--------- net/ipv4/tcp.c | 10 +++++++++- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index fe98fb01879b..0a51e6a45bce 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -296,14 +296,6 @@ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } -static inline bool tcp_out_of_memory(struct sock *sk) -{ - if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) - return true; - return false; -} - static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); @@ -316,7 +308,7 @@ static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) void sk_forced_mem_schedule(struct sock *sk, int size); -bool tcp_check_oom(struct sock *sk, int shift); +bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 388f6e115bf1..0a3aa3047083 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2752,7 +2752,15 @@ static bool tcp_too_many_orphans(int shift) READ_ONCE(sysctl_tcp_max_orphans); } -bool tcp_check_oom(struct sock *sk, int shift) +static bool tcp_out_of_memory(const struct sock *sk) +{ + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) + return true; + return false; +} + +bool tcp_check_oom(const struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; -- cgit v1.2.3 From 57bfc7605ca5b102ba336779ae9adbc5bbba1d96 Mon Sep 17 00:00:00 2001 From: Miao Xu Date: Wed, 1 May 2024 21:23:16 -0700 Subject: tcp: Add new args for cong_control in tcp_congestion_ops This patch adds two new arguments for cong_control of struct tcp_congestion_ops: - ack - flag These two arguments are inherited from the caller tcp_cong_control in tcp_intput.c. One use case of them is to update cwnd and pacing rate inside cong_control based on the info they provide. For example, the flag can be used to decide if it is the right time to raise or reduce a sender's cwnd. Reviewed-by: Eric Dumazet Signed-off-by: Miao Xu Link: https://lore.kernel.org/r/20240502042318.801932-2-miaxu@meta.com Signed-off-by: Martin KaFai Lau --- include/net/tcp.h | 2 +- net/ipv4/bpf_tcp_ca.c | 3 ++- net/ipv4/tcp_bbr.c | 2 +- net/ipv4/tcp_input.c | 2 +- tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 6 +++--- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index fe98fb01879b..7294da8fb780 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1172,7 +1172,7 @@ struct tcp_congestion_ops { /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 7f518ea5f4ac..6bd7f8db189a 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -307,7 +307,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) return 0; } -static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) { } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 7e52ab24e40a..760941e55153 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_update_gains(sk); } -__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) +__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { struct bbr *bbr = inet_csk_ca(sk); u32 bw; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53e1150f706f..23ccfc7b1d3c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3541,7 +3541,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cong_control) { - icsk->icsk_ca_ops->cong_control(sk, rs); + icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs); return; } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c index fcfbfe0336b4..52b610357309 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -5,7 +5,7 @@ #include extern void bbr_init(struct sock *sk) __ksym; -extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym; +extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym; extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; @@ -42,9 +42,9 @@ void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) } SEC("struct_ops/cong_control") -void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs) +void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { - bbr_main(sk, rs); + bbr_main(sk, ack, flag, rs); } SEC("struct_ops/cong_avoid") -- cgit v1.2.3 From 80e85fbdf19ecc4dfa31ecf639adb55555db02fe Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 2 May 2024 10:44:45 +0200 Subject: net: create tcp_gro_lookup helper function This pulls the flow port matching out of tcp_gro_receive, so that it can be reused for the next change, which adds the TCP fraglist GRO heuristic. Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: Felix Fietkau Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/net/tcp.h | 1 + net/ipv4/tcp_offload.c | 41 +++++++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0a51e6a45bce..79c4e8947b5d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2191,6 +2191,7 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index aa7508676315..4a194a9d36cd 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -245,6 +245,27 @@ out: return segs; } +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) +{ + struct tcphdr *th2; + struct sk_buff *p; + + list_for_each_entry(p, head, list) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + th2 = tcp_hdr(p); + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + return p; + } + + return NULL; +} + struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; @@ -282,24 +303,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - list_for_each_entry(p, head, list) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - th2 = tcp_hdr(p); - - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - goto found; - } - p = NULL; - goto out_check_final; + p = tcp_gro_lookup(head, th); + if (!p) + goto out_check_final; -found: /* Include the IP ID check below from the inner most IP hdr */ + th2 = tcp_hdr(p); flush = NAPI_GRO_CB(p)->flush; flush |= (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & -- cgit v1.2.3 From 7516b27c555c1711ec17a5d891befb6986e573a3 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 2 May 2024 10:44:46 +0200 Subject: net: create tcp_gro_header_pull helper function Pull the code out of tcp_gro_receive in order to access the tcp header from tcp4/6_gro_receive. Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: Felix Fietkau Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/net/tcp.h | 4 +++- net/ipv4/tcp_offload.c | 55 ++++++++++++++++++++++++++++++------------------ net/ipv6/tcpv6_offload.c | 18 +++++++++++----- 3 files changed, 50 insertions(+), 27 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 79c4e8947b5d..8f63a163c7de 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2191,8 +2191,10 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 4a194a9d36cd..87ae9808e260 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -266,40 +266,46 @@ struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) return NULL; } -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) { - struct sk_buff *pp = NULL; - struct sk_buff *p; + unsigned int thlen, hlen, off; struct tcphdr *th; - struct tcphdr *th2; - unsigned int len; - unsigned int thlen; - __be32 flags; - unsigned int mss = 1; - unsigned int hlen; - unsigned int off; - int flush = 1; - int i; off = skb_gro_offset(skb); hlen = off + sizeof(*th); th = skb_gro_header(skb, hlen, off); if (unlikely(!th)) - goto out; + return NULL; thlen = th->doff * 4; if (thlen < sizeof(*th)) - goto out; + return NULL; hlen = off + thlen; if (!skb_gro_may_pull(skb, hlen)) { th = skb_gro_header_slow(skb, hlen, off); if (unlikely(!th)) - goto out; + return NULL; } skb_gro_pull(skb, thlen); + return th; +} + +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ + unsigned int thlen = th->doff * 4; + struct sk_buff *pp = NULL; + struct sk_buff *p; + struct tcphdr *th2; + unsigned int len; + __be32 flags; + unsigned int mss = 1; + int flush = 1; + int i; + len = skb_gro_len(skb); flags = tcp_flag_word(th); @@ -376,7 +382,6 @@ out_check_final: if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = p; -out: NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; @@ -405,15 +410,23 @@ EXPORT_SYMBOL(tcp_gro_complete); INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - inet_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + inet_gro_compute_pseudo)) + goto flush; + + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; - return tcp_gro_receive(head, skb); + return tcp_gro_receive(head, skb, th); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 575e2743e331..e73a4f74fd96 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -16,15 +16,23 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - ip6_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + ip6_gro_compute_pseudo)) + goto flush; + + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; + + return tcp_gro_receive(head, skb, th); - return tcp_gro_receive(head, skb); +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) -- cgit v1.2.3