diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2025-05-15 21:30:11 +0300 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2025-05-15 21:30:12 +0300 |
| commit | 2da35e4b4df99d3dd29bacf0c054e6988013d4ec (patch) | |
| tree | b903298160aa31a09c0d70a56eaac6b1d2f6cf51 /include | |
| parent | bebd7b262638af611a0e699ba37c43ec2238801b (diff) | |
| parent | 572be9bf9d0d96242dd7977ce456009b6c690dce (diff) | |
| download | linux-2da35e4b4df99d3dd29bacf0c054e6988013d4ec.tar.xz | |
Merge branch 'tcp-receive-side-improvements'
Eric Dumazet says:
====================
tcp: receive side improvements
We have set tcp_rmem[2] to 15 MB for about 8 years at Google,
but had some issues for high speed flows on very small RTT.
TCP rx autotuning has a tendency to overestimate the RTT,
thus tp->rcvq_space.space and sk->sk_rcvbuf.
This makes TCP receive queues much bigger than necessary,
to a point cpu caches are evicted before application can
copy the data, on cpus using DDIO.
This series aims to fix this.
- First patch adds tcp_rcvbuf_grow() tracepoint, which was very
convenient to study the various issues fixed in this series.
- Seven patches fix receiver autotune issues.
- Two patches fix sender side issues.
- Final patch increases tcp_rmem[2] so that TCP speed over WAN
can meet modern needs.
Tested on a 200Gbit NIC, average max throughput of a single flow:
Before:
73593 Mbit.
After:
122514 Mbit.
====================
Link: https://patch.msgid.link/20250513193919.1089692-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/tcp.h | 2 | ||||
| -rw-r--r-- | include/trace/events/tcp.h | 73 |
2 files changed, 74 insertions, 1 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a8af71623ba7..29f59d50dc73 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -340,7 +340,7 @@ struct tcp_sock { } rcv_rtt_est; /* Receiver queue space */ struct { - u32 space; + int space; u32 seq; u64 time; } rcvq_space; diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 53e878fa14d1..006c2116c8f6 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_ARGS(sk) ); +TRACE_EVENT(tcp_rcvbuf_grow, + + TP_PROTO(struct sock *sk, int time), + + TP_ARGS(sk, time), + + TP_STRUCT__entry( + __field(int, time) + __field(__u32, rtt_us) + __field(__u32, copied) + __field(__u32, inq) + __field(__u32, space) + __field(__u32, ooo_space) + __field(__u32, rcvbuf) + __field(__u8, scaling_ratio) + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + __field(const void *, skaddr) + __field(__u64, sock_cookie) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __be32 *p32; + + __entry->time = time; + __entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3; + __entry->copied = tp->copied_seq - tp->rcvq_space.seq; + __entry->inq = tp->rcv_nxt - tp->copied_seq; + __entry->space = tp->rcvq_space.space; + __entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 : + TCP_SKB_CB(tp->ooo_last_skb)->end_seq - + tp->rcv_nxt; + + __entry->rcvbuf = sk->sk_rcvbuf; + __entry->scaling_ratio = tp->scaling_ratio; + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); + + __entry->skaddr = sk; + __entry->sock_cookie = sock_gen_cookie(sk); + ), + + TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u " + "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 " + "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx", + __entry->time, __entry->rtt_us, __entry->copied, + __entry->inq, __entry->space, __entry->ooo_space, + __entry->scaling_ratio, __entry->rcvbuf, + show_family_name(__entry->family), + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, + __entry->skaddr, + __entry->sock_cookie) +); + TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), |
