diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2026-01-23 05:28:51 +0300 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2026-01-23 05:28:52 +0300 |
| commit | 2849d179d442b1f3d2cb9ffdf8c0cdc8127f3b95 (patch) | |
| tree | f99f7e79688f7a3d096d100caf04010deb82db44 | |
| parent | 3b87882bb399bf6d1900a1c2cc8dde65d336680a (diff) | |
| parent | bc1f0b1c98f518867efc5cc9c22181722be14532 (diff) | |
| download | linux-2849d179d442b1f3d2cb9ffdf8c0cdc8127f3b95.tar.xz | |
Merge branch 'tcp-remove-tcp_rate-c'
Eric Dumazet says:
====================
tcp: remove tcp_rate.c
Move tcp_rate_gen() to tcp_input.c and tcp_rate_check_app_limited()
to tcp.c for better code generation.
tcp_rate.c was interesting from code maintenance perspective
but was adding cpu costs.
====================
Link: https://patch.msgid.link/20260121095923.3134639-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | include/net/tcp.h | 5 | ||||
| -rw-r--r-- | net/ipv4/Makefile | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 18 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 110 | ||||
| -rw-r--r-- | net/ipv4/tcp_rate.c | 130 |
5 files changed, 130 insertions, 135 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index b38327606454..efff433de9a4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -809,6 +809,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); +void tcp_rate_check_app_limited(struct sock *sk); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, @@ -1355,10 +1356,6 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) /* From tcp_cong.c */ void tcp_set_ca_state(struct sock *sk, const u8 ca_state); -/* From tcp_rate.c */ -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs); -void tcp_rate_check_app_limited(struct sock *sk); static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ec36d2ec059e..18108a6f0499 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_rate.o tcp_recovery.o tcp_ulp.o \ + tcp_recovery.o tcp_ulp.o \ tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d5319ebe2452..148cdf3cd623 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1074,6 +1074,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, return err; } +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} +EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); + int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct net_devmem_dmabuf_binding *binding = NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc8e256321b0..9e91ddbc6253 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1637,6 +1637,116 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. + */ + +/* Update the connection delivery information and generate a rate sample. */ +static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + bool is_sack_reneg, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = tp->tcp_mstamp; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; + } +} + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) * delivery information when the skb was last transmitted. * diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c deleted file mode 100644 index f0f2ef377043..000000000000 --- a/net/ipv4/tcp_rate.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include <net/tcp.h> - -/* The bandwidth estimator estimates the rate at which the network - * can currently deliver outbound data packets for this flow. At a high - * level, it operates by taking a delivery rate sample for each ACK. - * - * A rate sample records the rate at which the network delivered packets - * for this flow, calculated over the time interval between the transmission - * of a data packet and the acknowledgment of that packet. - * - * Specifically, over the interval between each transmit and corresponding ACK, - * the estimator generates a delivery rate sample. Typically it uses the rate - * at which packets were acknowledged. However, the approach of using only the - * acknowledgment rate faces a challenge under the prevalent ACK decimation or - * compression: packets can temporarily appear to be delivered much quicker - * than the bottleneck rate. Since it is physically impossible to do that in a - * sustained fashion, when the estimator notices that the ACK rate is faster - * than the transmit rate, it uses the latter: - * - * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) - * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) - * bw = min(send_rate, ack_rate) - * - * Notice the estimator essentially estimates the goodput, not always the - * network bottleneck link rate when the sending or receiving is limited by - * other factors like applications or receiver window limits. The estimator - * deliberately avoids using the inter-packet spacing approach because that - * approach requires a large number of samples and sophisticated filtering. - * - * TCP flows can often be application-limited in request/response workloads. - * The estimator marks a bandwidth sample as application-limited if there - * was some moment during the sampled window of packets when there was no data - * ready to send in the write queue. - */ - -/* Update the connection delivery information and generate a rate sample. */ -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 snd_us, ack_us; - - /* Clear app limited if bubble is acked and gone. */ - if (tp->app_limited && after(tp->delivered, tp->app_limited)) - tp->app_limited = 0; - - /* TODO: there are multiple places throughout tcp_ack() to get - * current time. Refactor the code using a new "tcp_acktag_state" - * to carry current time, flags, stats like "tcp_sacktag_state". - */ - if (delivered) - tp->delivered_mstamp = tp->tcp_mstamp; - - rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ - rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available or - * in recovery from loss with SACK reneging. Rate samples taken during - * a SACK reneging event may overestimate bw by including packets that - * were SACKed before the reneg. - */ - if (!rs->prior_mstamp || is_sack_reneg) { - rs->delivered = -1; - rs->interval_us = -1; - return; - } - rs->delivered = tp->delivered - rs->prior_delivered; - - rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; - /* delivered_ce occupies less than 32 bits in the skb control block */ - rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; - - /* Model sending data and receiving ACKs as separate pipeline phases - * for a window. Usually the ACK phase is longer, but with ACK - * compression the send phase can be longer. To be safe we use the - * longer phase. - */ - snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, - rs->prior_mstamp); /* ack phase */ - rs->interval_us = max(snd_us, ack_us); - - /* Record both segment send and ack receive intervals */ - rs->snd_interval_us = snd_us; - rs->rcv_interval_us = ack_us; - - /* Normally we expect interval_us >= min-rtt. - * Note that rate may still be over-estimated when a spuriously - * retransmistted skb was first (s)acked because "interval_us" - * is under-estimated (up to an RTT). However continuously - * measuring the delivery rate during loss recovery is crucial - * for connections suffer heavy or prolonged losses. - */ - if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { - if (!rs->is_retrans) - pr_debug("tcp rate: %ld %d %u %u %u\n", - rs->interval_us, rs->delivered, - inet_csk(sk)->icsk_ca_state, - tp->rx_opt.sack_ok, tcp_min_rtt(tp)); - rs->interval_us = -1; - return; - } - - /* Record the last non-app-limited or the highest app-limited bw */ - if (!rs->is_app_limited || - ((u64)rs->delivered * tp->rate_interval_us >= - (u64)tp->rate_delivered * rs->interval_us)) { - tp->rate_delivered = rs->delivered; - tp->rate_interval_us = rs->interval_us; - tp->rate_app_limited = rs->is_app_limited; - } -} - -/* If a gap is detected between sends, mark the socket application-limited. */ -void tcp_rate_check_app_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (/* We have less than one packet to send. */ - tp->write_seq - tp->snd_nxt < tp->mss_cache && - /* Nothing in sending host's qdisc queues or NIC tx queue. */ - sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && - /* We are not limited by CWND. */ - tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && - /* All lost packets have been retransmitted. */ - tp->lost_out <= tp->retrans_out) - tp->app_limited = - (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; -} -EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); |
