diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/tcp.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_rate.c | 29 |
3 files changed, 39 insertions, 1 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de02fb4b1349..2250f891f931 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk) */ tp->snd_cwnd = TCP_INIT_CWND; + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ @@ -1014,6 +1017,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); + + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + res = do_tcp_sendpages(sk, page, offset, size, flags); release_sock(sk); return res; @@ -1115,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 568947110b60..6234ebaa7db1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -487,6 +487,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + tcp_init_xmit_timers(newsk); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 1daed6af6e80..52ff84be59ab 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -26,9 +26,13 @@ * other factors like applications or receiver window limits. The estimator * deliberately avoids using the inter-packet spacing approach because that * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. */ - /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ @@ -58,6 +62,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) @@ -80,6 +85,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, after(scb->tx.delivered, rs->prior_delivered)) { rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; /* Find the duration of the "send phase" of this window: */ @@ -105,6 +111,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + /* TODO: there are multiple places throughout tcp_ack() to get * current time. Refactor the code using a new "tcp_acktag_state" * to carry current time, flags, stats like "tcp_sacktag_state". @@ -147,3 +157,20 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); } } + +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tp->snd_cwnd && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} |