From 5f852eb536ad651b8734559dcf4353514cb0bea3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Feb 2015 14:10:18 -0800 Subject: tcp: tso: remove tp->tso_deferred TSO relies on ability to defer sending a small amount of packets. Heuristic is to wait for future ACKS in hope to send more packets at once. Current algorithm uses a per socket tso_deferred field as a pseudo timer. This pseudo timer relies on future ACK, but there is no guarantee we receive them in time. Fix would be to use a real timer, but cost of such timer is probably too expensive for typical cases. This patch changes the logic to test the time of last transmit, because we should not add bursts of more than 1ms for any given flow. We've used this patch for about two years at Google, before FQ/pacing as it would reduce a fair amount of bursts. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1a7adb411647..97dbf16f7d9d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -236,7 +236,6 @@ struct tcp_sock { u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ u32 fackets_out; /* FACK'd packets */ - u32 tso_deferred; /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; -- cgit v1.2.3 From 9439ce00f208d95703a6725e4ea986dd90e37ffd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Mar 2015 18:32:29 -0700 Subject: tcp: rename struct tcp_request_sock listener The listener field in struct tcp_request_sock is a pointer back to the listener. We now have req->rsk_listener, so TCP only needs one boolean and not a full pointer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/core/request_sock.c | 18 +++++++----------- net/ipv4/inet_connection_sock.c | 7 +++---- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_fastopen.c | 7 +------ net/ipv4/tcp_input.c | 2 +- net/ipv6/syncookies.c | 2 +- 7 files changed, 15 insertions(+), 25 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97dbf16f7d9d..f869ae8afbaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -111,7 +111,7 @@ struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; - struct sock *listener; /* needed for TFO */ + bool tfo_listener; u32 rcv_isn; u32 snt_isn; u32 snt_synack; /* synack sent time */ diff --git a/net/core/request_sock.c b/net/core/request_sock.c index e910317ef6d9..cc39a2aa663a 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -153,24 +153,22 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * - * When a TFO req is created, it needs to sock_hold its listener to prevent - * the latter data structure from going away. - * - * This function also sets "treq->listener" to NULL and unreference listener - * socket. treq->listener is used by the listener so it is protected by the + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { - struct sock *lsk = tcp_rsk(req)->listener; - struct fastopen_queue *fastopenq = - inet_csk(lsk)->icsk_accept_queue.fastopenq; + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; @@ -179,7 +177,6 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); reqsk_put(req); return; } @@ -201,5 +198,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3390ba6f96b2..741f0d96a7f7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -325,7 +325,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) sk_acceptq_removed(sk); if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { spin_lock_bh(&queue->fastopenq->lock); - if (tcp_rsk(req)->listener) { + if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken @@ -817,9 +817,9 @@ void inet_csk_listen_stop(struct sock *sk) percpu_counter_inc(sk->sk_prot->orphan_count); - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != tcp_rsk(req)->listener); + BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is @@ -828,7 +828,6 @@ void inet_csk_listen_stop(struct sock *sk) * tcp_v4_destroy_sock(). */ tcp_sk(child)->fastopen_rsk = NULL; - sock_put(sk); } inet_csk_destroy_sock(child); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eb940750bb1b..574b67765a06 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -345,7 +345,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; - treq->listener = NULL; + treq->tfo_listener = false; ireq->ireq_family = AF_INET; ireq->ir_iif = sk->sk_bound_dev_if; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 84381319e1bc..186fd394ec0a 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -155,12 +155,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp = tcp_sk(child); tp->fastopen_rsk = req; - /* Do a hold on the listner sk so that if the listener is being - * closed, the child that has been accepted can live on and still - * access listen_lock. - */ - sock_hold(sk); - tcp_rsk(req)->listener = sk; + tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fbe518981d36..a94ddb96fc85 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6120,7 +6120,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (err || want_cookie) goto drop_and_free; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 039e74dd29fe..1ef0c926ce9d 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -195,7 +195,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); treq = tcp_rsk(req); - treq->listener = NULL; + treq->tfo_listener = false; ireq->ireq_family = AF_INET6; if (security_inet_conn_request(sk, skb, req)) -- cgit v1.2.3 From 7f9b838b71eb78a27de27a12ca5de8542fac3115 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Mon, 6 Apr 2015 14:37:26 -0700 Subject: tcp: RFC7413 option support for Fast Open server Fast Open has been using the experimental option with a magic number (RFC6994) to request and grant Fast Open cookies. This patch enables the server to support the official IANA option 34 in RFC7413 in addition. The change has passed all existing Fast Open tests with both old and new options at Google. Signed-off-by: Daniel Lee Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/net/tcp.h | 2 ++ net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 42 ++++++++++++++++++++++++++++++------------ net/ipv4/tcp_output.c | 34 +++++++++++++++++++++++----------- 5 files changed, 57 insertions(+), 23 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f869ae8afbaf..a48d00318683 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -58,6 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) struct tcp_fastopen_cookie { s8 len; u8 val[TCP_FASTOPEN_COOKIE_MAX]; + bool exp; /* In RFC6994 experimental option format */ }; /* This defines a selective acknowledgement block. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 963303fb96ae..7292c3c575bc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -179,6 +179,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -194,6 +195,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 +#define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 /* But this is what stacks really send out. */ diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 5da55e2b5cd2..e3d87aca6be8 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -303,6 +303,7 @@ fastopen: } else if (foc->len > 0) /* Client presents an invalid cookie */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + valid_foc.exp = foc->exp; *foc = valid_foc; return false; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c1ce304ba8d2..24f1630b2afb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3603,6 +3603,23 @@ old_ack: return 0; } +static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, + bool syn, struct tcp_fastopen_cookie *foc, + bool exp_opt) +{ + /* Valid only in SYN or SYN-ACK with an even length. */ + if (!foc || !syn || len < 0 || (len & 1)) + return; + + if (len >= TCP_FASTOPEN_COOKIE_MIN && + len <= TCP_FASTOPEN_COOKIE_MAX) + memcpy(foc->val, cookie, len); + else if (len != 0) + len = -1; + foc->len = len; + foc->exp = exp_opt; +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3692,21 +3709,22 @@ void tcp_parse_options(const struct sk_buff *skb, */ break; #endif + case TCPOPT_FASTOPEN: + tcp_parse_fastopen_option( + opsize - TCPOLEN_FASTOPEN_BASE, + ptr, th->syn, foc, false); + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a - * 16 bits magic number. It's valid only in - * SYN or SYN-ACK with an even size. + * 16 bits magic number. */ - if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || - get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || - !foc || !th->syn || (opsize & 1)) - break; - foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; - if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && - foc->len <= TCP_FASTOPEN_COOKIE_MAX) - memcpy(foc->val, ptr + 2, foc->len); - else if (foc->len != 0) - foc->len = -1; + if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && + get_unaligned_be16(ptr) == + TCPOPT_FASTOPEN_MAGIC) + tcp_parse_fastopen_option(opsize - + TCPOLEN_EXP_FASTOPEN_BASE, + ptr + 2, th->syn, foc, true); break; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7404e5238e00..464bd8c5de69 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -518,17 +518,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + u8 *p = (u8 *)ptr; + u32 len; /* Fast Open option length */ + + if (foc->exp) { + len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; + *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | + TCPOPT_FASTOPEN_MAGIC); + p += TCPOLEN_EXP_FASTOPEN_BASE; + } else { + len = TCPOLEN_FASTOPEN_BASE + foc->len; + *p++ = TCPOPT_FASTOPEN; + *p++ = len; + } - *ptr++ = htonl((TCPOPT_EXP << 24) | - ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | - TCPOPT_FASTOPEN_MAGIC); - - memcpy(ptr, foc->val, foc->len); - if ((foc->len & 3) == 2) { - u8 *align = ((u8 *)ptr) + foc->len; - align[0] = align[1] = TCPOPT_NOP; + memcpy(p, foc->val, foc->len); + if ((len & 3) == 2) { + p[foc->len] = TCPOPT_NOP; + p[foc->len + 1] = TCPOPT_NOP; } - ptr += (foc->len + 3) >> 2; + ptr += (len + 3) >> 2; } } @@ -641,8 +650,11 @@ static unsigned int tcp_synack_options(struct sock *sk, if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } - if (foc && foc->len >= 0) { - u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; + if (foc != NULL && foc->len >= 0) { + u32 need = foc->len; + + need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : + TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; -- cgit v1.2.3 From 2646c831c00c5d22aa72b79d24069c1b412cda7c Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Mon, 6 Apr 2015 14:37:27 -0700 Subject: tcp: RFC7413 option support for Fast Open client Fast Open has been using an experimental option with a magic number (RFC6994). This patch makes the client by default use the RFC7413 option (34) to get and send Fast Open cookies. This patch makes the client solicit cookies from a given server first with the RFC7413 option. If that fails to elicit a cookie, then it tries the RFC6994 experimental option. If that also fails, it uses the RFC7413 option on all subsequent connect attempts. If the server returns a Fast Open cookie then the client caches the form of the option that successfully elicited a cookie, and uses that form on later connects when it presents that cookie. The idea is to gradually obsolete the use of experimental options as the servers and clients upgrade, while keeping the interoperability meanwhile. Signed-off-by: Daniel Lee Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/net/tcp.h | 3 ++- net/ipv4/tcp_input.c | 29 +++++++++++++++++++---------- net/ipv4/tcp_metrics.c | 13 +++++++++++-- net/ipv4/tcp_output.c | 6 +++++- net/ipv4/tcp_timer.c | 2 +- 6 files changed, 39 insertions(+), 15 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a48d00318683..0caa3a2d4106 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -189,6 +189,7 @@ struct tcp_sock { u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ + syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 7292c3c575bc..9598871485ce 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1339,7 +1339,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie, int *syn_loss, unsigned long *last_syn_loss); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, - struct tcp_fastopen_cookie *cookie, bool syn_lost); + struct tcp_fastopen_cookie *cookie, bool syn_lost, + u16 try_exp); struct tcp_fastopen_request { /* Fast Open cookie. Size 0 means a cookie request */ struct tcp_fastopen_cookie cookie; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 24f1630b2afb..031cf72cd05c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5378,8 +5378,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; - u16 mss = tp->rx_opt.mss_clamp; - bool syn_drop; + u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; + bool syn_drop = false; if (mss == tp->rx_opt.user_mss) { struct tcp_options_received opt; @@ -5391,16 +5391,25 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, mss = opt.mss_clamp; } - if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ + if (!tp->syn_fastopen) { + /* Ignore an unsolicited cookie */ cookie->len = -1; + } else if (tp->total_retrans) { + /* SYN timed out and the SYN-ACK neither has a cookie nor + * acknowledges data. Presumably the remote received only + * the retransmitted (regular) SYNs: either the original + * SYN-data or the corresponding SYN-ACK was dropped. + */ + syn_drop = (cookie->len < 0 && data); + } else if (cookie->len < 0 && !tp->syn_data) { + /* We requested a cookie but didn't get it. If we did not use + * the (old) exp opt format then try so next time (try_exp=1). + * Otherwise we go back to use the RFC7413 opt (try_exp=2). + */ + try_exp = tp->syn_fastopen_exp ? 2 : 1; + } - /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably - * the remote receives only the retransmitted (regular) SYNs: either - * the original SYN-data or the corresponding SYN-ACK is lost. - */ - syn_drop = (cookie->len <= 0 && data && tp->total_retrans); - - tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ tcp_for_write_queue_from(data, sk) { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 78ecc4a01712..a51d63a43e33 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -28,7 +28,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s struct tcp_fastopen_metrics { u16 mss; - u16 syn_loss:10; /* Recurring Fast Open SYN losses */ + u16 syn_loss:10, /* Recurring Fast Open SYN losses */ + try_exp:2; /* Request w/ exp. option (once) */ unsigned long last_syn_loss; /* Last Fast Open SYN loss */ struct tcp_fastopen_cookie cookie; }; @@ -131,6 +132,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, if (fastopen_clear) { tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.try_exp = 0; + tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; } } @@ -713,6 +716,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, if (tfom->mss) *mss = tfom->mss; *cookie = tfom->cookie; + if (cookie->len <= 0 && tfom->try_exp == 1) + cookie->exp = true; *syn_loss = tfom->syn_loss; *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); @@ -721,7 +726,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, } void tcp_fastopen_cache_set(struct sock *sk, u16 mss, - struct tcp_fastopen_cookie *cookie, bool syn_lost) + struct tcp_fastopen_cookie *cookie, bool syn_lost, + u16 try_exp) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_metrics_block *tm; @@ -738,6 +744,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, tfom->mss = mss; if (cookie && cookie->len > 0) tfom->cookie = *cookie; + else if (try_exp > tfom->try_exp && + tfom->cookie.len <= 0 && !tfom->cookie.exp) + tfom->try_exp = try_exp; if (syn_lost) { ++tfom->syn_loss; tfom->last_syn_loss = jiffies; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 464bd8c5de69..e662d85d1635 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -592,13 +592,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } if (fastopen && fastopen->cookie.len >= 0) { - u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; + u32 need = fastopen->cookie.len; + + need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : + TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; + tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 2568fd282873..8c65dc147d8b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -167,7 +167,7 @@ static int tcp_write_timeout(struct sock *sk) if (icsk->icsk_retransmits) { dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true); + tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (tp->syn_data) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); -- cgit v1.2.3 From 0df48c26d8418c5c9fba63fac15b660d70ca2f1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 15:28:17 -0700 Subject: tcp: add tcpi_bytes_acked to tcp_info This patch tracks total number of bytes acked for a TCP socket. This is the sum of all changes done to tp->snd_una, and allows for precise tracking of delivered data. RFC4898 named this : tcpEStatsAppHCThruOctetsAcked This is a 64bit field, and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->bytes_acked was placed near tp->snd_una for best data locality and minimal performance impact. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Cc: Matt Mathis Cc: Eric Salo Cc: Martin Lau Cc: Chris Rapier Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++++ include/net/tcp.h | 2 +- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 6 +++++- net/ipv4/tcp_input.c | 13 +++++++++++-- 5 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0caa3a2d4106..0f73b43171da 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -150,6 +150,10 @@ struct tcp_sock { u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ + u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 051dc5c2802d..dd7b4ea6a10c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -576,7 +576,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) } /* tcp.c */ -void tcp_get_info(const struct sock *, struct tcp_info *); +void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 3b9718328d8b..6666e98a0af9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -189,6 +189,7 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; + __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c5cd9efebbc..4bf0e8ca7b5b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2592,7 +2592,7 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); #endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(const struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2663,6 +2663,10 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) rate = READ_ONCE(sk->sk_max_pacing_rate); info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + + spin_lock_bh(&sk->sk_lock.slock); + info->tcpi_bytes_acked = tp->bytes_acked; + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3a4d9b34bed4..378d3f4d4dc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3280,6 +3280,15 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); } +/* If we update tp->snd_una, also update tp->bytes_acked */ +static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) +{ + u32 delta = ack - tp->snd_una; + + tp->bytes_acked += delta; + tp->snd_una = ack; +} + /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 @@ -3315,7 +3324,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 } } - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); return flag; } @@ -3497,7 +3506,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack_seq); - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); -- cgit v1.2.3 From bdd1f9edacb5f5835d1e6276571bbbe5b88ded48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 15:28:18 -0700 Subject: tcp: add tcpi_bytes_received to tcp_info This patch tracks total number of payload bytes received on a TCP socket. This is the sum of all changes done to tp->rcv_nxt RFC4898 named this : tcpEStatsAppHCThruOctetsReceived This is a 64bit field, and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->bytes_received was placed near tp->rcv_nxt for best data locality and minimal performance impact. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Matt Mathis Cc: Eric Salo Cc: Martin Lau Cc: Chris Rapier Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++++ include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 17 +++++++++++++---- 5 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/linux/tcp.h') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0f73b43171da..3b2911502a8c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -145,6 +145,10 @@ struct tcp_sock { * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ + u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 6666e98a0af9..a48f93f3207b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -190,6 +190,7 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4bf0e8ca7b5b..99fcc0b22c92 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2666,6 +2666,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) spin_lock_bh(&sk->sk_lock.slock); info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3d87aca6be8..3c673d5e6cff 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -206,6 +206,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, skb_set_owner_r(skb2, child); __skb_queue_tail(&child->sk_receive_queue, skb2); tp->syn_data_acked = 1; + tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; } else { end_seq = TCP_SKB_CB(skb)->seq + 1; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 378d3f4d4dc3..7e6962bcfc30 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3289,6 +3289,15 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) tp->snd_una = ack; } +/* If we update tp->rcv_nxt, also update tp->bytes_received */ +static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) +{ + u32 delta = seq - tp->rcv_nxt; + + tp->bytes_received += delta; + tp->rcv_nxt = seq; +} + /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 @@ -4245,7 +4254,7 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4413,7 +4422,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; - tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); @@ -4506,7 +4515,7 @@ queue_and_out: eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5254,7 +5263,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); eaten = 1; } -- cgit v1.2.3