diff options
| -rw-r--r-- | Documentation/networking/net_cachelines/tcp_sock.rst | 2 | ||||
| -rw-r--r-- | include/linux/tcp.h | 3 | ||||
| -rw-r--r-- | include/net/tcp.h | 15 | ||||
| -rw-r--r-- | include/net/tcp_ecn.h | 53 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 5 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 100 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 9 |
7 files changed, 175 insertions, 12 deletions
diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 7bbda5944ee2..31313a9adccc 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -101,6 +101,8 @@ u32 prr_delivered u32 prr_out read_mostly read_mostly tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx) u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) +u32 received_ce read_mostly read_write +u8:4 received_ce_pending read_mostly read_write u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d103cc0e7a35..90cee6e53527 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -287,6 +287,8 @@ struct tcp_sock { */ u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -299,6 +301,7 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 received_ce; /* Like the above but for rcvd CE marked pkts */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index e25340459ce4..bc5159fe842e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -973,6 +973,14 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCP_ACCECN_CEP_ACE_MASK 0x7 +#define TCP_ACCECN_ACE_MAX_DELTA 6 + +/* To avoid/detect middlebox interference, not all counters start at 0. + * See draft-ietf-tcpm-accurate-ecn for the latest values. + */ +#define TCP_ACCECN_CEP_INIT_OFFSET 5 + /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ @@ -1782,11 +1790,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { + u32 ace; + /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; + ace = tcp_ecn_mode_accecn(tp) ? + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & + TCP_ACCECN_CEP_ACE_MASK) : 0; + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + (ace << 22) | ntohl(TCP_FLAG_ACK) | snd_wnd); } diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b3430557676b..b0ed89dbad41 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -12,6 +12,7 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { + /* Do not set CWR if in AccECN mode! */ if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -19,8 +20,10 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) static inline void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK @@ -36,6 +39,52 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +static inline u8 tcp_accecn_ace(const struct tcphdr *th) +{ + return (th->ae << 2) | (th->cwr << 1) | th->ece; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; + tp->received_ce_pending = 0; +} + +/* Updates Accurate ECN received counters from the received IP ECN field */ +static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +{ + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + u8 is_ce = INET_ECN_is_ce(ecnfield); + struct tcp_sock *tp = tcp_sk(sk); + + if (!INET_ECN_is_not_ect(ecnfield)) { + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by + * tcp_ecn_received_counters() when the ECN codepoint of + * received TCP data or ACK contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_SEEN; + + /* ACE counter tracks *all* segments including pure ACKs */ + tp->received_ce += pcount; + tp->received_ce_pending = min(tp->received_ce_pending + pcount, + 0xfU); + } +} + +static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +{ + u32 wire_ace; + + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; +} + static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1f643faa8b93..16456c10e5e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <net/xfrm.h> @@ -3406,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5138,6 +5140,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5145,7 +5148,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b2793e749cfd..98782134c2f4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -360,16 +360,25 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } @@ -385,10 +394,64 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int flag) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -3744,7 +3807,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3752,8 +3816,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3774,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; @@ -3881,6 +3950,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3905,7 +3979,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3914,12 +3989,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3940,7 +4019,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -6071,6 +6150,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb); + /* We know that such packets are checksummed * on entry. */ @@ -6119,6 +6200,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6159,6 +6241,8 @@ validate: return; step5: + tcp_ecn_received_counters(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index be8ceefa5332..a3a6d3e91d84 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -328,7 +328,14 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_ecn_mode_rfc3168(tp)) { + if (!tcp_ecn_mode_any(tp)) + return; + + if (tcp_ecn_mode_accecn(tp)) { + INET_ECN_xmit(sk); + tcp_accecn_set_ace(th, tp); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; + } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { |
