summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/linux/tcp.h28
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--include/net/tcp.h33
-rw-r--r--include/net/tcp_ecn.h554
-rw-r--r--include/uapi/linux/tcp.h9
5 files changed, 609 insertions, 17 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d103cc0e7a35..3ca5ed02de6d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -122,8 +122,9 @@ struct tcp_options_received {
smc_ok : 1, /* SMC seen on SYN packet */
snd_wscale : 4, /* Window scaling received from sender */
rcv_wscale : 4; /* Window scaling to send to receiver */
- u8 saw_unknown:1, /* Received unknown option */
- unused:7;
+ u8 accecn:6, /* AccECN index in header, 0=no options */
+ saw_unknown:1, /* Received unknown option */
+ unused:1;
u8 num_sacks; /* Number of SACK blocks */
u16 user_mss; /* mss requested by user in ioctl */
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
@@ -168,6 +169,11 @@ struct tcp_request_sock {
* after data-in-SYN.
*/
u8 syn_tos;
+ bool accecn_ok;
+ u8 syn_ect_snt: 2,
+ syn_ect_rcv: 2,
+ accecn_fail_mode:4;
+ u8 saw_accecn_opt :2;
#ifdef CONFIG_TCP_AO
u8 ao_keyid;
u8 ao_rcv_next;
@@ -270,6 +276,7 @@ struct tcp_sock {
u32 mdev_us; /* medium deviation */
u32 rtt_seq; /* sequence number to update rttvar */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
+ u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
struct sk_buff *highest_sack; /* skb just after the highest
* skb with SACKed bit set
@@ -287,6 +294,12 @@ struct tcp_sock {
*/
u8 nonagle : 4,/* Disable Nagle algorithm? */
rate_app_limited:1; /* rate_{delivered,interval_us} limited? */
+ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */
+ unused2:4;
+ u8 accecn_minlen:2,/* Minimum length of AccECN option sent */
+ est_ecnfield:2,/* ECN field for AccECN delivered estimates */
+ accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
+ prev_ecnfield:2; /* ECN bits from the previous segment */
__be32 pred_flags;
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
u64 tcp_mstamp; /* most recent packet received/sent */
@@ -299,6 +312,11 @@ struct tcp_sock {
u32 snd_up; /* Urgent pointer */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 delivered_ce; /* Like the above but only ECE marked packets */
+ u32 received_ce; /* Like the above but for rcvd CE marked pkts */
+ u32 received_ecn_bytes[3]; /* received byte counters for three ECN
+ * types: INET_ECN_ECT_1, INET_ECN_ECT_0,
+ * and INET_ECN_CE
+ */
u32 app_limited; /* limited until "delivered" reaches this val */
u32 rcv_wnd; /* Current receiver window */
/*
@@ -326,6 +344,7 @@ struct tcp_sock {
u32 rate_delivered; /* saved rate sample: packets delivered */
u32 rate_interval_us; /* saved rate sample: time elapsed */
u32 rcv_rtt_last_tsecr;
+ u32 delivered_ecn_bytes[3];
u64 first_tx_mstamp; /* start of window send phase */
u64 delivered_mstamp; /* time we reached "delivered" */
u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
@@ -372,7 +391,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
- unused:5;
+ syn_ect_snt:2, /* AccECN ECT memory, only */
+ syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */
u8 thin_lto : 1,/* Use linear timeouts for thin streams */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
@@ -388,6 +408,8 @@ struct tcp_sock {
syn_fastopen_child:1; /* created TFO passive child socket */
u8 keepalive_probes; /* num of allowed keep alive probes */
+ u8 accecn_fail_mode:4, /* AccECN failure handling */
+ saw_accecn_opt:2; /* An AccECN option was seen */
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
/* RTT measurement */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 54a7d187f62a..34eb3aecb3f2 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -148,6 +148,8 @@ struct netns_ipv4 {
struct local_ports ip_local_ports;
u8 sysctl_tcp_ecn;
+ u8 sysctl_tcp_ecn_option;
+ u8 sysctl_tcp_ecn_option_beacon;
u8 sysctl_tcp_ecn_fallback;
u8 sysctl_ip_default_ttl;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e25340459ce4..7c51a0a5ace8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE 14U
+/* Default sending frequency of accurate ECN option per RTT */
+#define TCP_ACCECN_OPTION_BEACON 3
+
/* urg_data states */
#define TCP_URG_VALID 0x0100
#define TCP_URG_NOTYET 0x0200
@@ -213,6 +216,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOPT_AO 29 /* Authentication Option (RFC5925) */
#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
+#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */
+#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */
#define TCPOPT_EXP 254 /* Experimental */
/* Magic number to be after the option value for sharing TCP
* experimental options. See draft-ietf-tcpm-experimental-options-00.txt
@@ -230,6 +235,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_MD5SIG 18
#define TCPOLEN_FASTOPEN_BASE 2
+#define TCPOLEN_ACCECN_BASE 2
#define TCPOLEN_EXP_FASTOPEN_BASE 4
#define TCPOLEN_EXP_SMC_BASE 6
@@ -243,6 +249,14 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_MD5SIG_ALIGNED 20
#define TCPOLEN_MSS_ALIGNED 4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
+#define TCPOLEN_ACCECN_PERFIELD 3
+
+/* Maximum number of byte counters in AccECN option + size */
+#define TCP_ACCECN_NUMFIELDS 3
+#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \
+ TCPOLEN_ACCECN_PERFIELD * \
+ TCP_ACCECN_NUMFIELDS)
+#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
@@ -972,6 +986,18 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
+#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)
+
+#define TCP_ACCECN_CEP_ACE_MASK 0x7
+#define TCP_ACCECN_ACE_MAX_DELTA 6
+
+/* To avoid/detect middlebox interference, not all counters start at 0.
+ * See draft-ietf-tcpm-accurate-ecn for the latest values.
+ */
+#define TCP_ACCECN_CEP_INIT_OFFSET 5
+#define TCP_ACCECN_E1B_INIT_OFFSET 1
+#define TCP_ACCECN_E0B_INIT_OFFSET 1
+#define TCP_ACCECN_CEB_INIT_OFFSET 0
/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
@@ -1782,11 +1808,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
+ u32 ace;
+
/* mptcp hooks are only on the slow path */
if (sk_is_mptcp((struct sock *)tp))
return;
+ ace = tcp_ecn_mode_accecn(tp) ?
+ ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) &
+ TCP_ACCECN_CEP_ACE_MASK) : 0;
+
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+ (ace << 22) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
index b3430557676b..f13e5cd2b1ac 100644
--- a/include/net/tcp_ecn.h
+++ b/include/net/tcp_ecn.h
@@ -4,14 +4,36 @@
#include <linux/tcp.h>
#include <linux/skbuff.h>
+#include <linux/bitfield.h>
#include <net/inet_connection_sock.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/inet_ecn.h>
+/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
+ * attemped to be negotiated and requested for incoming connection
+ * and outgoing connection, respectively.
+ */
+enum tcp_ecn_mode {
+ TCP_ECN_IN_NOECN_OUT_NOECN = 0,
+ TCP_ECN_IN_ECN_OUT_ECN = 1,
+ TCP_ECN_IN_ECN_OUT_NOECN = 2,
+ TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
+ TCP_ECN_IN_ACCECN_OUT_ECN = 4,
+ TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
+};
+
+/* AccECN option sending when AccECN has been successfully negotiated */
+enum tcp_accecn_option {
+ TCP_ACCECN_OPTION_DISABLED = 0,
+ TCP_ACCECN_OPTION_MINIMUM = 1,
+ TCP_ACCECN_OPTION_FULL = 2,
+};
+
static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
+ /* Do not set CWR if in AccECN mode! */
if (tcp_ecn_mode_rfc3168(tp))
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
@@ -19,8 +41,10 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
static inline void tcp_ecn_accept_cwr(struct sock *sk,
const struct sk_buff *skb)
{
- if (tcp_hdr(skb)->cwr) {
- tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
/* If the sender is telling us it has entered CWR, then its
* cwnd may be very low (even just 1 packet), so we should ACK
@@ -36,16 +60,485 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
-static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp,
- const struct tcphdr *th)
+/* tp->accecn_fail_mode */
+#define TCP_ACCECN_ACE_FAIL_SEND BIT(0)
+#define TCP_ACCECN_ACE_FAIL_RECV BIT(1)
+#define TCP_ACCECN_OPT_FAIL_SEND BIT(2)
+#define TCP_ACCECN_OPT_FAIL_RECV BIT(3)
+
+static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
+}
+
+static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
+}
+
+static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
+}
+
+static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
+}
+
+static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
+{
+ tp->accecn_fail_mode |= mode;
+}
+
+#define TCP_ACCECN_OPT_NOT_SEEN 0x0
+#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1
+#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2
+#define TCP_ACCECN_OPT_FAIL_SEEN 0x3
+
+static inline u8 tcp_accecn_ace(const struct tcphdr *th)
+{
+ return (th->ae << 2) | (th->cwr << 1) | th->ece;
+}
+
+/* Infer the ECT value our SYN arrived with from the echoed ACE field */
+static inline int tcp_accecn_extract_syn_ect(u8 ace)
+{
+ /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
+ static const int ace_to_ecn[8] = {
+ INET_ECN_ECT_0, /* 0b000 (Undefined) */
+ INET_ECN_ECT_1, /* 0b001 (Undefined) */
+ INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */
+ INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */
+ INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */
+ INET_ECN_ECT_1, /* 0b101 (Reserved) */
+ INET_ECN_CE, /* 0b110 (CE is received) */
+ INET_ECN_ECT_1 /* 0b111 (Undefined) */
+ };
+
+ return ace_to_ecn[ace & 0x7];
+}
+
+/* Check ECN field transition to detect invalid transitions */
+static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
+{
+ if (rcv == snt)
+ return true;
+
+ /* Non-ECT altered to something or something became non-ECT */
+ if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
+ return false;
+ /* CE -> ECT(0/1)? */
+ if (snt == INET_ECN_CE)
+ return false;
+ return true;
+}
+
+static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
+ u8 sent_ect)
+{
+ u8 ect = tcp_accecn_extract_syn_ect(ace);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
+ return true;
+
+ if (!tcp_ect_transition_valid(sent_ect, ect)) {
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
+ return false;
+ }
+
+ return true;
+}
+
+static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp,
+ u8 saw_opt)
+{
+ tp->saw_accecn_opt = saw_opt;
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+}
+
+/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
+static inline void tcp_accecn_third_ack(struct sock *sk,
+ const struct sk_buff *skb, u8 sent_ect)
+{
+ u8 ace = tcp_accecn_ace(tcp_hdr(skb));
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ switch (ace) {
+ case 0x0:
+ /* Invalid value */
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
+ break;
+ case 0x7:
+ case 0x5:
+ case 0x1:
+ /* Unused but legal values */
+ break;
+ default:
+ /* Validation only applies to first non-data packet */
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
+ !TCP_SKB_CB(skb)->sacked &&
+ tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
+ if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
+ !tp->delivered_ce)
+ tp->delivered_ce++;
+ }
+ break;
+ }
+}
+
+/* Demand the minimum # to send AccECN optnio */
+static inline void tcp_accecn_opt_demand_min(struct sock *sk,
+ u8 opt_demand_min)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 opt_demand;
+
+ opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand);
+ tp->accecn_opt_demand = opt_demand;
+}
+
+/* Maps IP ECN field ECT/CE code point to AccECN option field number, given
+ * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
+ */
+static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield)
+{
+ switch (ecnfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ return 0; /* AccECN does not send counts of NOT_ECT */
+ case INET_ECN_ECT_1:
+ return 1;
+ case INET_ECN_CE:
+ return 2;
+ case INET_ECN_ECT_0:
+ return 3;
+ }
+ return 0;
+}
+
+/* Maps IP ECN field ECT/CE code point to AccECN option field value offset.
+ * Some fields do not start from zero, to detect zeroing by middleboxes.
+ */
+static inline u32 tcp_accecn_field_init_offset(u8 ecnfield)
+{
+ switch (ecnfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ return 0; /* AccECN does not send counts of NOT_ECT */
+ case INET_ECN_ECT_1:
+ return TCP_ACCECN_E1B_INIT_OFFSET;
+ case INET_ECN_CE:
+ return TCP_ACCECN_CEB_INIT_OFFSET;
+ case INET_ECN_ECT_0:
+ return TCP_ACCECN_E0B_INIT_OFFSET;
+ }
+ return 0;
+}
+
+/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */
+static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option,
+ bool order)
+{
+ /* Based on Table 5 of the AccECN spec to map (option, order) to
+ * the corresponding ECN conuters (ECT-1, ECT-0, or CE).
+ */
+ static const u8 optfield_lookup[2][3] = {
+ /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */
+ { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 },
+ /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */
+ { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 }
+ };
+
+ return optfield_lookup[order][option % 3];
+}
+
+/* Handles AccECN option ECT and CE 24-bit byte counters update into
+ * the u32 value in tcp_sock. As we're processing TCP options, it is
+ * safe to access from - 1.
+ */
+static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from,
+ u32 init_offset)
+{
+ u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
+ 0xFFFFFFU;
+ u32 delta = (truncated - *cnt) & 0xFFFFFFU;
+
+ /* If delta has the highest bit set (24th bit) indicating
+ * negative, sign extend to correct an estimation using
+ * sign_extend32(delta, 24 - 1)
+ */
+ delta = sign_extend32(delta, 23);
+ *cnt += delta;
+ return (s32)delta;
+}
+
+/* Updates Accurate ECN received counters from the received IP ECN field */
+static inline void tcp_ecn_received_counters(struct sock *sk,
+ const struct sk_buff *skb, u32 len)
+{
+ u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
+ u8 is_ce = INET_ECN_is_ce(ecnfield);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool ecn_edge;
+
+ if (!INET_ECN_is_not_ect(ecnfield)) {
+ u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+
+ /* As for accurate ECN, the TCP_ECN_SEEN flag is set by
+ * tcp_ecn_received_counters() when the ECN codepoint of
+ * received TCP data or ACK contains ECT(0), ECT(1), or CE.
+ */
+ if (!tcp_ecn_mode_rfc3168(tp))
+ tp->ecn_flags |= TCP_ECN_SEEN;
+
+ /* ACE counter tracks *all* segments including pure ACKs */
+ tp->received_ce += pcount;
+ tp->received_ce_pending = min(tp->received_ce_pending + pcount,
+ 0xfU);
+
+ if (len > 0) {
+ u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
+ u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
+ u32 bytes_mask = GENMASK_U32(31, 22);
+
+ tp->received_ecn_bytes[ecnfield - 1] += len;
+ tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
+ minlen);
+
+ /* Send AccECN option at least once per 2^22-byte
+ * increase in any ECN byte counter.
+ */
+ if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
+ bytes_mask) {
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
+ }
+
+ ecn_edge = tp->prev_ecnfield != ecnfield;
+ if (ecn_edge || is_ce) {
+ tp->prev_ecnfield = ecnfield;
+ /* Demand Accurate ECN change-triggered ACKs. Two ACK are
+ * demanded to indicate unambiguously the ecnfield value
+ * in the latter ACK.
+ */
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (ecn_edge)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ tp->accecn_opt_demand = 2;
+ }
+ }
+}
+
+/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters
+ * initialized at the start of the half-connection. [...] These byte counters
+ * reflect only the TCP payload length, excluding TCP header and TCP options.
+ */
+static inline void tcp_ecn_received_counters_payload(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct tcphdr *th = (const struct tcphdr *)skb->data;
+
+ tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
+}
+
+/* AccECN specification, 5.1: [...] a server can determine that it
+ * negotiated AccECN as [...] if the ACK contains an ACE field with
+ * the value 0b010 to 0b111 (decimal 2 to 7).
+ */
+static inline bool cookie_accecn_ok(const struct tcphdr *th)
+{
+ return tcp_accecn_ace(th) > 0x1;
+}
+
+/* Used to form the ACE flags for SYN/ACK */
+static inline u16 tcp_accecn_reflector_flags(u8 ect)
+{
+ /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
+ * Below is an excerpt from the 1st block of Table 2 of AccECN spec,
+ * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
+ */
+ static const u8 ecn_to_ace_flags[4] = {
+ 0b010, /* Not-ECT is received */
+ 0b011, /* ECT(1) is received */
+ 0b100, /* ECT(0) is received */
+ 0b110 /* CE is received */
+ };
+
+ return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
+}
+
+/* AccECN specification, 3.1.2: If a TCP server that implements AccECN
+ * receives a SYN with the three TCP header flags (AE, CWR and ECE) set
+ * to any combination other than 000, 011 or 111, it MUST negotiate the
+ * use of AccECN as if they had been set to 111.
+ */
+static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
{
- if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
+ u8 ace = tcp_accecn_ace(th);
+
+ return ace && ace != 0x3;
+}
+
+static inline void __tcp_accecn_init_bytes_counters(int *counter_array)
+{
+ BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1);
+ BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2);
+ BUILD_BUG_ON(INET_ECN_CE != 0x3);
+
+ counter_array[INET_ECN_ECT_1 - 1] = 0;
+ counter_array[INET_ECN_ECT_0 - 1] = 0;
+ counter_array[INET_ECN_CE - 1] = 0;
+}
+
+static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
+{
+ tp->received_ce = 0;
+ tp->received_ce_pending = 0;
+ __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
+ __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
+ tp->accecn_minlen = 0;
+ tp->accecn_opt_demand = 0;
+ tp->est_ecnfield = 0;
+}
+
+/* Used for make_synack to form the ACE flags */
+static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
+{
+ /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
+ * from SYN. Below is an excerpt from Table 2 of the AccECN spec:
+ * +====================+====================================+
+ * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK |
+ * | received on SYN | AE CWR ECE |
+ * +====================+====================================+
+ * | Not-ECT | 0 1 0 |
+ * | ECT(1) | 0 1 1 |
+ * | ECT(0) | 1 0 0 |
+ * | CE | 1 1 0 |
+ * +====================+====================================+
+ */
+ th->ae = !!(ect & INET_ECN_ECT_0);
+ th->cwr = ect != INET_ECN_ECT_0;
+ th->ece = ect == INET_ECN_ECT_1;
+}
+
+static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ u32 wire_ace;
+
+ /* The final packet of the 3WHS or anything like it must reflect
+ * the SYN/ACK ECT instead of putting CEP into ACE field, such
+ * case show up in tcp_flags.
+ */
+ if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
+ wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
+ th->ece = !!(wire_ace & 0x1);
+ th->cwr = !!(wire_ace & 0x2);
+ th->ae = !!(wire_ace & 0x4);
+ tp->received_ce_pending = 0;
+ }
+}
+
+static inline u8 tcp_accecn_option_init(const struct sk_buff *skb,
+ u8 opt_offset)
+{
+ u8 *ptr = skb_transport_header(skb) + opt_offset;
+ unsigned int optlen = ptr[1] - 2;
+
+ if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
+ return TCP_ACCECN_OPT_FAIL_SEEN;
+ ptr += 2;
+
+ /* Detect option zeroing: an AccECN connection "MAY check that the
+ * initial value of the EE0B field or the EE1B field is non-zero"
+ */
+ if (optlen < TCPOLEN_ACCECN_PERFIELD)
+ return TCP_ACCECN_OPT_EMPTY_SEEN;
+ if (get_unaligned_be24(ptr) == 0)
+ return TCP_ACCECN_OPT_FAIL_SEEN;
+ if (optlen < TCPOLEN_ACCECN_PERFIELD * 3)
+ return TCP_ACCECN_OPT_COUNTER_SEEN;
+ ptr += TCPOLEN_ACCECN_PERFIELD * 2;
+ if (get_unaligned_be24(ptr) == 0)
+ return TCP_ACCECN_OPT_FAIL_SEEN;
+
+ return TCP_ACCECN_OPT_COUNTER_SEEN;
+}
+
+/* See Table 2 of the AccECN draft */
+static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
+ const struct tcphdr *th, u8 ip_dsfield)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 ace = tcp_accecn_ace(th);
+
+ switch (ace) {
+ case 0x0:
+ case 0x7:
+ /* +========+========+============+=============+
+ * | A | B | SYN/ACK | Feedback |
+ * | | | B->A | Mode of A |
+ * | | | AE CWR ECE | |
+ * +========+========+============+=============+
+ * | AccECN | No ECN | 0 0 0 | Not ECN |
+ * | AccECN | Broken | 1 1 1 | Not ECN |
+ * +========+========+============+=============+
+ */
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
+ break;
+ case 0x1:
+ case 0x5:
+ /* +========+========+============+=============+
+ * | A | B | SYN/ACK | Feedback |
+ * | | | B->A | Mode of A |
+ * | | | AE CWR ECE | |
+ * +========+========+============+=============+
+ * | AccECN | Nonce | 1 0 1 | (Reserved) |
+ * | AccECN | ECN | 0 0 1 | Classic ECN |
+ * | Nonce | AccECN | 0 0 1 | Classic ECN |
+ * | ECN | AccECN | 0 0 1 | Classic ECN |
+ * +========+========+============+=============+
+ */
+ if (tcp_ecn_mode_pending(tp))
+ /* Downgrade from AccECN, or requested initially */
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ break;
+ default:
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
+ if (tp->rx_opt.accecn &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ tp->accecn_opt_demand = 2;
+ }
+ if (INET_ECN_is_ce(ip_dsfield) &&
+ tcp_accecn_validate_syn_feedback(sk, ace,
+ tp->syn_ect_snt)) {
+ tp->received_ce++;
+ tp->received_ce_pending++;
+ }
+ break;
+ }
}
-static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp,
- const struct tcphdr *th)
+static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
+ const struct sk_buff *skb)
{
+ if (tcp_ecn_mode_pending(tp)) {
+ if (!tcp_accecn_syn_requested(th)) {
+ /* Downgrade to classic ECN feedback */
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ } else {
+ tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+ tp->prev_ecnfield = tp->syn_ect_rcv;
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ }
+ }
if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
@@ -61,7 +554,7 @@ static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
/* Packet ECN state for a SYN-ACK */
static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
- const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (tcp_ecn_disabled(tp))
@@ -69,6 +562,13 @@ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
else if (tcp_ca_needs_ecn(sk) ||
tcp_bpf_ca_needs_ecn(sk))
INET_ECN_xmit(sk);
+
+ if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
+ TCP_SKB_CB(skb)->tcp_flags |=
+ tcp_accecn_reflector_flags(tp->syn_ect_rcv);
+ tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
+ }
}
/* Packet ECN state for a SYN. */
@@ -76,8 +576,13 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
- bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
- tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
+ bool use_ecn, use_accecn;
+ u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);
+
+ use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN;
+ use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
+ tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
+ tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
if (!use_ecn) {
const struct dst_entry *dst = __sk_dst_get(sk);
@@ -93,24 +598,45 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
INET_ECN_xmit(sk);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
- tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ if (use_accecn) {
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
+ tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
+ } else {
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ }
}
}
static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
/* tp->ecn_flags are cleared at a later point in time when
* SYN ACK is ultimatively being received.
*/
- TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
+ }
}
static inline void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
- if (inet_rsk(req)->ecn_ok)
+ if (tcp_rsk(req)->accecn_ok)
+ tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
+ else if (inet_rsk(req)->ecn_ok)
th->ece = 1;
}
+static inline bool tcp_accecn_option_beacon_check(const struct sock *sk)
+{
+ u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!ecn_beacon)
+ return false;
+
+ return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >=
+ (tp->srtt_us >> 3);
+}
+
#endif /* _LINUX_TCP_ECN_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index bdac8c42fa82..dce3113787a7 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -316,6 +316,15 @@ struct tcp_info {
* in milliseconds, including any
* unfinished recovery.
*/
+ __u32 tcpi_received_ce; /* # of CE marks received */
+ __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */
+ __u32 tcpi_delivered_e0_bytes;
+ __u32 tcpi_delivered_ce_bytes;
+ __u32 tcpi_received_e1_bytes;
+ __u32 tcpi_received_e0_bytes;
+ __u32 tcpi_received_ce_bytes;
+ __u16 tcpi_accecn_fail_mode;
+ __u16 tcpi_accecn_opt_seen;
};
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */