From 73ed8e03388d16c12fc577e5c700b58a29045a15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:37 +0000 Subject: tcp: fix cookie_init_timestamp() overflows cookie_init_timestamp() is supposed to return a 64bit timestamp suitable for both TSval determination and setting of skb->tstamp. Unfortunately it uses 32bit fields and overflows after 2^32 * 10^6 nsec (~49 days) of uptime. Generated TSval are still correct, but skb->tstamp might be set far away in the past, potentially confusing other layers. tcp_ns_to_ts() is changed to return a full 64bit value, ts and ts_now variables are changed to u64 type, and TSMASK is removed in favor of shifts operations. While we are at it, change this sequence: ts >>= TSBITS; ts--; ts <<= TSBITS; ts |= options; to: ts -= (1UL << TSBITS); Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index bad304d173a5..d47a57a47b50 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -805,7 +805,7 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) } /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ -static inline u32 tcp_ns_to_ts(u64 ns) +static inline u64 tcp_ns_to_ts(u64 ns) { return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); } -- cgit v1.2.3 From 99d679556d737a14391c68e562d94076c2983252 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:38 +0000 Subject: tcp: add tcp_time_stamp_ms() helper In preparation of adding usec TCP TS values, add tcp_time_stamp_ms() for contexts needing ms based values. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index d47a57a47b50..9fc6dc4ba9e2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -804,6 +804,11 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } +static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) +{ + return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); +} + /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ static inline u64 tcp_ns_to_ts(u64 ns) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ab87f0285b72..ffce17545b62 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2856,7 +2856,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) static void tcp_update_rto_time(struct tcp_sock *tp) { if (tp->rto_stamp) { - tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp; + tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp; tp->rto_stamp = 0; } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0862b73dd3b5..63247c78dc13 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -422,7 +422,7 @@ static void tcp_update_rto_stats(struct sock *sk) if (!icsk->icsk_retransmits) { tp->total_rto_recoveries++; - tp->rto_stamp = tcp_time_stamp(tp); + tp->rto_stamp = tcp_time_stamp_ms(tp); } icsk->icsk_retransmits++; tp->total_rto++; -- cgit v1.2.3 From 2a7c8d291ffeba69a47d8528987156f625cc05b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:39 +0000 Subject: tcp: introduce tcp_clock_ms() It delivers current TCP time stamp in ms unit, and is used in place of confusing tcp_time_stamp_raw() It is the same family than tcp_clock_ns() and tcp_clock_ms(). tcp_time_stamp_raw() will be replaced later for TSval contexts with a more descriptive name. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv4/tcp.c | 6 ++---- net/ipv4/tcp_minisocks.c | 4 ++-- net/netfilter/nf_synproxy_core.c | 2 +- tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c | 4 ++-- 5 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9fc6dc4ba9e2..3bdf1141f5a2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -798,6 +798,11 @@ static inline u64 tcp_clock_us(void) return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } +static inline u64 tcp_clock_ms(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); +} + /* This should only be used in contexts where tp->tcp_mstamp is up to date */ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 56a8d936000f..5b034b0356ec 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3817,10 +3817,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_rto = tp->total_rto; info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; info->tcpi_total_rto_time = tp->total_rto_time; - if (tp->rto_stamp) { - info->tcpi_total_rto_time += tcp_time_stamp_raw() - - tp->rto_stamp; - } + if (tp->rto_stamp) + info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; unlock_sock_fast(sk, slow); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3f87611077ef..a9fdba897a28 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -567,8 +567,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto = req->num_timeout; newtp->total_rto_recoveries = 1; - newtp->total_rto_time = tcp_time_stamp_raw() - - newtp->retrans_stamp; + newtp->total_rto_time = tcp_clock_ms() - + newtp->retrans_stamp; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 16915f8eef2b..467671f2d42f 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -153,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; - opts->tsval = tcp_time_stamp_raw() & ~0x3f; + opts->tsval = tcp_clock_ms() & ~0x3f; if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index 07d786329105..e959336c7a73 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -177,7 +177,7 @@ static __always_inline __u32 tcp_ns_to_ts(__u64 ns) return ns / (NSEC_PER_SEC / TCP_TS_HZ); } -static __always_inline __u32 tcp_time_stamp_raw(void) +static __always_inline __u32 tcp_clock_ms(void) { return tcp_ns_to_ts(tcp_clock_ns()); } @@ -274,7 +274,7 @@ static __always_inline bool tscookie_init(struct tcphdr *tcp_header, if (!loop_ctx.option_timestamp) return false; - cookie = tcp_time_stamp_raw() & ~TSMASK; + cookie = tcp_clock_ms() & ~TSMASK; cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; if (loop_ctx.option_sack) cookie |= TS_OPT_SACK; -- cgit v1.2.3 From 16cf6477741bdaa287d5e4531a1a503618a41a22 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:40 +0000 Subject: tcp: replace tcp_time_stamp_raw() In preparation of usec TCP TS support, remove tcp_time_stamp_raw() in favor of tcp_clock_ts() helper. This helper will return a suitable 32bit result to feed TS values, depending on a socket field. Also add tcp_tw_tsval() and tcp_rsk_tsval() helpers to factorize the details. We do not yet support usec timestamps. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 25 +++++++++++++++++++------ net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 4 files changed, 25 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3bdf1141f5a2..0534526a535d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -803,6 +803,16 @@ static inline u64 tcp_clock_ms(void) return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); } +/* TCP Timestamp included in TS option (RFC 1323) can either use ms + * or usec resolution. Each socket carries a flag to select one or other + * resolution, as the route attribute could change anytime. + * Each flow must stick to initial resolution. + */ +static inline u32 tcp_clock_ts(bool usec_ts) +{ + return usec_ts ? tcp_clock_us() : tcp_clock_ms(); +} + /* This should only be used in contexts where tp->tcp_mstamp is up to date */ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) { @@ -820,12 +830,6 @@ static inline u64 tcp_ns_to_ts(u64 ns) return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); } -/* Could use tcp_clock_us() / 1000, but this version uses a single divide */ -static inline u32 tcp_time_stamp_raw(void) -{ - return tcp_ns_to_ts(tcp_clock_ns()); -} - void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) @@ -844,6 +848,15 @@ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } +static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) +{ + return tcp_clock_ts(false) + tcptw->tw_ts_offset; +} + +static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) +{ + return tcp_clock_ts(false) + treq->ts_off; +} #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5b034b0356ec..805f8341064f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3632,7 +3632,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (!tp->repair) err = -EPERM; else - WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw()); + WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -4143,7 +4143,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset); + val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset); break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a67a5de86253..cdd65cc594bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -954,7 +954,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcp_tw_tsval(tcptw), tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -988,7 +988,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), 0, tcp_md5_do_lookup(sk, l3index, addr, AF_INET), diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d410703bb5a1..1ee6517e9b2f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1096,7 +1096,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp_raw() + tcptw->tw_ts_offset, + tcp_tw_tsval(tcptw), tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); @@ -1123,7 +1123,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, + tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), ipv6_get_dsfield(ipv6_hdr(skb)), 0, -- cgit v1.2.3 From d1a02ed66fe62aa2edd77bd54e270ebc33bd12ff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:41 +0000 Subject: tcp: rename tcp_skb_timestamp() This helper returns a 32bit TCP TSval from skb->tstamp. As we are going to support usec or ms units soon, rename it to tcp_skb_timestamp_ts() and add a boolean to select the unit. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 14 +++++++++----- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 8 ++++---- net/ipv4/tcp_timer.c | 4 ++-- 4 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0534526a535d..493f8550055b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -837,17 +837,21 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } -static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) -{ - return tcp_ns_to_ts(skb->skb_mstamp_ns); -} - /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } +/* Provide skb TSval in usec or ms unit */ +static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) +{ + if (usec_ts) + return tcp_skb_timestamp_us(skb); + + return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC); +} + static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) { return tcp_clock_ts(false) + tcptw->tw_ts_offset; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ffce17545b62..de68cad82d19 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2442,7 +2442,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && - tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb)); } /* Nothing was retransmitted or returned timestamp is less diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 909f85aefd74..03a2a9fc0dc1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; + opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -884,7 +884,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; + opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -943,7 +943,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -3379,7 +3379,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp(skb); + tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 63247c78dc13..8764a9a2dc21 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -479,7 +479,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, return false; rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - - (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); + (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb))); return rtx_delta > timeout; } @@ -534,7 +534,7 @@ void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; - rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb)); + rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), -- cgit v1.2.3 From 003e07a1e48e9423647d2fef1c86b4caab3a94be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:42 +0000 Subject: tcp: move tcp_ns_to_ts() to net/ipv4/syncookies.c tcp_ns_to_ts() is only used once from cookie_init_timestamp(). Also add the 'bool usec_ts' parameter to enable usec TS later. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ------ net/ipv4/syncookies.c | 10 +++++++++- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 493f8550055b..b86abf1fbe46 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -824,12 +824,6 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } -/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ -static inline u64 tcp_ns_to_ts(u64 ns) -{ - return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); -} - void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3b4dafefb4b0..62395fdb0ca5 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -51,6 +51,14 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, count, &syncookie_secret[c]); } +/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ +static u64 tcp_ns_to_ts(bool usec_ts, u64 val) +{ + if (usec_ts) + return div_u64(val, NSEC_PER_USEC); + + return div_u64(val, NSEC_PER_MSEC); +} /* * when syncookies are in effect and tcp timestamps are enabled we encode @@ -62,7 +70,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u64 cookie_init_timestamp(struct request_sock *req, u64 now) { const struct inet_request_sock *ireq = inet_rsk(req); - u64 ts, ts_now = tcp_ns_to_ts(now); + u64 ts, ts_now = tcp_ns_to_ts(false, now); u32 options = 0; options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; -- cgit v1.2.3 From 9d0c00f5ca05be9e89649c156f9d5b9421fc534e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:43 +0000 Subject: tcp: rename tcp_time_stamp() to tcp_time_stamp_ts() This helper returns a TSval from a TCP socket. It currently calls tcp_time_stamp_ms() but will soon be able to return a usec based TSval, depending on an upcoming tp->tcp_usec_ts field. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 9 ++++----- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_lp.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 10 +++++----- 5 files changed, 14 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b86abf1fbe46..af72c1dc37f3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -813,15 +813,14 @@ static inline u32 tcp_clock_ts(bool usec_ts) return usec_ts ? tcp_clock_us() : tcp_clock_ms(); } -/* This should only be used in contexts where tp->tcp_mstamp is up to date */ -static inline u32 tcp_time_stamp(const struct tcp_sock *tp) +static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) { - return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); + return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } -static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) +static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { - return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); + return tcp_time_stamp_ms(tp); } void tcp_mstamp_refresh(struct tcp_sock *tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index de68cad82d19..e7e38fc1d62f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -704,7 +704,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { @@ -3148,7 +3148,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; + u32 delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) @@ -6293,7 +6293,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp(tp))) { + tcp_time_stamp_ts(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index ae36780977d2..52fe17167460 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -272,7 +272,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - u32 now = tcp_time_stamp(tp); + u32 now = tcp_time_stamp_ts(tp); u32 delta; if (sample->rtt_us > 0) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 03a2a9fc0dc1..a1fec8be9ac3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3961,7 +3961,7 @@ int tcp_connect(struct sock *sk) tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tcp_mstamp_refresh(tp); - tp->retrans_stamp = tcp_time_stamp(tp); + tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8764a9a2dc21..bfcf3fe44c72 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -33,7 +33,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout) return icsk->icsk_rto; - elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; + elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts; remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -226,7 +226,7 @@ static bool retransmits_timed_out(struct sock *sk, timeout = tcp_model_timeout(sk, boundary, rto_base); } - return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; + return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ @@ -462,7 +462,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_time_stamp(tp); + tp->retrans_stamp = tcp_time_stamp_ts(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, req->timeout << req->num_timeout, TCP_RTO_MAX); } @@ -478,7 +478,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, if (rcv_delta <= timeout) return false; - rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - + rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb))); return rtx_delta > timeout; @@ -534,7 +534,7 @@ void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; - rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); + rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), -- cgit v1.2.3 From 3d44de9a10ea2b1658dfaed8ea6d3d7b6e0defbb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:45 +0000 Subject: tcp: add RTAX_FEATURE_TCP_USEC_TS This new dst feature flag will be used to allow TCP to use usec based timestamps instead of msec ones. ip route .... feature tcp_usec_ts Also document that RTAX_FEATURE_SACK and RTAX_FEATURE_TIMESTAMP are unused. RTAX_FEATURE_ALLFRAG is also going away soon. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 +++++ include/uapi/linux/rtnetlink.h | 18 +++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e15452df9804..04a0e647ef74 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -576,4 +576,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); int tcp_sock_set_user_timeout(struct sock *sk, int val); +static inline bool dst_tcp_usec_ts(const struct dst_entry *dst) +{ + return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS); +} + #endif /* _LINUX_TCP_H */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 51c13cf9c5ae..aa2482a0614a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -502,13 +502,17 @@ enum { #define RTAX_MAX (__RTAX_MAX - 1) -#define RTAX_FEATURE_ECN (1 << 0) -#define RTAX_FEATURE_SACK (1 << 1) -#define RTAX_FEATURE_TIMESTAMP (1 << 2) -#define RTAX_FEATURE_ALLFRAG (1 << 3) - -#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ - RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) /* unused */ +#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ +#define RTAX_FEATURE_ALLFRAG (1 << 3) +#define RTAX_FEATURE_TCP_USEC_TS (1 << 4) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ + RTAX_FEATURE_TCP_USEC_TS) struct rta_session { __u8 proto; -- cgit v1.2.3 From af7721448a609d1912b57c825194ef6e17fc71a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:46 +0000 Subject: tcp: introduce TCP_PAWS_WRAP tcp_paws_check() uses TCP_PAWS_24DAYS constant to detect if TCP TS values might have wrapped after a long idle period. This mechanism is described in RFC 7323 5.5 (Outdated Timestamps) TCP_PAWS_24DAYS value was based on the assumption of a clock of 1 Khz. As we want to adopt a 1 Mhz clock in the future, we reduce this constant. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index af72c1dc37f3..0ab577869d7a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -166,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 -#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) +/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds + * to avoid overflows. This assumes a clock smaller than 1 Mhz. + * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz. + */ +#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC) + #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN @@ -1619,7 +1624,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), - rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) + rx_opt->ts_recent_stamp + TCP_PAWS_WRAP))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, -- cgit v1.2.3 From 614e8316aa4cafba3e204cb8ee48bd12b92f3d93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:47 +0000 Subject: tcp: add support for usec resolution in TCP TS values Back in 2015, Van Jacobson suggested to use usec resolution in TCP TS values. This has been implemented in our private kernels. Goals were : 1) better observability of delays in networking stacks. 2) better disambiguation of events based on TSval/ecr values. 3) building block for congestion control modules needing usec resolution. Back then we implemented a schem based on private SYN options to negotiate the feature. For upstream submission, we chose to use a route attribute, because this feature is probably going to be used in private networks [1] [2]. ip route add 10/8 ... features tcp_usec_ts Note that RFC 7323 recommends a "timestamp clock frequency in the range 1 ms to 1 sec per tick.", but also mentions "the maximum acceptable clock frequency is one tick every 59 ns." [1] Unfortunately RFC 7323 5.5 (Outdated Timestamps) suggests to invalidate TS.Recent values after a flow was idle for more than 24 days. This is the part making usec_ts a problem for peers following this recommendation for long living idle flows. [2] Attempts to standardize usec ts went nowhere: https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 +++- include/net/inet_timewait_sock.h | 3 ++- include/net/tcp.h | 6 ++++-- net/ipv4/syncookies.c | 6 +++++- net/ipv4/tcp.c | 18 ++++++++++++++---- net/ipv4/tcp_input.c | 5 ++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 19 ++++++++++++++----- net/ipv4/tcp_output.c | 12 ++++++++---- net/ipv4/tcp_timer.c | 40 ++++++++++++++++++++++++++-------------- net/ipv6/tcp_ipv6.c | 1 + 11 files changed, 82 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 04a0e647ef74..6df715b6e51d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -152,6 +152,7 @@ struct tcp_request_sock { u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; + s8 req_usec_ts; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif @@ -257,7 +258,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + tcp_usec_ts:1, /* TSval values in usec */ + unused:4; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 4a8e578405cb..b14999ff55db 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -67,7 +67,8 @@ struct inet_timewait_sock { /* And these are ours. */ unsigned int tw_transparent : 1, tw_flowlabel : 20, - tw_pad : 3, /* 3 bits hole */ + tw_usec_ts : 1, + tw_pad : 2, /* 2 bits hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ab577869d7a..39b731c900dd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -825,6 +825,8 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { + if (tp->tcp_usec_ts) + return tp->tcp_mstamp; return tcp_time_stamp_ms(tp); } @@ -852,12 +854,12 @@ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) { - return tcp_clock_ts(false) + tcptw->tw_ts_offset; + return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset; } static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) { - return tcp_clock_ts(false) + treq->ts_off; + return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off; } #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 62395fdb0ca5..c64334363230 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -84,7 +84,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now) if (ts > ts_now) ts -= (1UL << TSBITS); - return ts * (NSEC_PER_SEC / TCP_TS_HZ); + if (tcp_rsk(req)->req_usec_ts) + return ts * NSEC_PER_USEC; + return ts * NSEC_PER_MSEC; } @@ -304,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, treq->af_specific = af_ops; treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; + treq->req_usec_ts = -1; + #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 805f8341064f..b961364b4961 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: - if (!tp->repair) + if (!tp->repair) { err = -EPERM; - else - WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false)); + break; + } + /* val is an opaque field, + * and low order bit contains usec_ts enable bit. + * Its a best effort, and we do not care if user makes an error. + */ + tp->tcp_usec_ts = val & 1; + WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -4143,7 +4149,11 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset); + val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset); + if (tp->tcp_usec_ts) + val |= 1; + else + val &= ~1; break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5666f6137167..18b858597af4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -698,6 +698,8 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) u32 delta, delta_us; delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; + if (tp->tcp_usec_ts) + return delta; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) @@ -2452,7 +2454,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && - tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb)); + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); } /* Nothing was retransmitted or returned timestamp is less @@ -7045,6 +7047,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; + tcp_rsk(req)->req_usec_ts = -1; #if IS_ENABLED(CONFIG_MPTCP) tcp_rsk(req)->is_mptcp = 0; #endif diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cdd65cc594bc..7583d4e34c8c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -296,6 +296,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = NULL; goto failure; } + tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst); /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a9fdba897a28..ace806c5bd0c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -300,6 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; + tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; @@ -554,21 +555,29 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { + newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { + newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { - newtp->undo_marker = treq->snt_isn; - newtp->retrans_stamp = div_u64(treq->snt_synack, - USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto = req->num_timeout; + newtp->undo_marker = treq->snt_isn; + if (newtp->tcp_usec_ts) { + newtp->retrans_stamp = treq->snt_synack; + newtp->total_rto_time = (u32)(tcp_clock_us() - + newtp->retrans_stamp) / USEC_PER_MSEC; + } else { + newtp->retrans_stamp = div_u64(treq->snt_synack, + USEC_PER_SEC / TCP_TS_HZ); + newtp->total_rto_time = tcp_clock_ms() - + newtp->retrans_stamp; + } newtp->total_rto_recoveries = 1; - newtp->total_rto_time = tcp_clock_ms() - - newtp->retrans_stamp; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a1fec8be9ac3..2866ccbccde0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset; + opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -884,7 +884,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off; + opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -943,7 +944,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -3379,7 +3381,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb); + tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; @@ -3665,6 +3667,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); + if (tcp_rsk(req)->req_usec_ts < 0) + tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index bfcf3fe44c72..1f9f6c1c196b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -26,14 +26,18 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 elapsed, start_ts, user_timeout; + const struct tcp_sock *tp = tcp_sk(sk); + u32 elapsed, user_timeout; s32 remaining; - start_ts = tcp_sk(sk)->retrans_stamp; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout) return icsk->icsk_rto; - elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts; + + elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp; + if (tp->tcp_usec_ts) + elapsed /= USEC_PER_MSEC; + remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -212,12 +216,13 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { - unsigned int start_ts; + struct tcp_sock *tp = tcp_sk(sk); + unsigned int start_ts, delta; if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_sk(sk)->retrans_stamp; + start_ts = tp->retrans_stamp; if (likely(timeout == 0)) { unsigned int rto_base = TCP_RTO_MIN; @@ -226,7 +231,12 @@ static bool retransmits_timed_out(struct sock *sk, timeout = tcp_model_timeout(sk, boundary, rto_base); } - return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0; + if (tp->tcp_usec_ts) { + /* delta maybe off up to a jiffy due to timer granularity. */ + delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1); + return (s32)(delta - timeout * USEC_PER_MSEC) >= 0; + } + return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ @@ -468,20 +478,18 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + u32 rtx_delta) { const struct tcp_sock *tp = tcp_sk(sk); const int timeout = TCP_RTO_MAX * 2; - u32 rcv_delta, rtx_delta; + u32 rcv_delta; rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; - rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) - - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb))); - - return rtx_delta > timeout; + return msecs_to_jiffies(rtx_delta) > timeout; } /** @@ -534,7 +542,11 @@ void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; - rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)); + rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: + tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); + if (tp->tcp_usec_ts) + rtx_delta /= USEC_PER_MSEC; + if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), @@ -551,7 +563,7 @@ void tcp_retransmit_timer(struct sock *sk) rtx_delta); } #endif - if (tcp_rtx_probe0_timed_out(sk, skb)) { + if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) { tcp_write_err(sk); goto out; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1ee6517e9b2f..0c8a14ba104f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -286,6 +286,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, goto failure; } + tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { -- cgit v1.2.3 From a77a0f5c7f23a8a4981a2a3ff47baa91ceaf1f53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2023 12:57:48 +0000 Subject: tcp: add TCPI_OPT_USEC_TS Add the ability to report in tcp_info.tcpi_options if a flow is using usec resolution in TCP TS val. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index d1d08da6331a..8aa3916e14f6 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ +#define TCPI_OPT_USEC_TS 64 /* usec timestamps */ /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b961364b4961..a86d8200a1e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3760,6 +3760,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) + info->tcpi_options |= TCPI_OPT_USEC_TS; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, -- cgit v1.2.3