From 200ecef67b8d09d16ec55f91c92751dcc7a38d40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 11:51:18 -0800 Subject: tcp: Remove one extra ktime_get_ns() from cookie_init_timestamp tcp_make_synack() already uses tcp_clock_ns(), and can pass the value to cookie_init_timestamp() to avoid another call to ktime_get_ns() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index ab4eb5eb5d07..36f195fb576a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -537,7 +537,7 @@ static inline u32 tcp_cookie_time(void) u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); -u64 cookie_init_timestamp(struct request_sock *req); +u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, @@ -757,10 +757,16 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } +/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ +static inline u32 tcp_ns_to_ts(u64 ns) +{ + return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); +} + /* Could use tcp_clock_us() / 1000, but this version uses a single divide */ static inline u32 tcp_time_stamp_raw(void) { - return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); + return tcp_ns_to_ts(tcp_clock_ns()); } void tcp_mstamp_refresh(struct tcp_sock *tp); @@ -772,7 +778,7 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { - return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ); + return tcp_ns_to_ts(skb->skb_mstamp_ns); } /* provide the departure time in us unit */ -- cgit v1.2.3 From 04d26e7b159a396372646a480f4caa166d1b6720 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 6 Dec 2019 12:38:36 +0100 Subject: tcp: fix rejected syncookies due to stale timestamps If no synflood happens for a long enough period of time, then the synflood timestamp isn't refreshed and jiffies can advance so much that time_after32() can't accurately compare them any more. Therefore, we can end up in a situation where time_after32(now, last_overflow + HZ) returns false, just because these two values are too far apart. In that case, the synflood timestamp isn't updated as it should be, which can trick tcp_synq_no_recent_overflow() into rejecting valid syncookies. For example, let's consider the following scenario on a system with HZ=1000: * The synflood timestamp is 0, either because that's the timestamp of the last synflood or, more commonly, because we're working with a freshly created socket. * We receive a new SYN, which triggers synflood protection. Let's say that this happens when jiffies == 2147484649 (that is, 'synflood timestamp' + HZ + 2^31 + 1). * Then tcp_synq_overflow() doesn't update the synflood timestamp, because time_after32(2147484649, 1000) returns false. With: - 2147484649: the value of jiffies, aka. 'now'. - 1000: the value of 'last_overflow' + HZ. * A bit later, we receive the ACK completing the 3WHS. But cookie_v[46]_check() rejects it because tcp_synq_no_recent_overflow() says that we're not under synflood. That's because time_after32(2147484649, 120000) returns false. With: - 2147484649: the value of jiffies, aka. 'now'. - 120000: the value of 'last_overflow' + TCP_SYNCOOKIE_VALID. Of course, in reality jiffies would have increased a bit, but this condition will last for the next 119 seconds, which is far enough to accommodate for jiffie's growth. Fix this by updating the overflow timestamp whenever jiffies isn't within the [last_overflow, last_overflow + HZ] range. That shouldn't have any performance impact since the update still happens at most once per second. Now we're guaranteed to have fresh timestamps while under synflood, so tcp_synq_no_recent_overflow() can safely use it with time_after32() in such situations. Stale timestamps can still make tcp_synq_no_recent_overflow() return the wrong verdict when not under synflood. This will be handled in the next patch. For 64 bits architectures, the problem was introduced with the conversion of ->tw_ts_recent_stamp to 32 bits integer by commit cca9bab1b72c ("tcp: use monotonic timestamps for PAWS"). The problem has always been there on 32 bits architectures. Fixes: cca9bab1b72c ("tcp: use monotonic timestamps for PAWS") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/time.h | 13 +++++++++++++ include/net/tcp.h | 5 +++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/linux/time.h b/include/linux/time.h index 0760a4f5a15c..8e10b9dbd8c2 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -97,4 +97,17 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its) */ #define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0) #define time_before32(b, a) time_after32(a, b) + +/** + * time_between32 - check if a 32-bit timestamp is within a given time range + * @t: the time which may be within [l,h] + * @l: the lower bound of the range + * @h: the higher bound of the range + * + * time_before32(t, l, h) returns true if @l <= @t <= @h. All operands are + * treated as 32-bit integers. + * + * Equivalent to !(time_before32(@t, @l) || time_after32(@t, @h)). + */ +#define time_between32(t, l, h) ((u32)(h) - (u32)(l) >= (u32)(t) - (u32)(l)) #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 36f195fb576a..7d734ba391fc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -494,14 +494,15 @@ static inline void tcp_synq_overflow(const struct sock *sk) reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); - if (time_after32(now, last_overflow + HZ)) + if (!time_between32(now, last_overflow, + last_overflow + HZ)) WRITE_ONCE(reuse->synq_overflow_ts, now); return; } } last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - if (time_after32(now, last_overflow + HZ)) + if (!time_between32(now, last_overflow, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } -- cgit v1.2.3 From cb44a08f8647fd2e8db5cc9ac27cd8355fa392d8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 6 Dec 2019 12:38:43 +0100 Subject: tcp: tighten acceptance of ACKs not matching a child socket When no synflood occurs, the synflood timestamp isn't updated. Therefore it can be so old that time_after32() can consider it to be in the future. That's a problem for tcp_synq_no_recent_overflow() as it may report that a recent overflow occurred while, in fact, it's just that jiffies has grown past 'last_overflow' + TCP_SYNCOOKIE_VALID + 2^31. Spurious detection of recent overflows lead to extra syncookie verification in cookie_v[46]_check(). At that point, the verification should fail and the packet dropped. But we should have dropped the packet earlier as we didn't even send a syncookie. Let's refine tcp_synq_no_recent_overflow() to report a recent overflow only if jiffies is within the [last_overflow, last_overflow + TCP_SYNCOOKIE_VALID] interval. This way, no spurious recent overflow is reported when jiffies wraps and 'last_overflow' becomes in the future from the point of view of time_after32(). However, if jiffies wraps and enters the [last_overflow, last_overflow + TCP_SYNCOOKIE_VALID] interval (with 'last_overflow' being a stale synflood timestamp), then tcp_synq_no_recent_overflow() still erroneously reports an overflow. In such cases, we have to rely on syncookie verification to drop the packet. We unfortunately have no way to differentiate between a fresh and a stale syncookie timestamp. In practice, using last_overflow as lower bound is problematic. If the synflood timestamp is concurrently updated between the time we read jiffies and the moment we store the timestamp in 'last_overflow', then 'now' becomes smaller than 'last_overflow' and tcp_synq_no_recent_overflow() returns true, potentially dropping a valid syncookie. Reading jiffies after loading the timestamp could fix the problem, but that'd require a memory barrier. Let's just accommodate for potential timestamp growth instead and extend the interval using 'last_overflow - HZ' as lower bound. Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7d734ba391fc..43e04e14c41e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -518,13 +518,23 @@ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); - return time_after32(now, last_overflow + - TCP_SYNCOOKIE_VALID); + return !time_between32(now, last_overflow - HZ, + last_overflow + + TCP_SYNCOOKIE_VALID); } } last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); + + /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, + * then we're under synflood. However, we have to use + * 'last_overflow - HZ' as lower bound. That's because a concurrent + * tcp_synq_overflow() could update .ts_recent_stamp after we read + * jiffies but before we store .ts_recent_stamp into last_overflow, + * which could lead to rejecting a valid syncookie. + */ + return !time_between32(now, last_overflow - HZ, + last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) -- cgit v1.2.3 From 721c8dafad26ccfa90ff659ee19755e3377b829d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 6 Dec 2019 12:38:49 +0100 Subject: tcp: Protect accesses to .ts_recent_stamp with {READ,WRITE}_ONCE() Syncookies borrow the ->rx_opt.ts_recent_stamp field to store the timestamp of the last synflood. Protect them with READ_ONCE() and WRITE_ONCE() since reads and writes aren't serialised. Use of .rx_opt.ts_recent_stamp for storing the synflood timestamp was introduced by a0f82f64e269 ("syncookies: remove last_synq_overflow from struct tcp_sock"). But unprotected accesses were already there when timestamp was stored in .last_synq_overflow. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 43e04e14c41e..86b9a8766648 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -501,9 +501,9 @@ static inline void tcp_synq_overflow(const struct sock *sk) } } - last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) - tcp_sk(sk)->rx_opt.ts_recent_stamp = now; + WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ @@ -524,7 +524,7 @@ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) } } - last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use -- cgit v1.2.3 From ee2aabd3fc2eef4c1a0ebdadccc76fbff74b94fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Dec 2019 12:55:30 -0800 Subject: tcp: refine tcp_write_queue_empty() implementation Due to how tcp_sendmsg() is implemented, we can have an empty skb at the tail of the write queue. Most [1] tcp_write_queue_empty() callers want to know if there is anything to send (payload and/or FIN) Instead of checking if the sk_write_queue is empty, we need to test if tp->write_seq == tp->snd_nxt [1] tcp_send_fin() was the only caller that expected to see if an skb was in the write queue, I have changed the code to reuse the tcp_write_queue_tail() result. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 11 ++++++++++- net/ipv4/tcp_output.c | 5 +++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/net/tcp.h') diff --git a/include/net/tcp.h b/include/net/tcp.h index 86b9a8766648..e460ea7f767b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1766,9 +1766,18 @@ static inline bool tcp_skb_is_last(const struct sock *sk, return skb_queue_is_last(&sk->sk_write_queue, skb); } +/** + * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue + * @sk: socket + * + * Since the write queue can have a temporary empty skb in it, + * we must not use "return skb_queue_empty(&sk->sk_write_queue)" + */ static inline bool tcp_write_queue_empty(const struct sock *sk) { - return skb_queue_empty(&sk->sk_write_queue); + const struct tcp_sock *tp = tcp_sk(sk); + + return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 57f434a8e41f..36902d08473e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3129,7 +3129,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) */ void tcp_send_fin(struct sock *sk) { - struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); + struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and @@ -3137,6 +3137,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ + tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); @@ -3144,7 +3145,7 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (tcp_write_queue_empty(sk)) { + if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have -- cgit v1.2.3