From df19a6267705456f463871ae2aabc44299909d2a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 28 Aug 2009 23:48:54 -0700
Subject: tcp: keepalive cleanups

Introduce keepalive_probes(tp) helper, and use it, like
keepalive_time_when(tp) and keepalive_intvl_when(tp)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b144a26359bc..c520fb6e06d9 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -499,8 +499,7 @@ static void tcp_keepalive_timer (unsigned long data)
 	elapsed = tcp_time_stamp - tp->rcv_tstamp;
 
 	if (elapsed >= keepalive_time_when(tp)) {
-		if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
-		     (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
+		if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			tcp_write_err(sk);
 			goto out;
-- 
cgit v1.2.3


From f1ecd5d9e7366609d640ff4040304ea197fbc618 Mon Sep 17 00:00:00 2001
From: Damian Lukowski <damian@tvk.rwth-aachen.de>
Date: Wed, 26 Aug 2009 00:16:31 +0000
Subject: Revert Backoff [v3]: Revert RTO on ICMP destination unreachable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here, an ICMP host/network unreachable message, whose payload fits to
TCP's SND.UNA, is taken as an indication that the RTO retransmission has
not been lost due to congestion, but because of a route failure
somewhere along the path.
With true congestion, a router won't trigger such a message and the
patched TCP will operate as standard TCP.

This patch reverts one RTO backoff, if an ICMP host/network unreachable
message, whose payload fits to TCP's SND.UNA, arrives.
Based on the new RTO, the retransmission timer is reset to reflect the
remaining time, or - if the revert clocked out the timer - a retransmission
is sent out immediately.
Backoffs are only reverted, if TCP is in RTO loss recovery, i.e. if
there have been retransmissions and reversible backoffs, already.

Changes from v2:
1) Renaming of skb in tcp_v4_err() moved to another patch.
2) Reintroduced tcp_bound_rto() and __tcp_set_rto().
3) Fixed code comments.

Signed-off-by: Damian Lukowski <damian@tvk.rwth-aachen.de>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 12 ++++++++++++
 net/ipv4/tcp_input.c |  5 ++---
 net/ipv4/tcp_ipv4.c  | 37 +++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_timer.c |  2 +-
 4 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cbb2a4889fc9..54f212ce8aaf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -469,6 +469,7 @@ extern void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 				      int nonagle);
 extern int tcp_may_send_now(struct sock *sk);
 extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
+extern void tcp_retransmit_timer(struct sock *sk);
 extern void tcp_xmit_retransmit_queue(struct sock *);
 extern void tcp_simple_retransmit(struct sock *);
 extern int tcp_trim_head(struct sock *, struct sk_buff *, u32);
@@ -521,6 +522,17 @@ extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
 extern int tcp_mss_to_mtu(struct sock *sk, int mss);
 extern void tcp_mtup_init(struct sock *sk);
 
+static inline void tcp_bound_rto(const struct sock *sk)
+{
+	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
+		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
+}
+
+static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
+{
+	return (tp->srtt >> 3) + tp->rttvar;
+}
+
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2bdb0da237e6..af6d6fa00db1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -685,7 +685,7 @@ static inline void tcp_set_rto(struct sock *sk)
 	 *    is invisible. Actually, Linux-2.4 also generates erratic
 	 *    ACKs in some circumstances.
 	 */
-	inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
+	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
 
 	/* 2. Fixups made earlier cannot be right.
 	 *    If we do not estimate RTO correctly without them,
@@ -696,8 +696,7 @@ static inline void tcp_set_rto(struct sock *sk)
 	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
 	 * guarantees that rto is higher.
 	 */
-	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
-		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
+	tcp_bound_rto(sk);
 }
 
 /* Save metrics learned by this TCP session.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6ca1bc8c3025..6755e29a6dd3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -332,12 +332,15 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 {
 	struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
+	struct inet_connection_sock *icsk;
 	struct tcp_sock *tp;
 	struct inet_sock *inet;
 	const int type = icmp_hdr(icmp_skb)->type;
 	const int code = icmp_hdr(icmp_skb)->code;
 	struct sock *sk;
+	struct sk_buff *skb;
 	__u32 seq;
+	__u32 remaining;
 	int err;
 	struct net *net = dev_net(icmp_skb->dev);
 
@@ -367,6 +370,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	if (sk->sk_state == TCP_CLOSE)
 		goto out;
 
+	icsk = inet_csk(sk);
 	tp = tcp_sk(sk);
 	seq = ntohl(th->seq);
 	if (sk->sk_state != TCP_LISTEN &&
@@ -393,6 +397,39 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		}
 
 		err = icmp_err_convert[code].errno;
+		/* check if icmp_skb allows revert of backoff
+		 * (see draft-zimmermann-tcp-lcd) */
+		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
+			break;
+		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
+		    !icsk->icsk_backoff)
+			break;
+
+		icsk->icsk_backoff--;
+		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
+					 icsk->icsk_backoff;
+		tcp_bound_rto(sk);
+
+		skb = tcp_write_queue_head(sk);
+		BUG_ON(!skb);
+
+		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
+				tcp_time_stamp - TCP_SKB_CB(skb)->when);
+
+		if (remaining) {
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  remaining, TCP_RTO_MAX);
+		} else if (sock_owned_by_user(sk)) {
+			/* RTO revert clocked out retransmission,
+			 * but socket is locked. Will defer. */
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  HZ/20, TCP_RTO_MAX);
+		} else {
+			/* RTO revert clocked out retransmission.
+			 * Will retransmit now */
+			tcp_retransmit_timer(sk);
+		}
+
 		break;
 	case ICMP_TIME_EXCEEDED:
 		err = EHOSTUNREACH;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c520fb6e06d9..408fa4b7b9ba 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -279,7 +279,7 @@ static void tcp_probe_timer(struct sock *sk)
  *	The TCP retransmit timer.
  */
 
-static void tcp_retransmit_timer(struct sock *sk)
+void tcp_retransmit_timer(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
-- 
cgit v1.2.3


From 6fa12c85031485dff38ce550c24f10da23b0adaa Mon Sep 17 00:00:00 2001
From: Damian Lukowski <damian@tvk.rwth-aachen.de>
Date: Wed, 26 Aug 2009 00:16:34 +0000
Subject: Revert Backoff [v3]: Calculate TCP's connection close threshold as a
 time value.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RFC 1122 specifies two threshold values R1 and R2 for connection timeouts,
which may represent a number of allowed retransmissions or a timeout value.
Currently linux uses sysctl_tcp_retries{1,2} to specify the thresholds
in number of allowed retransmissions.

For any desired threshold R2 (by means of time) one can specify tcp_retries2
(by means of number of retransmissions) such that TCP will not time out
earlier than R2. This is the case, because the RTO schedule follows a fixed
pattern, namely exponential backoff.

However, the RTO behaviour is not predictable any more if RTO backoffs can be
reverted, as it is the case in the draft
"Make TCP more Robust to Long Connectivity Disruptions"
(http://tools.ietf.org/html/draft-zimmermann-tcp-lcd).

In the worst case TCP would time out a connection after 3.2 seconds, if the
initial RTO equaled MIN_RTO and each backoff has been reverted.

This patch introduces a function retransmits_timed_out(N),
which calculates the timeout of a TCP connection, assuming an initial
RTO of MIN_RTO and N unsuccessful, exponentially backed-off retransmissions.

Whenever timeout decisions are made by comparing the retransmission counter
to some value N, this function can be used, instead.

The meaning of tcp_retries2 will be changed, as many more RTO retransmissions
can occur than the value indicates. However, it yields a timeout which is
similar to the one of an unpatched, exponentially backing off TCP in the same
scenario. As no application could rely on an RTO greater than MIN_RTO, there
should be no risk of a regression.

Signed-off-by: Damian Lukowski <damian@tvk.rwth-aachen.de>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 18 ++++++++++++++++++
 net/ipv4/tcp_timer.c | 11 +++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'net/ipv4/tcp_timer.c')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 54f212ce8aaf..e5319495f15e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1252,6 +1252,24 @@ static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_bu
 #define tcp_for_write_queue_from_safe(skb, tmp, sk)			\
 	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
 
+static inline bool retransmits_timed_out(const struct sock *sk,
+					 unsigned int boundary)
+{
+	int limit, K;
+	if (!inet_csk(sk)->icsk_retransmits)
+		return false;
+
+	K = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
+
+	if (boundary <= K)
+		limit = ((2 << boundary) - 1) * TCP_RTO_MIN;
+	else
+		limit = ((2 << K) - 1) * TCP_RTO_MIN +
+			(boundary - K) * TCP_RTO_MAX;
+
+	return (tcp_time_stamp - tcp_sk(sk)->retrans_stamp) >= limit;
+}
+
 static inline struct sk_buff *tcp_send_head(struct sock *sk)
 {
 	return sk->sk_send_head;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 408fa4b7b9ba..cdb2ca7684d4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -137,13 +137,14 @@ static int tcp_write_timeout(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int retry_until;
+	bool do_reset;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 		if (icsk->icsk_retransmits)
 			dst_negative_advice(&sk->sk_dst_cache);
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
-		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
+		if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
@@ -155,13 +156,15 @@ static int tcp_write_timeout(struct sock *sk)
 			const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 
 			retry_until = tcp_orphan_retries(sk, alive);
+			do_reset = alive ||
+				   !retransmits_timed_out(sk, retry_until);
 
-			if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
+			if (tcp_out_of_resources(sk, do_reset))
 				return 1;
 		}
 	}
 
-	if (icsk->icsk_retransmits >= retry_until) {
+	if (retransmits_timed_out(sk, retry_until)) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
@@ -385,7 +388,7 @@ void tcp_retransmit_timer(struct sock *sk)
 out_reset_timer:
 	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
+	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
 		__sk_dst_reset(sk);
 
 out:;
-- 
cgit v1.2.3