summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig52
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/arp.c11
-rw-r--r--net/ipv4/cipso_ipv4.c9
-rw-r--r--net/ipv4/devinet.c3
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_semantics.c5
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/ip_fragment.c3
-rw-r--r--net/ipv4/ip_gre.c5
-rw-r--r--net/ipv4/ipip.c7
-rw-r--r--net/ipv4/tcp.c62
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c21
-rw-r--r--net/ipv4/tcp_cubic.c11
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_input.c207
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c95
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_scalable.c12
-rw-r--r--net/ipv4/tcp_timer.c23
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_yeah.c9
-rw-r--r--net/ipv4/udp.c2
27 files changed, 289 insertions, 301 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 691268f3a359..b2cf91e4ccaa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER
at boot time after the /proc file system has been mounted.
- If you turn on IP forwarding, you will also get the rp_filter, which
+ If you turn on IP forwarding, you should consider the rp_filter, which
automatically rejects incoming packets if the routing table entry
for their source address doesn't match the network interface they're
arriving on. This has security advantages because it prevents the
@@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER
rp_filter on use:
echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
- or
+ and
echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ <file:Documentation/networking/ip-sysctl.txt>.
+
If unsure, say N here.
-choice
+choice
prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
depends on IP_ADVANCED_ROUTER
default ASK_IP_FIB_HASH
@@ -59,27 +63,29 @@ choice
config ASK_IP_FIB_HASH
bool "FIB_HASH"
---help---
- Current FIB is very proven and good enough for most users.
+ Current FIB is very proven and good enough for most users.
config IP_FIB_TRIE
bool "FIB_TRIE"
---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-
+ Use new experimental LC-trie as FIB lookup algorithm.
+ This improves lookup performance if you have a large
+ number of routes.
+
+ LC-trie is a longest matching prefix lookup algorithm which
+ performs better than FIB_HASH for large routing tables.
+ But, it consumes more memory and is more complex.
+
+ LC-trie is described in:
+
+ IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
+ June 1999
+
+ An experimental study of compression methods for dynamic tries
+ Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+
endchoice
config IP_FIB_HASH
@@ -191,7 +197,7 @@ config IP_PNP_RARP
<file:Documentation/filesystems/nfsroot.txt> for details.
# not yet ready..
-# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
+# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
@@ -361,7 +367,7 @@ config INET_IPCOMP
---help---
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
-
+
If unsure, say Y.
config INET_XFRM_TUNNEL
@@ -415,7 +421,7 @@ config INET_DIAG
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
-
+
If unsure, say Y.
config INET_TCP_DIAG
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 627be4dc7fb0..d5aaabbb7cb3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1500,7 +1500,7 @@ static int ipv4_proc_init(void);
* IP protocol layer initialiser
*/
-static struct packet_type ip_packet_type = {
+static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 3f6b7354699b..f11931c18381 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb)
* cache.
*/
- /* Special case: IPv4 duplicate address detection packet (RFC2131) */
- if (sip == 0) {
+ /*
+ * Special case: IPv4 duplicate address detection packet (RFC2131)
+ * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4)
+ */
+ if (sip == 0 || tip == sip) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
inet_addr_type(net, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
@@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb)
out:
if (in_dev)
in_dev_put(in_dev);
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
@@ -1225,7 +1228,7 @@ void arp_ifdown(struct net_device *dev)
* Called once on startup.
*/
-static struct packet_type arp_packet_type = {
+static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 6bb2635b5ded..7bc992976d29 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -3,11 +3,16 @@
*
* This is an implementation of the CIPSO 2.2 protocol as specified in
* draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
- * FIPS-188, copies of both documents can be found in the Documentation
- * directory. While CIPSO never became a full IETF RFC standard many vendors
+ * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
* have chosen to adopt the protocol and over the years it has become a
* de-facto standard for labeled networking.
*
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ * http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
* Author: Paul Moore <paul.moore@hp.com>
*
*/
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d519a6a66726..126bb911880f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1216,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 741e4fa3e474..cafcc49d0993 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
fib_res_put(&res);
if (no_addr)
goto last_resort;
- if (rpf)
+ if (rpf == 1)
goto e_inval;
fl.oif = dev->ifindex;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4817dea3bc73..f831df500907 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
- info->nlh, GFP_KERNEL);
+ rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+ info->nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 382800a62b31..3f50807237e0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1207,7 +1207,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = {
int __init icmp_init(void)
{
- return register_pernet_device(&icmp_sk_ops);
+ return register_pernet_subsys(&icmp_sk_ops);
}
EXPORT_SYMBOL(icmp_err_convert);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6c52e08f786e..eaf3e2c8646a 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
+ __releases(&f->lock)
{
struct inet_frag_queue *q;
struct hlist_node *n;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6659ac000eeb..7985346653bd 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -463,6 +463,7 @@ err:
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
@@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
iph = ip_hdr(head);
iph->frag_off = 0;
iph->tot_len = htons(len);
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
return 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 07a188afb3ac..e62510d5ea5a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -491,7 +491,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
@@ -803,7 +803,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
#endif
if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 5079dfbc6f38..9054139795af 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -327,7 +327,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
goto out;
- if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
@@ -466,7 +466,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}
if (tunnel->err_count > 0) {
- if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
dst_link_failure(skb);
} else
@@ -750,7 +751,7 @@ static struct xfrm_tunnel ipip_handler = {
.priority = 1,
};
-static char banner[] __initdata =
+static const char banner[] __initconst =
KERN_INFO "IPv4 over IPv4 tunneling driver\n";
static void ipip_destroy_tunnels(struct ipip_net *ipn)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 90b2f3c192ff..2451aeb5ac23 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -661,6 +661,47 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
return NULL;
}
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+ int large_allowed)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 xmit_size_goal, old_size_goal;
+
+ xmit_size_goal = mss_now;
+
+ if (large_allowed && sk_can_gso(sk)) {
+ xmit_size_goal = ((sk->sk_gso_max_size - 1) -
+ inet_csk(sk)->icsk_af_ops->net_header_len -
+ inet_csk(sk)->icsk_ext_hdr_len -
+ tp->tcp_header_len);
+
+ xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
+
+ /* We try hard to avoid divides here */
+ old_size_goal = tp->xmit_size_goal_segs * mss_now;
+
+ if (likely(old_size_goal <= xmit_size_goal &&
+ old_size_goal + mss_now > xmit_size_goal)) {
+ xmit_size_goal = old_size_goal;
+ } else {
+ tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+ xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
+ }
+ }
+
+ return max(xmit_size_goal, mss_now);
+}
+
+static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+ int mss_now;
+
+ mss_now = tcp_current_mss(sk);
+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+ return mss_now;
+}
+
static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
size_t psize, int flags)
{
@@ -677,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
copied = 0;
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto do_error;
+ goto out_err;
while (psize > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
@@ -761,8 +801,7 @@ wait_for_memory:
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
}
out:
@@ -844,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
@@ -854,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto do_error;
+ goto out_err;
while (--iovlen >= 0) {
int seglen = iov->iov_len;
@@ -1007,8 +1045,7 @@ wait_for_memory:
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
- mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
- size_goal = tp->xmit_size_goal;
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
}
}
@@ -1045,8 +1082,7 @@ out_err:
*/
static int tcp_recv_urg(struct sock *sk, long timeo,
- struct msghdr *msg, int len, int flags,
- int *addr_len)
+ struct msghdr *msg, int len, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1661,7 +1697,7 @@ out:
return err;
recv_urg:
- err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+ err = tcp_recv_urg(sk, timeo, msg, len, flags);
goto out;
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 7eb7636db0d0..3b53fd1af23f 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -149,16 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tcp_slow_start(tp);
else {
bictcp_update(ca, tp->snd_cwnd);
-
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= ca->cnt) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, ca->cnt);
}
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4ec5b4e97c4e..e92beb9e55e0 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -336,6 +336,19 @@ void tcp_slow_start(struct tcp_sock *tp)
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+{
+ if (tp->snd_cwnd_cnt >= w) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else {
+ tp->snd_cwnd_cnt++;
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
/*
* TCP Reno congestion control
* This is special case used for fallback as well.
@@ -365,13 +378,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tp->snd_cwnd++;
}
} else {
- /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd);
}
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ee467ec40c4f..71d5f2f29fa6 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -294,16 +294,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tcp_slow_start(tp);
} else {
bictcp_update(ca, tp->snd_cwnd);
-
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= ca->cnt) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, ca->cnt);
}
}
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 937549b8a921..26d5c7fc7de5 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -115,8 +115,7 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
return;
/* achieved throughput calculations */
- if (icsk->icsk_ca_state != TCP_CA_Open &&
- icsk->icsk_ca_state != TCP_CA_Disorder) {
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
ca->packetcount = 0;
ca->lasttime = now;
return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a6961d75c7ea..2bc8e27a163d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -64,6 +64,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/kernel.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/inet_common.h>
@@ -1178,10 +1179,18 @@ static void tcp_mark_lost_retrans(struct sock *sk)
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (after(received_upto, ack_seq) &&
- (tcp_is_fack(tp) ||
- !before(received_upto,
- ack_seq + tp->reordering * tp->mss_cache))) {
+ /* TODO: We would like to get rid of tcp_is_fack(tp) only
+ * constraint here (see above) but figuring out that at
+ * least tp->reordering SACK blocks reside between ack_seq
+ * and received_upto is not easy task to do cheaply with
+ * the available datastructures.
+ *
+ * Whether FACK should check here for tp->reordering segs
+ * in-between one could argue for either way (it would be
+ * rather simple to implement as we could count fack_count
+ * during the walk and do tp->fackets_out - fack_count).
+ */
+ if (after(received_upto, ack_seq)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1374,7 +1383,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
- unsigned int pcount, int shifted, int mss)
+ unsigned int pcount, int shifted, int mss,
+ int dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1410,7 +1420,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
}
/* We discard results */
- tcp_sacktag_one(skb, sk, state, 0, pcount);
+ tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1561,7 +1571,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if (!skb_shift(prev, skb, len))
goto fallback;
- if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss))
+ if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
goto out;
/* Hole filled allows collapsing with the next as well, this is very
@@ -1580,7 +1590,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
len = skb->len;
if (skb_shift(prev, skb, len)) {
pcount += tcp_skb_pcount(skb);
- tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss);
+ tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
}
out:
@@ -1793,11 +1803,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
for (i = used_sacks - 1; i > 0; i--) {
for (j = 0; j < i; j++) {
if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
- struct tcp_sack_block tmp;
-
- tmp = sp[j];
- sp[j] = sp[j + 1];
- sp[j + 1] = tmp;
+ swap(sp[j], sp[j + 1]);
/* Track where the first SACK block goes to */
if (j == first_sack_index)
@@ -2452,6 +2458,44 @@ static int tcp_time_to_recover(struct sock *sk)
return 0;
}
+/* New heuristics: it is possible only after we switched to restart timer
+ * each time when something is ACKed. Hence, we can detect timed out packets
+ * during fast retransmit without falling to slow start.
+ *
+ * Usefulness of this as is very questionable, since we should know which of
+ * the segments is the next to timeout which is relatively expensive to find
+ * in general case unless we add some data structure just for that. The
+ * current approach certainly won't find the right one too often and when it
+ * finally does find _something_ it usually marks large part of the window
+ * right away (because a retransmission with a larger timestamp blocks the
+ * loop from advancing). -ij
+ */
+static void tcp_timeout_skbs(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
+ return;
+
+ skb = tp->scoreboard_skb_hint;
+ if (tp->scoreboard_skb_hint == NULL)
+ skb = tcp_write_queue_head(sk);
+
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ if (!tcp_skb_timedout(sk, skb))
+ break;
+
+ tcp_skb_mark_lost(tp, skb);
+ }
+
+ tp->scoreboard_skb_hint = skb;
+
+ tcp_verify_left_out(tp);
+}
+
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
@@ -2524,30 +2568,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
tcp_mark_head_lost(sk, sacked_upto);
}
- /* New heuristics: it is possible only after we switched
- * to restart timer each time when something is ACKed.
- * Hence, we can detect timed out packets during fast
- * retransmit without falling to slow start.
- */
- if (tcp_is_fack(tp) && tcp_head_timedout(sk)) {
- struct sk_buff *skb;
-
- skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
- : tcp_write_queue_head(sk);
-
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (!tcp_skb_timedout(sk, skb))
- break;
-
- tcp_skb_mark_lost(tp, skb);
- }
-
- tp->scoreboard_skb_hint = skb;
-
- tcp_verify_left_out(tp);
- }
+ tcp_timeout_skbs(sk);
}
/* CWND moderation, preventing bursts due to too big ACKs
@@ -2812,7 +2833,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.probe_size = 0;
}
-static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+static void tcp_mtup_probe_success(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2840,7 +2861,7 @@ void tcp_simple_retransmit(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int mss = tcp_current_mss(sk);
u32 prior_lost = tp->lost_out;
tcp_for_write_queue(skb, sk) {
@@ -3177,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- u32 end_seq;
u32 acked_pcount;
u8 sacked = scb->sacked;
@@ -3192,16 +3212,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
break;
fully_acked = 0;
- end_seq = tp->snd_una;
} else {
acked_pcount = tcp_skb_pcount(skb);
- end_seq = scb->end_seq;
- }
-
- /* MTU probing checks */
- if (fully_acked && icsk->icsk_mtup.probe_size &&
- !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
- tcp_mtup_probe_success(sk, skb);
}
if (sacked & TCPCB_RETRANS) {
@@ -3266,24 +3278,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
+ if (unlikely(icsk->icsk_mtup.probe_size &&
+ !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+ tcp_mtup_probe_success(sk);
+ }
+
tcp_ack_update_rtt(sk, flag, seq_rtt);
tcp_rearm_rto(sk);
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
+ int delta;
+
/* Non-retransmitted hole got filled? That's reordering */
if (reord < prior_fackets)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
- /* No need to care for underflows here because
- * the lost_skb_hint gets NULLed if we're past it
- * (or something non-trivial happened)
- */
- if (tcp_is_fack(tp))
- tp->lost_cnt_hint -= pkts_acked;
- else
- tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
+ delta = tcp_is_fack(tp) ? pkts_acked :
+ prior_sacked - tp->sacked_out;
+ tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
@@ -3395,7 +3409,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
- tcp_update_wl(tp, ack, ack_seq);
+ tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
@@ -3571,15 +3585,18 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
int prior_packets;
int frto_cwnd = 0;
- /* If the ack is newer than sent or older than previous acks
+ /* If the ack is older than previous acks
* then we can probably ignore it.
*/
- if (after(ack, tp->snd_nxt))
- goto uninteresting_ack;
-
if (before(ack, prior_snd_una))
goto old_ack;
+ /* If the ack includes data we haven't sent yet, discard
+ * this segment (RFC793 Section 3.9).
+ */
+ if (after(ack, tp->snd_nxt))
+ goto invalid_ack;
+
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
@@ -3600,7 +3617,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
- tcp_update_wl(tp, ack, ack_seq);
+ tcp_update_wl(tp, ack_seq);
tp->snd_una = ack;
flag |= FLAG_WIN_UPDATE;
@@ -3669,6 +3686,10 @@ no_queue:
tcp_ack_probe(sk);
return 1;
+invalid_ack:
+ SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ return -1;
+
old_ack:
if (TCP_SKB_CB(skb)->sacked) {
tcp_sacktag_write_queue(sk, skb, prior_snd_una);
@@ -3676,8 +3697,7 @@ old_ack:
tcp_try_keep_open(sk);
}
-uninteresting_ack:
- SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
@@ -3865,8 +3885,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* Not only, also it occurs for expired timestamps.
*/
- if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
- get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+ if (tcp_paws_check(&tp->rx_opt, 0))
tcp_store_ts_recent(tp);
}
}
@@ -3918,9 +3937,9 @@ static inline int tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
- get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
- !tcp_disordered_ack(sk, skb));
+
+ return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
+ !tcp_disordered_ack(sk, skb);
}
/* Check segment sequence number for validity.
@@ -4078,7 +4097,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
tp->duplicate_sack[0].end_seq = end_seq;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1;
}
}
@@ -4133,8 +4151,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
* Decrease num_sacks.
*/
tp->rx_opt.num_sacks--;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
- tp->rx_opt.dsack;
for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
sp[i] = sp[i + 1];
continue;
@@ -4143,20 +4159,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
}
}
-static inline void tcp_sack_swap(struct tcp_sack_block *sack1,
- struct tcp_sack_block *sack2)
-{
- __u32 tmp;
-
- tmp = sack1->start_seq;
- sack1->start_seq = sack2->start_seq;
- sack2->start_seq = tmp;
-
- tmp = sack1->end_seq;
- sack1->end_seq = sack2->end_seq;
- sack2->end_seq = tmp;
-}
-
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4171,7 +4173,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
if (tcp_sack_extend(sp, seq, end_seq)) {
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
- tcp_sack_swap(sp, sp - 1);
+ swap(*sp, *(sp - 1));
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
@@ -4197,7 +4199,6 @@ new_sack:
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
}
/* RCV.NXT advances, some SACKs should be eaten. */
@@ -4211,7 +4212,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
if (skb_queue_empty(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
return;
}
@@ -4232,11 +4232,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
this_sack++;
sp++;
}
- if (num_sacks != tp->rx_opt.num_sacks) {
- tp->rx_opt.num_sacks = num_sacks;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
- tp->rx_opt.dsack;
- }
+ tp->rx_opt.num_sacks = num_sacks;
}
/* This one checks to see if we can put data from the
@@ -4312,10 +4308,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
TCP_ECN_accept_cwr(tp, skb);
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
- }
+ tp->rx_opt.dsack = 0;
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
@@ -4434,8 +4427,6 @@ drop:
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = 1;
tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
tp->selective_acks[0].end_seq =
TCP_SKB_CB(skb)->end_seq;
@@ -5156,7 +5147,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
- TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+ !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
@@ -5309,8 +5301,8 @@ slow_path:
return -res;
step5:
- if (th->ack)
- tcp_ack(sk, skb, FLAG_SLOWPATH);
+ if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+ goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5408,7 +5400,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* never scaled.
*/
tp->snd_wnd = ntohs(th->window);
- tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5509,7 +5501,7 @@ discard:
/* PAWS check. */
if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
- tcp_paws_check(&tp->rx_opt, 0))
+ tcp_paws_reject(&tp->rx_opt, 0))
goto discard_and_undo;
if (th->syn) {
@@ -5647,7 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* step 5: check the ACK field */
if (th->ack) {
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
+ int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
switch (sk->sk_state) {
case TCP_SYN_RECV:
@@ -5669,8 +5661,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) <<
tp->rx_opt.snd_wscale;
- tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
- TCP_SKB_CB(skb)->seq);
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
/* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6b962f56ab4..d0a314879d81 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1226,15 +1226,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
- if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
- /* Some OSes (unknown ones, but I see them on web server, which
- * contains information interesting only for windows'
- * users) do not send their stamp in SYN. It is easy case.
- * We simply do not advertise TS support.
- */
- tmp_opt.saw_tstamp = 0;
- tmp_opt.tstamp_ok = 0;
- }
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
@@ -2443,7 +2434,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
void __init tcp_v4_init(void)
{
inet_hashinfo_init(&tcp_hashinfo);
- if (register_pernet_device(&tcp_sk_ops))
+ if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f67effbb102b..43bbba7926ee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -107,7 +107,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = tcptw->tw_ts_recent;
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
- paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_prequeue_init(newtp);
- tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
+ tcp_init_wl(newtp, treq->rcv_isn);
newtp->srtt = 0;
newtp->mdev = TCP_TIMEOUT_INIT;
@@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.saw_tstamp = 0;
newtp->rx_opt.dsack = 0;
- newtp->rx_opt.eff_sacks = 0;
-
newtp->rx_opt.num_sacks = 0;
+
newtp->urg_data = 0;
if (sock_flag(newsk, SOCK_KEEPOPEN))
@@ -512,7 +511,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* from another data.
*/
tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
- paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dda42f0bd7a3..c1f259d2d33b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -441,10 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(sp[this_sack].end_seq);
}
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
- }
+ tp->rx_opt.dsack = 0;
}
}
@@ -550,6 +547,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
unsigned size = 0;
+ unsigned int eff_sacks;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -568,10 +566,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
size += TCPOLEN_TSTAMP_ALIGNED;
}
- if (unlikely(tp->rx_opt.eff_sacks)) {
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+ if (unlikely(eff_sacks)) {
const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
opts->num_sack_blocks =
- min_t(unsigned, tp->rx_opt.eff_sacks,
+ min_t(unsigned, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -663,10 +662,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
th->urg_ptr = 0;
/* The urg_mode check is necessary during a below snd_una win probe */
- if (unlikely(tcp_urg_mode(tp) &&
- between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
- th->urg_ptr = htons(tp->snd_up - tcb->seq);
- th->urg = 1;
+ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+ if (before(tp->snd_up, tcb->seq + 0x10000)) {
+ th->urg_ptr = htons(tp->snd_up - tcb->seq);
+ th->urg = 1;
+ } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+ th->urg_ptr = 0xFFFF;
+ th->urg = 1;
+ }
}
tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
@@ -763,11 +766,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
struct sk_buff *buff;
int nsize, old_factor;
int nlen;
- u16 flags;
+ u8 flags;
BUG_ON(len > skb->len);
- tcp_clear_retrans_hints_partial(tp);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -850,6 +852,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
tcp_verify_left_out(tp);
}
tcp_adjust_fackets_out(sk, skb, diff);
+
+ if (tp->lost_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+ (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked))
+ tp->lost_cnt_hint -= diff;
}
/* Link BUFF into the send queue. */
@@ -913,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
* factor and mss.
*/
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
+ tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
return 0;
}
@@ -974,15 +982,6 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_mtup.probe_size = 0;
}
-/* Bound MSS / TSO packet size with the half of the window */
-static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
-{
- if (tp->max_window && pktsize > (tp->max_window >> 1))
- return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
- else
- return pktsize;
-}
-
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -1029,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*/
-unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
+unsigned int tcp_current_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
- u16 xmit_size_goal;
- int doing_tso = 0;
unsigned header_len;
struct tcp_out_options opts;
struct tcp_md5sig_key *md5;
mss_now = tp->mss_cache;
- if (large_allowed && sk_can_gso(sk))
- doing_tso = 1;
-
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
@@ -1062,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
mss_now -= delta;
}
- xmit_size_goal = mss_now;
-
- if (doing_tso) {
- xmit_size_goal = ((sk->sk_gso_max_size - 1) -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
-
- xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
- xmit_size_goal -= (xmit_size_goal % mss_now);
- }
- tp->xmit_size_goal = xmit_size_goal;
-
return mss_now;
}
@@ -1256,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk)
struct sk_buff *skb = tcp_send_head(sk);
return (skb &&
- tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+ tcp_snd_test(sk, skb, tcp_current_mss(sk),
(tcp_skb_is_last(sk, skb) ?
tp->nonagle : TCP_NAGLE_PUSH)));
}
@@ -1273,7 +1254,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
{
struct sk_buff *buff;
int nlen = skb->len - len;
- u16 flags;
+ u8 flags;
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
@@ -1352,6 +1333,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
if (limit >= sk->sk_gso_max_size)
goto send_now;
+ /* Middle in queue won't get any more data, full sendable already? */
+ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
+ goto send_now;
+
if (sysctl_tcp_tso_win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -1405,11 +1390,11 @@ static int tcp_mtu_probe(struct sock *sk)
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
tp->snd_cwnd < 11 ||
- tp->rx_opt.eff_sacks)
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
/* Very simple search strategy: just double the MSS. */
- mss_now = tcp_current_mss(sk, 0);
+ mss_now = tcp_current_mss(sk);
probe_size = 2 * tp->mss_cache;
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
@@ -1754,11 +1739,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
int skb_size, next_skb_size;
- u16 flags;
skb_size = skb->len;
next_skb_size = next_skb->len;
- flags = TCP_SKB_CB(skb)->flags;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
@@ -1778,9 +1761,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
/* Update sequence range on original skb. */
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
- /* Merge over control information. */
- flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
- TCP_SKB_CB(skb)->flags = flags;
+ /* Merge over control information. This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
@@ -1894,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
- cur_mss = tcp_current_mss(sk, 0);
+ cur_mss = tcp_current_mss(sk);
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
@@ -1908,6 +1890,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (skb->len > cur_mss) {
if (tcp_fragment(sk, skb, cur_mss, cur_mss))
return -ENOMEM; /* We'll try again later. */
+ } else {
+ tcp_init_tso_segs(sk, skb, cur_mss);
}
tcp_retrans_try_collapse(sk, skb, cur_mss);
@@ -2023,7 +2007,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
- /* First pass: retransmit lost packets. */
tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
@@ -2062,7 +2045,7 @@ begin_fwd:
goto begin_fwd;
} else if (!(sacked & TCPCB_LOST)) {
- if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
+ if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
@@ -2101,7 +2084,7 @@ void tcp_send_fin(struct sock *sk)
* unsent frames. But be careful about outgoing SACKS
* and IP options.
*/
- mss_now = tcp_current_mss(sk, 1);
+ mss_now = tcp_current_mss(sk);
if (tcp_send_head(sk) != NULL) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -2326,7 +2309,7 @@ static void tcp_connect_init(struct sock *sk)
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
- tcp_init_wl(tp, tp->write_seq, 0);
+ tcp_init_wl(tp, 0);
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
@@ -2513,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk)
if ((skb = tcp_send_head(sk)) != NULL &&
before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
- unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 25524d4e372a..59f5b5e7c566 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -165,9 +165,10 @@ static int tcpprobe_sprint(char *tbuf, int n)
static ssize_t tcpprobe_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
- int error = 0, cnt = 0;
+ int error = 0;
+ size_t cnt = 0;
- if (!buf || len < 0)
+ if (!buf)
return -EINVAL;
while (cnt < len) {
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 2747ec7bfb63..a76513779e2b 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -1,6 +1,6 @@
/* Tom Kelly's Scalable TCP
*
- * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ * See http://www.deneholme.net/tom/scalable/
*
* John Heffner <jheffner@sc.edu>
*/
@@ -24,14 +24,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh)
tcp_slow_start(tp);
- else {
- tp->snd_cwnd_cnt++;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
- }
+ else
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0170e914f1b0..b144a26359bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -328,19 +328,16 @@ static void tcp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0) {
int mib_idx;
- if (icsk->icsk_ca_state == TCP_CA_Disorder ||
- icsk->icsk_ca_state == TCP_CA_Recovery) {
- if (tcp_is_sack(tp)) {
- if (icsk->icsk_ca_state == TCP_CA_Recovery)
- mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
- else
- mib_idx = LINUX_MIB_TCPSACKFAILURES;
- } else {
- if (icsk->icsk_ca_state == TCP_CA_Recovery)
- mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
- else
- mib_idx = LINUX_MIB_TCPRENOFAILURES;
- }
+ if (icsk->icsk_ca_state == TCP_CA_Disorder) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
+ } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+ else
+ mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
mib_idx = LINUX_MIB_TCPLOSSFAILURES;
} else {
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index d08b2e855c22..e9bbff746488 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,12 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* In the "non-congestive state", increase cwnd
* every rtt.
*/
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd);
} else {
/* In the "congestive state", increase cwnd
* every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9ec843a9bbb2..66b6821b984e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -94,14 +94,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
} else {
/* Reno */
-
- if (tp->snd_cwnd_cnt < tp->snd_cwnd)
- tp->snd_cwnd_cnt++;
-
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4bd178a111d5..05b7abb99f69 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
sk = sknext;
} while (sknext);
} else
- kfree_skb(skb);
+ consume_skb(skb);
spin_unlock(&hslot->lock);
return 0;
}