summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig1
-rw-r--r--net/ipv4/af_inet.c42
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/esp4.c332
-rw-r--r--net/ipv4/fib_semantics.c62
-rw-r--r--net/ipv4/fib_trie.c42
-rw-r--r--net/ipv4/icmp.c125
-rw-r--r--net/ipv4/inet_connection_sock.c278
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/inet_timewait_sock.c3
-rw-r--r--net/ipv4/ip_output.c11
-rw-r--r--net/ipv4/ip_sockglue.c9
-rw-r--r--net/ipv4/ip_tunnel_core.c10
-rw-r--r--net/ipv4/ipmr.c266
-rw-r--r--net/ipv4/netfilter/arp_tables.c15
-rw-r--r--net/ipv4/netfilter/ip_tables.c21
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c1
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c11
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c15
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c2
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/ping.c9
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c44
-rw-r--r--net/ipv4/syncookies.c21
-rw-r--r--net/ipv4/sysctl_net_ipv4.c110
-rw-r--r--net/ipv4/tcp.c85
-rw-r--r--net/ipv4/tcp_fastopen.c54
-rw-r--r--net/ipv4/tcp_input.c271
-rw-r--r--net/ipv4/tcp_ipv4.c38
-rw-r--r--net/ipv4/tcp_metrics.c8
-rw-r--r--net/ipv4/tcp_minisocks.c22
-rw-r--r--net/ipv4/tcp_output.c132
-rw-r--r--net/ipv4/tcp_recovery.c148
-rw-r--r--net/ipv4/tcp_timer.c5
-rw-r--r--net/ipv4/udp.c117
-rw-r--r--net/ipv4/xfrm4_state.c8
42 files changed, 1423 insertions, 945 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6e7baaf814c6..e30f9caddae8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -187,6 +187,7 @@ config NET_IPGRE_DEMUX
config NET_IP_TUNNEL
tristate
select DST_CACHE
+ select GRO_CELLS
default n
config NET_IPGRE
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f75069883f2b..685ba53df2d1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -479,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK &&
+ if (snum && snum < inet_prot_sock(net) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
@@ -570,19 +570,30 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
* TCP 'magic' in here.
*/
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+ int addr_len, int flags, int is_sendmsg)
{
struct sock *sk = sock->sk;
int err;
long timeo;
- if (addr_len < sizeof(uaddr->sa_family))
- return -EINVAL;
+ /*
+ * uaddr can be NULL and addr_len can be 0 if:
+ * sk is a TCP fastopen active socket and
+ * TCP_FASTOPEN_CONNECT sockopt is set and
+ * we already have a valid cookie for this socket.
+ * In this case, user can call write() after connect().
+ * write() will invoke tcp_sendmsg_fastopen() which calls
+ * __inet_stream_connect().
+ */
+ if (uaddr) {
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
- if (uaddr->sa_family == AF_UNSPEC) {
- err = sk->sk_prot->disconnect(sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto out;
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
}
switch (sock->state) {
@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- err = -EALREADY;
+ if (inet_sk(sk)->defer_connect)
+ err = is_sendmsg ? -EINPROGRESS : -EISCONN;
+ else
+ err = -EALREADY;
/* Fall out of switch with err, set for this state */
break;
case SS_UNCONNECTED:
@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING;
+ if (!err && inet_sk(sk)->defer_connect)
+ goto out;
+
/* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY.
@@ -662,7 +679,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
lock_sock(sock->sk);
- err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
release_sock(sock->sk);
return err;
}
@@ -1700,6 +1717,9 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
+#ifdef CONFIG_SYSCTL
+ net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
+#endif
return 0;
}
@@ -1831,8 +1851,6 @@ static int __init inet_init(void)
ip_init();
- tcp_v4_init();
-
/* Setup TCP slab cache for open requests. */
tcp_init();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f2a71025a770..22377c8ff14b 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -270,6 +270,9 @@ static void ah_input_done(struct crypto_async_request *base, int err)
int ihl = ip_hdrlen(skb);
int ah_hlen = (ah->hdrlen + 2) << 2;
+ if (err)
+ goto out;
+
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, ihl);
icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 4cd2ee8857d2..5d367b7ff542 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -65,8 +65,6 @@
#include <net/net_namespace.h>
#include <net/addrconf.h>
-#include "fib_lookup.h"
-
static struct ipv4_devconf ipv4_devconf = {
.data = {
[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 20fb25e3027b..b1e24446e297 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
#include <net/protocol.h>
#include <net/udp.h>
+#include <linux/highmem.h>
+
struct esp_skb_cb {
struct xfrm_skb_cb xfrm;
void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+ struct crypto_aead *aead = x->data;
+ int extralen = 0;
+ u8 *iv;
+ struct aead_request *req;
+ struct scatterlist *sg;
+
+ if (x->props.flags & XFRM_STATE_ESN)
+ extralen += sizeof(*extra);
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+
+ /* Unref skb_frag_pages in the src scatterlist if necessary.
+ * Skip the first sg which comes from skb->data.
+ */
+ if (req->src != req->dst)
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+ put_page(sg_page(sg));
+}
+
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
+ void *tmp;
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
- kfree(ESP_SKB_CB(skb)->tmp);
+ tmp = ESP_SKB_CB(skb)->tmp;
+ esp_ssg_unref(x, tmp);
+ kfree(tmp);
xfrm_output_resume(skb, err);
}
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
sizeof(__be32));
}
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+ struct ip_esp_hdr *esph,
+ struct esp_output_extra *extra)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * encryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ }
+
+ esph->spi = x->id.spi;
+
+ return esph;
+}
+
static void esp_output_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -128,18 +182,36 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
+static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+{
+ /* Fill padding... */
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = proto;
+}
+
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
- int err;
struct esp_output_extra *extra;
+ int err = -ENOMEM;
struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct aead_request *req;
- struct scatterlist *sg;
+ struct scatterlist *sg, *dsg;
struct sk_buff *trailer;
+ struct page *page;
void *tmp;
u8 *iv;
u8 *tail;
+ u8 *vaddr;
int blksize;
int clen;
int alen;
@@ -149,7 +221,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int nfrags;
int assoclen;
int extralen;
+ int tailen;
__be64 seqno;
+ __u8 proto = *skb_mac_header(skb);
/* skb is pure payload to encrypt */
@@ -169,12 +243,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
clen = ALIGN(skb->len + 2 + tfclen, blksize);
plen = clen - skb->len - tfclen;
-
- err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
- if (err < 0)
- goto error;
- nfrags = err;
-
+ tailen = tfclen + plen + alen;
assoclen = sizeof(*esph);
extralen = 0;
@@ -183,35 +252,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
assoclen += sizeof(__be32);
}
- tmp = esp_alloc_tmp(aead, nfrags, extralen);
- if (!tmp) {
- err = -ENOMEM;
- goto error;
- }
-
- extra = esp_tmp_extra(tmp);
- iv = esp_tmp_iv(aead, tmp, extralen);
- req = esp_tmp_req(aead, iv);
- sg = esp_req_sg(aead, req);
-
- /* Fill padding... */
- tail = skb_tail_pointer(trailer);
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = *skb_mac_header(skb);
- pskb_put(skb, trailer, clen - skb->len + alen);
-
- skb_push(skb, -skb_network_offset(skb));
- esph = ip_esp_hdr(skb);
*skb_mac_header(skb) = IPPROTO_ESP;
+ esph = ip_esp_hdr(skb);
/* this is non-NULL only with UDP Encapsulation */
if (x->encap) {
@@ -230,7 +272,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
uh = (struct udphdr *)esph;
uh->source = sport;
uh->dest = dport;
- uh->len = htons(skb->len - skb_transport_offset(skb));
+ uh->len = htons(skb->len + tailen
+ - skb_transport_offset(skb));
uh->check = 0;
switch (encap_type) {
@@ -248,31 +291,148 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
*skb_mac_header(skb) = IPPROTO_UDP;
}
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ if (!skb_cloned(skb)) {
+ if (tailen <= skb_availroom(skb)) {
+ nfrags = 1;
+ trailer = skb;
+ tail = skb_tail_pointer(trailer);
- aead_request_set_callback(req, 0, esp_output_done, skb);
+ goto skip_cow;
+ } else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+ && !skb_has_frag_list(skb)) {
+ int allocsize;
+ struct sock *sk = skb->sk;
+ struct page_frag *pfrag = &x->xfrag;
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * encryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- extra->esphoff = (unsigned char *)esph -
- skb_transport_header(skb);
- esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
- extra->seqhi = esph->spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
- aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+ spin_lock_bh(&x->lock);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ goto cow;
+ }
+
+ page = pfrag->page;
+ get_page(page);
+
+ vaddr = kmap_atomic(page);
+
+ tail = vaddr + pfrag->offset;
+
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ kunmap_atomic(vaddr);
+
+ nfrags = skb_shinfo(skb)->nr_frags;
+
+ __skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+ tailen);
+ skb_shinfo(skb)->nr_frags = ++nfrags;
+
+ pfrag->offset = pfrag->offset + allocsize;
+ nfrags++;
+
+ skb->len += tailen;
+ skb->data_len += tailen;
+ skb->truesize += tailen;
+ if (sk)
+ atomic_add(tailen, &sk->sk_wmem_alloc);
+
+ skb_push(skb, -skb_network_offset(skb));
+
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+ esph->spi = x->id.spi;
+
+ tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
+ if (!tmp) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = &sg[nfrags];
+
+ esph = esp_output_set_extra(skb, esph, extra);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+ if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+ spin_unlock_bh(&x->lock);
+ err = -ENOMEM;
+ goto error;
+ }
+
+ skb_shinfo(skb)->nr_frags = 1;
+
+ page = pfrag->page;
+ get_page(page);
+ /* replace page frags in skb with new page */
+ __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+ pfrag->offset = pfrag->offset + allocsize;
+
+ sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+ skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + clen + alen);
+
+ spin_unlock_bh(&x->lock);
+
+ goto skip_cow2;
+ }
}
+cow:
+ err = skb_cow_data(skb, tailen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
+ tail = skb_tail_pointer(trailer);
+ esph = ip_esp_hdr(skb);
+
+skip_cow:
+ esp_output_fill_trailer(tail, tfclen, plen, proto);
+
+ pskb_put(skb, trailer, clen - skb->len + alen);
+ skb_push(skb, -skb_network_offset(skb));
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
esph->spi = x->id.spi;
+ tmp = esp_alloc_tmp(aead, nfrags, extralen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
+ req = esp_tmp_req(aead, iv);
+ sg = esp_req_sg(aead, req);
+ dsg = sg;
+
+ esph = esp_output_set_extra(skb, esph, extra);
+
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
(unsigned char *)esph - skb->data,
assoclen + ivlen + clen + alen);
- aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+ if ((x->props.flags & XFRM_STATE_ESN))
+ aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+ else
+ aead_request_set_callback(req, 0, esp_output_done, skb);
+
+ aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
aead_request_set_ad(req, assoclen);
seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +458,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
esp_output_restore_header(skb);
}
+ if (sg != dsg)
+ esp_ssg_unref(x, tmp);
kfree(tmp);
error:
@@ -401,6 +563,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
__skb_pull(skb, 4);
}
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+ /* For ESN we move the header forward by 4 bytes to
+ * accomodate the high bits. We will move it back after
+ * decryption.
+ */
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ esph = (void *)skb_push(skb, 4);
+ *seqhi = esph->spi;
+ esph->spi = esph->seq_no;
+ esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ }
+}
+
static void esp_input_done_esn(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -437,12 +616,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
if (elen <= 0)
goto out;
- err = skb_cow_data(skb, 0, &trailer);
- if (err < 0)
- goto out;
-
- nfrags = err;
-
assoclen = sizeof(*esph);
seqhilen = 0;
@@ -451,6 +624,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
assoclen += seqhilen;
}
+ if (!skb_cloned(skb)) {
+ if (!skb_is_nonlinear(skb)) {
+ nfrags = 1;
+
+ goto skip_cow;
+ } else if (!skb_has_frag_list(skb)) {
+ nfrags = skb_shinfo(skb)->nr_frags;
+ nfrags++;
+
+ goto skip_cow;
+ }
+ }
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+
+ nfrags = err;
+
+skip_cow:
err = -ENOMEM;
tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
if (!tmp)
@@ -462,26 +655,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
- skb->ip_summed = CHECKSUM_NONE;
+ esp_input_set_header(skb, seqhi);
- esph = (struct ip_esp_hdr *)skb->data;
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
- aead_request_set_callback(req, 0, esp_input_done, skb);
+ skb->ip_summed = CHECKSUM_NONE;
- /* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
- * decryption.
- */
- if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)skb_push(skb, 4);
- *seqhi = esph->spi;
- esph->spi = esph->seq_no;
- esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+ if ((x->props.flags & XFRM_STATE_ESN))
aead_request_set_callback(req, 0, esp_input_done_esn, skb);
- }
-
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ else
+ aead_request_set_callback(req, 0, esp_input_done, skb);
aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
aead_request_set_ad(req, assoclen);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9a375b908d01..317026a39cfa 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -471,7 +471,6 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
int ret;
change_nexthops(fi) {
@@ -503,16 +502,14 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
nla = nla_find(attrs, attrlen, RTA_ENCAP);
if (nla) {
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
struct nlattr *nla_entype;
nla_entype = nla_find(attrs, attrlen,
RTA_ENCAP_TYPE);
if (!nla_entype)
goto err_inval;
- if (cfg->fc_oif)
- dev = __dev_get_by_index(net, cfg->fc_oif);
- ret = lwtunnel_build_state(dev, nla_get_u16(
+
+ ret = lwtunnel_build_state(nla_get_u16(
nla_entype),
nla, AF_INET, cfg,
&lwtstate);
@@ -597,21 +594,18 @@ static inline void fib_add_weight(struct fib_info *fi,
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int fib_encap_match(struct net *net, u16 encap_type,
+static int fib_encap_match(u16 encap_type,
struct nlattr *encap,
- int oif, const struct fib_nh *nh,
+ const struct fib_nh *nh,
const struct fib_config *cfg)
{
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
int ret, result = 0;
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- if (oif)
- dev = __dev_get_by_index(net, oif);
- ret = lwtunnel_build_state(dev, encap_type, encap,
+ ret = lwtunnel_build_state(encap_type, encap,
AF_INET, cfg, &lwtstate);
if (!ret) {
result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -623,7 +617,6 @@ static int fib_encap_match(struct net *net, u16 encap_type,
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
int remaining;
@@ -634,9 +627,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
if (cfg->fc_oif || cfg->fc_gw) {
if (cfg->fc_encap) {
- if (fib_encap_match(net, cfg->fc_encap_type,
- cfg->fc_encap, cfg->fc_oif,
- fi->fib_nh, cfg))
+ if (fib_encap_match(cfg->fc_encap_type,
+ cfg->fc_encap, fi->fib_nh, cfg))
return 1;
}
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
@@ -1093,13 +1085,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_encap) {
struct lwtunnel_state *lwtstate;
- struct net_device *dev = NULL;
if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
goto err_inval;
- if (cfg->fc_oif)
- dev = __dev_get_by_index(net, cfg->fc_oif);
- err = lwtunnel_build_state(dev, cfg->fc_encap_type,
+ err = lwtunnel_build_state(cfg->fc_encap_type,
cfg->fc_encap, AF_INET, cfg,
&lwtstate);
if (err)
@@ -1366,6 +1355,36 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
return ret;
}
+static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
+ enum fib_event_type event_type)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
+ struct fib_nh_notifier_info info = {
+ .fib_nh = fib_nh,
+ };
+
+ switch (event_type) {
+ case FIB_EVENT_NH_ADD:
+ if (fib_nh->nh_flags & RTNH_F_DEAD)
+ break;
+ if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ fib_nh->nh_flags & RTNH_F_LINKDOWN)
+ break;
+ return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
+ &info.info);
+ case FIB_EVENT_NH_DEL:
+ if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
+ (fib_nh->nh_flags & RTNH_F_DEAD))
+ return call_fib_notifiers(dev_net(fib_nh->nh_dev),
+ event_type, &info.info);
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
/* Event force Flags Description
* NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
@@ -1407,6 +1426,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
break;
}
+ call_fib_nh_notifiers(nexthop_nh,
+ FIB_EVENT_NH_DEL);
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1437,7 +1458,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
}
/* Must be invoked inside of an RCU protected region. */
-void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
+static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
struct fib_info *fi = NULL, *last_resort = NULL;
struct hlist_head *fa_head = res->fa_head;
@@ -1561,6 +1582,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
continue;
alive++;
nexthop_nh->nh_flags &= ~nh_flags;
+ call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
} endfor_nexthops(fi)
if (alive > 0) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2919d1a10cfd..d8cea210af0e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -124,7 +124,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb,
static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id, u32 nlflags)
+ u8 tos, u8 type, u32 tb_id)
{
struct fib_entry_notifier_info info = {
.dst = dst,
@@ -133,7 +133,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
.tos = tos,
.type = type,
.tb_id = tb_id,
- .nlflags = nlflags,
};
return call_fib_notifier(nb, net, event_type, &info.info);
}
@@ -197,7 +196,7 @@ int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
static int call_fib_entry_notifiers(struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id, u32 nlflags)
+ u8 tos, u8 type, u32 tb_id)
{
struct fib_entry_notifier_info info = {
.dst = dst,
@@ -206,7 +205,6 @@ static int call_fib_entry_notifiers(struct net *net,
.tos = tos,
.type = type,
.tb_id = tb_id,
- .nlflags = nlflags,
};
return call_fib_notifiers(net, event_type, &info.info);
}
@@ -1198,6 +1196,7 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg)
{
+ enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
@@ -1295,6 +1294,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
+
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1303,13 +1309,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
- key, plen, fi,
- new_fa->fa_tos, cfg->fc_type,
- tb->tb_id, cfg->fc_nlflags);
- rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, nlflags);
-
goto succeeded;
}
/* Error if we find a perfect match which
@@ -1319,10 +1318,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (fa_match)
goto out;
- if (cfg->fc_nlflags & NLM_F_APPEND)
+ if (cfg->fc_nlflags & NLM_F_APPEND) {
+ event = FIB_EVENT_ENTRY_APPEND;
nlflags |= NLM_F_APPEND;
- else
+ } else {
fa = fa_first;
+ }
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1351,8 +1352,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
- cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
+ call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type,
+ tb->tb_id);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
@@ -1653,8 +1654,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
return -ESRCH;
call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
- fa_to_delete->fa_info, tos, cfg->fc_type,
- tb->tb_id, 0);
+ fa_to_delete->fa_info, tos,
+ fa_to_delete->fa_type, tb->tb_id);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1963,7 +1964,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
+ tb->tb_id != fa->tb_id) {
slen = fa->fa_slen;
continue;
}
@@ -1972,7 +1974,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
n->key,
KEYLENGTH - fa->fa_slen,
fi, fa->fa_tos, fa->fa_type,
- tb->tb_id, 0);
+ tb->tb_id);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -2012,7 +2014,7 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
call_fib_entry_notifier(nb, net, event_type, l->key,
KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
- fa->fa_type, fa->tb_id, 0);
+ fa->fa_type, fa->tb_id);
}
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0777ea949223..fc310db2708b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net)
return *this_cpu_ptr(net->ipv4.icmp_sk);
}
+/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
- local_bh_disable();
-
sk = icmp_sk(net);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
- local_bh_enable();
return NULL;
}
return sk;
@@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
static inline void icmp_xmit_unlock(struct sock *sk)
{
- spin_unlock_bh(&sk->sk_lock.slock);
+ spin_unlock(&sk->sk_lock.slock);
}
int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
@@ -282,6 +280,33 @@ bool icmp_global_allow(void)
}
EXPORT_SYMBOL(icmp_global_allow);
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+ if (type > NR_ICMP_TYPES)
+ return true;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ return true;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ return true;
+
+ return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+ if (icmpv4_mask_allow(net, type, code))
+ return true;
+
+ if (icmp_global_allow())
+ return true;
+
+ return false;
+}
+
/*
* Send an ICMP frame.
*/
@@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct flowi4 *fl4, int type, int code)
{
struct dst_entry *dst = &rt->dst;
+ struct inet_peer *peer;
bool rc = true;
+ int vif;
- if (type > NR_ICMP_TYPES)
- goto out;
-
- /* Don't limit PMTU discovery. */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ if (icmpv4_mask_allow(net, type, code))
goto out;
/* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
goto out;
- /* Limit if icmp type is enabled in ratemask. */
- if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
- goto out;
-
- rc = false;
- if (icmp_global_allow()) {
- int vif = l3mdev_master_ifindex(dst->dev);
- struct inet_peer *peer;
-
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
- rc = inet_peer_xrlim_allow(peer,
- net->ipv4.sysctl_icmp_ratelimit);
- if (peer)
- inet_putpeer(peer);
- }
+ vif = l3mdev_master_ifindex(dst->dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+ rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+ if (peer)
+ inet_putpeer(peer);
out:
return rc;
}
@@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct inet_sock *inet;
__be32 daddr, saddr;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
+ int type = icmp_param->data.icmph.type;
+ int code = icmp_param->data.icmph.code;
if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
+ /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* global icmp_msgs_per_sec */
+ if (!icmpv4_global_allow(net, type, code))
+ goto out_bh_enable;
+
sk = icmp_xmit_lock(net);
if (!sk)
- return;
+ goto out_bh_enable;
inet = inet_sk(sk);
icmp_param->data.icmph.checksum = 0;
@@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
- if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
- icmp_param->data.icmph.code))
+ if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
+out_bh_enable:
+ local_bh_enable();
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
- struct icmp_bxm *icmp_param;
+ struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
struct flowi4 fl4;
@@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
- icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
- if (!icmp_param)
- return;
+ /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ local_bh_disable();
+
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit */
+ if (!icmpv4_global_allow(net, type, code))
+ goto out_bh_enable;
sk = icmp_xmit_lock(net);
if (!sk)
- goto out_free;
+ goto out_bh_enable;
/*
* Construct source address and options.
@@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+ if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Prepare data for ICMP header.
*/
- icmp_param->data.icmph.type = type;
- icmp_param->data.icmph.code = code;
- icmp_param->data.icmph.un.gateway = info;
- icmp_param->data.icmph.checksum = 0;
- icmp_param->skb = skb_in;
- icmp_param->offset = skb_network_offset(skb_in);
+ icmp_param.data.icmph.type = type;
+ icmp_param.data.icmph.code = code;
+ icmp_param.data.icmph.un.gateway = info;
+ icmp_param.data.icmph.checksum = 0;
+ icmp_param.skb = skb_in;
+ icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
sk->sk_mark = mark;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param->replyopts.opt;
+ ipc.opt = &icmp_param.replyopts.opt;
ipc.tx_flags = 0;
ipc.ttl = 0;
ipc.tos = -1;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
- type, code, icmp_param);
+ type, code, &icmp_param);
if (IS_ERR(rt))
goto out_unlock;
+ /* peer icmp_ratelimit */
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
goto ende;
@@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
- icmp_param->data_len = skb_in->len - icmp_param->offset;
- if (icmp_param->data_len > room)
- icmp_param->data_len = room;
- icmp_param->head_len = sizeof(struct icmphdr);
+ icmp_param.data_len = skb_in->len - icmp_param.offset;
+ if (icmp_param.data_len > room)
+ icmp_param.data_len = room;
+ icmp_param.head_len = sizeof(struct icmphdr);
- icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
-out_free:
- kfree(icmp_param);
+out_bh_enable:
+ local_bh_enable();
out:;
}
EXPORT_SYMBOL(icmp_send);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 19ea045c50ed..b4d5980ade3b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -31,6 +31,86 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
+#if IS_ENABLED(CONFIG_IPV6)
+/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ * only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ * and 0.0.0.0 equals to 0.0.0.0 only
+ */
+static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+ const struct in6_addr *sk2_rcv_saddr6,
+ __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk1_ipv6only, bool sk2_ipv6only,
+ bool match_wildcard)
+{
+ int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
+ int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+ /* if both are mapped, treat as IPv4 */
+ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return 1;
+ if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
+ }
+
+ if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+ return 1;
+
+ if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+ !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+ !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (sk2_rcv_saddr6 &&
+ ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
+ return 1;
+
+ return 0;
+}
+#endif
+
+/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * 0.0.0.0 only equals to 0.0.0.0
+ */
+static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+ bool sk2_ipv6only, bool match_wildcard)
+{
+ if (!sk2_ipv6only) {
+ if (sk1_rcv_saddr == sk2_rcv_saddr)
+ return 1;
+ if (!sk1_rcv_saddr || !sk2_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
+}
+
+int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+ bool match_wildcard)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
+ inet6_rcv_saddr(sk2),
+ sk->sk_rcv_saddr,
+ sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk),
+ ipv6_only_sock(sk2),
+ match_wildcard);
+#endif
+ return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
+ ipv6_only_sock(sk2), match_wildcard);
+}
+EXPORT_SYMBOL(inet_rcv_saddr_equal);
+
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
@@ -44,9 +124,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
-int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb, bool relax,
- bool reuseport_ok)
+static int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb,
+ bool relax, bool reuseport_ok)
{
struct sock *sk2;
bool reuse = sk->sk_reuse;
@@ -62,7 +142,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
- !inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -72,54 +151,34 @@ int inet_csk_bind_conflict(const struct sock *sk,
rcu_access_pointer(sk->sk_reuseport_cb) ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
-
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
-
- if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
- sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
}
}
return sk2 != NULL;
}
-EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket. Returns with the
+ * inet_bind_hashbucket lock held.
*/
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
{
- bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int ret = 1, attempts = 5, port = snum;
- int smallest_size = -1, smallest_port;
+ int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
- kuid_t uid = sock_i_uid(sk);
u32 remaining, offset;
- bool reuseport_ok = !!snum;
- if (port) {
-have_port:
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock_bh(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
- goto tb_found;
-
- goto tb_not_found;
- }
-again:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -143,8 +202,6 @@ other_half_scan:
* We do the opposite to not pollute connect() users.
*/
offset |= 1U;
- smallest_size = -1;
- smallest_port = low; /* avoid compiler warning */
other_parity_scan:
port = low + offset;
@@ -158,30 +215,17 @@ other_parity_scan:
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == port) {
- if (((tb->fastreuse > 0 && reuse) ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(tb->fastuid, uid))) &&
- (tb->num_owners < smallest_size || smallest_size == -1)) {
- smallest_size = tb->num_owners;
- smallest_port = port;
- }
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false,
- reuseport_ok))
- goto tb_found;
+ if (!inet_csk_bind_conflict(sk, tb, false, false))
+ goto success;
goto next_port;
}
- goto tb_not_found;
+ tb = NULL;
+ goto success;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
}
- if (smallest_size != -1) {
- port = smallest_port;
- goto have_port;
- }
offset--;
if (!(offset & 1))
goto other_parity_scan;
@@ -191,8 +235,74 @@ next_port:
attempt_half = 2;
goto other_half_scan;
}
- return ret;
+ return NULL;
+success:
+ *port_ret = port;
+ *tb_ret = tb;
+ return head;
+}
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+ struct sock *sk)
+{
+ kuid_t uid = sock_i_uid(sk);
+
+ if (tb->fastreuseport <= 0)
+ return 0;
+ if (!sk->sk_reuseport)
+ return 0;
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ return 0;
+ if (!uid_eq(tb->fastuid, uid))
+ return 0;
+ /* We only need to check the rcv_saddr if this tb was once marked
+ * without fastreuseport and then was reset, as we can only know that
+ * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
+ * owners list.
+ */
+ if (tb->fastreuseport == FASTREUSEPORT_ANY)
+ return 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb->fast_sk_family == AF_INET6)
+ return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
+ &sk->sk_v6_rcv_saddr,
+ tb->fast_rcv_saddr,
+ sk->sk_rcv_saddr,
+ tb->fast_ipv6_only,
+ ipv6_only_sock(sk), true);
+#endif
+ return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
+ ipv6_only_sock(sk), true);
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ int ret = 1, port = snum;
+ struct inet_bind_hashbucket *head;
+ struct net *net = sock_net(sk);
+ struct inet_bind_bucket *tb = NULL;
+ kuid_t uid = sock_i_uid(sk);
+
+ if (!port) {
+ head = inet_csk_find_open_port(sk, &tb, &port);
+ if (!head)
+ return ret;
+ if (!tb)
+ goto tb_not_found;
+ goto success;
+ }
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port)
+ goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
@@ -203,39 +313,54 @@ tb_found:
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (((tb->fastreuse > 0 && reuse) ||
- (tb->fastreuseport > 0 &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
- smallest_size == -1)
+ if ((tb->fastreuse > 0 && reuse) ||
+ sk_reuseport_match(tb, sk))
goto success;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true,
- reuseport_ok)) {
- if ((reuse ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- !rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(tb->fastuid, uid))) &&
- !snum && smallest_size != -1 && --attempts >= 0) {
- spin_unlock_bh(&head->lock);
- goto again;
- }
+ if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
+ }
+success:
+ if (!hlist_empty(&tb->owners)) {
+ tb->fastreuse = reuse;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = FASTREUSEPORT_ANY;
+ tb->fastuid = uid;
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ } else {
+ tb->fastreuseport = 0;
}
+ } else {
if (!reuse)
tb->fastreuse = 0;
- if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
- tb->fastreuseport = 0;
- } else {
- tb->fastreuse = reuse;
if (sk->sk_reuseport) {
- tb->fastreuseport = 1;
- tb->fastuid = uid;
+ /* We didn't match or we don't have fastreuseport set on
+ * the tb, but we have sk_reuseport set on this socket
+ * and we know that there are no bind conflicts with
+ * this socket in this tb, so reset our tb's reuseport
+ * settings so that any subsequent sockets that match
+ * our current socket will be put on the fast path.
+ *
+ * If we reset we need to set FASTREUSEPORT_STRICT so we
+ * do extra checking for all subsequent sk_reuseport
+ * socks.
+ */
+ if (!sk_reuseport_match(tb, sk)) {
+ tb->fastreuseport = FASTREUSEPORT_STRICT;
+ tb->fastuid = uid;
+ tb->fast_rcv_saddr = sk->sk_rcv_saddr;
+ tb->fast_ipv6_only = ipv6_only_sock(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+#endif
+ }
} else {
tb->fastreuseport = 0;
}
}
-success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
@@ -711,9 +836,8 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
- local_bh_disable();
percpu_counter_dec(sk->sk_prot->orphan_count);
- local_bh_enable();
+
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4dea33e5f295..3828b3a805cd 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
}
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ca97835bfec4..8bea74298173 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
- tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
@@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
{
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
- tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
}
@@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
- tb->num_owners--;
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -435,10 +432,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
static int inet_reuseport_add_sock(struct sock *sk,
- struct inet_listen_hashbucket *ilb,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+ struct inet_listen_hashbucket *ilb)
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
struct sock *sk2;
@@ -451,7 +445,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
inet_csk(sk2)->icsk_bind_hash == tb &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
- saddr_same(sk, sk2, false))
+ inet_rcv_saddr_equal(sk, sk2, false))
return reuseport_add_sock(sk, sk2);
}
@@ -461,10 +455,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
return 0;
}
-int __inet_hash(struct sock *sk, struct sock *osk,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+int __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
@@ -479,7 +470,7 @@ int __inet_hash(struct sock *sk, struct sock *osk,
spin_lock(&ilb->lock);
if (sk->sk_reuseport) {
- err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+ err = inet_reuseport_add_sock(sk, ilb);
if (err)
goto unlock;
}
@@ -503,7 +494,7 @@ int inet_hash(struct sock *sk)
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
+ err = __inet_hash(sk, NULL);
local_bh_enable();
}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index ddcd56c08d14..f8aff2c71cde 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
}
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
-void inet_twsk_purge(struct inet_hashinfo *hashinfo,
- struct inet_timewait_death_row *twdr, int family)
+void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
{
struct inet_timewait_sock *tw;
struct sock *sk;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b67719f45953..737ce826d7ec 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
- int res = dst_neigh_output(dst, neigh, skb);
+ int res;
+
+ sock_confirm_neigh(skb, neigh);
+ res = neigh_output(neigh, skb);
rcu_read_unlock_bh();
return res;
@@ -886,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
skb->csum = 0;
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
__skb_queue_tail(queue, skb);
} else if (skb_is_gso(skb)) {
goto append;
@@ -1086,6 +1092,9 @@ alloc_new_skb:
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ if ((flags & MSG_CONFIRM) && !skb_prev)
+ skb_set_dst_pending_confirm(skb, 1);
+
/*
* Put the packet on the pending queue.
*/
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 900011709e3b..ce1386a67e24 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -272,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
continue;
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
- err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = cmsg->cmsg_len - sizeof(struct cmsghdr);
/* Our caller is responsible for freeing ipc->opt */
err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
@@ -843,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
{
struct ip_mreqn mreq;
struct net_device *dev = NULL;
+ int midx;
if (sk->sk_type == SOCK_STREAM)
goto e_inval;
@@ -887,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = -EADDRNOTAVAIL;
if (!dev)
break;
+
+ midx = l3mdev_master_ifindex(dev);
+
dev_put(dev);
err = -EINVAL;
if (sk->sk_bound_dev_if &&
- mreq.imr_ifindex != sk->sk_bound_dev_if)
+ mreq.imr_ifindex != sk->sk_bound_dev_if &&
+ (!midx || midx != sk->sk_bound_dev_if))
break;
inet->mc_index = mreq.imr_ifindex;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0fd1976ab63b..a31f47ccaad9 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
/* Often modified stats are per cpu, other are shared (netdev->stats) */
-struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
+void ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
{
int i;
@@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
tot->rx_bytes += rx_bytes;
tot->tx_bytes += tx_bytes;
}
-
- return tot;
}
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
@@ -228,7 +226,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
[LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
};
-static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
@@ -325,7 +323,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
};
-static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
+static int ip6_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index efc1e76d4977..beacd028848c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -299,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net)
}
#endif
+static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct mfc_cache_cmp_arg *cmparg = arg->key;
+ struct mfc_cache *c = (struct mfc_cache *)ptr;
+
+ return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
+ cmparg->mfc_origin != c->mfc_origin;
+}
+
+static const struct rhashtable_params ipmr_rht_params = {
+ .head_offset = offsetof(struct mfc_cache, mnode),
+ .key_offset = offsetof(struct mfc_cache, cmparg),
+ .key_len = sizeof(struct mfc_cache_cmp_arg),
+ .nelem_hint = 3,
+ .locks_mul = 1,
+ .obj_cmpfn = ipmr_hash_cmp,
+ .automatic_shrinking = true,
+};
+
static struct mr_table *ipmr_new_table(struct net *net, u32 id)
{
struct mr_table *mrt;
- unsigned int i;
/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
if (id != RT_TABLE_DEFAULT && id >= 1000000000)
@@ -318,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
write_pnet(&mrt->net, net);
mrt->id = id;
- /* Forwarding cache */
- for (i = 0; i < MFC_LINES; i++)
- INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
-
+ rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
+ INIT_LIST_HEAD(&mrt->mfc_cache_list);
INIT_LIST_HEAD(&mrt->mfc_unres_queue);
setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
@@ -338,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
mroute_clean_tables(mrt, true);
+ rhltable_destroy(&mrt->mfc_hash);
kfree(mrt);
}
@@ -839,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
__be32 origin,
__be32 mcastgrp)
{
- int line = MFC_HASH(mcastgrp, origin);
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
- return c;
- }
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ return c;
+
return NULL;
}
@@ -853,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
int vifi)
{
- int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = htonl(INADDR_ANY),
+ .mfc_origin = htonl(INADDR_ANY)
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
- if (c->mfc_origin == htonl(INADDR_ANY) &&
- c->mfc_mcastgrp == htonl(INADDR_ANY) &&
- c->mfc_un.res.ttls[vifi] < 255)
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (c->mfc_un.res.ttls[vifi] < 255)
return c;
return NULL;
@@ -869,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
__be32 mcastgrp, int vifi)
{
- int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = htonl(INADDR_ANY)
+ };
+ struct rhlist_head *tmp, *list;
struct mfc_cache *c, *proxy;
if (mcastgrp == htonl(INADDR_ANY))
goto skip;
- list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
- if (c->mfc_origin == htonl(INADDR_ANY) &&
- c->mfc_mcastgrp == mcastgrp) {
- if (c->mfc_un.res.ttls[vifi] < 255)
- return c;
-
- /* It's ok if the vifi is part of the static tree */
- proxy = ipmr_cache_find_any_parent(mrt,
- c->mfc_parent);
- if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
- return c;
- }
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
skip:
return ipmr_cache_find_any_parent(mrt, vifi);
}
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
+ __be32 origin, __be32 mcastgrp,
+ int parent)
+{
+ struct mfc_cache_cmp_arg arg = {
+ .mfc_mcastgrp = mcastgrp,
+ .mfc_origin = origin,
+ };
+ struct rhlist_head *tmp, *list;
+ struct mfc_cache *c;
+
+ list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
+ rhl_for_each_entry_rcu(c, tmp, list, mnode)
+ if (parent == -1 || parent == c->mfc_parent)
+ return c;
+
+ return NULL;
+}
+
/* Allocate a multicast cache entry */
static struct mfc_cache *ipmr_cache_alloc(void)
{
@@ -1028,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
struct sk_buff *skb)
{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct mfc_cache *c;
bool found = false;
int err;
- struct mfc_cache *c;
- const struct iphdr *iph = ip_hdr(skb);
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
@@ -1095,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
- int line;
- struct mfc_cache *c, *next;
+ struct mfc_cache *c;
- line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (!c)
+ return -ENOENT;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
- list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
- (parent == -1 || parent == c->mfc_parent)) {
- list_del_rcu(&c->list);
- mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
- return 0;
- }
- }
- return -ENOENT;
+ return 0;
}
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
struct mfcctl *mfc, int mrtsock, int parent)
{
- bool found = false;
- int line;
struct mfc_cache *uc, *c;
+ bool found;
+ int ret;
if (mfc->mfcc_parent >= MAXVIFS)
return -ENFILE;
- line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
-
- list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
- if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
- (parent == -1 || parent == c->mfc_parent)) {
- found = true;
- break;
- }
- }
-
- if (found) {
+ /* The entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
+ mfc->mfcc_mcastgrp.s_addr, parent);
+ rcu_read_unlock();
+ if (c) {
write_lock_bh(&mrt_lock);
c->mfc_parent = mfc->mfcc_parent;
ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
@@ -1160,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
- list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
-
+ ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+ ipmr_rht_params);
+ if (ret) {
+ pr_err("ipmr: rhtable insert error %d\n", ret);
+ ipmr_cache_free(c);
+ return ret;
+ }
+ list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
/* Check to see if we resolved a queued list. If so we
* need to send on the frames and tidy up.
*/
@@ -1191,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
/* Close the multicast socket, and clear the vif tables etc */
static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
- int i;
+ struct mfc_cache *c, *tmp;
LIST_HEAD(list);
- struct mfc_cache *c, *next;
+ int i;
/* Shut down all active vif entries */
for (i = 0; i < mrt->maxvif; i++) {
@@ -1204,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
unregister_netdevice_many(&list);
/* Wipe the cache */
- for (i = 0; i < MFC_LINES; i++) {
- list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (!all && (c->mfc_flags & MFC_STATIC))
- continue;
- list_del_rcu(&c->list);
- mroute_netlink_event(mrt, c, RTM_DELROUTE);
- ipmr_cache_free(c);
- }
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (!all && (c->mfc_flags & MFC_STATIC))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
}
if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
spin_lock_bh(&mfc_unres_lock);
- list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
list_del(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_destroy_unres(mrt, c);
@@ -1791,9 +1836,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, struct mfc_cache *cache,
int local)
{
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
int psend = -1;
int vif, ct;
- int true_vifi = ipmr_find_vif(mrt, skb->dev);
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
@@ -2091,8 +2136,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ct;
/* If cache is unresolved, don't try to parse IIF and OIF */
- if (c->mfc_parent >= MAXVIFS)
+ if (c->mfc_parent >= MAXVIFS) {
+ rtm->rtm_flags |= RTNH_F_UNRESOLVED;
return -ENOENT;
+ }
if (VIF_EXISTS(mrt, c->mfc_parent) &&
nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
@@ -2134,7 +2181,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
int ipmr_get_route(struct net *net, struct sk_buff *skb,
__be32 saddr, __be32 daddr,
- struct rtmsg *rtm, int nowait, u32 portid)
+ struct rtmsg *rtm, u32 portid)
{
struct mfc_cache *cache;
struct mr_table *mrt;
@@ -2158,11 +2205,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
struct net_device *dev;
int vif = -1;
- if (nowait) {
- rcu_read_unlock();
- return -EAGAIN;
- }
-
dev = skb->dev;
read_lock(&mrt_lock);
if (dev)
@@ -2296,34 +2338,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
struct mr_table *mrt;
struct mfc_cache *mfc;
unsigned int t = 0, s_t;
- unsigned int h = 0, s_h;
unsigned int e = 0, s_e;
s_t = cb->args[0];
- s_h = cb->args[1];
- s_e = cb->args[2];
+ s_e = cb->args[1];
rcu_read_lock();
ipmr_for_each_table(mrt, net) {
if (t < s_t)
goto next_table;
- if (t > s_t)
- s_h = 0;
- for (h = s_h; h < MFC_LINES; h++) {
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
- if (e < s_e)
- goto next_entry;
- if (ipmr_fill_mroute(mrt, skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- mfc, RTM_NEWROUTE,
- NLM_F_MULTI) < 0)
- goto done;
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ if (e < s_e)
+ goto next_entry;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
+ goto done;
next_entry:
- e++;
- }
- e = s_e = 0;
+ e++;
}
+ e = 0;
+ s_e = 0;
+
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
if (e < s_e)
@@ -2340,16 +2378,15 @@ next_entry2:
e++;
}
spin_unlock_bh(&mfc_unres_lock);
- e = s_e = 0;
- s_h = 0;
+ e = 0;
+ s_e = 0;
next_table:
t++;
}
done:
rcu_read_unlock();
- cb->args[2] = e;
- cb->args[1] = h;
+ cb->args[1] = e;
cb->args[0] = t;
return skb->len;
@@ -2593,10 +2630,8 @@ struct ipmr_mfc_iter {
struct seq_net_private p;
struct mr_table *mrt;
struct list_head *cache;
- int ct;
};
-
static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct ipmr_mfc_iter *it, loff_t pos)
{
@@ -2604,12 +2639,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct mfc_cache *mfc;
rcu_read_lock();
- for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
- it->cache = &mrt->mfc_cache_array[it->ct];
- list_for_each_entry_rcu(mfc, it->cache, list)
- if (pos-- == 0)
- return mfc;
- }
+ it->cache = &mrt->mfc_cache_list;
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+ if (pos-- == 0)
+ return mfc;
rcu_read_unlock();
spin_lock_bh(&mfc_unres_lock);
@@ -2636,17 +2669,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
it->mrt = mrt;
it->cache = NULL;
- it->ct = 0;
return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
: SEQ_START_TOKEN;
}
static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct mfc_cache *mfc = v;
struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt = it->mrt;
+ struct mfc_cache *mfc = v;
++*pos;
@@ -2659,19 +2691,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (it->cache == &mrt->mfc_unres_queue)
goto end_of_list;
- BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
-
- while (++it->ct < MFC_LINES) {
- it->cache = &mrt->mfc_cache_array[it->ct];
- if (list_empty(it->cache))
- continue;
- return list_first_entry(it->cache, struct mfc_cache, list);
- }
-
/* exhausted cache_array, show unresolved */
rcu_read_unlock();
it->cache = &mrt->mfc_unres_queue;
- it->ct = 0;
spin_lock_bh(&mfc_unres_lock);
if (!list_empty(it->cache))
@@ -2691,7 +2713,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
if (it->cache == &mrt->mfc_unres_queue)
spin_unlock_bh(&mfc_unres_lock);
- else if (it->cache == &mrt->mfc_cache_array[it->ct])
+ else if (it->cache == &mrt->mfc_cache_list)
rcu_read_unlock();
}
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index a467e1236c43..6241a81fd7f5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -677,11 +677,6 @@ static int copy_entries_to_user(unsigned int total_size,
return PTR_ERR(counters);
loc_cpu_entry = private->entries;
- /* ... then copy entire thing ... */
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
@@ -689,6 +684,10 @@ static int copy_entries_to_user(unsigned int total_size,
const struct xt_entry_target *t;
e = (struct arpt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct arpt_entry, counters),
&counters[num],
@@ -698,11 +697,7 @@ static int copy_entries_to_user(unsigned int total_size,
}
t = arpt_get_target_c(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct xt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 91656a1d8fbd..384b85713e06 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -826,10 +826,6 @@ copy_entries_to_user(unsigned int total_size,
return PTR_ERR(counters);
loc_cpu_entry = private->entries;
- if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
- ret = -EFAULT;
- goto free_counters;
- }
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
@@ -839,6 +835,10 @@ copy_entries_to_user(unsigned int total_size,
const struct xt_entry_target *t;
e = (struct ipt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off, e, sizeof(*e))) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -852,23 +852,14 @@ copy_entries_to_user(unsigned int total_size,
i += m->u.match_size) {
m = (void *)e + i;
- if (copy_to_user(userptr + off + i
- + offsetof(struct xt_entry_match,
- u.user.name),
- m->u.kernel.match->name,
- strlen(m->u.kernel.match->name)+1)
- != 0) {
+ if (xt_match_to_user(m, userptr + off + i)) {
ret = -EFAULT;
goto free_counters;
}
}
t = ipt_get_target_c(e);
- if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct xt_entry_target,
- u.user.name),
- t->u.kernel.target->name,
- strlen(t->u.kernel.target->name)+1) != 0) {
+ if (xt_target_to_user(t, userptr + off + e->target_offset)) {
ret = -EFAULT;
goto free_counters;
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0a783cd73faf..52f26459efc3 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -485,6 +485,7 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
.checkentry = clusterip_tg_check,
.destroy = clusterip_tg_destroy,
.targetsize = sizeof(struct ipt_clusterip_tgt_info),
+ .usersize = offsetof(struct ipt_clusterip_tgt_info, config),
#ifdef CONFIG_COMPAT
.compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
#endif /* CONFIG_COMPAT */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 30c0de53e254..3240a2614e82 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -57,8 +57,7 @@ synproxy_send_tcp(struct net *net,
goto free_nskb;
if (nfct) {
- nskb->nfct = nfct;
- nskb->nfctinfo = ctinfo;
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
nf_conntrack_get(nfct);
}
@@ -107,8 +106,8 @@ synproxy_send_client_synack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static void
@@ -230,8 +229,8 @@ synproxy_send_client_ack(struct net *net,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
}
static bool
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d075b3cf2400..73c591d8a9a8 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -128,16 +128,16 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
static int
icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo,
unsigned int hooknum)
{
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_zone *zone;
+ enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
- NF_CT_ASSERT(skb->nfct == NULL);
+ NF_CT_ASSERT(!skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return -NF_ACCEPT;
}
- *ctinfo = IP_CT_RELATED;
+ ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(net, zone, &innertuple);
if (!h) {
@@ -169,11 +169,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
}
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- *ctinfo += IP_CT_IS_REPLY;
+ ctinfo += IP_CT_IS_REPLY;
/* Update skb to refer to this connection */
- skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
- skb->nfctinfo = *ctinfo;
+ nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
return NF_ACCEPT;
}
@@ -181,7 +180,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
static int
icmp_error(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
- enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+ u8 pf, unsigned int hooknum)
{
const struct icmphdr *icmph;
struct icmphdr _ih;
@@ -225,7 +224,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
icmph->type != ICMP_REDIRECT)
return NF_ACCEPT;
- return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+ return icmp_error_message(net, tmpl, skb, hooknum);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 49bd6a54404f..346bf7ccac08 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
{
u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (skb->nfct) {
+ if (skb_nfct(skb)) {
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
@@ -75,7 +75,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
#if !IS_ENABLED(CONFIG_NF_NAT)
/* Previously seen (loopback)? Ignore. Do this before
fragment check. */
- if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
return NF_ACCEPT;
#endif
#endif
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index cf986e1c7bbd..f0dbff05fc28 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -68,10 +68,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Avoid counting cloned packets towards the original connection. */
- nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_ct_untracked_get()->ct_general;
- skb->nfctinfo = IP_CT_NEW;
- nf_conntrack_get(skb->nfct);
+ nf_reset(skb);
+ nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW);
+ nf_conntrack_get(skb_nfct(skb));
#endif
/*
* If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index b24795e2ee6d..f6f713376e6e 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -87,7 +87,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
struct nf_log_buf *m;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
m = nf_log_buf_open();
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 856648966f4c..c83a9963269b 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -319,7 +319,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
struct nf_log_buf *m;
/* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net))
+ if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
return;
m = nf_log_buf_open();
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 68d77b1f1495..2af6244b83e2 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -433,9 +433,9 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
}
- pr_debug("after bind(): num = %d, dif = %d\n",
- (int)isk->inet_num,
- (int)sk->sk_bound_dev_if);
+ pr_debug("after bind(): num = %hu, dif = %d\n",
+ isk->inet_num,
+ sk->sk_bound_dev_if);
err = 0;
if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
@@ -850,7 +850,8 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7143ca1a6af9..69cf49e8356d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -57,15 +57,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
unsigned int frag_mem;
int orphans, sockets;
- local_bh_disable();
orphans = percpu_counter_sum_positive(&tcp_orphan_count);
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
- local_bh_enable();
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- atomic_read(&tcp_death_row.tw_count), sockets,
+ atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
@@ -264,6 +262,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+ SNMP_MIB_ITEM("PFMemallocDrop", LINUX_MIB_PFMEMALLOCDROP),
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4e49e5cb001c..8119e1f66e03 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
+ if (flags & MSG_CONFIRM)
+ skb_set_dst_pending_confirm(skb, 1);
+
skb->transport_header = skb->network_header;
err = -EFAULT;
if (memcpy_from_msg(iph, msg, length))
@@ -666,7 +669,8 @@ out:
return len;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4.daddr);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 709ffe67d1de..cb494a5050f7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
@@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = {
.redirect = ip_do_redirect,
.local_out = __ip_local_out,
.neigh_lookup = ipv4_neigh_lookup,
+ .confirm_neigh = ipv4_confirm_neigh,
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
return neigh_create(&arp_tbl, pkey, dev);
}
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+ struct net_device *dev = dst->dev;
+ const __be32 *pkey = daddr;
+ const struct rtable *rt;
+
+ rt = (const struct rtable *)dst;
+ if (rt->rt_gateway)
+ pkey = (const __be32 *)&rt->rt_gateway;
+ else if (!daddr ||
+ (rt->rt_flags &
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+ return;
+
+ __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
+}
+
#define IP_IDENTS_SZ 2048u
static atomic_t *ip_idents __read_mostly;
@@ -1758,7 +1777,6 @@ standard_hash:
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
- const struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
@@ -1883,7 +1901,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type != RTN_UNICAST)
goto martian_destination;
- err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+ err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
out: return err;
brd_input:
@@ -2454,7 +2472,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq, int event, int nowait, unsigned int flags)
+ u32 seq, int event)
{
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
@@ -2463,7 +2481,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
if (!nlh)
return -EMSGSIZE;
@@ -2541,18 +2559,12 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
int err = ipmr_get_route(net, skb,
fl4->saddr, fl4->daddr,
- r, nowait, portid);
+ r, portid);
if (err <= 0) {
- if (!nowait) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- } else {
- if (err == -EMSGSIZE)
- goto nla_put_failure;
- error = err;
- }
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
}
} else
#endif
@@ -2638,9 +2650,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
skb->mark = mark;
- local_bh_disable();
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
- local_bh_enable();
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
@@ -2665,7 +2675,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
- RTM_NEWROUTE, 0, 0);
+ RTM_NEWROUTE);
if (err < 0)
goto errout_free;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3e88467d70ee..496b97e17aaf 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
#include <linux/tcp.h>
#include <linux/slab.h>
#include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <net/tcp.h>
#include <net/route.h>
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie_secret[2] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
-
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
- __u32 *tmp;
-
net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
-
- tmp = this_cpu_ptr(ipv4_cookie_scratch);
- memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
- tmp[0] = (__force u32)saddr;
- tmp[1] = (__force u32)daddr;
- tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
- tmp[3] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
- return tmp[17];
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u32)sport << 16 | (__force u32)dport,
+ count, &syncookie_secret[c]);
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2fa498b15d1..d6880a6149ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
+static int ip_privileged_port_min;
+static int ip_privileged_port_max = 65535;
static int ip_ttl_min = 1;
static int ip_ttl_max = 255;
static int tcp_syn_retries_min = 1;
@@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0) {
- if (range[1] < range[0])
+ /* Ensure that the upper limit is not smaller than the lower,
+ * and that the lower does not encroach upon the privileged
+ * port limit.
+ */
+ if ((range[1] < range[0]) ||
+ (range[0] < net->ipv4.sysctl_ip_prot_sock))
ret = -EINVAL;
else
set_local_port_range(net, range);
@@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
return ret;
}
+/* Validate changes from /proc interface. */
+static int ipv4_privileged_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_prot_sock);
+ int ret;
+ int pports;
+ int range[2];
+ struct ctl_table tmp = {
+ .data = &pports,
+ .maxlen = sizeof(pports),
+ .mode = table->mode,
+ .extra1 = &ip_privileged_port_min,
+ .extra2 = &ip_privileged_port_max,
+ };
+
+ pports = net->ipv4.sysctl_ip_prot_sock;
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ inet_get_local_port_range(net, &range[0], &range[1]);
+ /* Ensure that the local port range doesn't overlap with the
+ * privileged port range.
+ */
+ if (range[0] < pports)
+ ret = -EINVAL;
+ else
+ net->ipv4.sysctl_ip_prot_sock = pports;
+ }
+
+ return ret;
+}
static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
{
@@ -290,13 +331,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_max_tw_buckets",
- .data = &tcp_death_row.sysctl_max_tw_buckets,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_fastopen",
.data = &sysctl_tcp_fastopen,
.maxlen = sizeof(int),
@@ -310,13 +344,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_fastopen_key,
},
{
- .procname = "tcp_tw_recycle",
- .data = &tcp_death_row.sysctl_tw_recycle,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_abort_on_overflow",
.data = &sysctl_tcp_abort_on_overflow,
.maxlen = sizeof(int),
@@ -338,13 +365,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_max_syn_backlog",
- .data = &sysctl_max_syn_backlog,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
.maxlen = sizeof(int),
@@ -558,13 +578,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_thin_dupack",
- .data = &sysctl_tcp_thin_dupack,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_early_retrans",
.data = &sysctl_tcp_early_retrans,
.maxlen = sizeof(int),
@@ -960,6 +973,27 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "tcp_max_tw_buckets",
+ .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_tw_recycle",
+ .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_syn_backlog",
+ .data = &init_net.ipv4.sysctl_max_syn_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
.procname = "fib_multipath_use_neigh",
@@ -971,6 +1005,24 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "ip_unprivileged_port_start",
+ .maxlen = sizeof(int),
+ .data = &init_net.ipv4.sysctl_ip_prot_sock,
+ .mode = 0644,
+ .proc_handler = ipv4_privileged_ports,
+ },
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "udp_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_udp_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0efb4c7f6704..d44a6989e76d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk)
tp->mss_cache = TCP_MSS_DEFAULT;
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
- tcp_enable_early_retrans(tp);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
@@ -421,15 +420,13 @@ void tcp_init_sock(struct sock *sk)
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
- local_bh_disable();
sk_sockets_allocated_inc(sk);
- local_bh_enable();
}
EXPORT_SYMBOL(tcp_init_sock);
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
{
- if (tsflags) {
+ if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -536,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
+ } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* Active TCP fastopen socket with defer_connect
+ * Return POLLOUT so application can call write()
+ * in order for kernel to generate SYN+data
+ */
+ mask |= POLLOUT | POLLWRNORM;
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
@@ -964,10 +967,8 @@ new_segment:
copied += copy;
offset += copy;
size -= copy;
- if (!size) {
- tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+ if (!size)
goto out;
- }
if (skb->len < size_goal || (flags & MSG_OOB))
continue;
@@ -993,8 +994,11 @@ wait_for_memory:
}
out:
- if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
- tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ if (copied) {
+ tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+ if (!(flags & MSG_SENDPAGE_NOTLAST))
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ }
return copied;
do_error:
@@ -1079,6 +1083,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
int *copied, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
int err, flags;
if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
@@ -1093,9 +1098,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ if (inet->defer_connect) {
+ err = tcp_connect(sk);
+ /* Same failure procedure as in tcp_v4/6_connect */
+ if (err) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ }
+ }
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
- msg->msg_namelen, flags);
+ msg->msg_namelen, flags, 1);
+ inet->defer_connect = 0;
*copied = tp->fastopen_req->copied;
tcp_free_fastopen_req(tp);
return err;
@@ -1115,7 +1130,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk);
flags = msg->msg_flags;
- if (flags & MSG_FASTOPEN) {
+ if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
@@ -1287,7 +1302,6 @@ new_segment:
copied += copy;
if (!msg_data_left(msg)) {
- tcp_tx_timestamp(sk, sockc.tsflags, skb);
if (unlikely(flags & MSG_EOR))
TCP_SKB_CB(skb)->eor = 1;
goto out;
@@ -1318,8 +1332,10 @@ wait_for_memory:
}
out:
- if (copied)
+ if (copied) {
+ tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ }
out_nopush:
release_sock(sk);
return copied + copied_syn;
@@ -2479,11 +2495,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK:
if (val < 0 || val > 1)
err = -EINVAL;
- else {
- tp->thin_dupack = val;
- if (tp->thin_dupack)
- tcp_disable_early_retrans(tp);
- }
break;
case TCP_REPAIR:
@@ -2668,6 +2679,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL;
}
break;
+ case TCP_FASTOPEN_CONNECT:
+ if (val > 1 || val < 0) {
+ err = -EINVAL;
+ } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ if (sk->sk_state == TCP_CLOSE)
+ tp->fastopen_connect = val;
+ else
+ err = -EINVAL;
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ break;
case TCP_TIMESTAMP:
if (!tp->repair)
err = -EPERM;
@@ -2770,6 +2793,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_sacked = sk->sk_max_ack_backlog;
return;
}
+
+ slow = lock_sock_fast(sk);
+
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2820,15 +2846,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
- slow = lock_sock_fast(sk);
-
info->tcpi_bytes_acked = tp->bytes_acked;
info->tcpi_bytes_received = tp->bytes_received;
info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
tcp_get_info_chrono_stats(tp, info);
- unlock_sock_fast(sk, slow);
-
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
@@ -2844,6 +2866,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
do_div(rate64, intv);
info->tcpi_delivery_rate = rate64;
}
+ unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2853,7 +2876,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
struct sk_buff *stats;
struct tcp_info info;
- stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+ stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -2864,6 +2887,10 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
info.tcpi_rwnd_limited, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
+ tp->data_segs_out, TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
+ tp->total_retrans, TCP_NLA_PAD);
return stats;
}
@@ -2973,8 +3000,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
+
case TCP_THIN_DUPACK:
- val = tp->thin_dupack;
+ val = 0;
break;
case TCP_REPAIR:
@@ -3027,6 +3055,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
+ case TCP_FASTOPEN_CONNECT:
+ val = tp->fastopen_connect;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
@@ -3340,6 +3372,7 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+ inet_hashinfo_init(&tcp_hashinfo);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
@@ -3383,10 +3416,7 @@ void __init tcp_init(void)
cnt = tcp_hashinfo.ehash_mask + 1;
-
- tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
sysctl_tcp_max_orphans = cnt / 2;
- sysctl_max_syn_backlog = max(128, cnt / 256);
tcp_init_mem();
/* Set per-socket limits to no more than 1/128 the pressure threshold */
@@ -3405,6 +3435,7 @@ void __init tcp_init(void)
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+ tcp_v4_init();
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index dd2560c83a85..8ea4e9787f82 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -326,3 +326,57 @@ fastopen:
*foc = valid_foc;
return NULL;
}
+
+bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie)
+{
+ unsigned long last_syn_loss = 0;
+ int syn_loss = 0;
+
+ tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
+
+ /* Recurring FO SYN losses: no cookie or data in SYN */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ cookie->len = -1;
+ return false;
+ }
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+ cookie->len = -1;
+ return true;
+ }
+ return cookie->len > 0;
+}
+
+/* This function checks if we want to defer sending SYN until the first
+ * write(). We defer under the following conditions:
+ * 1. fastopen_connect sockopt is set
+ * 2. we have a valid cookie
+ * Return value: return true if we want to defer until application writes data
+ * return false if we want to send out SYN immediately
+ */
+bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
+{
+ struct tcp_fastopen_cookie cookie = { .len = 0 };
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (tp->fastopen_connect && !tp->fastopen_req) {
+ if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
+ inet_sk(sk)->defer_connect = 1;
+ return true;
+ }
+
+ /* Alloc fastopen_req in order for FO option to be included
+ * in SYN
+ */
+ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
+ sk->sk_allocation);
+ if (tp->fastopen_req)
+ tp->fastopen_req->cookie = cookie;
+ else
+ *err = -ENOBUFS;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_fastopen_defer_connect);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 41dcbd568cbe..2c0ff327b6df 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,7 +79,7 @@
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
-int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly;
int sysctl_tcp_max_reordering __read_mostly = 300;
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
@@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
-
-int sysctl_tcp_thin_dupack __read_mostly;
-
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
@@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
tcp_disable_fack(tp);
}
- if (metric > 0)
- tcp_disable_early_retrans(tp);
tp->rack.reord = 1;
}
@@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
tp->retransmit_skb_hint = skb;
-
- if (!tp->lost_out ||
- after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
/* Sum the number of packets on the wire we have marked as lost.
@@ -1135,6 +1126,7 @@ struct tcp_sacktag_state {
*/
struct skb_mstamp first_sackt;
struct skb_mstamp last_sackt;
+ struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */
struct rate_sample *rate;
int flag;
};
@@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
- tcp_rack_advance(tp, xmit_time, sacked);
+ tcp_rack_advance(tp, sacked, end_seq,
+ xmit_time, &state->ack_time);
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
@@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sk_buff *skb;
- bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
bool mark_lost;
@@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
}
}
tcp_verify_left_out(tp);
@@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk)
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
- /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
- * loss recovery is underway except recurring timeout(s) on
- * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+ * if a previous recovery is underway, otherwise it may incorrectly
+ * call a timeout spurious if some previously retransmitted packets
+ * are s/acked (sec 3.2). We do not apply that retriction since
+ * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+ * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+ * on PTMU discovery to avoid sending new data.
*/
- tp->frto = sysctl_tcp_frto &&
- (new_recovery || icsk->icsk_retransmits) &&
- !inet_csk(sk)->icsk_mtup.probe_size;
+ tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
}
/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- unsigned long delay;
-
- /* Delay early retransmit and entering fast recovery for
- * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
- * available, or RTO is scheduled to fire first.
- */
- if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
- (flag & FLAG_ECE) || !tp->srtt_us)
- return false;
-
- delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
- msecs_to_jiffies(2));
-
- if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
- return false;
-
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
- TCP_RTO_MAX);
- return true;
-}
-
/* Linux NewReno/SACK/FACK/ECN state machine.
* --------------------------------------
*
@@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
- * Essentially, we have now two algorithms counting
+ * Essentially, we have now a few algorithms detecting
* lost packets.
*
- * FACK: It is the simplest heuristics. As soon as we decided
+ * If the receiver supports SACK:
+ *
+ * RFC6675/3517: It is the conventional algorithm. A packet is
+ * considered lost if the number of higher sequence packets
+ * SACKed is greater than or equal the DUPACK thoreshold
+ * (reordering). This is implemented in tcp_mark_head_lost and
+ * tcp_update_scoreboard.
+ *
+ * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * (2017-) that checks timing instead of counting DUPACKs.
+ * Essentially a packet is considered lost if it's not S/ACKed
+ * after RTT + reordering_window, where both metrics are
+ * dynamically measured and adjusted. This is implemented in
+ * tcp_rack_mark_lost.
+ *
+ * FACK (Disabled by default. Subsumbed by RACK):
+ * It is the simplest heuristics. As soon as we decided
* that something is lost, we decide that _all_ not SACKed
* packets until the most forward SACK are lost. I.e.
* lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* takes place. We use FACK by default until reordering
* is suspected on the path to this destination.
*
- * NewReno: when Recovery is entered, we assume that one segment
+ * If the receiver does not support SACK:
+ *
+ * NewReno (RFC6582): in Recovery we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
- * Imagine, that's all! Forget about all this shamanism about CWND inflation
- * deflation etc. CWND is real congestion window, never inflated, changes
- * only according to classic VJ rules.
- *
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
@@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- __u32 packets_out;
- int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
if (tcp_dupack_heuristics(tp) > tp->reordering)
return true;
- /* Trick#4: It is still not OK... But will it be useful to delay
- * recovery more?
- */
- packets_out = tp->packets_out;
- if (packets_out <= tp->reordering &&
- tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
- !tcp_may_send_now(sk)) {
- /* We have nothing to send. This connection is limited
- * either by receiver window or by application.
- */
- return true;
- }
-
- /* If a thin stream is detected, retransmit after first
- * received dupack. Employ only if SACK is supported in order
- * to avoid possible corner-case series of spurious retransmissions
- * Use only if there are no unsent data.
- */
- if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
- tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
- tcp_is_sack(tp) && !tcp_send_head(sk))
- return true;
-
- /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
- * retransmissions due to small network reorderings, we implement
- * Mitigation A.3 in the RFC and delay the retransmission for a short
- * interval if appropriate.
- */
- if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
- (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
- !tcp_may_send_now(sk))
- return !tcp_pause_early_retransmit(sk, flag);
-
return false;
}
@@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
- int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk)
}
EXPORT_SYMBOL(tcp_simple_retransmit);
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
@@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
tcp_try_undo_loss(sk, false))
return;
- if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
- /* Step 3.b. A timeout is spurious if not all data are
- * lost, i.e., never-retransmitted data are (s)acked.
- */
- if ((flag & FLAG_ORIG_SACK_ACKED) &&
- tcp_try_undo_loss(sk, true))
- return;
+ /* The ACK (s)acks some never-retransmitted data meaning not all
+ * the data packets before the timeout were lost. Therefore we
+ * undo the congestion window and state. This is essentially
+ * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+ * a retransmitted skb is permantly marked, we can apply such an
+ * operation even if F-RTO was not used.
+ */
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, tp->undo_marker))
+ return;
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
@@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
return false;
}
+static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
+ const struct skb_mstamp *ack_time)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+ u32 prior_retrans = tp->retrans_out;
+
+ tcp_rack_mark_lost(sk, ack_time);
+ if (prior_retrans > tp->retrans_out)
+ *ack_flag |= FLAG_LOST_RETRANS;
+ }
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- bool is_dupack, int *ack_flag, int *rexmit)
+ bool is_dupack, int *ack_flag, int *rexmit,
+ const struct skb_mstamp *ack_time)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
- /* Use RACK to detect loss */
- if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
- tcp_rack_mark_lost(sk)) {
- flag |= FLAG_LOST_RETRANS;
- *ack_flag |= FLAG_LOST_RETRANS;
- }
-
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_keep_open(sk);
return;
}
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, is_dupack, rexmit);
- if (icsk->icsk_ca_state != TCP_CA_Open &&
- !(flag & FLAG_LOST_RETRANS))
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
+ if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+ (*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
default:
@@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
+ tcp_rack_identify_loss(sk, ack_flag, ack_time);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
@@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk)
} else {
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
- if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp =
@@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk)
}
}
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_rearm_rto(sk);
-
- /* Stop if ER is disabled after the delayed ER timer is scheduled */
- if (!tp->do_early_retrans)
- return;
-
- tcp_enter_recovery(sk, false);
- tcp_update_scoreboard(sk, 1);
- tcp_xmit_retransmit_queue(sk);
-}
-
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
@@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, int *acked,
- struct tcp_sacktag_state *sack,
- struct skb_mstamp *now)
+ struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct skb_mstamp first_ackt, last_ackt;
+ struct skb_mstamp *now = &sack->ack_time;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->packets_out;
@@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
- tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ tcp_rack_advance(tp, sacked, scb->end_seq,
+ &skb->skb_mstamp,
+ &sack->ack_time);
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
- struct skb_mstamp now;
sack_state.first_sackt.v64 = 0;
sack_state.rate = &rs;
@@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
- skb_mstamp_get(&now);
+ skb_mstamp_get(&sack_state.ack_time);
- if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+ if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
if (after(ack, prior_snd_una)) {
@@ -3689,34 +3634,34 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
- &sack_state, &now);
+ &sack_state);
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
- struct dst_entry *dst = __sk_dst_get(sk);
- if (dst)
- dst_confirm(dst);
- }
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+ sk_dst_confirm(sk);
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
lost = tp->lost - lost; /* freshly marked lost */
- tcp_rate_gen(sk, delivered, lost, &now, &rs);
- tcp_cong_control(sk, ack, delivered, flag, &rs);
+ tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time,
+ sack_state.rate);
+ tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3737,9 +3682,11 @@ old_ack:
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
+ skb_mstamp_get(&sack_state.ack_time);
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit,
+ &sack_state.ack_time);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4557,6 +4504,7 @@ add_sack:
end:
if (skb) {
tcp_grow_window(sk, skb);
+ skb_condense(skb);
skb_set_owner_r(skb, sk);
}
}
@@ -5249,6 +5197,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
return err;
}
+/* Accept RST for rcv_nxt - 1 after a FIN.
+ * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
+ * FIN is sent followed by a RST packet. The RST is sent with the same
+ * sequence number as the FIN, and thus according to RFC 5961 a challenge
+ * ACK should be sent. However, Mac OSX rate limits replies to challenge
+ * ACKs on the closed socket. In addition middleboxes can drop either the
+ * challenge ACK or a subsequent RST.
+ */
+static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
+ (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
+ TCPF_CLOSING));
+}
+
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5287,20 +5252,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
+ } else if (tcp_reset_check(sk, skb)) {
+ tcp_reset(sk);
}
goto discard;
}
/* Step 2: check RST bit */
if (th->rst) {
- /* RFC 5961 3.2 (extend to match against SACK too if available):
- * If seq num matches RCV.NXT or the right-most SACK block,
+ /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
+ * FIN and SACK too if available):
+ * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
+ * the right-most SACK block,
* then
* RESET the connection
* else
* Send a challenge ACK
*/
- if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
+ tcp_reset_check(sk, skb)) {
rst_seq_match = true;
} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
struct tcp_sack_block *sp = &tp->selective_acks[0];
@@ -6022,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
case TCP_FIN_WAIT1: {
- struct dst_entry *dst;
int tmo;
/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6049,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
- dst = __sk_dst_get(sk);
- if (dst)
- dst_confirm(dst);
+ sk_dst_confirm(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
/* Wake up lingering close() */
@@ -6363,7 +6330,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
- if (tcp_death_row.sysctl_tw_recycle) {
+ if (net->ipv4.tcp_death_row.sysctl_tw_recycle) {
bool strict;
dst = af_ops->route_req(sk, &fl, req, &strict);
@@ -6377,8 +6344,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
/* Kill the following clause, if you dislike this way. */
else if (!net->ipv4.sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (sysctl_max_syn_backlog >> 2)) &&
+ (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst, false,
tmp_opt.saw_tstamp)) {
/* Without syncookies last quarter of
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fe9da4fb96bf..8c124d4ef4b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -146,6 +146,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -196,7 +197,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = 0;
}
- if (tcp_death_row.sysctl_tw_recycle &&
+ if (tcp_death_row->sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
tcp_fetch_timewait_stamp(sk, &rt->dst);
@@ -215,7 +216,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
- err = inet_hash_connect(&tcp_death_row, sk);
+ err = inet_hash_connect(tcp_death_row, sk);
if (err)
goto failure;
@@ -231,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
+ rt = NULL;
if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -241,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
+ if (tcp_fastopen_defer_connect(sk, &err))
+ return err;
+ if (err)
+ goto failure;
+
err = tcp_connect(sk);
- rt = NULL;
if (err)
goto failure;
@@ -1318,10 +1324,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_ca_openreq_child(newsk, dst);
tcp_sync_mss(newsk, dst_mtu(dst));
- newtp->advmss = dst_metric_advmss(dst);
- if (tcp_sk(sk)->rx_opt.user_mss &&
- tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
- newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+ newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -1555,8 +1558,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
* It has been noticed pure SACK packets were sometimes dropped
* (if cooked by drivers without copybreak feature).
*/
- if (!skb->data_len)
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ skb_condense(skb);
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
@@ -1816,7 +1818,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
- .bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
@@ -1887,9 +1888,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_free_fastopen_req(tp);
tcp_saved_syn_free(tp);
- local_bh_disable();
sk_sockets_allocated_dec(sk);
- local_bh_enable();
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -2228,7 +2227,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
@@ -2375,6 +2374,7 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
@@ -2418,7 +2418,7 @@ static void __net_exit tcp_sk_exit(struct net *net)
static int __net_init tcp_sk_init(struct net *net)
{
- int res, cpu;
+ int res, cpu, cnt;
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
if (!net->ipv4.tcp_sk)
@@ -2457,6 +2457,13 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 0;
+ cnt = tcp_hashinfo.ehash_mask + 1;
+ net->ipv4.tcp_death_row.sysctl_tw_recycle = 0;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
+ net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+
return 0;
fail:
tcp_sk_exit(net);
@@ -2466,7 +2473,7 @@ fail:
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
- inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+ inet_twsk_purge(&tcp_hashinfo, AF_INET);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2477,7 +2484,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
void __init tcp_v4_init(void)
{
- inet_hashinfo_init(&tcp_hashinfo);
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ba8f02d0f283..0f46e5fe31ad 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
u32 val;
int m;
+ sk_dst_confirm(sk);
if (sysctl_tcp_nometrics_save || !dst)
return;
- if (dst->flags & DST_HOST)
- dst_confirm(dst);
-
rcu_read_lock();
if (icsk->icsk_backoff || !tp->srtt_us) {
/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
+ sk_dst_confirm(sk);
if (!dst)
goto reset;
- dst_confirm(dst);
-
rcu_read_lock();
tm = tcp_get_metrics(sk, dst, true);
if (!tm) {
@@ -522,7 +519,6 @@ void tcp_init_metrics(struct sock *sk)
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val) {
tcp_disable_fack(tp);
- tcp_disable_early_retrans(tp);
tp->reordering = val;
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 28ce5ee831f5..dff7d2aaf861 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -29,12 +29,6 @@
int sysctl_tcp_abort_on_overflow __read_mostly;
-struct inet_timewait_death_row tcp_death_row = {
- .sysctl_max_tw_buckets = NR_FILE * 2,
- .hashinfo = &tcp_hashinfo,
-};
-EXPORT_SYMBOL_GPL(tcp_death_row);
-
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
@@ -100,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
bool paws_reject = false;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
@@ -153,7 +148,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
- if (tcp_death_row.sysctl_tw_recycle &&
+ if (tcp_death_row->sysctl_tw_recycle &&
tcptw->tw_ts_recent_stamp &&
tcp_tw_remember_stamp(tw))
inet_twsk_reschedule(tw, tw->tw_timeout);
@@ -264,11 +259,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
bool recycle_ok = false;
+ struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
- if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
- tw = inet_twsk_alloc(sk, &tcp_death_row, state);
+ tw = inet_twsk_alloc(sk, tcp_death_row, state);
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -364,15 +360,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk_listener);
- u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
int full_space = tcp_full_space(sk_listener);
- int mss = dst_metric_advmss(dst);
u32 window_clamp;
__u8 rcv_wscale;
+ int mss;
- if (user_mss && user_mss < mss)
- mss = user_mss;
-
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
@@ -472,7 +465,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
newtp->lsndtime = treq->snt_synack.stamp_jiffies;
newsk->sk_txhash = treq->txhash;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8ce50dc3ab8c..61f272a99a49 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
- }
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
@@ -966,6 +964,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+ /* If we had to use memory reserve to allocate this skb,
+ * this might cause drops if packet is looped back :
+ * Other socket might not have SOCK_MEMALLOC.
+ * Packets not looped back do not care about pfmemalloc.
+ */
+ skb->pfmemalloc = 0;
+
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -975,6 +980,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
@@ -2289,8 +2296,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
- if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
- return false;
/* No consecutive loss probes. */
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
tcp_rearm_rto(sk);
@@ -2309,8 +2314,9 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !tp->packets_out ||
- !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
+ !tp->packets_out || !tcp_is_sack(tp) ||
+ icsk->icsk_ca_state != TCP_CA_Open)
return false;
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
@@ -2776,6 +2782,13 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
+ /* Update global and local TCP statistics. */
+ segs = tcp_skb_pcount(skb);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans += segs;
+
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
@@ -2793,14 +2806,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
}
if (likely(!err)) {
- segs = tcp_skb_pcount(skb);
-
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
- /* Update global TCP statistics. */
- TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans += segs;
+ } else if (err != -EBUSY) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
return err;
}
@@ -2823,8 +2831,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- } else if (err != -EBUSY) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
@@ -2833,36 +2839,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
return err;
}
-/* Check if we forward retransmits are possible in the current
- * window/congestion state.
- */
-static bool tcp_can_forward_retransmit(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_sock *tp = tcp_sk(sk);
-
- /* Forward retransmissions are possible only during Recovery. */
- if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return false;
-
- /* No forward retransmissions in Reno are possible. */
- if (tcp_is_reno(tp))
- return false;
-
- /* Yeah, we have to make difficult choice between forward transmission
- * and retransmission... Both ways have their merits...
- *
- * For now we do not retransmit anything, while we have some new
- * segments to send. In the other cases, follow rule 3 for
- * NextSeg() specified in RFC3517.
- */
-
- if (tcp_may_send_now(sk))
- return false;
-
- return true;
-}
-
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
@@ -2877,24 +2853,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 max_segs, last_lost;
+ u32 max_segs;
int mib_idx;
- int fwd_rexmitting = 0;
if (!tp->packets_out)
return;
- if (!tp->lost_out)
- tp->retransmit_high = tp->snd_una;
-
if (tp->retransmit_skb_hint) {
skb = tp->retransmit_skb_hint;
- last_lost = TCP_SKB_CB(skb)->end_seq;
- if (after(last_lost, tp->retransmit_high))
- last_lost = tp->retransmit_high;
} else {
skb = tcp_write_queue_head(sk);
- last_lost = tp->snd_una;
}
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
@@ -2917,31 +2885,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
*/
segs = min_t(int, segs, max_segs);
- if (fwd_rexmitting) {
-begin_fwd:
- if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
- break;
- mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
-
- } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
- tp->retransmit_high = last_lost;
- if (!tcp_can_forward_retransmit(sk))
- break;
- /* Backtrack if necessary to non-L'ed skb */
- if (hole) {
- skb = hole;
- hole = NULL;
- }
- fwd_rexmitting = 1;
- goto begin_fwd;
-
+ if (tp->retrans_out >= tp->lost_out) {
+ break;
} else if (!(sacked & TCPCB_LOST)) {
if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
} else {
- last_lost = TCP_SKB_CB(skb)->end_seq;
if (icsk->icsk_ca_state != TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPFASTRETRANS;
else
@@ -2962,7 +2913,8 @@ begin_fwd:
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
- if (skb == tcp_write_queue_head(sk))
+ if (skb == tcp_write_queue_head(sk) &&
+ icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
@@ -3119,7 +3071,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct sk_buff *skb;
int tcp_header_size;
struct tcphdr *th;
- u16 user_mss;
int mss;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
@@ -3149,10 +3100,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
}
skb_dst_set(skb, dst);
- mss = dst_metric_advmss(dst);
- user_mss = READ_ONCE(tp->rx_opt.user_mss);
- if (user_mss && user_mss < mss)
- mss = user_mss;
+ mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
@@ -3258,9 +3206,7 @@ static void tcp_connect_init(struct sock *sk)
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
- tp->advmss = dst_metric_advmss(dst);
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
- tp->advmss = tp->rx_opt.user_mss;
+ tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
@@ -3326,31 +3272,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0;
- unsigned long last_syn_loss = 0;
+ int space, err = 0;
struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
- tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
- &syn_loss, &last_syn_loss);
- /* Recurring FO SYN losses: revert to regular handshake temporarily */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- fo->cookie.len = -1;
- goto fallback;
- }
-
- if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
- fo->cookie.len = -1;
- else if (fo->cookie.len <= 0)
+ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
* user-MSS. Reserve maximum option space for middleboxes that add
* private TCP options. The cost is reduced data space in SYN :(
*/
- if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+
space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e36df4fcfeba..4ecb38ae8504 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,9 +1,32 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
-/* Marks a packet lost, if some packet sent later has been (s)acked.
+static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ /* Account for retransmits that are lost again */
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+}
+
+static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
+ const struct skb_mstamp *t2,
+ u32 seq1, u32 seq2)
+{
+ return skb_mstamp_after(t1, t2) ||
+ (t1->v64 == t2->v64 && after(seq1, seq2));
+}
+
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
* but they look at different metrics:
*
@@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
* is being more resilient to reordering by simply allowing some
* "settling delay", instead of tweaking the dupthresh.
*
- * The current version is only used after recovery starts but can be
- * easily extended to detect the first loss.
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
*/
-int tcp_rack_mark_lost(struct sock *sk)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+ u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- u32 reo_wnd, prior_retrans = tp->retrans_out;
-
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
- return 0;
-
- /* Reset the advanced flag to avoid unnecessary queue scanning */
- tp->rack.advanced = 0;
+ u32 reo_wnd;
+ *reo_timeout = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
- *
- * TODO: measure and adapt to the observed reordering delay, and
- * use a timer to retransmit like the delayed early retransmit.
*/
reo_wnd = 1000;
- if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
tcp_for_write_queue(skb, sk) {
@@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk)
scb->sacked & TCPCB_SACKED_ACKED)
continue;
- if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+ if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp,
+ tp->rack.end_seq, scb->end_seq)) {
+ /* Step 3 in draft-cheng-tcpm-rack-00.txt:
+ * A packet is lost if its elapsed time is beyond
+ * the recent RTT plus the reordering window.
+ */
+ u32 elapsed = skb_mstamp_us_delta(now,
+ &skb->skb_mstamp);
+ s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
- if (skb_mstamp_us_delta(&tp->rack.mstamp,
- &skb->skb_mstamp) <= reo_wnd)
+ if (remaining < 0) {
+ tcp_rack_mark_skb_lost(sk, skb);
continue;
-
- /* skb is lost if packet sent later is sacked */
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- if (scb->sacked & TCPCB_SACKED_RETRANS) {
- scb->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
}
+
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ /* Record maximum wait time (+1 to avoid 0) */
+ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
@@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk)
break;
}
}
- return prior_retrans - tp->retrans_out;
}
-/* Record the most recently (re)sent time among the (s)acked packets */
-void tcp_rack_advance(struct tcp_sock *tp,
- const struct skb_mstamp *xmit_time, u8 sacked)
+void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout;
+
+ if (!tp->rack.advanced)
+ return;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+ tcp_rack_detect_loss(sk, now, &timeout);
+ if (timeout) {
+ timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+ timeout, inet_csk(sk)->icsk_rto);
+ }
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
+ const struct skb_mstamp *xmit_time,
+ const struct skb_mstamp *ack_time)
{
+ u32 rtt_us;
+
if (tp->rack.mstamp.v64 &&
- !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp,
+ end_seq, tp->rack.end_seq))
return;
+ rtt_us = skb_mstamp_us_delta(ack_time, xmit_time);
if (sacked & TCPCB_RETRANS) {
- struct skb_mstamp now;
-
/* If the sacked packet was retransmitted, it's ambiguous
* whether the retransmission or the original (or the prior
* retransmission) was sacked.
@@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp,
* so it's at least one RTT (i.e., retransmission is at least
* an RTT later).
*/
- skb_mstamp_get(&now);
- if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ if (rtt_us < tcp_min_rtt(tp))
return;
}
-
+ tp->rack.rtt_us = rtt_us;
tp->rack.mstamp = *xmit_time;
+ tp->rack.end_seq = end_seq;
tp->rack.advanced = 1;
}
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ u32 timeout, prior_inflight;
+
+ skb_mstamp_get(&now);
+ prior_inflight = tcp_packets_in_flight(tp);
+ tcp_rack_detect_loss(sk, &now, &timeout);
+ if (prior_inflight != tcp_packets_in_flight(tp)) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+ tcp_enter_recovery(sk, false);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_cwnd_reduction(sk, 1, 0);
+ }
+ tcp_xmit_retransmit_queue(sk);
+ }
+ if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+ tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3705075f42c3..40d893556e67 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -563,8 +563,8 @@ void tcp_write_timer_handler(struct sock *sk)
event = icsk->icsk_pending;
switch (event) {
- case ICSK_TIME_EARLY_RETRANS:
- tcp_resume_early_retransmit(sk);
+ case ICSK_TIME_REO_TIMEOUT:
+ tcp_rack_reo_timeout(sk);
break;
case ICSK_TIME_LOSS_PROBE:
tcp_send_loss_probe(sk);
@@ -617,6 +617,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
else if (!val)
inet_csk_delete_keepalive_timer(sk);
}
+EXPORT_SYMBOL_GPL(tcp_set_keepalive);
static void tcp_keepalive_timer (unsigned long data)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8aab7d78d25b..ea6e4cff9faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -134,14 +134,21 @@ EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+/* IPCB reference means this can not be used from early demux */
+static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (!net->ipv4.sysctl_udp_l3mdev_accept &&
+ skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
+ return true;
+#endif
+ return false;
+}
+
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
unsigned long *bitmap,
- struct sock *sk,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard),
- unsigned int log)
+ struct sock *sk, unsigned int log)
{
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
@@ -153,13 +160,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- (!sk2->sk_reuseport || !sk->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2, true)) {
- if (!bitmap)
- return 1;
- __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sock_i_uid(sk2))) {
+ if (!bitmap)
+ return 0;
+ } else {
+ if (!bitmap)
+ return 1;
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log,
+ bitmap);
+ }
}
}
return 0;
@@ -171,10 +183,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
*/
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
struct udp_hslot *hslot2,
- struct sock *sk,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+ struct sock *sk)
{
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
@@ -188,11 +197,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- (!sk2->sk_reuseport || !sk->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2, true)) {
- res = 1;
+ inet_rcv_saddr_equal(sk, sk2, true)) {
+ if (sk2->sk_reuseport && sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(uid, sock_i_uid(sk2))) {
+ res = 0;
+ } else {
+ res = 1;
+ }
break;
}
}
@@ -200,10 +212,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
return res;
}
-static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
- int (*saddr_same)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard))
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
{
struct net *net = sock_net(sk);
kuid_t uid = sock_i_uid(sk);
@@ -217,7 +226,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
(udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
- (*saddr_same)(sk, sk2, false)) {
+ inet_rcv_saddr_equal(sk, sk2, false)) {
return reuseport_add_sock(sk, sk2);
}
}
@@ -233,14 +242,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
*
* @sk: socket struct in question
* @snum: port number to look up
- * @saddr_comp: AF-dependent comparison of bound local IP addresses
* @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
* with NULL address
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2,
- bool match_wildcard),
unsigned int hash2_nulladdr)
{
struct udp_hslot *hslot, *hslot2;
@@ -269,7 +274,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
bitmap_zero(bitmap, PORTS_PER_CHAIN);
spin_lock_bh(&hslot->lock);
udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
- saddr_comp, udptable->log);
+ udptable->log);
snum = first;
/*
@@ -285,6 +290,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
snum += rand;
} while (snum != first);
spin_unlock_bh(&hslot->lock);
+ cond_resched();
} while (++first != last);
goto fail;
} else {
@@ -301,12 +307,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
if (hslot->count < hslot2->count)
goto scan_primary_hash;
- exist = udp_lib_lport_inuse2(net, snum, hslot2,
- sk, saddr_comp);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
if (!exist && (hash2_nulladdr != slot2)) {
hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
exist = udp_lib_lport_inuse2(net, snum, hslot2,
- sk, saddr_comp);
+ sk);
}
if (exist)
goto fail_unlock;
@@ -314,8 +319,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
goto found;
}
scan_primary_hash:
- if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
- saddr_comp, 0))
+ if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
goto fail_unlock;
}
found:
@@ -324,7 +328,7 @@ found:
udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
if (sk->sk_reuseport &&
- udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
+ udp_reuseport_add_sock(sk, hslot)) {
inet_sk(sk)->inet_num = 0;
udp_sk(sk)->udp_port_hash = 0;
udp_sk(sk)->udp_portaddr_hash ^= snum;
@@ -356,24 +360,6 @@ fail:
}
EXPORT_SYMBOL(udp_lib_get_port);
-/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
- * match_wildcard == false: addresses must be exactly the same, i.e.
- * 0.0.0.0 only equals to 0.0.0.0
- */
-int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
- bool match_wildcard)
-{
- struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
-
- if (!ipv6_only_sock(sk2)) {
- if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
- return 1;
- if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
- return match_wildcard;
- }
- return 0;
-}
-
static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
unsigned int port)
{
@@ -389,12 +375,13 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
- return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+ return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum, int dif)
+ __be32 daddr, unsigned short hnum, int dif,
+ bool exact_dif)
{
int score;
struct inet_sock *inet;
@@ -425,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if || exact_dif) {
if (sk->sk_bound_dev_if != dif)
return -1;
score += 4;
@@ -450,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif,
+ __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
struct udp_hslot *hslot2,
struct sk_buff *skb)
{
@@ -462,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif);
+ daddr, hnum, dif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -497,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ bool exact_dif = udp_lib_exact_dif_match(net, skb);
int score, badness, matches = 0, reuseport = 0;
u32 hash = 0;
@@ -509,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, skb);
+ exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
@@ -524,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, skb);
+ exact_dif, hslot2, skb);
}
return result;
}
@@ -533,7 +521,7 @@ begin:
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif);
+ daddr, hnum, dif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
@@ -1113,7 +1101,8 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->dst);
+ if (msg->msg_flags & MSG_PROBE)
+ dst_confirm_neigh(&rt->dst, &fl4->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 542074c00c78..d6660a8c0ea5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -90,11 +90,3 @@ void __init xfrm4_state_init(void)
{
xfrm_state_register_afinfo(&xfrm4_state_afinfo);
}
-
-#if 0
-void __exit xfrm4_state_fini(void)
-{
- xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
-}
-#endif /* 0 */
-