summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c17
-rw-r--r--net/ipv4/devinet.c7
-rw-r--r--net/ipv4/esp4.c9
-rw-r--r--net/ipv4/esp4_offload.c15
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fou.c75
-rw-r--r--net/ipv4/gre_demux.c9
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c14
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/inet_hashtables.c117
-rw-r--r--net/ipv4/ip_forward.c8
-rw-r--r--net/ipv4/ip_fragment.c25
-rw-r--r--net/ipv4/ip_gre.c56
-rw-r--r--net/ipv4/ip_input.c77
-rw-r--r--net/ipv4/ip_output.c42
-rw-r--r--net/ipv4/ip_tunnel_core.c3
-rw-r--r--net/ipv4/ipconfig.c21
-rw-r--r--net/ipv4/ipip.c14
-rw-r--r--net/ipv4/ipmr.c19
-rw-r--r--net/ipv4/metrics.c26
-rw-r--r--net/ipv4/netfilter/Kconfig5
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c184
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c7
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c43
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c38
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c150
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c83
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c4
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c33
-rw-r--r--net/ipv4/route.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c17
-rw-r--r--net/ipv4/tcp_bbr.c15
-rw-r--r--net/ipv4/tcp_bpf.c32
-rw-r--r--net/ipv4/tcp_input.c94
-rw-r--r--net/ipv4/tcp_ipv4.c132
-rw-r--r--net/ipv4/tcp_offload.c6
-rw-r--r--net/ipv4/tcp_output.c79
-rw-r--r--net/ipv4/tcp_timer.c20
-rw-r--r--net/ipv4/tunnel4.c18
-rw-r--r--net/ipv4/udp.c266
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udp_offload.c122
-rw-r--r--net/ipv4/udp_tunnel.c18
-rw-r--r--net/ipv4/udplite.c4
-rw-r--r--net/ipv4/xfrm4_protocol.c18
52 files changed, 1142 insertions, 843 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1fbe2f815474..0dfb72c46671 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
+ sk->sk_max_ack_backlog = backlog;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog)
goto out;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
- sk->sk_max_ack_backlog = backlog;
err = 0;
out:
@@ -1385,6 +1385,10 @@ out:
}
EXPORT_SYMBOL(inet_gso_segment);
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *,
+ struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
+ struct sk_buff *));
struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
const struct net_offload *ops;
@@ -1494,7 +1498,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
- pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+ pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
+ ops->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
@@ -1556,6 +1561,8 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
return -EINVAL;
}
+INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int));
+INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
__be16 newlen = htons(skb->len - nhoff);
@@ -1581,7 +1588,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
* because any hdr with option will have been flushed in
* inet_gro_receive().
*/
- err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+ err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
+ tcp4_gro_complete, udp4_gro_complete,
+ skb, nhoff + sizeof(*iph));
out_unlock:
rcu_read_unlock();
@@ -1964,6 +1973,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ raw_init();
+
ping_init();
/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a34602ae27de..04ba321ae5ce 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -952,17 +952,18 @@ static int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
- if (ipv4_is_zeronet(addr))
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
rc = 0;
else {
__u32 haddr = ntohl(addr);
-
if (IN_CLASSA(haddr))
rc = 8;
else if (IN_CLASSB(haddr))
rc = 16;
else if (IN_CLASSC(haddr))
rc = 24;
+ else if (IN_CLASSE(haddr))
+ rc = 32;
}
return rc;
@@ -1100,7 +1101,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
inet_del_ifa(in_dev, ifap, 1);
break;
}
- ret = dev_change_flags(dev, ifr->ifr_flags);
+ ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
break;
case SIOCSIFADDR: /* Set interface address (and family) */
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 9e1c840596c5..5459f41fc26f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -125,10 +125,13 @@ static void esp_output_done(struct crypto_async_request *base, int err)
void *tmp;
struct xfrm_state *x;
- if (xo && (xo->flags & XFRM_DEV_RESUME))
- x = skb->sp->xvec[skb->sp->len - 1];
- else
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ struct sec_path *sp = skb_sec_path(skb);
+
+ x = sp->xvec[sp->len - 1];
+ } else {
x = skb_dst(skb)->xfrm;
+ }
tmp = ESP_SKB_CB(skb)->tmp;
esp_ssg_unref(x, tmp);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 58834a10c0be..8756e0e790d2 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -46,11 +46,12 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
xo = xfrm_offload(skb);
if (!xo || !(xo->flags & CRYPTO_DONE)) {
- err = secpath_set(skb);
- if (err)
+ struct sec_path *sp = secpath_set(skb);
+
+ if (!sp)
goto out;
- if (skb->sp->len == XFRM_MAX_DEPTH)
+ if (sp->len == XFRM_MAX_DEPTH)
goto out;
x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
@@ -59,8 +60,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
if (!x)
goto out;
- skb->sp->xvec[skb->sp->len++] = x;
- skb->sp->olen++;
+ sp->xvec[sp->len++] = x;
+ sp->olen++;
xo = xfrm_offload(skb);
if (!xo) {
@@ -114,6 +115,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
struct crypto_aead *aead;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
if (!xo)
return ERR_PTR(-EINVAL);
@@ -121,7 +123,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
return ERR_PTR(-EINVAL);
- x = skb->sp->xvec[skb->sp->len - 1];
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
aead = x->data;
esph = ip_esp_hdr(skb);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5c3937ca6ec..5022bc63863a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
if (!fi)
goto failure;
fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
- cfg->fc_mx_len);
+ cfg->fc_mx_len, extack);
if (unlikely(IS_ERR(fi->fib_metrics))) {
err = PTR_ERR(fi->fib_metrics);
kfree(fi);
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 500a59906b87..0c9f171fb085 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -3,6 +3,7 @@
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
+#include <linux/icmp.h>
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -1003,15 +1004,89 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+ const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t optlen;
+ int ret;
+
+ if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+ goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+ case 6:
+ ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+ goto out;
+#endif
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ /* Handling exceptions for direct UDP encapsulation in GUE would lead to
+ * recursion. Besides, this kind of encapsulation can't even be
+ * configured currently. Discard this.
+ */
+ if (guehdr->proto_ctype == IPPROTO_UDP)
+ return -EOPNOTSUPP;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+ ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
static const struct ip_tunnel_encap_ops fou_iptun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou_build_header,
+ .err_handler = gue_err,
};
static const struct ip_tunnel_encap_ops gue_iptun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue_build_header,
+ .err_handler = gue_err,
};
static int ip_tunnel_encap_add_fou_ops(void)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 7efe740c06eb..a4bf22ee3aed 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -151,20 +151,25 @@ drop:
return NET_RX_DROP;
}
-static void gre_err(struct sk_buff *skb, u32 info)
+static int gre_err(struct sk_buff *skb, u32 info)
{
const struct gre_protocol *proto;
const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+ int err = 0;
if (ver >= GREPROTO_MAX)
- return;
+ return -EINVAL;
rcu_read_lock();
proto = rcu_dereference(gre_proto[ver]);
if (proto && proto->err_handler)
proto->err_handler(skb, info);
+ else
+ err = -EPROTONOSUPPORT;
rcu_read_unlock();
+
+ return err;
}
static const struct net_protocol net_gre_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d832beed6e3a..065997f414e6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1079,7 +1079,7 @@ error:
goto drop;
}
-void icmp_err(struct sk_buff *skb, u32 info)
+int icmp_err(struct sk_buff *skb, u32 info)
{
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
@@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info)
*/
if (icmph->type != ICMP_ECHOREPLY) {
ping_err(skb, offset, info);
- return;
+ return 0;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
else if (type == ICMP_REDIRECT)
ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+ return 0;
}
/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 15e7f7915a21..6ea523d71947 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
+ int l3mdev;
+ l3mdev = inet_sk_bound_l3mdev(sk);
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -219,7 +221,8 @@ other_parity_scan:
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, false, false))
goto success;
goto next_port;
@@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb = NULL;
kuid_t uid = sock_i_uid(sk);
+ int l3mdev;
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) {
head = inet_csk_find_open_port(sk, &tb, &port);
@@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port)
goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb)
goto fail_unlock;
tb_found:
@@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4e5bc4b2f14e..1a4e9ff02762 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -998,7 +998,9 @@ next_chunk:
if (!inet_diag_bc_sk(bc, sk))
goto next_normal;
- sock_hold(sk);
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_normal;
+
num_arr[accum] = num;
sk_arr[accum] = sk;
if (++accum == SKARR_SZ)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 411dd7a90046..942265d65eb3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
- const unsigned short snum)
+ const unsigned short snum,
+ int l3mdev)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
+ tb->l3mdev = l3mdev;
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
@@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
+ int l3mdev;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
@@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
return -ENOENT;
}
if (tb->port != port) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
* that the listener socket's icsk_bind_hash is the same
@@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
* create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) &&
- tb->port == port)
+ tb->l3mdev == l3mdev && tb->port == port)
break;
}
if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
- sock_net(sk), head, port);
+ sock_net(sk), head, port,
+ l3mdev);
if (!tb) {
spin_unlock(&head->lock);
return -ENOMEM;
@@ -228,26 +234,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
const int dif, const int sdif, bool exact_dif)
{
int score = -1;
- struct inet_sock *inet = inet_sk(sk);
- if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
+ if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
!ipv6_only_sock(sk)) {
- __be32 rcv_saddr = inet->inet_rcv_saddr;
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
+
+ if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return -1;
+
score = sk->sk_family == PF_INET ? 2 : 1;
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- return -1;
- score += 4;
- }
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
@@ -303,26 +299,12 @@ struct sock *__inet_lookup_listener(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
- unsigned int hash = inet_lhashfn(net, hnum);
- struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- bool exact_dif = inet_exact_dif_match(net, skb);
struct inet_listen_hashbucket *ilb2;
- struct sock *sk, *result = NULL;
- int score, hiscore = 0;
+ struct sock *result = NULL;
unsigned int hash2;
- u32 phash = 0;
-
- if (ilb->count <= 10 || !hashinfo->lhash2)
- goto port_lookup;
-
- /* Too many sk in the ilb bucket (which is hashed by port alone).
- * Try lhash2 (which is hashed by port and addr) instead.
- */
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
@@ -331,34 +313,12 @@ struct sock *__inet_lookup_listener(struct net *net,
goto done;
/* Lookup lhash2 with INADDR_ANY */
-
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
+ saddr, sport, htonl(INADDR_ANY), hnum,
dif, sdif);
- goto done;
-
-port_lookup:
- sk_for_each_rcu(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr,
- dif, sdif, exact_dif);
- if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- goto done;
- }
- result = sk;
- hiscore = score;
- }
- }
done:
if (unlikely(IS_ERR(result)))
return NULL;
@@ -675,6 +635,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
u32 remaining, offset;
int ret, i, low, high;
static u32 hint;
+ int l3mdev;
if (port) {
head = &hinfo->bhash[inet_bhashfn(net, port,
@@ -693,6 +654,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return ret;
}
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
inet_get_local_port_range(net, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
@@ -719,7 +682,8 @@ other_parity_scan:
* the established check is already unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
+ tb->port == port) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
@@ -732,7 +696,7 @@ other_parity_scan:
}
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb) {
spin_unlock_bh(&head->lock);
return -ENOMEM;
@@ -798,13 +762,22 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+static void init_hashinfo_lhash2(struct inet_hashinfo *h)
+{
+ int i;
+
+ for (i = 0; i <= h->lhash2_mask; i++) {
+ spin_lock_init(&h->lhash2[i].lock);
+ INIT_HLIST_HEAD(&h->lhash2[i].head);
+ h->lhash2[i].count = 0;
+ }
+}
+
void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
unsigned long numentries, int scale,
unsigned long low_limit,
unsigned long high_limit)
{
- unsigned int i;
-
h->lhash2 = alloc_large_system_hash(name,
sizeof(*h->lhash2),
numentries,
@@ -814,13 +787,23 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
&h->lhash2_mask,
low_limit,
high_limit);
+ init_hashinfo_lhash2(h);
+}
- for (i = 0; i <= h->lhash2_mask; i++) {
- spin_lock_init(&h->lhash2[i].lock);
- INIT_HLIST_HEAD(&h->lhash2[i].head);
- h->lhash2[i].count = 0;
- }
+int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
+{
+ h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
+ if (!h->lhash2)
+ return -ENOMEM;
+
+ h->lhash2_mask = INET_LHTABLE_SIZE - 1;
+ /* INET_LHTABLE_SIZE must be a power of 2 */
+ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
+
+ init_hashinfo_lhash2(h);
+ return 0;
}
+EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 32662e9e5d21..00ec819f949b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -69,9 +69,17 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
+
if (unlikely(opt->optlen))
ip_forward_options(skb);
+ skb->tstamp = 0;
return dst_output(net, sk, skb);
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d6ee343fdb86..867be8f7f1fa 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -346,10 +346,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct rb_node **rbn, *parent;
struct sk_buff *skb1, *prev_tail;
+ int ihl, end, skb1_run_end;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
- int ihl, end;
int err = -ENOENT;
u8 ecn;
@@ -419,7 +419,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
* overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded.
*
- * We do the same here for IPv4 (and increment an snmp counter).
+ * We do the same here for IPv4 (and increment an snmp counter) but
+ * we do not want to drop the whole queue in response to a duplicate
+ * fragment.
*/
err = -EINVAL;
@@ -444,13 +446,17 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
do {
parent = *rbn;
skb1 = rb_to_skb(parent);
+ skb1_run_end = skb1->ip_defrag_offset +
+ FRAG_CB(skb1)->frag_run_len;
if (end <= skb1->ip_defrag_offset)
rbn = &parent->rb_left;
- else if (offset >= skb1->ip_defrag_offset +
- FRAG_CB(skb1)->frag_run_len)
+ else if (offset >= skb1_run_end)
rbn = &parent->rb_right;
- else /* Found an overlap with skb1. */
- goto overlap;
+ else if (offset >= skb1->ip_defrag_offset &&
+ end <= skb1_run_end)
+ goto err; /* No new data, potential duplicate */
+ else
+ goto overlap; /* Found an overlap */
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
* one of its NULL left/right children. Insert skb.
@@ -515,6 +521,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct rb_node *rbn;
int len;
int ihlen;
+ int delta;
int err;
u8 ecn;
@@ -556,10 +563,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
if (len > 65535)
goto out_oversize;
+ delta = - head->truesize;
+
/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem;
+ delta += head->truesize;
+ if (delta)
+ add_frag_mem_limit(qp->q.net, delta);
+
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..c7a7bd58a23c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
static unsigned int erspan_net_id __read_mostly;
-static void ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
unsigned int data_len = 0;
struct ip_tunnel *t;
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+ tpi->proto == htons(ETH_P_ERSPAN2))
+ itn = net_generic(net, erspan_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (!t)
+ return -ENOENT;
+
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return 0;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
+ return 0;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
+ return 0;
data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
@@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
break;
}
- if (tpi->proto == htons(ETH_P_TEB))
- itn = net_generic(net, gre_tap_net_id);
- else if (tpi->proto == htons(ETH_P_ERSPAN) ||
- tpi->proto == htons(ETH_P_ERSPAN2))
- itn = net_generic(net, erspan_net_id);
- else
- itn = net_generic(net, ipgre_net_id);
-
- iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
- iph->daddr, iph->saddr, tpi->key);
-
- if (!t)
- return;
-
#if IS_ENABLED(CONFIG_IPV6)
if (tpi->proto == htons(ETH_P_IPV6) &&
!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
type, data_len))
- return;
+ return 0;
#endif
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return;
+ return 0;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ return 0;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static void gre_err(struct sk_buff *skb, u32 info)
@@ -1339,12 +1341,6 @@ static void ipgre_tap_setup(struct net_device *dev)
ip_tunnel_setup(dev, gre_tap_net_id);
}
-bool is_gretap_dev(const struct net_device *dev)
-{
- return dev->netdev_ops == &gre_tap_netdev_ops;
-}
-EXPORT_SYMBOL_GPL(is_gretap_dev);
-
static int ipgre_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
@@ -1601,7 +1597,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
memset(&tb, 0, sizeof(tb));
dev = rtnl_create_link(net, name, name_assign_type,
- &ipgre_tap_ops, tb);
+ &ipgre_tap_ops, tb, NULL);
if (IS_ERR(dev))
return dev;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 35a786c0aaa0..26921f6b3b92 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
- __skb_pull(skb, skb_network_header_len(skb));
-
- rcu_read_lock();
- {
- int protocol = ip_hdr(skb)->protocol;
- const struct net_protocol *ipprot;
- int raw;
+ const struct net_protocol *ipprot;
+ int raw, ret;
- resubmit:
- raw = raw_local_deliver(skb, protocol);
+resubmit:
+ raw = raw_local_deliver(skb, protocol);
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot) {
- int ret;
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ return;
}
- ret = ipprot->handler(skb);
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
+ nf_reset(skb);
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
}
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ kfree_skb(skb);
} else {
- if (!raw) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- kfree_skb(skb);
- } else {
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
- consume_skb(skb);
- }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
}
}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
@@ -547,7 +546,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
list_for_each_entry_safe(skb, next, head, list) {
struct dst_entry *dst;
- list_del(&skb->list);
+ skb_list_del_init(skb);
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
@@ -594,7 +593,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
- list_del(&skb->list);
+ skb_list_del_init(skb);
skb = ip_rcv_core(skb, net);
if (skb == NULL)
continue;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c09219e7f230..c80188875f39 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
+ skb_ext_copy(to, from);
#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
@@ -867,6 +868,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct ip_options *opt = cork->opt;
@@ -880,8 +882,8 @@ static int __ip_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
+ bool paged, extra_uref;
u32 tskey = 0;
- bool paged;
skb = skb_peek_tail(queue);
@@ -916,6 +918,20 @@ static int __ip_append_data(struct sock *sk,
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
+ if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = true;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
+ }
+
cork->length += length;
/* So, what's going on in the loop below?
@@ -939,7 +955,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
- unsigned int pagedlen = 0;
+ unsigned int pagedlen;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -956,6 +972,7 @@ alloc_new_skb:
if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
fraglen = datalen + fragheaderlen;
+ pagedlen = 0;
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
@@ -1000,12 +1017,6 @@ alloc_new_skb:
skb->csum = 0;
skb_reserve(skb, hh_len);
- /* only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = cork->tx_flags;
- cork->tx_flags = 0;
- skb_shinfo(skb)->tskey = tskey;
- tskey = 0;
-
/*
* Find where to start putting bytes.
*/
@@ -1038,6 +1049,13 @@ alloc_new_skb:
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ /* only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
@@ -1067,7 +1085,7 @@ alloc_new_skb:
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1097,6 +1115,10 @@ alloc_new_skb:
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1109,6 +1131,8 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
+ if (uarg)
+ sock_zerocopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index c248e0dccbe1..9a0e67b52a4e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
}
skb_clear_hash_if_not_l4(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb_set_queue_mapping(skb, 0);
skb_scrub_packet(skb, xnet);
@@ -151,6 +151,7 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
sizeof(struct in6_addr));
else
dst->key.u.ipv4.dst = src->key.u.ipv4.src;
+ dst->key.tun_flags = src->key.tun_flags;
dst->mode = src->mode | IP_TUNNEL_INFO_TX;
return res;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 88212615bf4c..b9a9873c25c6 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -220,7 +220,7 @@ static int __init ic_open_devs(void)
for_each_netdev(&init_net, dev) {
if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
continue;
- if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)
pr_err("IP-Config: Failed to open %s\n", dev->name);
}
@@ -238,7 +238,7 @@ static int __init ic_open_devs(void)
if (ic_proto_enabled && !able)
continue;
oflags = dev->flags;
- if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) {
pr_err("IP-Config: Failed to open %s\n",
dev->name);
continue;
@@ -315,7 +315,7 @@ static void __init ic_close_devs(void)
dev = d->dev;
if (d != ic_dev && !netdev_uses_dsa(dev)) {
pr_debug("IP-Config: Downing %s\n", dev->name);
- dev_change_flags(dev, d->flags);
+ dev_change_flags(dev, d->flags, NULL);
}
kfree(d);
}
@@ -429,6 +429,8 @@ static int __init ic_defaults(void)
ic_netmask = htonl(IN_CLASSB_NET);
else if (IN_CLASSC(ntohl(ic_myaddr)))
ic_netmask = htonl(IN_CLASSC_NET);
+ else if (IN_CLASSE(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSE_NET);
else {
pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
&ic_myaddr);
@@ -1361,18 +1363,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v)
}
return 0;
}
-
-static int ntp_servers_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ntp_servers_seq_show, NULL);
-}
-
-static const struct file_operations ntp_servers_seq_fops = {
- .open = ntp_servers_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq);
#endif /* CONFIG_PROC_FS */
/*
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e65287c27e3d..57c5dd283a2c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
struct ip_tunnel *t;
int err = 0;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!t) {
+ err = -ENOENT;
+ goto out;
+ }
+
switch (type) {
case ICMP_DEST_UNREACH:
switch (code) {
@@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
goto out;
}
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->daddr, iph->saddr, 0);
- if (!t) {
- err = -ENOENT;
- goto out;
- }
-
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
goto out;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a6defbec4f1b..ddbf8c9a1abb 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -69,6 +69,8 @@
#include <net/nexthop.h>
#include <net/switchdev.h>
+#include <linux/nospec.h>
+
struct ipmr_rule {
struct fib_rule common;
};
@@ -506,7 +508,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
dev->flags |= IFF_MULTICAST;
if (!ipmr_init_vif_indev(dev))
goto failure;
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
}
@@ -589,7 +591,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
if (!ipmr_init_vif_indev(dev))
goto failure;
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
@@ -1612,6 +1614,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
return -EFAULT;
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
+ vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
@@ -1686,6 +1689,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
return -EFAULT;
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
+ vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
@@ -1802,7 +1806,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
struct vif_device *out_vif = &mrt->vif_table[out_vifi];
struct vif_device *in_vif = &mrt->vif_table[in_vifi];
- if (!skb->offload_mr_fwd_mark)
+ if (!skb->offload_l3_fwd_mark)
return false;
if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
return false;
@@ -1820,8 +1824,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
/* Processing handlers for ipmr_forward */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
- int in_vifi, struct sk_buff *skb,
- struct mfc_cache *c, int vifi)
+ int in_vifi, struct sk_buff *skb, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
@@ -2027,7 +2030,7 @@ forward:
if (skb2)
ipmr_queue_xmit(net, mrt, true_vifi,
- skb2, c, psend);
+ skb2, psend);
}
psend = ct;
}
@@ -2039,9 +2042,9 @@ last_forward:
if (skb2)
ipmr_queue_xmit(net, mrt, true_vifi, skb2,
- c, psend);
+ psend);
} else {
- ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend);
+ ipmr_queue_xmit(net, mrt, true_vifi, skb, psend);
return;
}
}
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 6d218f5a2e71..ca9a5fefdefa 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -6,7 +6,8 @@
#include <net/tcp.h>
static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len, u32 *metrics)
+ int fc_mx_len, u32 *metrics,
+ struct netlink_ext_ack *extack)
{
bool ecn_ca = false;
struct nlattr *nla;
@@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
if (!type)
continue;
- if (type > RTAX_MAX)
+ if (type > RTAX_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid metric type");
return -EINVAL;
+ }
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
+ if (val == TCP_CA_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
return -EINVAL;
+ }
} else {
- if (nla_len(nla) != sizeof(u32))
+ if (nla_len(nla) != sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Invalid attribute in metrics");
return -EINVAL;
+ }
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
@@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
val = 65535 - 15;
if (type == RTAX_HOPLIMIT && val > 255)
val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+ NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
return -EINVAL;
+ }
metrics[type - 1] = val;
}
@@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
}
struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
- int fc_mx_len)
+ int fc_mx_len,
+ struct netlink_ext_ack *extack)
{
struct dst_metrics *fib_metrics;
int err;
@@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
if (unlikely(!fib_metrics))
return ERR_PTR(-ENOMEM);
- err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+ err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+ extack);
if (!err) {
refcount_set(&fib_metrics->refcnt, 1);
} else {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 184bf2e0a1ed..80f72cc5ca8d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -156,15 +156,10 @@ config NF_NAT_SNMP_BASIC
To compile it as a module, choose M here. If unsure, say N.
-config NF_NAT_PROTO_GRE
- tristate
- depends on NF_CT_PROTO_GRE
-
config NF_NAT_PPTP
tristate
depends on NF_CONNTRACK
default NF_CONNTRACK_PPTP
- select NF_NAT_PROTO_GRE
config NF_NAT_H323
tristate
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 367993adf4d3..fd7122e0e2c9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,7 +3,7 @@
# Makefile for the netfilter modules on top of IPv4.
#
-nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o
nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
@@ -28,9 +28,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
-
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2c8d313ae216..b61977db9b7f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -56,18 +56,15 @@ struct clusterip_config {
#endif
enum clusterip_hashmode hash_mode; /* which hashing mode */
u_int32_t hash_initval; /* hash initialization */
- struct rcu_head rcu;
-
+ struct rcu_head rcu; /* for call_rcu_bh */
+ struct net *net; /* netns for pernet list */
char ifname[IFNAMSIZ]; /* device ifname */
- struct notifier_block notifier; /* refresh c->ifindex in it */
};
#ifdef CONFIG_PROC_FS
static const struct file_operations clusterip_proc_fops;
#endif
-static unsigned int clusterip_net_id __read_mostly;
-
struct clusterip_net {
struct list_head configs;
/* lock protects the configs list */
@@ -75,51 +72,66 @@ struct clusterip_net {
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *procdir;
+ /* mutex protects the config->pde*/
+ struct mutex mutex;
#endif
};
+static unsigned int clusterip_net_id __read_mostly;
+static inline struct clusterip_net *clusterip_pernet(struct net *net)
+{
+ return net_generic(net, clusterip_net_id);
+}
+
static inline void
clusterip_config_get(struct clusterip_config *c)
{
refcount_inc(&c->refcount);
}
-
static void clusterip_config_rcu_free(struct rcu_head *head)
{
- kfree(container_of(head, struct clusterip_config, rcu));
+ struct clusterip_config *config;
+ struct net_device *dev;
+
+ config = container_of(head, struct clusterip_config, rcu);
+ dev = dev_get_by_name(config->net, config->ifname);
+ if (dev) {
+ dev_mc_del(dev, config->clustermac);
+ dev_put(dev);
+ }
+ kfree(config);
}
static inline void
clusterip_config_put(struct clusterip_config *c)
{
if (refcount_dec_and_test(&c->refcount))
- call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
+ call_rcu(&c->rcu, clusterip_config_rcu_free);
}
/* decrease the count of entries using/referencing this config. If last
* entry(rule) is removed, remove the config from lists, but don't free it
* yet, since proc-files could still be holding references */
static inline void
-clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
+clusterip_config_entry_put(struct clusterip_config *c)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(c->net);
local_bh_disable();
if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
+ list_del_rcu(&c->list);
+ spin_unlock(&cn->lock);
+ local_bh_enable();
/* In case anyone still accesses the file, the open/close
* functions are also incrementing the refcount on their own,
* so it's safe to remove the entry even if it's in use. */
#ifdef CONFIG_PROC_FS
+ mutex_lock(&cn->mutex);
if (cn->procdir)
proc_remove(c->pde);
+ mutex_unlock(&cn->mutex);
#endif
- list_del_rcu(&c->list);
- spin_unlock(&cn->lock);
- local_bh_enable();
-
- unregister_netdevice_notifier(&c->notifier);
-
return;
}
local_bh_enable();
@@ -129,7 +141,7 @@ static struct clusterip_config *
__clusterip_config_find(struct net *net, __be32 clusterip)
{
struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
list_for_each_entry_rcu(c, &cn->configs, list) {
if (c->clusterip == clusterip)
@@ -181,32 +193,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct clusterip_net *cn = clusterip_pernet(net);
struct clusterip_config *c;
- c = container_of(this, struct clusterip_config, notifier);
- switch (event) {
- case NETDEV_REGISTER:
- if (!strcmp(dev->name, c->ifname)) {
- c->ifindex = dev->ifindex;
- dev_mc_add(dev, c->clustermac);
- }
- break;
- case NETDEV_UNREGISTER:
- if (dev->ifindex == c->ifindex) {
- dev_mc_del(dev, c->clustermac);
- c->ifindex = -1;
- }
- break;
- case NETDEV_CHANGENAME:
- if (!strcmp(dev->name, c->ifname)) {
- c->ifindex = dev->ifindex;
- dev_mc_add(dev, c->clustermac);
- } else if (dev->ifindex == c->ifindex) {
- dev_mc_del(dev, c->clustermac);
- c->ifindex = -1;
+ spin_lock_bh(&cn->lock);
+ list_for_each_entry_rcu(c, &cn->configs, list) {
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (!strcmp(dev->name, c->ifname)) {
+ c->ifindex = dev->ifindex;
+ dev_mc_add(dev, c->clustermac);
+ }
+ break;
+ case NETDEV_UNREGISTER:
+ if (dev->ifindex == c->ifindex) {
+ dev_mc_del(dev, c->clustermac);
+ c->ifindex = -1;
+ }
+ break;
+ case NETDEV_CHANGENAME:
+ if (!strcmp(dev->name, c->ifname)) {
+ c->ifindex = dev->ifindex;
+ dev_mc_add(dev, c->clustermac);
+ } else if (dev->ifindex == c->ifindex) {
+ dev_mc_del(dev, c->clustermac);
+ c->ifindex = -1;
+ }
+ break;
}
- break;
}
+ spin_unlock_bh(&cn->lock);
return NOTIFY_DONE;
}
@@ -215,30 +232,44 @@ static struct clusterip_config *
clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
__be32 ip, const char *iniface)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
struct clusterip_config *c;
+ struct net_device *dev;
int err;
+ if (iniface[0] == '\0') {
+ pr_info("Please specify an interface name\n");
+ return ERR_PTR(-EINVAL);
+ }
+
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
return ERR_PTR(-ENOMEM);
- strcpy(c->ifname, iniface);
- c->ifindex = -1;
- c->clusterip = ip;
+ dev = dev_get_by_name(net, iniface);
+ if (!dev) {
+ pr_info("no such interface %s\n", iniface);
+ kfree(c);
+ return ERR_PTR(-ENOENT);
+ }
+ c->ifindex = dev->ifindex;
+ strcpy(c->ifname, dev->name);
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+ dev_mc_add(dev, c->clustermac);
+ dev_put(dev);
+
+ c->clusterip = ip;
c->num_total_nodes = i->num_total_nodes;
clusterip_config_init_nodelist(c, i);
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
+ c->net = net;
refcount_set(&c->refcount, 1);
spin_lock_bh(&cn->lock);
if (__clusterip_config_find(net, ip)) {
- spin_unlock_bh(&cn->lock);
- kfree(c);
-
- return ERR_PTR(-EBUSY);
+ err = -EBUSY;
+ goto out_config_put;
}
list_add_rcu(&c->list, &cn->configs);
@@ -250,9 +281,11 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
+ mutex_lock(&cn->mutex);
c->pde = proc_create_data(buffer, 0600,
cn->procdir,
&clusterip_proc_fops, c);
+ mutex_unlock(&cn->mutex);
if (!c->pde) {
err = -ENOMEM;
goto err;
@@ -260,22 +293,17 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
}
#endif
- c->notifier.notifier_call = clusterip_netdev_event;
- err = register_netdevice_notifier(&c->notifier);
- if (!err) {
- refcount_set(&c->entries, 1);
- return c;
- }
+ refcount_set(&c->entries, 1);
+ return c;
#ifdef CONFIG_PROC_FS
- proc_remove(c->pde);
err:
#endif
spin_lock_bh(&cn->lock);
list_del_rcu(&c->list);
+out_config_put:
spin_unlock_bh(&cn->lock);
clusterip_config_put(c);
-
return ERR_PTR(err);
}
@@ -475,34 +503,20 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
&e->ip.dst.s_addr);
return -EINVAL;
} else {
- struct net_device *dev;
-
- if (e->ip.iniface[0] == '\0') {
- pr_info("Please specify an interface name\n");
- return -EINVAL;
- }
-
- dev = dev_get_by_name(par->net, e->ip.iniface);
- if (!dev) {
- pr_info("no such interface %s\n",
- e->ip.iniface);
- return -ENOENT;
- }
- dev_put(dev);
-
config = clusterip_config_init(par->net, cipinfo,
e->ip.dst.s_addr,
e->ip.iniface);
if (IS_ERR(config))
return PTR_ERR(config);
}
- }
+ } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN))
+ return -EINVAL;
ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
- clusterip_config_entry_put(par->net, config);
+ clusterip_config_entry_put(config);
clusterip_config_put(config);
return ret;
}
@@ -524,7 +538,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
/* if no more entries are referencing the config, remove it
* from the list and destroy the proc entry */
- clusterip_config_entry_put(par->net, cipinfo->config);
+ clusterip_config_entry_put(cipinfo->config);
clusterip_config_put(cipinfo->config);
@@ -806,7 +820,7 @@ static const struct file_operations clusterip_proc_fops = {
static int clusterip_net_init(struct net *net)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
int ret;
INIT_LIST_HEAD(&cn->configs);
@@ -824,6 +838,7 @@ static int clusterip_net_init(struct net *net)
pr_err("Unable to proc dir entry\n");
return -ENOMEM;
}
+ mutex_init(&cn->mutex);
#endif /* CONFIG_PROC_FS */
return 0;
@@ -831,13 +846,15 @@ static int clusterip_net_init(struct net *net)
static void clusterip_net_exit(struct net *net)
{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ struct clusterip_net *cn = clusterip_pernet(net);
+
#ifdef CONFIG_PROC_FS
+ mutex_lock(&cn->mutex);
proc_remove(cn->procdir);
cn->procdir = NULL;
+ mutex_unlock(&cn->mutex);
#endif
nf_unregister_net_hook(net, &cip_arp_ops);
- WARN_ON_ONCE(!list_empty(&cn->configs));
}
static struct pernet_operations clusterip_net_ops = {
@@ -847,6 +864,10 @@ static struct pernet_operations clusterip_net_ops = {
.size = sizeof(struct clusterip_net),
};
+struct notifier_block cip_netdev_notifier = {
+ .notifier_call = clusterip_netdev_event
+};
+
static int __init clusterip_tg_init(void)
{
int ret;
@@ -859,11 +880,17 @@ static int __init clusterip_tg_init(void)
if (ret < 0)
goto cleanup_subsys;
+ ret = register_netdevice_notifier(&cip_netdev_notifier);
+ if (ret < 0)
+ goto unregister_target;
+
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
return 0;
+unregister_target:
+ xt_unregister_target(&clusterip_tg_reg);
cleanup_subsys:
unregister_pernet_subsys(&clusterip_net_ops);
return ret;
@@ -873,11 +900,12 @@ static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
+ unregister_netdevice_notifier(&cip_netdev_notifier);
xt_unregister_target(&clusterip_tg_reg);
unregister_pernet_subsys(&clusterip_net_ops);
- /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
- rcu_barrier_bh();
+ /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */
+ rcu_barrier();
}
module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index ce1512b02cb2..fd3f9e8a74da 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -81,9 +81,12 @@ static int __init masquerade_tg_init(void)
int ret;
ret = xt_register_target(&masquerade_tg_reg);
+ if (ret)
+ return ret;
- if (ret == 0)
- nf_nat_masquerade_ipv4_register_notifier();
+ ret = nf_nat_masquerade_ipv4_register_notifier();
+ if (ret)
+ xt_unregister_target(&masquerade_tg_reg);
return ret;
}
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 78a67f961d86..2687db015b6f 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -62,22 +62,8 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
}
#endif /* CONFIG_XFRM */
-static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range2 *range)
-{
- return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
- ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
-}
-
-static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
- __be16 dport)
-{
- return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
-}
-
static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff,
- const struct nf_nat_l4proto *l4proto,
const struct nf_conntrack_tuple *target,
enum nf_nat_manip_type maniptype)
{
@@ -90,8 +76,8 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
iph = (void *)skb->data + iphdroff;
hdroff = iphdroff + iph->ihl * 4;
- if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
- target, maniptype))
+ if (!nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff,
+ hdroff, target, maniptype))
return false;
iph = (void *)skb->data + iphdroff;
@@ -161,8 +147,6 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
.l3proto = NFPROTO_IPV4,
- .in_range = nf_nat_ipv4_in_range,
- .secure_port = nf_nat_ipv4_secure_port,
.manip_pkt = nf_nat_ipv4_manip_pkt,
.csum_update = nf_nat_ipv4_csum_update,
.csum_recalc = nf_nat_ipv4_csum_recalc,
@@ -186,7 +170,6 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
unsigned int hdrlen = ip_hdrlen(skb);
- const struct nf_nat_l4proto *l4proto;
struct nf_conntrack_tuple target;
unsigned long statusbit;
@@ -217,9 +200,8 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
if (!(ct->status & statusbit))
return 1;
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
- l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ &ct->tuplehash[!dir].tuple, !manip))
return 0;
if (skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -233,8 +215,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
/* Change outer to look like the reply to an incoming packet */
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
- if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
+ if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))
return 0;
return 1;
@@ -391,26 +372,12 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
static int __init nf_nat_l3proto_ipv4_init(void)
{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
- if (err < 0)
- goto err1;
- err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
- if (err < 0)
- goto err2;
- return err;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
-err1:
- return err;
+ return nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
}
static void __exit nf_nat_l3proto_ipv4_exit(void)
{
nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
}
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index a9d5e013e555..41327bb99093 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -147,28 +147,50 @@ static struct notifier_block masq_inet_notifier = {
.notifier_call = masq_inet_event,
};
-static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
+static int masq_refcnt;
+static DEFINE_MUTEX(masq_mutex);
-void nf_nat_masquerade_ipv4_register_notifier(void)
+int nf_nat_masquerade_ipv4_register_notifier(void)
{
+ int ret = 0;
+
+ mutex_lock(&masq_mutex);
/* check if the notifier was already set */
- if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
- return;
+ if (++masq_refcnt > 1)
+ goto out_unlock;
/* Register for device down reports */
- register_netdevice_notifier(&masq_dev_notifier);
+ ret = register_netdevice_notifier(&masq_dev_notifier);
+ if (ret)
+ goto err_dec;
/* Register IP address change reports */
- register_inetaddr_notifier(&masq_inet_notifier);
+ ret = register_inetaddr_notifier(&masq_inet_notifier);
+ if (ret)
+ goto err_unregister;
+
+ mutex_unlock(&masq_mutex);
+ return ret;
+
+err_unregister:
+ unregister_netdevice_notifier(&masq_dev_notifier);
+err_dec:
+ masq_refcnt--;
+out_unlock:
+ mutex_unlock(&masq_mutex);
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
void nf_nat_masquerade_ipv4_unregister_notifier(void)
{
+ mutex_lock(&masq_mutex);
/* check if the notifier still has clients */
- if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
- return;
+ if (--masq_refcnt > 0)
+ goto out_unlock;
unregister_netdevice_notifier(&masq_dev_notifier);
unregister_inetaddr_notifier(&masq_inet_notifier);
+out_unlock:
+ mutex_unlock(&masq_mutex);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 5d259a12e25f..68b4d450391b 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -299,8 +299,6 @@ pptp_inbound_pkt(struct sk_buff *skb,
static int __init nf_nat_helper_pptp_init(void)
{
- nf_nat_need_gre();
-
BUG_ON(nf_nat_pptp_hook_outbound != NULL);
RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
deleted file mode 100644
index 00fda6331ce5..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * nf_nat_proto_gre.c
- *
- * NAT protocol helper module for GRE.
- *
- * GRE is a generic encapsulation protocol, which is generally not very
- * suited for NAT, as it has no protocol-specific part as port numbers.
- *
- * It has an optional key field, which may help us distinguishing two
- * connections between the same two hosts.
- *
- * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
- *
- * PPTP is built on top of a modified version of GRE, and has a mandatory
- * field called "CallID", which serves us for the same purpose as the key
- * field in plain GRE.
- *
- * Documentation about PPTP can be found in RFC 2637
- *
- * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- *
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-#include <linux/netfilter/nf_conntrack_proto_gre.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
-
-/* generate unique tuple ... */
-static void
-gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u_int16_t key;
- __be16 *keyptr;
- unsigned int min, i, range_size;
-
- /* If there is no master conntrack we are not PPTP,
- do not change tuples */
- if (!ct->master)
- return;
-
- if (maniptype == NF_NAT_MANIP_SRC)
- keyptr = &tuple->src.u.gre.key;
- else
- keyptr = &tuple->dst.u.gre.key;
-
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
- pr_debug("%p: NATing GRE PPTP\n", ct);
- min = 1;
- range_size = 0xffff;
- } else {
- min = ntohs(range->min_proto.gre.key);
- range_size = ntohs(range->max_proto.gre.key) - min + 1;
- }
-
- pr_debug("min = %u, range_size = %u\n", min, range_size);
-
- for (i = 0; ; ++key) {
- *keyptr = htons(min + key % range_size);
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
-
- pr_debug("%p: no NAT mapping\n", ct);
- return;
-}
-
-/* manipulate a GRE packet according to maniptype */
-static bool
-gre_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct gre_base_hdr *greh;
- struct pptp_gre_header *pgreh;
-
- /* pgreh includes two optional 32bit fields which are not required
- * to be there. That's where the magic '8' comes from */
- if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
- return false;
-
- greh = (void *)skb->data + hdroff;
- pgreh = (struct pptp_gre_header *)greh;
-
- /* we only have destination manip of a packet, since 'source key'
- * is not present in the packet itself */
- if (maniptype != NF_NAT_MANIP_DST)
- return true;
-
- switch (greh->flags & GRE_VERSION) {
- case GRE_VERSION_0:
- /* We do not currently NAT any GREv0 packets.
- * Try to behave like "nf_nat_proto_unknown" */
- break;
- case GRE_VERSION_1:
- pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
- pgreh->call_id = tuple->dst.u.gre.key;
- break;
- default:
- pr_debug("can't nat unknown GRE version\n");
- return false;
- }
- return true;
-}
-
-static const struct nf_nat_l4proto gre = {
- .l4proto = IPPROTO_GRE,
- .manip_pkt = gre_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = gre_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_gre_init(void)
-{
- return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
-}
-
-static void __exit nf_nat_proto_gre_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
-}
-
-module_init(nf_nat_proto_gre_init);
-module_exit(nf_nat_proto_gre_fini);
-
-void nf_nat_need_gre(void)
-{
- return;
-}
-EXPORT_SYMBOL_GPL(nf_nat_need_gre);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
deleted file mode 100644
index 6d7cf1d79baf..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/ip.h>
-#include <linux/icmp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static bool
-icmp_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
- ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
-}
-
-static void
-icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u_int16_t id;
- unsigned int range_size;
- unsigned int i;
-
- range_size = ntohs(range->max_proto.icmp.id) -
- ntohs(range->min_proto.icmp.id) + 1;
- /* If no range specified... */
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
- range_size = 0xFFFF;
-
- for (i = 0; ; ++id) {
- tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
- (id % range_size));
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
- return;
-}
-
-static bool
-icmp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct icmphdr *hdr;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct icmphdr *)(skb->data + hdroff);
- inet_proto_csum_replace2(&hdr->checksum, skb,
- hdr->un.echo.id, tuple->src.u.icmp.id, false);
- hdr->un.echo.id = tuple->src.u.icmp.id;
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
- .l4proto = IPPROTO_ICMP,
- .manip_pkt = icmp_manip_pkt,
- .in_range = icmp_in_range,
- .unique_tuple = icmp_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 5cd06ba3535d..aa8304c618b8 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -102,6 +102,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
/* Send RST reply */
void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
+ struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
struct iphdr *niph;
const struct tcphdr *oth;
@@ -147,10 +148,11 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- if (oldskb->nf_bridge) {
+ br_indev = nf_bridge_get_physindev(oldskb);
+ if (br_indev) {
struct ethhdr *oeth = eth_hdr(oldskb);
- nskb->dev = nf_bridge_get_physindev(oldskb);
+ nskb->dev = br_indev;
niph->tot_len = htons(nskb->len);
ip_send_check(niph);
if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f1193e1e928a..6847de1d1db8 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -69,7 +69,9 @@ static int __init nft_masq_ipv4_module_init(void)
if (ret < 0)
return ret;
- nf_nat_masquerade_ipv4_register_notifier();
+ ret = nf_nat_masquerade_ipv4_register_notifier();
+ if (ret)
+ nft_unregister_expr(&nft_masq_ipv4_type);
return ret;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 70289682a670..c3610b37bb4c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 32a691b7ce2c..92d249e053be 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,6 +29,7 @@
#include <net/protocol.h>
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
EXPORT_SYMBOL(inet_offloads);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8ca3eb06ba04..c55a5432cf37 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
!(inet->inet_daddr && inet->inet_daddr != raddr) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+ raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
@@ -391,7 +390,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
- sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
+ skb_setup_tx_timestamp(skb, sockc->tsflags);
if (flags & MSG_CONFIRM)
skb_set_dst_pending_confirm(skb, 1);
@@ -805,7 +804,7 @@ out:
return copied;
}
-static int raw_init(struct sock *sk)
+static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
@@ -970,7 +969,7 @@ struct proto raw_prot = {
.connect = ip4_datagram_connect,
.disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
- .init = raw_init,
+ .init = raw_sk_init,
.setsockopt = raw_setsockopt,
.getsockopt = raw_getsockopt,
.sendmsg = raw_sendmsg,
@@ -1134,3 +1133,27 @@ void __init raw_proc_exit(void)
unregister_pernet_subsys(&raw_net_ops);
}
#endif /* CONFIG_PROC_FS */
+
+static void raw_sysctl_init_net(struct net *net)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->ipv4.sysctl_raw_l3mdev_accept = 1;
+#endif
+}
+
+static int __net_init raw_sysctl_init(struct net *net)
+{
+ raw_sysctl_init_net(net);
+ return 0;
+}
+
+static struct pernet_operations __net_initdata raw_sysctl_ops = {
+ .init = raw_sysctl_init,
+};
+
+void __init raw_init(void)
+{
+ raw_sysctl_init_net(&init_net);
+ if (register_pernet_subsys(&raw_sysctl_ops))
+ panic("RAW: failed to init sysctl parameters.\n");
+}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c0a9d26c06ce..ce92f73cf104 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev,
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
skb_mac_header(skb),
- dev->hard_header_len, true);
+ dev->hard_header_len, false);
}
}
#endif
@@ -2849,6 +2849,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
err = -rt->dst.error;
} else {
fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ skb->dev = net->loopback_dev;
rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
err = 0;
if (IS_ERR(rt))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 891ed2f91467..ba0fc4b18465 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "raw_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_raw_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{
.procname = "tcp_ecn",
.data = &init_net.ipv4.sysctl_tcp_ecn,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9e6bc4d6daa7..27e2f6837062 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1423,7 +1423,7 @@ do_error:
if (copied + copied_syn)
goto out;
out_err:
- sock_zerocopy_put_abort(uarg);
+ sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
@@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
}
continue;
- found_ok_skb:
+found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
@@ -2147,7 +2147,7 @@ skip_copy:
sk_eat_skb(sk, skb);
continue;
- found_fin_ok:
+found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags & MSG_PEEK))
@@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state)
* socket sitting in hash tables.
*/
inet_sk_state_store(sk, state);
-
-#ifdef STATE_TRACE
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
-#endif
}
EXPORT_SYMBOL_GPL(tcp_set_state);
@@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
0;
}
@@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
TCP_NLA_PAD);
nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
return stats;
}
@@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void)
if (unlikely(!tcp_md5sig_pool_populated)) {
mutex_lock(&tcp_md5sig_mutex);
- if (!tcp_md5sig_pool_populated)
+ if (!tcp_md5sig_pool_populated) {
__tcp_alloc_md5sig_pool();
+ if (tcp_md5sig_pool_populated)
+ static_key_slow_inc(&tcp_md5_needed);
+ }
mutex_unlock(&tcp_md5sig_mutex);
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 9277abdd822a..0f497fc49c3f 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
-/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
static const int bbr_pacing_margin_percent = 1;
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
@@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
}
-/* Pace using current bw estimate and a gain factor. In order to help drive the
- * network toward lower queues while maintaining high utilization and low
- * latency, the average pacing rate aims to be slightly (~1%) lower than the
- * estimated bandwidth. This is an important aspect of the design. In this
- * implementation this slightly lower pacing rate is achieved implicitly by not
- * including link-layer headers in the packet size used for the pacing rate.
- */
+/* Pace using current bw estimate and a gain factor. */
static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 3b45fe530f91..1bb7321a256d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -8,6 +8,7 @@
#include <linux/wait.h>
#include <net/inet_common.h>
+#include <net/tls.h>
static bool tcp_bpf_stream_read(const struct sock *sk)
{
@@ -198,7 +199,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
msg->sg.start = i;
msg->sg.size -= apply_bytes;
sk_psock_queue_msg(psock, tmp);
- sk->sk_data_ready(sk);
+ sk_psock_data_ready(sk, psock);
} else {
sk_msg_free(sk, tmp);
kfree(tmp);
@@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
u32 off;
while (1) {
+ bool has_tx_ulp;
+
sge = sk_msg_elem(msg, msg->sg.start);
size = (apply && apply_bytes < sge->length) ?
apply_bytes : sge->length;
@@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
tcp_rate_check_app_limited(sk);
retry:
- ret = do_tcp_sendpages(sk, page, off, size, flags);
+ has_tx_ulp = tls_sw_has_ctx_tx(sk);
+ if (has_tx_ulp) {
+ flags |= MSG_SENDPAGE_NOPOLICY;
+ ret = kernel_sendpage_locked(sk,
+ page, off, size, flags);
+ } else {
+ ret = do_tcp_sendpages(sk, page, off, size, flags);
+ }
+
if (ret <= 0)
return ret;
if (apply)
@@ -289,12 +300,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
{
bool cork = false, enospc = msg->sg.start == msg->sg.end;
struct sock *sk_redir;
- u32 tosend;
+ u32 tosend, delta = 0;
int ret;
more_data:
- if (psock->eval == __SK_NONE)
+ if (psock->eval == __SK_NONE) {
+ /* Track delta in msg size to add/subtract it on SK_DROP from
+ * returned to user copied size. This ensures user doesn't
+ * get a positive return code with msg_cut_data and SK_DROP
+ * verdict.
+ */
+ delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ if (msg->sg.size < delta)
+ delta -= msg->sg.size;
+ else
+ delta = 0;
+ }
if (msg->cork_bytes &&
msg->cork_bytes > msg->sg.size && !enospc) {
@@ -350,7 +372,7 @@ more_data:
default:
sk_msg_free_partial(sk, msg, tosend);
sk_msg_apply_bytes(psock, tosend);
- *copied -= tosend;
+ *copied -= (tosend + delta);
return -EACCES;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1e37c1388189..76858b14ebe9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -579,10 +579,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
u32 delta_us;
- if (!delta)
- delta = 1;
- delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
- tcp_rcv_rtt_update(tp, delta_us, 0);
+ if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+ if (!delta)
+ delta = 1;
+ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ tcp_rcv_rtt_update(tp, delta_us, 0);
+ }
}
}
@@ -1863,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct sock *sk)
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 prior_sacked = tp->sacked_out;
+ if (num_dupack) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ s32 delivered;
- tp->sacked_out++;
- tcp_check_reno_reordering(sk, 0);
- if (tp->sacked_out > prior_sacked)
- tp->delivered++; /* Some out-of-order packet is delivered */
- tcp_verify_left_out(tp);
+ tp->sacked_out += num_dupack;
+ tcp_check_reno_reordering(sk, 0);
+ delivered = tp->sacked_out - prior_sacked;
+ if (delivered > 0)
+ tp->delivered += delivered;
+ tcp_verify_left_out(tp);
+ }
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
@@ -2457,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
- !(flag & FLAG_LOST_RETRANS)) {
+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) ==
+ FLAG_RETRANS_DATA_ACKED) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
@@ -2634,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2653,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
return;
if (after(tp->snd_nxt, tp->high_seq)) {
- if (flag & FLAG_DATA_SACKED || is_dupack)
+ if (flag & FLAG_DATA_SACKED || num_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
@@ -2679,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
/* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions.
*/
- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
- tcp_add_reno_sack(sk);
+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
+ tcp_add_reno_sack(sk, num_dupack);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
@@ -2757,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
- bool is_dupack, int *ack_flag, int *rexmit)
+ int num_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
- bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
- tcp_force_fast_retransmit(sk));
+ bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
+ tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
@@ -2810,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
- if (tcp_is_reno(tp) && is_dupack)
- tcp_add_reno_sack(sk);
+ if (tcp_is_reno(tp))
+ tcp_add_reno_sack(sk, num_dupack);
} else {
if (tcp_try_undo_partial(sk, prior_snd_una))
return;
@@ -2826,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_identify_packet_loss(sk, ack_flag);
break;
case TCP_CA_Loss:
- tcp_process_loss(sk, flag, is_dupack, rexmit);
+ tcp_process_loss(sk, flag, num_dupack, rexmit);
tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
@@ -2837,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
- if (is_dupack)
- tcp_add_reno_sack(sk);
+ tcp_add_reno_sack(sk, num_dupack);
}
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
@@ -2910,9 +2915,11 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
- u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
- seq_rtt_us = ca_rtt_us = delta_us;
+ if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+ seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ ca_rtt_us = seq_rtt_us;
+ }
}
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
@@ -3558,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
- bool is_dupack = false;
+ int num_dupack = 0;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
@@ -3610,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
+ FLAG_SND_UNA_ADVANCED) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
@@ -3668,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) {
- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
+ num_dupack = 1;
+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
+ if (!(flag & FLAG_DATA))
+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ }
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
}
@@ -3687,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
}
@@ -3712,7 +3725,7 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
@@ -4602,13 +4615,12 @@ end:
}
}
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
- bool *fragstolen)
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
+ bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
- __skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail,
skb, fragstolen)) ? 1 : 0;
@@ -4659,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
@@ -4719,7 +4731,7 @@ queue_and_out:
goto drop;
}
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -5595,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- &fragstolen);
+ __skb_pull(skb, tcp_header_len);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index de47038afdf0..efc6fef692ff 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err);
*
*/
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
@@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
inet_iif(icmp_skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq,
- type == ICMP_PARAMETERPROB ||
- type == ICMP_TIME_EXCEEDED ||
- (type == ICMP_DEST_UNREACH &&
- (code == ICMP_NET_UNREACH ||
- code == ICMP_HOST_UNREACH)));
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
skb = tcp_rtx_queue_head(sk);
- BUG_ON(!skb);
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
@@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
@@ -969,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
* We need to maintain these in the sk structure.
*/
+struct static_key tcp_md5_needed __read_mostly;
+EXPORT_SYMBOL(tcp_md5_needed);
+
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
- const union tcp_md5_addr *addr,
- int family)
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1010,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
}
return best_match;
}
-EXPORT_SYMBOL(tcp_md5_do_lookup);
+EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
@@ -1618,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
-
- /* Only socket owner can try to collapse/prune rx queues
- * to reduce memory overhead, so add a little headroom here.
- * Few sockets backlog are possibly concurrently non empty.
- */
- limit += 64*1024;
+ struct skb_shared_info *shinfo;
+ const struct tcphdr *th;
+ struct tcphdr *thtail;
+ struct sk_buff *tail;
+ unsigned int hdrlen;
+ bool fragstolen;
+ u32 gso_segs;
+ int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@@ -1633,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
*/
skb_condense(skb);
+ skb_dst_drop(skb);
+
+ if (unlikely(tcp_checksum_complete(skb))) {
+ bh_unlock_sock(sk);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ return true;
+ }
+
+ /* Attempt coalescing to last skb in backlog, even if we are
+ * above the limits.
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+ */
+ th = (const struct tcphdr *)skb->data;
+ hdrlen = th->doff * 4;
+ shinfo = skb_shinfo(skb);
+
+ if (!shinfo->gso_size)
+ shinfo->gso_size = skb->len - hdrlen;
+
+ if (!shinfo->gso_segs)
+ shinfo->gso_segs = 1;
+
+ tail = sk->sk_backlog.tail;
+ if (!tail)
+ goto no_coalesce;
+ thtail = (struct tcphdr *)tail->data;
+
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+ ((TCP_SKB_CB(tail)->tcp_flags |
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
+ ((TCP_SKB_CB(tail)->tcp_flags ^
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+ tail->decrypted != skb->decrypted ||
+#endif
+ thtail->doff != th->doff ||
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+ goto no_coalesce;
+
+ __skb_pull(skb, hdrlen);
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ thtail->window = th->window;
+
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
+ tail->tstamp = skb->tstamp;
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ /* Not as strict as GRO. We only need to carry mss max value */
+ skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
+ skb_shinfo(tail)->gso_size);
+
+ gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
+ skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+
+ sk->sk_backlog.len += delta;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPBACKLOGCOALESCE);
+ kfree_skb_partial(skb, fragstolen);
+ return false;
+ }
+ __skb_push(skb, hdrlen);
+
+no_coalesce:
+ /* Only socket owner can try to collapse/prune rx queues
+ * to reduce memory overhead, so add a little headroom here.
+ * Few sockets backlog are possibly concurrently non empty.
+ */
+ limit += 64*1024;
+
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
@@ -2573,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of four TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
+ /* Default TSQ limit of 16 TSO segments */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 870b0a335061..0fbf7d4df9da 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -10,6 +10,7 @@
* TCPv4 GSO/GRO support
*/
+#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
#include <net/tcp.h>
#include <net/protocol.h>
@@ -305,7 +306,8 @@ int tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);
-static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
@@ -318,7 +320,7 @@ static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *
return tcp_gro_receive(head, skb);
}
-static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3f510cad0b3e..730bc44dbad9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (init_rcv_wnd)
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- (*rcv_wscale) = 0;
+ *rcv_wscale = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
- space >>= 1;
- (*rcv_wscale)++;
- }
+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
+ 0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_key_false(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_key_false(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -1904,24 +1904,27 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
* This algorithm is from John Heffner.
*/
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
- bool *is_cwnd_limited, u32 max_segs)
+ bool *is_cwnd_limited,
+ bool *is_rwnd_limited,
+ u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 age, send_win, cong_win, limit, in_flight;
+ u32 send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *head;
int win_divisor;
-
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- goto send_now;
+ s64 delta;
if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
- * only if the last write was recent.
+ * only if the last write was recent (1 ms).
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
+ * packets waiting in a qdisc or device for EDT delivery.
*/
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+ if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1968,15 +1971,33 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
+ delta = tp->tcp_clock_cache - head->tstamp;
/* If next ACK is likely to come too late (half srtt), do not defer */
- if (age < (tp->srtt_us >> 4))
+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
goto send_now;
- /* Ok, it looks like it is advisable to defer. */
+ /* Ok, it looks like it is advisable to defer.
+ * Three cases are tracked :
+ * 1) We are cwnd-limited
+ * 2) We are rwnd-limited
+ * 3) We are application limited.
+ */
+ if (cong_win < send_win) {
+ if (cong_win <= skb->len) {
+ *is_cwnd_limited = true;
+ return true;
+ }
+ } else {
+ if (send_win <= skb->len) {
+ *is_rwnd_limited = true;
+ return true;
+ }
+ }
- if (cong_win < send_win && cong_win <= skb->len)
- *is_cwnd_limited = true;
+ /* If this packet won't get more data, do not wait. */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
+ TCP_SKB_CB(skb)->eor)
+ goto send_now;
return true;
@@ -2212,8 +2233,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(unsigned long, limit,
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ limit = min_t(unsigned long, limit,
+ sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
@@ -2356,7 +2378,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
} else {
if (!push_one &&
tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
- max_segs))
+ &is_rwnd_limited, max_segs))
break;
}
@@ -2494,15 +2516,18 @@ void tcp_send_loss_probe(struct sock *sk)
goto rearm_timer;
}
skb = skb_rb_last(&sk->tcp_rtx_queue);
+ if (unlikely(!skb)) {
+ WARN_ONCE(tp->packets_out,
+ "invalid inflight: %u state %u cwnd %u mss %d\n",
+ tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
+ inet_csk(sk)->icsk_pending = 0;
+ return;
+ }
/* At most one outstanding TLP retransmission. */
if (tp->tlp_high_seq)
goto rearm_timer;
- /* Retransmit last segment. */
- if (WARN_ON(!skb))
- goto rearm_timer;
-
if (skb_still_in_host_queue(sk, skb))
goto rearm_timer;
@@ -2920,7 +2945,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
}
return err;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 5f8b6d3cd855..f87dbc78b6bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -40,15 +40,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts;
+ s32 remaining;
start_ts = tcp_retransmit_stamp(sk);
if (!icsk->icsk_user_timeout || !start_ts)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
- if (elapsed >= icsk->icsk_user_timeout)
+ remaining = icsk->icsk_user_timeout - elapsed;
+ if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */
- else
- return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
+
+ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
}
/**
@@ -209,7 +211,7 @@ static bool retransmits_timed_out(struct sock *sk,
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
timeout = jiffies_to_msecs(timeout);
}
- return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
+ return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
}
/* A write timeout has occurred. Process the after effects. */
@@ -376,7 +378,7 @@ static void tcp_probe_timer(struct sock *sk)
return;
}
- if (icsk->icsk_probes_out > max_probes) {
+ if (icsk->icsk_probes_out >= max_probes) {
abort: tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
@@ -482,11 +484,12 @@ void tcp_retransmit_timer(struct sock *sk)
goto out_reset_timer;
}
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
if (tcp_write_timeout(sk))
goto out;
if (icsk->icsk_retransmits == 0) {
- int mib_idx;
+ int mib_idx = 0;
if (icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tcp_is_sack(tp))
@@ -501,10 +504,9 @@ void tcp_retransmit_timer(struct sock *sk)
mib_idx = LINUX_MIB_TCPSACKFAILURES;
else
mib_idx = LINUX_MIB_TCPRENOFAILURES;
- } else {
- mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ if (mib_idx)
+ __NET_INC_STATS(sock_net(sk), mib_idx);
}
tcp_enter_loss(sk);
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c0630013c1ae..33bf8e9c8663 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -149,34 +149,40 @@ drop:
}
#endif
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#if IS_ENABLED(CONFIG_IPV6)
-static void tunnel64_err(struct sk_buff *skb, u32 info)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_MPLS)
-static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1976fddb9e00..3fb0ed5e4789 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -105,6 +105,7 @@
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
+#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -115,6 +116,7 @@
#include "udp_impl.h"
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
+#include <net/udp_tunnel.h>
struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
@@ -371,21 +373,19 @@ static int compute_score(struct sock *sk, struct net *net,
{
int score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
ipv6_only_sock(sk))
return -1;
- score = (sk->sk_family == PF_INET) ? 2 : 1;
- inet = inet_sk(sk);
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
- if (inet->inet_rcv_saddr) {
- if (inet->inet_rcv_saddr != daddr)
- return -1;
- score += 4;
- }
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+ inet = inet_sk(sk);
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
@@ -398,15 +398,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ score += 4;
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
@@ -465,65 +461,30 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport, int dif,
int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
- struct sock *sk, *result;
+ struct sock *result;
unsigned short hnum = ntohs(dport);
- unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
- struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ unsigned int hash2, slot2;
+ struct udp_hslot *hslot2;
bool exact_dif = udp_lib_exact_dif_match(net, skb);
- int score, badness;
- u32 hash = 0;
- if (hslot->count > 10) {
- hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif, sdif,
+ exact_dif, hslot2, skb);
+ if (!result) {
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
+ htonl(INADDR_ANY), hnum, dif, sdif,
exact_dif, hslot2, skb);
- if (!result) {
- unsigned int old_slot2 = slot2;
- hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
- slot2 = hash2 & udptable->mask;
- /* avoid searching the same slot again. */
- if (unlikely(slot2 == old_slot2))
- return result;
-
- hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
-
- result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
- exact_dif, hslot2, skb);
- }
- if (unlikely(IS_ERR(result)))
- return NULL;
- return result;
- }
-begin:
- result = NULL;
- badness = 0;
- sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
- if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (unlikely(IS_ERR(result)))
- return NULL;
- if (result)
- return result;
- }
- result = sk;
- badness = score;
- }
}
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -585,6 +546,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+ static_branch_inc(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, u32 info);
+
+ if (!iptun_encaps[i])
+ continue;
+ handler = rcu_dereference(iptun_encaps[i]->err_handler);
+ if (handler && !handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+ const struct iphdr *iph,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sk_buff *skb, u32 info)
+{
+ int network_offset, transport_offset;
+ struct sock *sk;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv4 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, iph->ihl << 2);
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+ iph->saddr, uh->dest, skb->dev->ifindex, 0,
+ udptable, NULL);
+ if (sk) {
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ struct udp_sock *up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+ if (!sk)
+ sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -596,13 +640,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
* to find the appropriate port.
*/
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
struct inet_sock *inet;
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
@@ -612,8 +657,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
iph->saddr, uh->source, skb->dev->ifindex,
inet_sdif(skb), udptable, NULL);
if (!sk) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return; /* No socket for error */
+ /* No socket for error: try tunnels before discarding */
+ sk = ERR_PTR(-ENOENT);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
+ info);
+ if (!sk)
+ return 0;
+ }
+
+ if (IS_ERR(sk)) {
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
err = 0;
@@ -656,6 +714,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
+ if (tunnel) {
+ /* ...not for tunnels though: we don't have a sending socket */
+ goto out;
+ }
if (!inet->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
@@ -665,12 +727,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- return;
+ return 0;
}
-void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udp_table);
+ return __udp4_lib_err(skb, info, &udp_table);
}
/*
@@ -1713,6 +1775,10 @@ try_again:
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
+
+ if (udp_sk(sk)->gro_enabled)
+ udp_cmsg_recv(msg, sk, skb);
+
if (inet->cmsg_flags)
ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
@@ -1889,13 +1955,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
- static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
/* returns:
* -1: error
* 0: success
@@ -1904,7 +1963,7 @@ EXPORT_SYMBOL(udp_encap_enable);
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -2007,6 +2066,27 @@ drop:
return -1;
}
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udp_queue_rcv_one_skb(sk, skb);
+
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, true);
+ for (skb = segs; skb; skb = next) {
+ next = skb->next;
+ __skb_pull(skb, skb_transport_offset(skb));
+ ret = udp_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+ }
+ return 0;
+}
+
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -2398,11 +2478,15 @@ void udp_destroy_sock(struct sock *sk)
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (up->encap_enabled)
+ static_branch_dec(&udp_encap_needed_key);
}
}
@@ -2447,7 +2531,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
- udp_encap_enable();
+ lock_sock(sk);
+ udp_tunnel_encap_enable(sk->sk_socket);
+ release_sock(sk);
break;
default:
err = -ENOPROTOOPT;
@@ -2469,6 +2555,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
up->gso_size = val;
break;
+ case UDP_GRO:
+ lock_sock(sk);
+ if (valbool)
+ udp_tunnel_encap_enable(sk->sk_socket);
+ up->gro_enabled = valbool;
+ release_sock(sk);
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e7d18b140287..322672655419 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -7,7 +7,7 @@
#include <net/inet_common.h>
int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
int udp_v4_get_port(struct sock *sk, unsigned short snum);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..64f9715173ac 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -13,6 +13,7 @@
#include <linux/skbuff.h>
#include <net/udp.h>
#include <net/protocol.h>
+#include <net/inet_common.h>
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
@@ -343,6 +344,56 @@ out:
return segs;
}
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+ struct sk_buff *pp = NULL;
+ struct udphdr *uh2;
+ struct sk_buff *p;
+
+ /* requires non zero csum, for symmetry with GSO */
+ if (!uh->check) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ /* pull encapsulating udp header */
+ skb_gro_pull(skb, sizeof(struct udphdr));
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = udp_hdr(p);
+
+ /* Match ports only, as csum is always non zero */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Terminate the flow on len mismatch or if it grow "too much".
+ * Under small packet flood GRO count could elsewhere grow a lot
+ * leading to execessive truesize values
+ */
+ if (!skb_gro_receive(p, skb) &&
+ NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+ pp = p;
+ else if (uh->len != uh2->len)
+ pp = p;
+
+ return pp;
+ }
+
+ /* mismatch, but we never need to flush */
+ return NULL;
+}
+
+INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport));
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct udphdr *uh, udp_lookup_t lookup)
{
@@ -353,23 +404,28 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
int flush = 1;
struct sock *sk;
+ rcu_read_lock();
+ sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
+ udp4_lib_lookup_skb, skb, uh->source, uh->dest);
+ if (!sk)
+ goto out_unlock;
+
+ if (udp_sk(sk)->gro_enabled) {
+ pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+ rcu_read_unlock();
+ return pp;
+ }
+
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
- !NAPI_GRO_CB(skb)->csum_valid))
- goto out;
+ !NAPI_GRO_CB(skb)->csum_valid) ||
+ !udp_sk(sk)->gro_receive)
+ goto out_unlock;
/* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1;
- rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
-
- if (sk && udp_sk(sk)->gro_receive)
- goto unflush;
- goto out_unlock;
-
-unflush:
flush = 0;
list_for_each_entry(p, head, list) {
@@ -394,14 +450,13 @@ unflush:
out_unlock:
rcu_read_unlock();
-out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
EXPORT_SYMBOL(udp_gro_receive);
-static struct sk_buff *udp4_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
@@ -427,6 +482,19 @@ flush:
return NULL;
}
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)uh - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+ return 0;
+}
+
int udp_gro_complete(struct sk_buff *skb, int nhoff,
udp_lookup_t lookup)
{
@@ -437,16 +505,22 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen;
- /* Set encapsulation before calling into inner gro_complete() functions
- * to make them set up the inner offsets.
- */
- skb->encapsulation = 1;
-
rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
- if (sk && udp_sk(sk)->gro_complete)
+ sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
+ udp4_lib_lookup_skb, skb, uh->source, uh->dest);
+ if (sk && udp_sk(sk)->gro_enabled) {
+ err = udp_gro_complete_segment(skb);
+ } else if (sk && udp_sk(sk)->gro_complete) {
+ skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+ : SKB_GSO_UDP_TUNNEL;
+
+ /* Set encapsulation before calling into inner gro_complete()
+ * functions to make them set up the inner offsets.
+ */
+ skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
+ }
rcu_read_unlock();
if (skb->remcsum_offload)
@@ -456,18 +530,14 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
}
EXPORT_SYMBOL(udp_gro_complete);
-static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..be8b5b2157d8 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
if (err < 0)
goto error;
+ if (cfg->bind_ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, cfg->bind_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto error;
+ }
+
+ err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE,
+ dev->name, strlen(dev->name) + 1);
+ dev_put(dev);
+
+ if (err < 0)
+ goto error;
+ }
+
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
@@ -68,6 +85,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
udp_sk(sk)->gro_receive = cfg->gro_receive;
udp_sk(sk)->gro_complete = cfg->gro_complete;
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..39c7f17d916f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb)
return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udplite_table);
+ return __udp4_lib_err(skb, info, &udplite_table);
}
static const struct net_protocol udplite_protocol = {
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..35c54865dc42 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(esp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ah_rcv(struct sk_buff *skb)
@@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ah4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
@@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ipcomp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct net_protocol esp4_protocol = {