summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig2
-rw-r--r--net/ipv4/af_inet.c21
-rw-r--r--net/ipv4/arp.c34
-rw-r--r--net/ipv4/bpf_tcp_ca.c2
-rw-r--r--net/ipv4/datagram.c21
-rw-r--r--net/ipv4/devinet.c86
-rw-r--r--net/ipv4/esp4.c58
-rw-r--r--net/ipv4/fib_frontend.c80
-rw-r--r--net/ipv4/fib_rules.c65
-rw-r--r--net/ipv4/fib_semantics.c264
-rw-r--r--net/ipv4/fib_trie.c26
-rw-r--r--net/ipv4/gre_demux.c2
-rw-r--r--net/ipv4/icmp.c98
-rw-r--r--net/ipv4/igmp.c90
-rw-r--r--net/ipv4/igmp_internal.h17
-rw-r--r--net/ipv4/inet_connection_sock.c175
-rw-r--r--net/ipv4/inet_diag.c12
-rw-r--r--net/ipv4/inet_fragment.c37
-rw-r--r--net/ipv4/inet_hashtables.c162
-rw-r--r--net/ipv4/inet_timewait_sock.c8
-rw-r--r--net/ipv4/inetpeer.c57
-rw-r--r--net/ipv4/ip_fragment.c67
-rw-r--r--net/ipv4/ip_gre.c82
-rw-r--r--net/ipv4/ip_input.c24
-rw-r--r--net/ipv4/ip_output.c61
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ip_tunnel.c49
-rw-r--r--net/ipv4/ip_tunnel_core.c8
-rw-r--r--net/ipv4/ip_vti.c22
-rw-r--r--net/ipv4/ipcomp.c2
-rw-r--r--net/ipv4/ipconfig.c6
-rw-r--r--net/ipv4/ipip.c18
-rw-r--r--net/ipv4/ipmr.c225
-rw-r--r--net/ipv4/ipmr_base.c9
-rw-r--r--net/ipv4/netfilter.c4
-rw-r--r--net/ipv4/netfilter/Kconfig23
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c20
-rw-r--r--net/ipv4/nexthop.c221
-rw-r--r--net/ipv4/ping.c30
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c12
-rw-r--r--net/ipv4/route.c125
-rw-r--r--net/ipv4/syncookies.c12
-rw-r--r--net/ipv4/sysctl_net_ipv4.c20
-rw-r--r--net/ipv4/tcp.c283
-rw-r--r--net/ipv4/tcp_bpf.c61
-rw-r--r--net/ipv4/tcp_cubic.c8
-rw-r--r--net/ipv4/tcp_dctcp.c2
-rw-r--r--net/ipv4/tcp_dctcp.h2
-rw-r--r--net/ipv4/tcp_diag.c21
-rw-r--r--net/ipv4/tcp_fastopen.c18
-rw-r--r--net/ipv4/tcp_input.c664
-rw-r--r--net/ipv4/tcp_ipv4.c437
-rw-r--r--net/ipv4/tcp_metrics.c8
-rw-r--r--net/ipv4/tcp_minisocks.c93
-rw-r--r--net/ipv4/tcp_offload.c26
-rw-r--r--net/ipv4/tcp_output.c178
-rw-r--r--net/ipv4/tcp_recovery.c2
-rw-r--r--net/ipv4/tcp_timer.c90
-rw-r--r--net/ipv4/udp.c491
-rw-r--r--net/ipv4/udp_impl.h1
-rw-r--r--net/ipv4/udp_offload.c243
-rw-r--r--net/ipv4/udp_tunnel_core.c36
-rw-r--r--net/ipv4/udp_tunnel_nic.c78
-rw-r--r--net/ipv4/udplite.c2
-rw-r--r--net/ipv4/xfrm4_input.c21
-rw-r--r--net/ipv4/xfrm4_output.c2
69 files changed, 3141 insertions, 2009 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d2c97f8e9ef..12850a277251 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -425,7 +425,7 @@ config INET_DIAG
tristate "INET: socket monitoring interface"
default y
help
- Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ Support for INET (TCP, UDP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at:
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8095e82de808..76e38092cd8a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
WARN_ON_ONCE(sk->sk_wmem_queued);
- WARN_ON_ONCE(sk_forward_alloc_get(sk));
+ WARN_ON_ONCE(sk->sk_forward_alloc);
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
@@ -1309,8 +1309,6 @@ int inet_sk_rebuild_header(struct sock *sk)
{
struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
struct inet_sock *inet = inet_sk(sk);
- __be32 daddr;
- struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
int err;
@@ -1319,17 +1317,9 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rcu_read_unlock();
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
- inet->inet_dport, inet->inet_sport,
- sk->sk_protocol, ip_sock_rt_tos(sk),
- sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (!IS_ERR(rt)) {
err = 0;
sk_setup_caps(sk, &rt->dst);
@@ -1338,10 +1328,7 @@ int inet_sk_rebuild_header(struct sock *sk)
/* Routing failed... */
sk->sk_route_caps = 0;
- /*
- * Other protocols have to map its equivalent state to TCP_SYN_SENT.
- * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
- */
+
if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index cb9a7ed8abd3..5cfc1c939673 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -659,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb
*/
void arp_xmit(struct sk_buff *skb)
{
+ rcu_read_lock();
/* Send it off, maybe filter it using firewalling first. */
NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
- dev_net(skb->dev), NULL, skb, NULL, skb->dev,
+ dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev,
arp_xmit_finish);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(arp_xmit);
@@ -862,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
(rt->dst.dev != dev &&
- pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+ pneigh_lookup(&arp_tbl, net, &tip, dev)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
@@ -964,6 +966,7 @@ static int arp_is_multicast(const void *pkey)
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
+ enum skb_drop_reason drop_reason;
const struct arphdr *arp;
/* do not tweak dropwatch on an ARP we will ignore */
@@ -977,12 +980,15 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
- if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+ drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev));
+ if (drop_reason != SKB_NOT_DROPPED_YET)
goto freeskb;
arp = arp_hdr(skb);
- if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) {
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
goto freeskb;
+ }
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -994,7 +1000,7 @@ consumeskb:
consume_skb(skb);
return NET_RX_SUCCESS;
freeskb:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out_of_mem:
return NET_RX_DROP;
}
@@ -1062,8 +1068,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
return 0;
}
- if (__in_dev_get_rtnl(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+ if (__in_dev_get_rtnl_net(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on);
return 0;
}
return -ENXIO;
@@ -1075,7 +1081,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
if (!dev && (r->arp_flags & ATF_COM)) {
- dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+ dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
r->arp_ha.sa_data);
if (!dev)
return -ENODEV;
@@ -1083,9 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
if (mask) {
__be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
- return -ENOBUFS;
- return 0;
+ return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false);
}
return arp_req_set_proxy(net, dev, 1);
@@ -1293,14 +1297,14 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCDARP:
- rtnl_lock();
+ rtnl_net_lock(net);
err = arp_req_delete(net, &r);
- rtnl_unlock();
+ rtnl_net_unlock(net);
break;
case SIOCSARP:
- rtnl_lock();
+ rtnl_net_lock(net);
err = arp_req_set(net, &r);
- rtnl_unlock();
+ rtnl_net_unlock(net);
break;
case SIOCGARP:
rcu_read_lock();
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 554804774628..e01492234b0b 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -121,7 +121,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
{
/* bpf_tcp_ca prog cannot have NULL tp */
- __tcp_send_ack((struct sock *)tp, rcv_nxt);
+ __tcp_send_ack((struct sock *)tp, rcv_nxt, 0);
return 0;
}
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index cc6d0bd7b0a9..c2b2cda1a7e5 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -61,15 +61,17 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
err = -EACCES;
goto out;
}
+
+ /* Update addresses before rehashing */
+ inet->inet_daddr = fl4->daddr;
+ inet->inet_dport = usin->sin_port;
if (!inet->inet_saddr)
- inet->inet_saddr = fl4->saddr; /* Update source address */
+ inet->inet_saddr = fl4->saddr;
if (!inet->inet_rcv_saddr) {
inet->inet_rcv_saddr = fl4->saddr;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
- inet->inet_daddr = fl4->daddr;
- inet->inet_dport = usin->sin_port;
reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
@@ -100,8 +102,6 @@ EXPORT_SYMBOL(ip4_datagram_connect);
void ip4_datagram_release_cb(struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct dst_entry *dst;
struct flowi4 fl4;
struct rtable *rt;
@@ -109,18 +109,13 @@ void ip4_datagram_release_cb(struct sock *sk)
rcu_read_lock();
dst = __sk_dst_get(sk);
- if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+ if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) {
rcu_read_unlock();
return;
}
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, &fl4);
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
dst = !IS_ERR(rt) ? &rt->dst : NULL;
sk_dst_set(sk, dst);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c8b3cf5fba4c..c47d3828d4f6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -46,6 +46,7 @@
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include "igmp_internal.h"
#include <linux/slab.h>
#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
@@ -107,15 +108,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_PROTO] = { .type = NLA_U8 },
};
-struct inet_fill_args {
- u32 portid;
- u32 seq;
- int event;
- unsigned int flags;
- int netnsid;
- int ifindex;
-};
-
#define IN4_ADDR_HSIZE_SHIFT 8
#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
@@ -289,7 +281,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
if (!in_dev->arp_parms)
goto out_kfree;
if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
/* Reference in_dev->dev */
netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
/* Account for reference dev->ip_ptr (below) */
@@ -1371,10 +1363,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
__be32 addr = 0;
unsigned char localnet_scope = RT_SCOPE_HOST;
struct in_device *in_dev;
- struct net *net = dev_net(dev);
+ struct net *net;
int master_idx;
rcu_read_lock();
+ net = dev_net_rcu(dev);
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto no_in_dev;
@@ -1799,12 +1792,12 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
struct ifaddrmsg *ifm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
return -EINVAL;
@@ -1846,9 +1839,38 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
return 0;
}
-static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
- struct netlink_callback *cb, int *s_ip_idx,
- struct inet_fill_args *fillargs)
+static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ struct ip_mc_list *im;
+ int ip_idx = 0;
+ int err;
+
+ for (im = rcu_dereference(in_dev->mc_list);
+ im;
+ im = rcu_dereference(im->next_rcu)) {
+ if (ip_idx < *s_ip_idx) {
+ ip_idx++;
+ continue;
+ }
+ err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs);
+ if (err < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ ip_idx++;
+ }
+ err = 0;
+ ip_idx = 0;
+done:
+ *s_ip_idx = ip_idx;
+ return err;
+}
+
+static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
{
struct in_ifaddr *ifa;
int ip_idx = 0;
@@ -1874,6 +1896,21 @@ done:
return err;
}
+static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ switch (fillargs->event) {
+ case RTM_NEWADDR:
+ return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs);
+ case RTM_GETMULTICAST:
+ return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx,
+ fillargs);
+ default:
+ return -EINVAL;
+ }
+}
+
/* Combine dev_addr_genid and dev_base_seq to detect changes.
*/
static u32 inet_base_seq(const struct net *net)
@@ -1889,13 +1926,14 @@ static u32 inet_base_seq(const struct net *net)
return res;
}
-static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
+ int event)
{
const struct nlmsghdr *nlh = cb->nlh;
struct inet_fill_args fillargs = {
.portid = NETLINK_CB(cb->skb).portid,
.seq = nlh->nlmsg_seq,
- .event = RTM_NEWADDR,
+ .event = event,
.flags = NLM_F_MULTI,
.netnsid = -1,
};
@@ -1949,6 +1987,16 @@ done:
return err;
}
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return inet_dump_addr(skb, cb, RTM_NEWADDR);
+}
+
+static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return inet_dump_addr(skb, cb, RTM_GETMULTICAST);
+}
+
static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid)
{
@@ -2845,6 +2893,8 @@ static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = {
{.protocol = PF_INET, .msgtype = RTM_GETNETCONF,
.doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf,
.flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST,
+ .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED},
};
void __init devinet_init(void)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f3281312eb5e..f14a41ee4aa1 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -120,47 +120,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
}
#ifdef CONFIG_INET_ESPINTCP
-struct esp_tcp_sk {
- struct sock *sk;
- struct rcu_head rcu;
-};
-
-static void esp_free_tcp_sk(struct rcu_head *head)
-{
- struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
-
- sock_put(esk->sk);
- kfree(esk);
-}
-
static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
{
struct xfrm_encap_tmpl *encap = x->encap;
struct net *net = xs_net(x);
- struct esp_tcp_sk *esk;
__be16 sport, dport;
- struct sock *nsk;
struct sock *sk;
- sk = rcu_dereference(x->encap_sk);
- if (sk && sk->sk_state == TCP_ESTABLISHED)
- return sk;
-
spin_lock_bh(&x->lock);
sport = encap->encap_sport;
dport = encap->encap_dport;
- nsk = rcu_dereference_protected(x->encap_sk,
- lockdep_is_held(&x->lock));
- if (sk && sk == nsk) {
- esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
- if (!esk) {
- spin_unlock_bh(&x->lock);
- return ERR_PTR(-ENOMEM);
- }
- RCU_INIT_POINTER(x->encap_sk, NULL);
- esk->sk = sk;
- call_rcu(&esk->rcu, esp_free_tcp_sk);
- }
spin_unlock_bh(&x->lock);
sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
@@ -173,20 +142,6 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
return ERR_PTR(-EINVAL);
}
- spin_lock_bh(&x->lock);
- nsk = rcu_dereference_protected(x->encap_sk,
- lockdep_is_held(&x->lock));
- if (encap->encap_sport != sport ||
- encap->encap_dport != dport) {
- sock_put(sk);
- sk = nsk ?: ERR_PTR(-EREMCHG);
- } else if (sk == nsk) {
- sock_put(sk);
- } else {
- rcu_assign_pointer(x->encap_sk, sk);
- }
- spin_unlock_bh(&x->lock);
-
return sk;
}
@@ -199,8 +154,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
sk = esp_find_tcp_sk(x);
err = PTR_ERR_OR_ZERO(sk);
- if (err)
+ if (err) {
+ kfree_skb(skb);
goto out;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
@@ -209,6 +166,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
err = espintcp_push_skb(sk, skb);
bh_unlock_sock(sk);
+ sock_put(sk);
+
out:
rcu_read_unlock();
return err;
@@ -279,7 +238,7 @@ static void esp_output_done(void *data, int err)
x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
esp_output_tail_tcp(x, skb);
else
- xfrm_output_resume(skb->sk, skb, err);
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
}
@@ -392,6 +351,8 @@ static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
if (IS_ERR(sk))
return ERR_CAST(sk);
+ sock_put(sk);
+
*lenp = htons(len);
esph = (struct ip_esp_hdr *)(lenp + 1);
@@ -816,7 +777,8 @@ int esp_input_done2(struct sk_buff *skb, int err)
}
skb_pull_rcsum(skb, hlen);
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -ihl);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 272e42d81323..6e1b94796f67 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
const struct in_ifaddr *ifa;
struct in_device *in_dev;
- in_dev = __in_dev_get_rtnl(dev);
+ in_dev = __in_dev_get_rtnl_net(dev);
if (!in_dev)
return -ENODEV;
*colon = ':';
- rcu_read_lock();
- in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
}
- rcu_read_unlock();
if (!ifa)
return -ENODEV;
@@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- rtnl_lock();
+ rtnl_net_lock(net);
err = rtentry_to_fib_config(net, cmd, rt, &cfg);
if (err == 0) {
struct fib_table *tb;
@@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
/* allocated by rtentry_to_fib_config() */
kfree(cfg.fc_mx);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
return err;
}
return -EINVAL;
@@ -837,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
}
}
+ if (cfg->fc_dst_len > 32) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix length");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length");
+ err = -EINVAL;
+ goto errout;
+ }
+
if (cfg->fc_nh_id) {
if (cfg->fc_oif || cfg->fc_gw_family ||
cfg->fc_encap || cfg->fc_mp) {
NL_SET_ERR_MSG(extack,
"Nexthop specification and nexthop id are mutually exclusive");
- return -EINVAL;
+ err = -EINVAL;
+ goto errout;
}
}
if (has_gw && has_via) {
NL_SET_ERR_MSG(extack,
"Nexthop configuration can not contain both GATEWAY and VIA");
- return -EINVAL;
+ err = -EINVAL;
+ goto errout;
}
if (!cfg->fc_table)
@@ -872,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
+ rtnl_net_lock(net);
+
if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
err = -EINVAL;
- goto errout;
+ goto unlock;
}
tb = fib_get_table(net, cfg.fc_table);
if (!tb) {
NL_SET_ERR_MSG(extack, "FIB table does not exist");
err = -ESRCH;
- goto errout;
+ goto unlock;
}
err = fib_table_delete(net, tb, &cfg, extack);
+unlock:
+ rtnl_net_unlock(net);
errout:
return err;
}
@@ -902,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
+ rtnl_net_lock(net);
+
tb = fib_new_table(net, cfg.fc_table);
if (!tb) {
err = -ENOBUFS;
- goto errout;
+ goto unlock;
}
err = fib_table_insert(net, tb, &cfg, extack);
if (!err && cfg.fc_type == RTN_LOCAL)
net->ipv4.fib_has_custom_local_routes = true;
+
+unlock:
+ rtnl_net_unlock(net);
errout:
return err;
}
@@ -927,12 +948,12 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
if (filter->rtnl_held)
ASSERT_RTNL();
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
return -EINVAL;
}
- rtm = nlmsg_data(nlh);
if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
rtm->rtm_scope) {
NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
@@ -1450,7 +1471,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
- rt_cache_flush(dev_net(dev));
+ rt_cache_flush(net);
break;
case NETDEV_DOWN:
fib_del_ifaddr(ifa, NULL);
@@ -1461,7 +1482,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
*/
fib_disable_ip(dev, event, true);
} else {
- rt_cache_flush(dev_net(dev));
+ rt_cache_flush(net);
}
break;
}
@@ -1503,7 +1524,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
fib_disable_ip(dev, event, false);
break;
case NETDEV_CHANGE:
- flags = dev_get_flags(dev);
+ flags = netif_get_flags(dev);
if (flags & (IFF_RUNNING | IFF_LOWER_UP))
fib_sync_up(dev, RTNH_F_LINKDOWN);
else
@@ -1575,7 +1596,7 @@ static void ip_fib_net_exit(struct net *net)
{
int i;
- ASSERT_RTNL();
+ ASSERT_RTNL_NET(net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@@ -1615,9 +1636,15 @@ static int __net_init fib_net_init(struct net *net)
error = ip_fib_net_init(net);
if (error < 0)
goto out;
+
+ error = fib4_semantics_init(net);
+ if (error)
+ goto out_semantics;
+
error = nl_fib_lookup_init(net);
if (error < 0)
goto out_nlfl;
+
error = fib_proc_init(net);
if (error < 0)
goto out_proc;
@@ -1627,9 +1654,11 @@ out:
out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
- rtnl_lock();
+ fib4_semantics_exit(net);
+out_semantics:
+ rtnl_net_lock(net);
ip_fib_net_exit(net);
- rtnl_unlock();
+ rtnl_net_unlock(net);
goto out;
}
@@ -1644,10 +1673,15 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list)
struct net *net;
rtnl_lock();
- list_for_each_entry(net, net_list, exit_list)
+ list_for_each_entry(net, net_list, exit_list) {
+ __rtnl_net_lock(net);
ip_fib_net_exit(net);
-
+ __rtnl_net_unlock(net);
+ }
rtnl_unlock();
+
+ list_for_each_entry(net, net_list, exit_list)
+ fib4_semantics_exit(net);
}
static struct pernet_operations fib_net_ops = {
@@ -1658,9 +1692,9 @@ static struct pernet_operations fib_net_ops = {
static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = {
{.protocol = PF_INET, .msgtype = RTM_NEWROUTE,
- .doit = inet_rtm_newroute},
+ .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET},
{.protocol = PF_INET, .msgtype = RTM_DELROUTE,
- .doit = inet_rtm_delroute},
+ .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET},
{.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib,
.flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 8325224ef072..fa58d6620ed6 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -37,6 +37,7 @@ struct fib4_rule {
u8 dst_len;
u8 src_len;
dscp_t dscp;
+ dscp_t dscp_mask;
u8 dscp_full:1; /* DSCP or TOS selector */
__be32 src;
__be32 srcmask;
@@ -192,7 +193,8 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
* to mask the upper three DSCP bits prior to matching to maintain
* legacy behavior.
*/
- if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
+ if (r->dscp_full &&
+ (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask)
return 0;
else if (!r->dscp_full && r->dscp &&
!fib_dscp_masked_match(r->dscp, fl4))
@@ -201,12 +203,12 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
return 0;
- if (fib_rule_port_range_set(&rule->sport_range) &&
- !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport))
+ if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask,
+ fl4->fl4_sport))
return 0;
- if (fib_rule_port_range_set(&rule->dport_range) &&
- !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport))
+ if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask,
+ fl4->fl4_dport))
return 0;
return 1;
@@ -235,19 +237,49 @@ static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4,
}
rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ rule4->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK);
rule4->dscp_full = true;
return 0;
}
+static int fib4_nl2rule_dscp_mask(const struct nlattr *nla,
+ struct fib4_rule *rule4,
+ struct netlink_ext_ack *extack)
+{
+ dscp_t dscp_mask;
+
+ if (!rule4->dscp_full) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Cannot specify DSCP mask without DSCP value");
+ return -EINVAL;
+ }
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ if (rule4->dscp & ~dscp_mask) {
+ NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask");
+ return -EINVAL;
+ }
+
+ rule4->dscp_mask = dscp_mask;
+
+ return 0;
+}
+
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
+ struct fib4_rule *rule4 = (struct fib4_rule *)rule;
+ struct net *net = rule->fr_net;
int err = -EINVAL;
- struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) {
+ NL_SET_ERR_MSG(extack,
+ "Flow label cannot be specified for IPv4 FIB rules");
+ goto errout;
+ }
if (!inet_validate_dscp(frh->tos)) {
NL_SET_ERR_MSG(extack,
@@ -265,6 +297,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0)
goto errout;
+ if (tb[FRA_DSCP_MASK] &&
+ fib4_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule4, extack) < 0)
+ goto errout;
+
/* split local/main if they are not already split */
err = fib_unmerge(net);
if (err)
@@ -360,6 +396,14 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
return 0;
}
+ if (tb[FRA_DSCP_MASK]) {
+ dscp_t dscp_mask;
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2);
+ if (!rule4->dscp_full || rule4->dscp_mask != dscp_mask)
+ return 0;
+ }
+
#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
@@ -385,7 +429,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
if (rule4->dscp_full) {
frh->tos = 0;
if (nla_put_u8(skb, FRA_DSCP,
- inet_dscp_to_dsfield(rule4->dscp) >> 2))
+ inet_dscp_to_dsfield(rule4->dscp) >> 2) ||
+ nla_put_u8(skb, FRA_DSCP_MASK,
+ inet_dscp_to_dsfield(rule4->dscp_mask) >> 2))
goto nla_put_failure;
} else {
frh->tos = inet_dscp_to_dsfield(rule4->dscp);
@@ -412,7 +458,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
return nla_total_size(4) /* dst */
+ nla_total_size(4) /* src */
+ nla_total_size(4) /* flow */
- + nla_total_size(1); /* dscp */
+ + nla_total_size(1) /* dscp */
+ + nla_total_size(1); /* dscp mask */
}
static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d2cee5c314f5..a5f3c8459758 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -50,12 +50,6 @@
#include "fib_lookup.h"
-static struct hlist_head *fib_info_hash;
-static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_info_hash_size;
-static unsigned int fib_info_hash_bits;
-static unsigned int fib_info_cnt;
-
/* for_nexthops and change_nexthops only used when nexthop object
* is not set in a fib_info. The logic within can reference fib_nh.
*/
@@ -258,8 +252,7 @@ void fib_release_info(struct fib_info *fi)
ASSERT_RTNL();
if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
hlist_del(&fi->fib_hash);
-
- fib_info_cnt--;
+ fi->fib_net->ipv4.fib_info_cnt--;
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
@@ -335,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
static unsigned int fib_info_hashfn_result(const struct net *net,
unsigned int val)
{
- return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits);
+ return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits);
}
-static inline unsigned int fib_info_hashfn(struct fib_info *fi)
+static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi)
{
+ struct net *net = fi->fib_net;
unsigned int val;
val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
@@ -354,7 +348,70 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi)
} endfor_nexthops(fi)
}
- return fib_info_hashfn_result(fi->fib_net, val);
+ return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)];
+}
+
+static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
+ __be32 val)
+{
+ unsigned int hash_bits = net->ipv4.fib_info_hash_bits;
+ u32 slot;
+
+ slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits);
+
+ return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot];
+}
+
+static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
+{
+ /* The second half is used for prefsrc */
+ return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head),
+ GFP_KERNEL);
+}
+
+static void fib_info_hash_free(struct hlist_head *head)
+{
+ kvfree(head);
+}
+
+static void fib_info_hash_grow(struct net *net)
+{
+ unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits;
+ struct hlist_head *new_info_hash, *old_info_hash;
+ unsigned int i;
+
+ if (net->ipv4.fib_info_cnt < old_size)
+ return;
+
+ new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1);
+ if (!new_info_hash)
+ return;
+
+ old_info_hash = net->ipv4.fib_info_hash;
+ net->ipv4.fib_info_hash = new_info_hash;
+ net->ipv4.fib_info_hash_bits += 1;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &old_info_hash[i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, head, fib_hash)
+ hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
+ }
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *lhead = &old_info_hash[old_size + i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash)
+ hlist_add_head(&fi->fib_lhash,
+ fib_info_laddrhash_bucket(fi->fib_net,
+ fi->fib_prefsrc));
+ }
+
+ fib_info_hash_free(old_info_hash);
}
/* no metrics, only nexthop id */
@@ -370,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net,
(__force u32)cfg->fc_prefsrc,
cfg->fc_priority);
hash = fib_info_hashfn_result(net, hash);
- head = &fib_info_hash[hash];
+ head = &net->ipv4.fib_info_hash[hash];
hlist_for_each_entry(fi, head, fib_hash) {
- if (!net_eq(fi->fib_net, net))
- continue;
if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
continue;
+
if (cfg->fc_protocol == fi->fib_protocol &&
cfg->fc_scope == fi->fib_scope &&
cfg->fc_prefsrc == fi->fib_prefsrc &&
@@ -392,18 +448,13 @@ static struct fib_info *fib_find_info_nh(struct net *net,
static struct fib_info *fib_find_info(struct fib_info *nfi)
{
- struct hlist_head *head;
+ struct hlist_head *head = fib_info_hash_bucket(nfi);
struct fib_info *fi;
- unsigned int hash;
-
- hash = fib_info_hashfn(nfi);
- head = &fib_info_hash[hash];
hlist_for_each_entry(fi, head, fib_hash) {
- if (!net_eq(fi->fib_net, nfi->fib_net))
- continue;
if (fi->fib_nhs != nfi->fib_nhs)
continue;
+
if (nfi->fib_protocol == fi->fib_protocol &&
nfi->fib_scope == fi->fib_scope &&
nfi->fib_prefsrc == fi->fib_prefsrc &&
@@ -574,11 +625,6 @@ int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
if (encap) {
struct lwtunnel_state *lwtstate;
- if (encap_type == LWTUNNEL_ENCAP_NONE) {
- NL_SET_ERR_MSG(extack, "LWT encap type not specified");
- err = -EINVAL;
- goto lwt_failure;
- }
err = lwtunnel_build_state(net, encap_type, encap,
nhc->nhc_family, cfg, &lwtstate,
extack);
@@ -1239,64 +1285,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
return err;
}
-static struct hlist_head *
-fib_info_laddrhash_bucket(const struct net *net, __be32 val)
-{
- u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val,
- fib_info_hash_bits);
-
- return &fib_info_laddrhash[slot];
-}
-
-static void fib_info_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
-{
- struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_info_hash_size;
- unsigned int i;
-
- ASSERT_RTNL();
- old_info_hash = fib_info_hash;
- old_laddrhash = fib_info_laddrhash;
- fib_info_hash_size = new_size;
- fib_info_hash_bits = ilog2(new_size);
-
- for (i = 0; i < old_size; i++) {
- struct hlist_head *head = &fib_info_hash[i];
- struct hlist_node *n;
- struct fib_info *fi;
-
- hlist_for_each_entry_safe(fi, n, head, fib_hash) {
- struct hlist_head *dest;
- unsigned int new_hash;
-
- new_hash = fib_info_hashfn(fi);
- dest = &new_info_hash[new_hash];
- hlist_add_head(&fi->fib_hash, dest);
- }
- }
- fib_info_hash = new_info_hash;
-
- fib_info_laddrhash = new_laddrhash;
- for (i = 0; i < old_size; i++) {
- struct hlist_head *lhead = &old_laddrhash[i];
- struct hlist_node *n;
- struct fib_info *fi;
-
- hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
- struct hlist_head *ldest;
-
- ldest = fib_info_laddrhash_bucket(fi->fib_net,
- fi->fib_prefsrc);
- hlist_add_head(&fi->fib_lhash, ldest);
- }
- }
-
- kvfree(old_info_hash);
- kvfree(old_laddrhash);
-}
-
__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
unsigned char scope)
{
@@ -1409,32 +1397,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
}
#endif
- err = -ENOBUFS;
-
- if (fib_info_cnt >= fib_info_hash_size) {
- unsigned int new_size = fib_info_hash_size << 1;
- struct hlist_head *new_info_hash;
- struct hlist_head *new_laddrhash;
- size_t bytes;
-
- if (!new_size)
- new_size = 16;
- bytes = (size_t)new_size * sizeof(struct hlist_head *);
- new_info_hash = kvzalloc(bytes, GFP_KERNEL);
- new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
- if (!new_info_hash || !new_laddrhash) {
- kvfree(new_info_hash);
- kvfree(new_laddrhash);
- } else {
- fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
- }
- if (!fib_info_hash_size)
- goto failure;
- }
+ fib_info_hash_grow(net);
fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
- if (!fi)
+ if (!fi) {
+ err = -ENOBUFS;
goto failure;
+ }
+
fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
if (IS_ERR(fi->fib_metrics)) {
err = PTR_ERR(fi->fib_metrics);
@@ -1571,9 +1541,9 @@ link_it:
refcount_set(&fi->fib_treeref, 1);
refcount_set(&fi->fib_clntref, 1);
- fib_info_cnt++;
- hlist_add_head(&fi->fib_hash,
- &fib_info_hash[fib_info_hashfn(fi)]);
+ net->ipv4.fib_info_cnt++;
+ hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
+
if (fi->fib_prefsrc) {
struct hlist_head *head;
@@ -1665,8 +1635,7 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
goto nla_put_failure;
- if (nhc->nhc_lwtstate &&
- lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
+ if (lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
goto nla_put_failure;
@@ -1855,7 +1824,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
struct fib_info *fi;
int ret = 0;
- if (!fib_info_laddrhash || local == 0)
+ if (!local)
return 0;
head = fib_info_laddrhash_bucket(net, local);
@@ -2118,7 +2087,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
return 0;
if (nh_flags & RTNH_F_DEAD) {
- unsigned int flags = dev_get_flags(dev);
+ unsigned int flags = netif_get_flags(dev);
if (flags & (IFF_RUNNING | IFF_LOWER_UP))
nh_flags |= RTNH_F_LINKDOWN;
@@ -2193,34 +2162,52 @@ static bool fib_good_nh(const struct fib_nh *nh)
return !!(state & NUD_VALID);
}
-void fib_select_multipath(struct fib_result *res, int hash)
+void fib_select_multipath(struct fib_result *res, int hash,
+ const struct flowi4 *fl4)
{
struct fib_info *fi = res->fi;
struct net *net = fi->fib_net;
- bool first = false;
+ bool found = false;
+ bool use_neigh;
+ __be32 saddr;
if (unlikely(res->fi->nh)) {
nexthop_path_fib_result(res, hash);
return;
}
+ use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
+ saddr = fl4 ? fl4->saddr : 0;
+
change_nexthops(fi) {
- if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) {
- if (!fib_good_nh(nexthop_nh))
- continue;
- if (!first) {
- res->nh_sel = nhsel;
- res->nhc = &nexthop_nh->nh_common;
- first = true;
- }
+ int nh_upper_bound;
+
+ /* Nexthops without a carrier are assigned an upper bound of
+ * minus one when "ignore_routes_with_linkdown" is set.
+ */
+ nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound);
+ if (nh_upper_bound == -1 ||
+ (use_neigh && !fib_good_nh(nexthop_nh)))
+ continue;
+
+ if (!found) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ found = !saddr || nexthop_nh->nh_saddr == saddr;
}
- if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
+ if (hash > nh_upper_bound)
continue;
- res->nh_sel = nhsel;
- res->nhc = &nexthop_nh->nh_common;
- return;
+ if (!saddr || nexthop_nh->nh_saddr == saddr) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ return;
+ }
+
+ if (found)
+ return;
+
} endfor_nexthops(fi);
}
#endif
@@ -2235,7 +2222,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
if (fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(net, fl4, skb, NULL);
- fib_select_multipath(res, h);
+ fib_select_multipath(res, h, fl4);
}
else
#endif
@@ -2257,3 +2244,22 @@ check_saddr:
fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
}
}
+
+int __net_init fib4_semantics_init(struct net *net)
+{
+ unsigned int hash_bits = 4;
+
+ net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits);
+ if (!net->ipv4.fib_info_hash)
+ return -ENOMEM;
+
+ net->ipv4.fib_info_hash_bits = hash_bits;
+ net->ipv4.fib_info_cnt = 0;
+
+ return 0;
+}
+
+void __net_exit fib4_semantics_exit(struct net *net)
+{
+ fib_info_hash_free(net->ipv4.fib_info_hash);
+}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 161f5526b86c..59a6f0a9638f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
return 0;
}
-static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
-{
- if (plen > KEYLENGTH) {
- NL_SET_ERR_MSG(extack, "Invalid prefix length");
- return false;
- }
-
- if ((plen < KEYLENGTH) && (key << plen)) {
- NL_SET_ERR_MSG(extack,
- "Invalid prefix for given prefix length");
- return false;
- }
-
- return true;
-}
-
static void fib_remove_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *old);
@@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
key = ntohl(cfg->fc_dst);
- if (!fib_valid_key_len(key, plen, extack))
- return -EINVAL;
-
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
fi = fib_create_info(cfg, extack);
@@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
key = ntohl(cfg->fc_dst);
- if (!fib_valid_key_len(key, plen, extack))
- return -EINVAL;
-
l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;
@@ -2999,7 +2977,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%u\t%08X\t%d\t%u\t%u",
nhc->nhc_dev ? nhc->nhc_dev->name : "*",
prefix, gw, flags, 0, 0,
fi->fib_priority,
@@ -3011,7 +2989,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
} else {
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%u\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
}
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 6701a98d9a9f..dafd68f3436a 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -199,7 +199,7 @@ static const struct net_protocol net_gre_protocol = {
static int __init gre_init(void)
{
- pr_info("GRE over IPv4 demultiplexor driver\n");
+ pr_info("GRE over IPv4 demultiplexer driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4f088fa1c2f2..2ffe73ea644f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -311,22 +311,23 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
{
struct dst_entry *dst = &rt->dst;
struct inet_peer *peer;
+ struct net_device *dev;
bool rc = true;
- int vif;
if (!apply_ratelimit)
return true;
/* No rate limit on loopback */
- if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ dev = dst_dev(dst);
+ if (dev && (dev->flags & IFF_LOOPBACK))
goto out;
- vif = l3mdev_master_ifindex(dst->dev);
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+ rcu_read_lock();
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
+ l3mdev_master_ifindex_rcu(dev));
rc = inet_peer_xrlim_allow(peer,
READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
- if (peer)
- inet_putpeer(peer);
+ rcu_read_unlock();
out:
if (!rc)
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
@@ -400,13 +401,12 @@ static void icmp_push_reply(struct sock *sk,
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
- struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->dst.dev);
+ struct net *net = dev_net_rcu(rt->dst.dev);
bool apply_ratelimit = false;
+ struct ipcm_cookie ipc;
struct flowi4 fl4;
struct sock *sk;
- struct inet_sock *inet;
__be32 daddr, saddr;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
int type = icmp_param->data.icmph.type;
@@ -425,12 +425,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
sk = icmp_xmit_lock(net);
if (!sk)
goto out_bh_enable;
- inet = inet_sk(sk);
icmp_param->data.icmph.checksum = 0;
ipcm_init(&ipc);
- inet->tos = ip_hdr(skb)->tos;
+ ipc.tos = ip_hdr(skb)->tos;
ipc.sockc.mark = mark;
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
@@ -469,13 +468,13 @@ out_bh_enable:
*/
static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
{
- struct net_device *route_lookup_dev = NULL;
+ struct net_device *dev = skb->dev;
+ const struct dst_entry *dst;
- if (skb->dev)
- route_lookup_dev = skb->dev;
- else if (skb_dst(skb))
- route_lookup_dev = skb_dst(skb)->dev;
- return route_lookup_dev;
+ if (dev)
+ return dev;
+ dst = skb_dst(skb);
+ return dst ? dst_dev(dst) : NULL;
}
static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
@@ -517,6 +516,9 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
if (!IS_ERR(dst)) {
if (rt != rt2)
return rt;
+ if (inet_addr_type_dev_table(net, route_lookup_dev,
+ fl4->daddr) == RTN_LOCAL)
+ return rt;
} else if (PTR_ERR(dst) == -EPERM) {
rt = NULL;
} else {
@@ -606,12 +608,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
struct sock *sk;
if (!rt)
- goto out;
+ return;
+
+ rcu_read_lock();
if (rt->dst.dev)
- net = dev_net(rt->dst.dev);
+ net = dev_net_rcu(rt->dst.dev);
else if (skb_in->dev)
- net = dev_net(skb_in->dev);
+ net = dev_net_rcu(skb_in->dev);
else
goto out;
@@ -733,8 +737,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
icmp_param.data.icmph.checksum = 0;
icmp_param.skb = skb_in;
icmp_param.offset = skb_network_offset(skb_in);
- inet_sk(sk)->tos = tos;
ipcm_init(&ipc);
+ ipc.tos = tos;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts.opt;
ipc.sockc.mark = mark;
@@ -783,7 +787,8 @@ out_unlock:
icmp_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
-out:;
+out:
+ rcu_read_unlock();
}
EXPORT_SYMBOL(__icmp_send);
@@ -832,7 +837,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
* avoid additional coding at protocol handlers.
*/
if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
return;
}
@@ -866,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
struct net *net;
u32 info = 0;
- net = dev_net(skb_dst(skb)->dev);
+ net = skb_dst_dev_net_rcu(skb);
/*
* Incomplete header ?
@@ -977,7 +982,7 @@ out_err:
static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
return SKB_DROP_REASON_PKT_TOO_SMALL;
}
@@ -1009,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
struct icmp_bxm icmp_param;
struct net *net;
- net = dev_net(skb_dst(skb)->dev);
+ net = skb_dst_dev_net_rcu(skb);
/* should there be an ICMP stat for ignored echos? */
if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
return SKB_NOT_DROPPED_YET;
@@ -1038,9 +1043,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
{
+ struct net *net = dev_net_rcu(skb->dev);
struct icmp_ext_hdr *ext_hdr, _ext_hdr;
struct icmp_ext_echo_iio *iio, _iio;
- struct net *net = dev_net(skb->dev);
struct inet6_dev *in6_dev;
struct in_device *in_dev;
struct net_device *dev;
@@ -1179,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
return SKB_NOT_DROPPED_YET;
out_err:
- __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS);
return SKB_DROP_REASON_PKT_TOO_SMALL;
}
@@ -1196,7 +1201,7 @@ int icmp_rcv(struct sk_buff *skb)
{
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->dst.dev);
+ struct net *net = dev_net_rcu(rt->dst.dev);
struct icmphdr *icmph;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
@@ -1245,22 +1250,6 @@ int icmp_rcv(struct sk_buff *skb)
goto reason_check;
}
- if (icmph->type == ICMP_EXT_ECHOREPLY) {
- reason = ping_rcv(skb);
- goto reason_check;
- }
-
- /*
- * 18 is the highest 'known' ICMP type. Anything else is a mystery
- *
- * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
- * discarded.
- */
- if (icmph->type > NR_ICMP_TYPES) {
- reason = SKB_DROP_REASON_UNHANDLED_PROTO;
- goto error;
- }
-
/*
* Parse the ICMP message
*/
@@ -1287,6 +1276,23 @@ int icmp_rcv(struct sk_buff *skb)
}
}
+ if (icmph->type == ICMP_EXT_ECHOREPLY ||
+ icmph->type == ICMP_ECHOREPLY) {
+ reason = ping_rcv(skb);
+ return reason ? NET_RX_DROP : NET_RX_SUCCESS;
+ }
+
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES) {
+ reason = SKB_DROP_REASON_UNHANDLED_PROTO;
+ goto error;
+ }
+
reason = icmp_pointers[icmph->type].handler(skb);
reason_check:
if (!reason) {
@@ -1369,9 +1375,9 @@ int icmp_err(struct sk_buff *skb, u32 info)
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ struct net *net = dev_net_rcu(skb->dev);
int type = icmp_hdr(skb)->type;
int code = icmp_hdr(skb)->code;
- struct net *net = dev_net(skb->dev);
/*
* Use ping_err to handle all icmp errors except those
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6a238398acc9..7182f1419c2a 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -81,6 +81,7 @@
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include "igmp_internal.h"
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
@@ -88,6 +89,8 @@
#include <linux/byteorder/generic.h>
#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/addrconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -202,7 +205,7 @@ static void ip_sf_list_clear_all(struct ip_sf_list *psf)
static void igmp_stop_timer(struct ip_mc_list *im)
{
spin_lock_bh(&im->lock);
- if (del_timer(&im->timer))
+ if (timer_delete(&im->timer))
refcount_dec(&im->refcnt);
im->tm_running = 0;
im->reporter = 0;
@@ -248,7 +251,7 @@ static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
{
spin_lock_bh(&im->lock);
im->unsolicit_count = 0;
- if (del_timer(&im->timer)) {
+ if (timer_delete(&im->timer)) {
if ((long)(im->timer.expires-jiffies) < max_delay) {
add_timer(&im->timer);
im->tm_running = 1;
@@ -424,7 +427,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -798,7 +801,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
static void igmp_gq_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);
+ struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer);
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
@@ -807,7 +810,7 @@ static void igmp_gq_timer_expire(struct timer_list *t)
static void igmp_ifc_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
+ struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer);
u32 mr_ifc_count;
igmpv3_send_cr(in_dev);
@@ -837,7 +840,7 @@ static void igmp_ifc_event(struct in_device *in_dev)
static void igmp_timer_expire(struct timer_list *t)
{
- struct ip_mc_list *im = from_timer(im, t, timer);
+ struct ip_mc_list *im = timer_container_of(im, t, timer);
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
@@ -971,7 +974,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
}
/* cancel the interface change timer */
WRITE_ONCE(in_dev->mr_ifc_count, 0);
- if (del_timer(&in_dev->mr_ifc_timer))
+ if (timer_delete(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
/* clear deleted report items */
igmpv3_clear_delrec(in_dev);
@@ -1430,6 +1433,70 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
*mc_hash = im->next_hash;
}
+int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im,
+ struct inet_fill_args *args)
+{
+ struct ifa_cacheinfo ci;
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = 32;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = RT_SCOPE_UNIVERSE;
+ ifm->ifa_index = dev->ifindex;
+
+ ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ;
+ ci.tstamp = ci.cstamp;
+ ci.ifa_prefered = INFINITY_LIFE_TIME;
+ ci.ifa_valid = INFINITY_LIFE_TIME;
+
+ if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 ||
+ nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static void inet_ifmcaddr_notify(struct net_device *dev,
+ const struct ip_mc_list *im, int event)
+{
+ struct inet_fill_args fillargs = {
+ .event = event,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(__be32)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet_fill_ifmcaddr(skb, dev, im, &fillargs);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err);
+}
/*
* A socket has joined a multicast group on device dev.
@@ -1473,6 +1540,8 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
im->interface = in_dev;
in_dev_hold(in_dev);
im->multiaddr = addr;
+ im->mca_cstamp = jiffies;
+ im->mca_tstamp = im->mca_cstamp;
/* initial mode is (EX, empty) */
im->sfmode = mode;
im->sfcount[mode] = 1;
@@ -1492,6 +1561,7 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
+ inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
@@ -1705,6 +1775,8 @@ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
*ip = i->next_rcu;
in_dev->mc_count--;
__igmp_group_dropped(i, gfp);
+ inet_ifmcaddr_notify(in_dev->dev, i,
+ RTM_DELMULTICAST);
ip_mc_clear_src(i);
if (!in_dev->dead)
@@ -1758,10 +1830,10 @@ void ip_mc_down(struct in_device *in_dev)
#ifdef CONFIG_IP_MULTICAST
WRITE_ONCE(in_dev->mr_ifc_count, 0);
- if (del_timer(&in_dev->mr_ifc_timer))
+ if (timer_delete(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
in_dev->mr_gq_running = 0;
- if (del_timer(&in_dev->mr_gq_timer))
+ if (timer_delete(&in_dev->mr_gq_timer))
__in_dev_put(in_dev);
#endif
diff --git a/net/ipv4/igmp_internal.h b/net/ipv4/igmp_internal.h
new file mode 100644
index 000000000000..0a1bcc8ec8e1
--- /dev/null
+++ b/net/ipv4/igmp_internal.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IGMP_INTERNAL_H
+#define _LINUX_IGMP_INTERNAL_H
+
+struct inet_fill_args {
+ u32 portid;
+ u32 seq;
+ int event;
+ unsigned int flags;
+ int netnsid;
+ int ifindex;
+};
+
+int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im,
+ struct inet_fill_args *args);
+#endif
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6872b5aff73e..1e2df51427fe 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -157,12 +157,10 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6) {
- int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
-
- if (addr_type == IPV6_ADDR_ANY)
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
return false;
- if (addr_type != IPV6_ADDR_MAPPED)
+ if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
return true;
}
#endif
@@ -170,7 +168,7 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk)
}
static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
- kuid_t sk_uid, bool relax,
+ kuid_t uid, bool relax,
bool reuseport_cb_ok, bool reuseport_ok)
{
int bound_dev_if2;
@@ -187,12 +185,12 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
if (!relax || (!reuseport_ok && sk->sk_reuseport &&
sk2->sk_reuseport && reuseport_cb_ok &&
(sk2->sk_state == TCP_TIME_WAIT ||
- uid_eq(sk_uid, sock_i_uid(sk2)))))
+ uid_eq(uid, sk_uid(sk2)))))
return true;
} else if (!reuseport_ok || !sk->sk_reuseport ||
!sk2->sk_reuseport || !reuseport_cb_ok ||
(sk2->sk_state != TCP_TIME_WAIT &&
- !uid_eq(sk_uid, sock_i_uid(sk2)))) {
+ !uid_eq(uid, sk_uid(sk2)))) {
return true;
}
}
@@ -200,7 +198,7 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
}
static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
- kuid_t sk_uid, bool relax,
+ kuid_t uid, bool relax,
bool reuseport_cb_ok, bool reuseport_ok)
{
if (ipv6_only_sock(sk2)) {
@@ -213,20 +211,20 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
#endif
}
- return inet_bind_conflict(sk, sk2, sk_uid, relax,
+ return inet_bind_conflict(sk, sk2, uid, relax,
reuseport_cb_ok, reuseport_ok);
}
static bool inet_bhash2_conflict(const struct sock *sk,
const struct inet_bind2_bucket *tb2,
- kuid_t sk_uid,
+ kuid_t uid,
bool relax, bool reuseport_cb_ok,
bool reuseport_ok)
{
struct sock *sk2;
sk_for_each_bound(sk2, &tb2->owners) {
- if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
+ if (__inet_bhash2_conflict(sk, sk2, uid, relax,
reuseport_cb_ok, reuseport_ok))
return true;
}
@@ -244,8 +242,8 @@ static int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind2_bucket *tb2, /* may be null */
bool relax, bool reuseport_ok)
{
- kuid_t uid = sock_i_uid((struct sock *)sk);
struct sock_reuseport *reuseport_cb;
+ kuid_t uid = sk_uid(sk);
bool reuseport_cb_ok;
struct sock *sk2;
@@ -289,11 +287,11 @@ static int inet_csk_bind_conflict(const struct sock *sk,
static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev,
bool relax, bool reuseport_ok)
{
- kuid_t uid = sock_i_uid((struct sock *)sk);
const struct net *net = sock_net(sk);
struct sock_reuseport *reuseport_cb;
struct inet_bind_hashbucket *head2;
struct inet_bind2_bucket *tb2;
+ kuid_t uid = sk_uid(sk);
bool conflict = false;
bool reuseport_cb_ok;
@@ -332,7 +330,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
struct inet_bind2_bucket **tb2_ret,
struct inet_bind_hashbucket **head2_ret, int *port_ret)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
int i, low, high, attempt_half, port, l3mdev;
struct inet_bind_hashbucket *head, *head2;
struct net *net = sock_net(sk);
@@ -427,15 +425,13 @@ success:
static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
struct sock *sk)
{
- kuid_t uid = sock_i_uid(sk);
-
if (tb->fastreuseport <= 0)
return 0;
if (!sk->sk_reuseport)
return 0;
if (rcu_access_pointer(sk->sk_reuseport_cb))
return 0;
- if (!uid_eq(tb->fastuid, uid))
+ if (!uid_eq(tb->fastuid, sk_uid(sk)))
return 0;
/* We only need to check the rcv_saddr if this tb was once marked
* without fastreuseport and then was reset, as we can only know that
@@ -460,14 +456,13 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
struct sock *sk)
{
- kuid_t uid = sock_i_uid(sk);
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
if (hlist_empty(&tb->bhash2)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = FASTREUSEPORT_ANY;
- tb->fastuid = uid;
+ tb->fastuid = sk_uid(sk);
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
tb->fast_sk_family = sk->sk_family;
@@ -494,7 +489,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
*/
if (!sk_reuseport_match(tb, sk)) {
tb->fastreuseport = FASTREUSEPORT_STRICT;
- tb->fastuid = uid;
+ tb->fastuid = sk_uid(sk);
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
tb->fast_sk_family = sk->sk_family;
@@ -514,10 +509,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
bool found_port = false, check_bind_conflict = true;
bool bhash_created = false, bhash2_created = false;
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
int ret = -EADDRINUSE, port = snum, l3mdev;
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2 = NULL;
@@ -600,7 +595,7 @@ fail_unlock:
if (bhash2_created)
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
if (bhash_created)
- inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(tb);
}
if (head2_lock_acquired)
spin_unlock(&head2->lock);
@@ -769,7 +764,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
timer_setup(&sk->sk_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
-EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk)
{
@@ -782,7 +776,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
sk_stop_timer(sk, &icsk->icsk_delack_timer);
sk_stop_timer(sk, &sk->sk_timer);
}
-EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
void inet_csk_clear_xmit_timers_sync(struct sock *sk)
{
@@ -799,18 +792,6 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk)
sk_stop_timer_sync(sk, &sk->sk_timer);
}
-void inet_csk_delete_keepalive_timer(struct sock *sk)
-{
- sk_stop_timer(sk, &sk->sk_timer);
-}
-EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
-
-void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
-{
- sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
-}
-EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
-
struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req)
@@ -828,7 +809,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num), sk->sk_uid);
+ htons(ireq->ir_num), sk_uid(sk));
security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -845,7 +826,6 @@ no_route:
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
-EXPORT_SYMBOL_GPL(inet_csk_route_req);
struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
@@ -866,7 +846,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num), sk->sk_uid);
+ htons(ireq->ir_num), sk_uid(sk));
security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -904,16 +884,6 @@ static void syn_ack_recalc(struct request_sock *req,
req->num_timeout >= rskq_defer_accept - 1;
}
-int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
-{
- int err = req->rsk_ops->rtx_syn_ack(parent, req);
-
- if (!err)
- req->num_retrans++;
- return err;
-}
-EXPORT_SYMBOL(inet_rtx_syn_ack);
-
static struct request_sock *
reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
bool attach_listener)
@@ -1040,9 +1010,10 @@ static bool reqsk_queue_unlink(struct request_sock *req)
bool found = false;
if (sk_hashed(sk)) {
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
- spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ spinlock_t *lock;
+ lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
spin_lock(lock);
found = __sk_nulls_del_node_init_rcu(sk);
spin_unlock(lock);
@@ -1072,18 +1043,17 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
{
return __inet_csk_reqsk_queue_drop(sk, req, false);
}
-EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
{
inet_csk_reqsk_queue_drop(sk, req);
reqsk_put(req);
}
-EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put);
static void reqsk_timer_handler(struct timer_list *t)
{
- struct request_sock *req = from_timer(req, t, rsk_timer);
+ struct request_sock *req = timer_container_of(req, t, rsk_timer);
struct request_sock *nreq = NULL, *oreq = req;
struct sock *sk_listener = req->rsk_listener;
struct inet_connection_sock *icsk;
@@ -1153,7 +1123,7 @@ static void reqsk_timer_handler(struct timer_list *t)
req->rsk_ops->syn_ack_timeout(req);
if (!expire &&
(!resend ||
- !inet_rtx_syn_ack(sk_listener, req) ||
+ !tcp_rtx_synack(sk_listener, req) ||
inet_rsk(req)->acked)) {
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
@@ -1223,7 +1193,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
inet_csk_reqsk_queue_added(sk);
return true;
}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
const gfp_t priority)
@@ -1249,42 +1218,61 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
const gfp_t priority)
{
struct sock *newsk = sk_clone_lock(sk, priority);
+ struct inet_connection_sock *newicsk;
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
- if (newsk) {
- struct inet_connection_sock *newicsk = inet_csk(newsk);
+ if (!newsk)
+ return NULL;
- inet_sk_set_state(newsk, TCP_SYN_RECV);
- newicsk->icsk_bind_hash = NULL;
- newicsk->icsk_bind2_hash = NULL;
+ newicsk = inet_csk(newsk);
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
- inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
- inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
- inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
+ newicsk->icsk_bind_hash = NULL;
+ newicsk->icsk_bind2_hash = NULL;
- /* listeners have SOCK_RCU_FREE, not the children */
- sock_reset_flag(newsk, SOCK_RCU_FREE);
+ newinet->inet_dport = ireq->ir_rmt_port;
+ newinet->inet_num = ireq->ir_num;
+ newinet->inet_sport = htons(ireq->ir_num);
- inet_sk(newsk)->mc_list = NULL;
+ newsk->sk_bound_dev_if = ireq->ir_iif;
- newsk->sk_mark = inet_rsk(req)->ir_mark;
- atomic64_set(&newsk->sk_cookie,
- atomic64_read(&inet_rsk(req)->ir_cookie));
+ newsk->sk_daddr = ireq->ir_rmt_addr;
+ newsk->sk_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
- newicsk->icsk_retransmits = 0;
- newicsk->icsk_backoff = 0;
- newicsk->icsk_probes_out = 0;
- newicsk->icsk_probes_tstamp = 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+#endif
- /* Deinitialize accept_queue to trap illegal accesses. */
- memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ /* listeners have SOCK_RCU_FREE, not the children */
+ sock_reset_flag(newsk, SOCK_RCU_FREE);
- inet_clone_ulp(req, newsk, priority);
+ inet_sk(newsk)->mc_list = NULL;
+
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+ atomic64_set(&newsk->sk_cookie,
+ atomic64_read(&inet_rsk(req)->ir_cookie));
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+ newicsk->icsk_probes_tstamp = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0,
+ sizeof(newicsk->icsk_accept_queue));
+
+ inet_sk_set_state(newsk, TCP_SYN_RECV);
+
+ inet_clone_ulp(req, newsk, priority);
+
+ security_inet_csk_clone(newsk, req);
- security_inet_csk_clone(newsk, req);
- }
return newsk;
}
-EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
/*
* At this point, there should be no process reference to this
@@ -1316,7 +1304,7 @@ void inet_csk_destroy_sock(struct sock *sk)
EXPORT_SYMBOL(inet_csk_destroy_sock);
/* This function allows to force a closure of a socket after the call to
- * tcp/dccp_create_openreq_child().
+ * tcp_create_openreq_child().
*/
void inet_csk_prepare_forced_close(struct sock *sk)
__releases(&sk->sk_lock.slock)
@@ -1374,7 +1362,6 @@ int inet_csk_listen_start(struct sock *sk)
inet_sk_set_state(sk, TCP_CLOSE);
return err;
}
-EXPORT_SYMBOL_GPL(inet_csk_listen_start);
static void inet_child_forget(struct sock *sk, struct request_sock *req,
struct sock *child)
@@ -1469,7 +1456,6 @@ child_put:
sock_put(child);
return NULL;
}
-EXPORT_SYMBOL(inet_csk_complete_hashdance);
/*
* This routine closes sockets which have been at least partially
@@ -1547,34 +1533,16 @@ skip_child_forget:
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
-void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
-{
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
- const struct inet_sock *inet = inet_sk(sk);
-
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = inet->inet_daddr;
- sin->sin_port = inet->inet_dport;
-}
-EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
-
static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct flowi4 *fl4;
struct rtable *rt;
rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
fl4 = &fl->u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (IS_ERR(rt))
rt = NULL;
if (rt)
@@ -1602,4 +1570,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
out:
return dst;
}
-EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 321acc8abf17..2fa53b16fe77 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -160,7 +160,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
ext & (1 << (INET_DIAG_TCLASS - 1))) {
u32 classid = 0;
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUP_NET_CLASSID
classid = sock_cgroup_classid(&sk->sk_cgrp_data);
#endif
/* Fallback to socket priority if class id isn't set.
@@ -181,7 +181,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
goto errout;
#endif
- r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->idiag_uid = from_kuid_munged(user_ns, sk_uid(sk));
r->idiag_inode = sock_i_ino(sk);
memset(&inet_sockopt, 0, sizeof(inet_sockopt));
@@ -282,7 +282,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
.idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
- .idiag_fmem = sk_forward_alloc_get(sk),
+ .idiag_fmem = READ_ONCE(sk->sk_forward_alloc),
.idiag_tmem = sk_wmem_alloc_get(sk),
};
@@ -315,12 +315,12 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires =
- jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
+ jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires =
- jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
+ jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = icsk->icsk_probes_out;
@@ -1369,8 +1369,6 @@ static int inet_diag_type2proto(int type)
switch (type) {
case TCPDIAG_GETSOCK:
return IPPROTO_TCP;
- case DCCPDIAG_GETSOCK:
- return IPPROTO_DCCP;
default:
return 0;
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index d179a2c84222..470ab17ceb51 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -133,7 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
struct inet_frag_queue *fq = ptr;
int count;
- count = del_timer_sync(&fq->timer) ? 1 : 0;
+ count = timer_delete_sync(&fq->timer) ? 1 : 0;
spin_lock_bh(&fq->lock);
fq->flags |= INET_FRAG_DROP;
@@ -145,8 +145,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
}
spin_unlock_bh(&fq->lock);
- if (refcount_sub_and_test(count, &fq->refcnt))
- inet_frag_destroy(fq);
+ inet_frag_putn(fq, count);
}
static LLIST_HEAD(fqdir_free_list);
@@ -226,10 +225,10 @@ void fqdir_exit(struct fqdir *fqdir)
}
EXPORT_SYMBOL(fqdir_exit);
-void inet_frag_kill(struct inet_frag_queue *fq)
+void inet_frag_kill(struct inet_frag_queue *fq, int *refs)
{
- if (del_timer(&fq->timer))
- refcount_dec(&fq->refcnt);
+ if (timer_delete(&fq->timer))
+ (*refs)++;
if (!(fq->flags & INET_FRAG_COMPLETE)) {
struct fqdir *fqdir = fq->fqdir;
@@ -244,7 +243,7 @@ void inet_frag_kill(struct inet_frag_queue *fq)
if (!READ_ONCE(fqdir->dead)) {
rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
fqdir->f->rhash_params);
- refcount_dec(&fq->refcnt);
+ (*refs)++;
} else {
fq->flags |= INET_FRAG_HASH_DEAD;
}
@@ -298,7 +297,7 @@ void inet_frag_destroy(struct inet_frag_queue *q)
reason = (q->flags & INET_FRAG_DROP) ?
SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
SKB_CONSUMED;
- WARN_ON(del_timer(&q->timer) != 0);
+ WARN_ON(timer_delete(&q->timer) != 0);
/* Release all fragment data. */
fqdir = q->fqdir;
@@ -328,7 +327,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
- refcount_set(&q->refcnt, 3);
+ /* One reference for the timer, one for the hash table. */
+ refcount_set(&q->refcnt, 2);
return q;
}
@@ -350,15 +350,20 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
&q->node, f->rhash_params);
if (*prev) {
+ /* We could not insert in the hash table,
+ * we need to cancel what inet_frag_alloc()
+ * anticipated.
+ */
+ int refs = 1;
+
q->flags |= INET_FRAG_COMPLETE;
- inet_frag_kill(q);
- inet_frag_destroy(q);
+ inet_frag_kill(q, &refs);
+ inet_frag_putn(q, refs);
return NULL;
}
return q;
}
-/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
{
/* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
@@ -368,17 +373,11 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
return NULL;
- rcu_read_lock();
-
prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
if (!prev)
fq = inet_frag_create(fqdir, key, &prev);
- if (!IS_ERR_OR_NULL(prev)) {
+ if (!IS_ERR_OR_NULL(prev))
fq = prev;
- if (!refcount_inc_not_zero(&fq->refcnt))
- fq = NULL;
- }
- rcu_read_unlock();
return fq;
}
EXPORT_SYMBOL(inet_frag_find);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 9bfcfd016e18..ceeeec9b7290 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -23,11 +23,12 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/inet6_hashtables.h>
#endif
-#include <net/secure_seq.h>
#include <net/hotdata.h>
#include <net/ip.h>
-#include <net/tcp.h>
+#include <net/rps.h>
+#include <net/secure_seq.h>
#include <net/sock_reuseport.h>
+#include <net/tcp.h>
u32 inet_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
@@ -35,8 +36,8 @@ u32 inet_ehashfn(const struct net *net, const __be32 laddr,
{
net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
- return __inet_ehashfn(laddr, lport, faddr, fport,
- inet_ehash_secret + net_hash_mix(net));
+ return lport + __inet_ehashfn(laddr, 0, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
}
EXPORT_SYMBOL_GPL(inet_ehashfn);
@@ -76,7 +77,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->fastreuse = 0;
tb->fastreuseport = 0;
INIT_HLIST_HEAD(&tb->bhash2);
- hlist_add_head(&tb->node, &head->chain);
+ hlist_add_head_rcu(&tb->node, &head->chain);
}
return tb;
}
@@ -84,11 +85,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
{
if (hlist_empty(&tb->bhash2)) {
- __hlist_del(&tb->node);
- kmem_cache_free(cachep, tb);
+ hlist_del_rcu(&tb->node);
+ kfree_rcu(tb, rcu);
}
}
@@ -176,7 +177,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
*/
static void __inet_put_port(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_bind_hashbucket *head, *head2;
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb;
@@ -201,7 +202,7 @@ static void __inet_put_port(struct sock *sk)
}
spin_unlock(&head2->lock);
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(tb);
spin_unlock(&head->lock);
}
@@ -215,7 +216,7 @@ EXPORT_SYMBOL(inet_put_port);
int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
- struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *table = tcp_get_hashinfo(sk);
unsigned short port = inet_sk(child)->inet_num;
struct inet_bind_hashbucket *head, *head2;
bool created_inet_bind_bucket = false;
@@ -285,7 +286,7 @@ bhash2_find:
error:
if (created_inet_bind_bucket)
- inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(tb);
spin_unlock(&head2->lock);
spin_unlock(&head->lock);
return -ENOMEM;
@@ -537,7 +538,9 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
- struct inet_timewait_sock **twp)
+ struct inet_timewait_sock **twp,
+ bool rcu_lookup,
+ u32 hash)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
@@ -548,14 +551,25 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
int sdif = l3mdev_master_ifindex_by_index(net, dif);
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
- unsigned int hash = inet_ehashfn(net, daddr, lport,
- saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
- spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
- struct sock *sk2;
- const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
+ const struct hlist_nulls_node *node;
+ struct sock *sk2;
+ spinlock_t *lock;
+
+ if (rcu_lookup) {
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash ||
+ !inet_match(net, sk2, acookie, ports, dif, sdif))
+ continue;
+ if (sk2->sk_state == TCP_TIME_WAIT)
+ break;
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+ }
+ lock = inet_ehash_lockp(hinfo, hash);
spin_lock(lock);
sk_nulls_for_each(sk2, node, &head->chain) {
@@ -655,7 +669,7 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk,
*/
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_ehash_bucket *head;
struct hlist_nulls_head *list;
spinlock_t *lock;
@@ -700,15 +714,15 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
}
return ok;
}
-EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
+EXPORT_IPV6_MOD(inet_ehash_nolisten);
static int inet_reuseport_add_sock(struct sock *sk,
struct inet_listen_hashbucket *ilb)
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
const struct hlist_nulls_node *node;
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
- kuid_t uid = sock_i_uid(sk);
sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
if (sk2 != sk &&
@@ -716,7 +730,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
inet_csk(sk2)->icsk_bind_hash == tb &&
- sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false))
return reuseport_add_sock(sk, sk2,
inet_rcv_saddr_any(sk));
@@ -727,7 +741,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
int __inet_hash(struct sock *sk, struct sock *osk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_listen_hashbucket *ilb2;
int err = 0;
@@ -758,7 +772,7 @@ unlock:
return err;
}
-EXPORT_SYMBOL(__inet_hash);
+EXPORT_IPV6_MOD(__inet_hash);
int inet_hash(struct sock *sk)
{
@@ -769,15 +783,15 @@ int inet_hash(struct sock *sk)
return err;
}
-EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
if (sk_unhashed(sk))
return;
+ sock_rps_delete_flow(sk);
if (sk->sk_state == TCP_LISTEN) {
struct inet_listen_hashbucket *ilb2;
@@ -810,7 +824,7 @@ void inet_unhash(struct sock *sk)
spin_unlock_bh(lock);
}
}
-EXPORT_SYMBOL_GPL(inet_unhash);
+EXPORT_IPV6_MOD(inet_unhash);
static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
const struct net *net, unsigned short port,
@@ -861,7 +875,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net
struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
u32 hash;
#if IS_ENABLED(CONFIG_IPV6)
@@ -889,7 +903,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family)
static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
{
- struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2, *new_tb2;
int l3mdev = inet_sk_bound_l3mdev(sk);
@@ -969,14 +983,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
{
return __inet_bhash2_update_saddr(sk, saddr, family, false);
}
-EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
+EXPORT_IPV6_MOD(inet_bhash2_update_saddr);
void inet_bhash2_reset_saddr(struct sock *sk)
{
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
__inet_bhash2_update_saddr(sk, NULL, 0, true);
}
-EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);
+EXPORT_IPV6_MOD(inet_bhash2_reset_saddr);
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
* Note that we use 32bit integers (vs RFC 'short integers')
@@ -993,8 +1007,10 @@ static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u64 port_offset,
+ u32 hash_port0,
int (*check_established)(struct inet_timewait_death_row *,
- struct sock *, __u16, struct inet_timewait_sock **))
+ struct sock *, __u16, struct inet_timewait_sock **,
+ bool rcu_lookup, u32 hash))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_bind_hashbucket *head, *head2;
@@ -1012,7 +1028,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
if (port) {
local_bh_disable();
- ret = check_established(death_row, sk, port, NULL);
+ ret = check_established(death_row, sk, port, NULL, false,
+ hash_port0 + port);
local_bh_enable();
return ret;
}
@@ -1048,6 +1065,22 @@ other_parity_scan:
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tb, &head->chain, node) {
+ if (!inet_bind_bucket_match(tb, net, port, l3mdev))
+ continue;
+ if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
+ rcu_read_unlock();
+ goto next_port;
+ }
+ if (!check_established(death_row, sk, port, &tw, true,
+ hash_port0 + port))
+ break;
+ rcu_read_unlock();
+ goto next_port;
+ }
+ rcu_read_unlock();
+
spin_lock_bh(&head->lock);
/* Does not bother with rcv_saddr checks, because
@@ -1057,12 +1090,13 @@ other_parity_scan:
if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
- goto next_port;
+ goto next_port_unlock;
WARN_ON(hlist_empty(&tb->bhash2));
if (!check_established(death_row, sk,
- port, &tw))
+ port, &tw, false,
+ hash_port0 + port))
goto ok;
- goto next_port;
+ goto next_port_unlock;
}
}
@@ -1076,8 +1110,9 @@ other_parity_scan:
tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok;
-next_port:
+next_port_unlock:
spin_unlock_bh(&head->lock);
+next_port:
cond_resched();
}
@@ -1149,7 +1184,7 @@ error:
spin_unlock(&head2->lock);
if (tb_created)
- inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(tb);
spin_unlock(&head->lock);
if (tw)
@@ -1166,14 +1201,20 @@ error:
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct net *net = sock_net(sk);
u64 port_offset = 0;
+ u32 hash_port0;
if (!inet_sk(sk)->inet_num)
port_offset = inet_sk_port_offset(sk);
- return __inet_hash_connect(death_row, sk, port_offset,
+
+ hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0,
+ inet->inet_daddr, inet->inet_dport);
+
+ return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
__inet_check_established);
}
-EXPORT_SYMBOL_GPL(inet_hash_connect);
static void init_hashinfo_lhash2(struct inet_hashinfo *h)
{
@@ -1224,32 +1265,45 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
init_hashinfo_lhash2(h);
return 0;
}
-EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
unsigned int locksz = sizeof(spinlock_t);
unsigned int i, nblocks = 1;
+ spinlock_t *ptr = NULL;
- if (locksz != 0) {
- /* allocate 2 cache lines or at least one spinlock per cpu */
- nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
- nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+ if (locksz == 0)
+ goto set_mask;
- /* no more locks than number of hash buckets */
- nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+ /* Allocate 2 cache lines or at least one spinlock per cpu. */
+ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();
- hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
- if (!hashinfo->ehash_locks)
- return -ENOMEM;
+ /* At least one page per NUMA node. */
+ nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);
+
+ nblocks = roundup_pow_of_two(nblocks);
- for (i = 0; i < nblocks; i++)
- spin_lock_init(&hashinfo->ehash_locks[i]);
+ /* No more locks than number of hash buckets. */
+ nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+ if (num_online_nodes() > 1) {
+ /* Use vmalloc() to allow NUMA policy to spread pages
+ * on all available nodes if desired.
+ */
+ ptr = vmalloc_array(nblocks, locksz);
+ }
+ if (!ptr) {
+ ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
}
+ for (i = 0; i < nblocks; i++)
+ spin_lock_init(&ptr[i]);
+ hashinfo->ehash_locks = ptr;
+set_mask:
hashinfo->ehash_locks_mask = nblocks - 1;
return 0;
}
-EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
unsigned int ehash_entries)
@@ -1285,7 +1339,6 @@ free_hashinfo:
err:
return NULL;
}
-EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc);
void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
{
@@ -1296,4 +1349,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
vfree(hashinfo->ehash);
kfree(hashinfo);
}
-EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 337390ba85b4..875ff923a8ed 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
tw->tw_tb = NULL;
tw->tw_tb2 = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(tb);
__sock_put((struct sock *)tw);
}
@@ -166,11 +166,10 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
spin_unlock(lock);
local_bh_enable();
}
-EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule);
static void tw_timer_handler(struct timer_list *t)
{
- struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
+ struct inet_timewait_sock *tw = timer_container_of(tw, t, tw_timer);
inet_twsk_kill(tw);
}
@@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
return tw;
}
-EXPORT_SYMBOL_GPL(inet_twsk_alloc);
/* These are always called from BH context. See callers in
* tcp_input.c to verify this.
@@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
}
-EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
void inet_twsk_purge(struct inet_hashinfo *hashinfo)
@@ -365,4 +362,3 @@ restart:
rcu_read_unlock();
}
}
-EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 5ab56f4cb529..7b1e0a2d6906 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -60,7 +60,7 @@ void inet_peer_base_init(struct inet_peer_base *bp)
seqlock_init(&bp->lock);
bp->total = 0;
}
-EXPORT_SYMBOL_GPL(inet_peer_base_init);
+EXPORT_IPV6_MOD_GPL(inet_peer_base_init);
#define PEER_MAX_GC 32
@@ -95,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
{
struct rb_node **pp, *parent, *next;
struct inet_peer *p;
+ u32 now;
pp = &base->rb_root.rb_node;
parent = NULL;
@@ -108,8 +109,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
p = rb_entry(parent, struct inet_peer, rb_node);
cmp = inetpeer_addr_cmp(daddr, &p->daddr);
if (cmp == 0) {
- if (!refcount_inc_not_zero(&p->refcnt))
- break;
+ now = jiffies;
+ if (READ_ONCE(p->dtime) != now)
+ WRITE_ONCE(p->dtime, now);
return p;
}
if (gc_stack) {
@@ -150,9 +152,6 @@ static void inet_peer_gc(struct inet_peer_base *base,
for (i = 0; i < gc_cnt; i++) {
p = gc_stack[i];
- /* The READ_ONCE() pairs with the WRITE_ONCE()
- * in inet_putpeer()
- */
delta = (__u32)jiffies - READ_ONCE(p->dtime);
if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
@@ -168,31 +167,23 @@ static void inet_peer_gc(struct inet_peer_base *base,
}
}
+/* Must be called under RCU : No refcount change is done here. */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
- const struct inetpeer_addr *daddr,
- int create)
+ const struct inetpeer_addr *daddr)
{
struct inet_peer *p, *gc_stack[PEER_MAX_GC];
struct rb_node **pp, *parent;
unsigned int gc_cnt, seq;
- int invalidated;
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
- rcu_read_lock();
seq = read_seqbegin(&base->lock);
p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
- invalidated = read_seqretry(&base->lock, seq);
- rcu_read_unlock();
if (p)
return p;
- /* If no writer did a change during our lookup, we can return early. */
- if (!create && !invalidated)
- return NULL;
-
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
@@ -201,12 +192,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
gc_cnt = 0;
p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
- if (!p && create) {
+ if (!p) {
p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
if (p) {
p->daddr = *daddr;
p->dtime = (__u32)jiffies;
- refcount_set(&p->refcnt, 2);
+ refcount_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
@@ -227,19 +218,13 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
return p;
}
-EXPORT_SYMBOL_GPL(inet_getpeer);
+EXPORT_IPV6_MOD_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
- /* The WRITE_ONCE() pairs with itself (we run lockless)
- * and the READ_ONCE() in inet_peer_gc()
- */
- WRITE_ONCE(p->dtime, (__u32)jiffies);
-
if (refcount_dec_and_test(&p->refcnt))
kfree_rcu(p, rcu);
}
-EXPORT_SYMBOL_GPL(inet_putpeer);
/*
* Check transmit rate limitation for given message.
@@ -261,26 +246,30 @@ EXPORT_SYMBOL_GPL(inet_putpeer);
#define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
- unsigned long now, token;
+ unsigned long now, token, otoken, delta;
bool rc = false;
if (!peer)
return true;
- token = peer->rate_tokens;
+ token = otoken = READ_ONCE(peer->rate_tokens);
now = jiffies;
- token += now - peer->rate_last;
- peer->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
+ delta = now - READ_ONCE(peer->rate_last);
+ if (delta) {
+ WRITE_ONCE(peer->rate_last, now);
+ token += delta;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ }
if (token >= timeout) {
token -= timeout;
rc = true;
}
- peer->rate_tokens = token;
+ if (token != otoken)
+ WRITE_ONCE(peer->rate_tokens, token);
return rc;
}
-EXPORT_SYMBOL(inet_peer_xrlim_allow);
+EXPORT_IPV6_MOD(inet_peer_xrlim_allow);
void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
@@ -297,4 +286,4 @@ void inetpeer_invalidate_tree(struct inet_peer_base *base)
base->total = 0;
}
-EXPORT_SYMBOL(inetpeer_invalidate_tree);
+EXPORT_IPV6_MOD(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 07036a2943c1..b2584cce90ae 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -76,21 +76,27 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev);
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
- struct net *net = q->fqdir->net;
-
const struct frag_v4_compare_key *key = a;
+ struct net *net = q->fqdir->net;
+ struct inet_peer *p = NULL;
q->key.v4 = *key;
qp->ecn = 0;
- qp->peer = q->fqdir->max_dist ?
- inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
- NULL;
+ if (q->fqdir->max_dist) {
+ rcu_read_lock();
+ p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif);
+ if (p && !refcount_inc_not_zero(&p->refcnt))
+ p = NULL;
+ rcu_read_unlock();
+ }
+ qp->peer = p;
}
static void ip4_frag_free(struct inet_frag_queue *q)
@@ -102,22 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q)
inet_putpeer(qp->peer);
}
-
-/* Destruction primitives. */
-
-static void ipq_put(struct ipq *ipq)
-{
- inet_frag_put(&ipq->q);
-}
-
-/* Kill ipq entry. It is not destroyed immediately,
- * because caller (and someone more) holds reference count.
- */
-static void ipq_kill(struct ipq *ipq)
-{
- inet_frag_kill(&ipq->q);
-}
-
static bool frag_expire_skip_icmp(u32 user)
{
return user == IP_DEFRAG_AF_PACKET ||
@@ -133,11 +123,12 @@ static bool frag_expire_skip_icmp(u32 user)
static void ip_expire(struct timer_list *t)
{
enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
- struct inet_frag_queue *frag = from_timer(frag, t, timer);
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
const struct iphdr *iph;
struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
+ int refs = 1;
qp = container_of(frag, struct ipq, q);
net = qp->q.fqdir->net;
@@ -154,7 +145,7 @@ static void ip_expire(struct timer_list *t)
goto out;
qp->q.flags |= INET_FRAG_DROP;
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, &refs);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -197,7 +188,7 @@ out:
out_rcu_unlock:
rcu_read_unlock();
kfree_skb_reason(head, reason);
- ipq_put(qp);
+ inet_frag_putn(&qp->q, refs);
}
/* Find the correct entry in the "incomplete datagrams" queue for
@@ -273,7 +264,7 @@ static int ip_frag_reinit(struct ipq *qp)
}
/* Add new segment to existing queue. */
-static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs)
{
struct net *net = qp->q.fqdir->net;
int ihl, end, flags, offset;
@@ -293,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, refs);
goto err;
}
@@ -377,10 +368,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, skb, prev_tail, dev);
+ err = ip_frag_reasm(qp, skb, prev_tail, dev, refs);
skb->_skb_refdst = orefdst;
if (err)
- inet_frag_kill(&qp->q);
+ inet_frag_kill(&qp->q, refs);
return err;
}
@@ -397,7 +388,7 @@ insert_error:
err = -EINVAL;
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
discard_qp:
- inet_frag_kill(&qp->q);
+ inet_frag_kill(&qp->q, refs);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
err:
kfree_skb_reason(skb, reason);
@@ -411,7 +402,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp)
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev)
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
@@ -419,7 +411,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
int len, err;
u8 ecn;
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, refs);
ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
@@ -484,25 +476,28 @@ out_fail:
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+ struct net_device *dev = skb->dev ? : skb_dst_dev(skb);
int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
+ rcu_read_lock();
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
- int ret;
+ int ret, refs = 0;
spin_lock(&qp->q.lock);
- ret = ip_frag_queue(qp, skb);
+ ret = ip_frag_queue(qp, skb, &refs);
spin_unlock(&qp->q.lock);
- ipq_put(qp);
+ rcu_read_unlock();
+ inet_frag_putn(&qp->q, refs);
return ret;
}
+ rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f1f31ebfc793..f5b9004d6938 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -141,7 +141,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
const struct iphdr *iph;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
- unsigned int data_len = 0;
struct ip_tunnel *t;
if (tpi->proto == htons(ETH_P_TEB))
@@ -182,7 +181,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
return 0;
- data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
break;
case ICMP_REDIRECT:
@@ -190,10 +188,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info,
}
#if IS_ENABLED(CONFIG_IPV6)
- if (tpi->proto == htons(ETH_P_IPV6) &&
- !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
- type, data_len))
- return 0;
+ if (tpi->proto == htons(ETH_P_IPV6)) {
+ unsigned int data_len = 0;
+
+ if (type == ICMP_TIME_EXCEEDED)
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
+
+ if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
+ type, data_len))
+ return 0;
+ }
#endif
if (t->parms.iph.daddr == 0 ||
@@ -924,15 +928,18 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi4 fl4;
+ struct flowi4 fl4 = {
+ .flowi4_oif = t->parms.link,
+ .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)),
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_proto = IPPROTO_GRE,
+ .saddr = t->parms.iph.saddr,
+ .daddr = t->parms.iph.daddr,
+ .fl4_gre_key = t->parms.o_key,
+ };
struct rtable *rt;
- rt = ip_route_output_gre(t->net, &fl4,
- t->parms.iph.daddr,
- t->parms.iph.saddr,
- t->parms.o_key,
- t->parms.iph.tos & INET_DSCP_MASK,
- t->parms.link);
+ rt = ip_route_output_key(t->net, &fl4);
if (IS_ERR(rt))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
@@ -1059,16 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net)
return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}
-static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit ipgre_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
- .exit_batch_rtnl = ipgre_exit_batch_rtnl,
+ .exit_rtnl = ipgre_exit_rtnl,
.id = &ipgre_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -1389,10 +1395,12 @@ ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
return 0;
}
-static int ipgre_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ipgre_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip_tunnel_parm_kern p;
__u32 fwmark = 0;
int err;
@@ -1404,13 +1412,16 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
- return ip_tunnel_newlink(dev, tb, &p, fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
}
-static int erspan_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int erspan_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip_tunnel_parm_kern p;
__u32 fwmark = 0;
int err;
@@ -1422,7 +1433,8 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev,
err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
if (err)
return err;
- return ip_tunnel_newlink(dev, tb, &p, fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
}
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
@@ -1690,6 +1702,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = {
struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
u8 name_assign_type)
{
+ struct rtnl_newlink_params params = { .src_net = net };
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
LIST_HEAD(list_kill);
@@ -1697,6 +1710,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
int err;
memset(&tb, 0, sizeof(tb));
+ params.tb = tb;
dev = rtnl_create_link(net, name, name_assign_type,
&ipgre_tap_ops, tb, NULL);
@@ -1707,7 +1721,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
t = netdev_priv(dev);
t->collect_md = true;
- err = ipgre_newlink(net, dev, tb, NULL, NULL);
+ err = ipgre_newlink(dev, &params, NULL);
if (err < 0) {
free_netdev(dev);
return ERR_PTR(err);
@@ -1737,16 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net)
return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}
-static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit ipgre_tap_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill);
}
static struct pernet_operations ipgre_tap_net_ops = {
.init = ipgre_tap_init_net,
- .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl,
+ .exit_rtnl = ipgre_tap_exit_rtnl,
.id = &gre_tap_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -1757,16 +1770,15 @@ static int __net_init erspan_init_net(struct net *net)
&erspan_link_ops, "erspan0");
}
-static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
+static void __net_exit erspan_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill);
}
static struct pernet_operations erspan_net_ops = {
.init = erspan_init_net,
- .exit_batch_rtnl = erspan_exit_batch_rtnl,
+ .exit_rtnl = erspan_exit_rtnl,
.id = &erspan_net_id,
.size = sizeof(struct ip_tunnel_net),
};
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f0a4dda246ab..fc323994b1fa 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -226,6 +226,12 @@ resubmit:
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
+ return 0;
+ }
+
skb_clear_delivery_time(skb);
__skb_pull(skb, skb_network_header_len(skb));
@@ -314,13 +320,13 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_early_demux(struct sk_buff *skb);
int udp_v4_early_demux(struct sk_buff *skb);
-static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+static int ip_rcv_finish_core(struct net *net,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
- int err, drop_reason;
struct rtable *rt;
+ int drop_reason;
if (ip_can_use_hint(skb, iph, hint)) {
drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr,
@@ -345,9 +351,10 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
break;
case IPPROTO_UDP:
if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
- err = udp_v4_early_demux(skb);
- if (unlikely(err))
+ drop_reason = udp_v4_early_demux(skb);
+ if (unlikely(drop_reason))
goto drop_error;
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
@@ -442,7 +449,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
+ ret = ip_rcv_finish_core(net, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -589,8 +596,7 @@ static struct sk_buff *ip_extract_route_hint(const struct net *net,
return skb;
}
-static void ip_list_rcv_finish(struct net *net, struct sock *sk,
- struct list_head *head)
+static void ip_list_rcv_finish(struct net *net, struct list_head *head)
{
struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
@@ -607,7 +613,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
@@ -633,7 +639,7 @@ static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
{
NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
head, dev, NULL, ip_rcv_finish);
- ip_list_rcv_finish(net, NULL, head);
+ ip_list_rcv_finish(net, head);
}
/* Receive a list of IP packets */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0065b1996c94..84e7f8a2f50f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -75,7 +75,6 @@
#include <net/checksum.h>
#include <net/gso.h>
#include <net/inetpeer.h>
-#include <net/inet_ecn.h>
#include <net/lwtunnel.h>
#include <net/inet_dscp.h>
#include <linux/bpf-cgroup.h>
@@ -117,7 +116,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, NULL, skb_dst_dev(skb),
dst_output);
}
@@ -200,7 +199,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = dst_rtable(dst);
- struct net_device *dev = dst->dev;
+ struct net_device *dev = dst_dev(dst);
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
bool is_v6gw = false;
@@ -426,15 +425,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
+ struct net_device *dev, *indev = skb->dev;
+ int ret_val;
+ rcu_read_lock();
+ dev = skb_dst_dev_rcu(skb);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, indev, dev,
- ip_finish_output,
- !(IPCB(skb)->flags & IPSKB_REROUTED));
+ ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, indev, dev,
+ ip_finish_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+ rcu_read_unlock();
+ return ret_val;
}
EXPORT_SYMBOL(ip_output);
@@ -478,24 +482,16 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
/* Make sure we can route this packet. */
rt = dst_rtable(__sk_dst_check(sk, 0));
if (!rt) {
- __be32 daddr;
+ inet_sk_init_flowi4(inet, fl4);
- /* Use correct destination address if we have options. */
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
+ /* sctp_v4_xmit() uses its own DSCP value */
+ fl4->flowi4_tos = tos & INET_DSCP_MASK;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(net, fl4, sk,
- daddr, inet->inet_saddr,
- inet->inet_dport,
- inet->inet_sport,
- sk->sk_protocol,
- tos & INET_DSCP_MASK,
- sk->sk_bound_dev_if);
+ rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
@@ -1023,7 +1019,8 @@ static int __ip_append_data(struct sock *sk,
uarg = msg->msg_ubuf;
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
- uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
@@ -1169,7 +1166,10 @@ alloc_new_skb:
/* [!] NOTE: copy will be negative if pagedlen>0
* because then the equation reduces to -fraggap.
*/
- if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ if (copy > 0 &&
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
@@ -1213,8 +1213,9 @@ alloc_new_skb:
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
@@ -1226,8 +1227,7 @@ alloc_new_skb:
if (WARN_ON_ONCE(copy > msg->msg_iter.count))
goto error;
- err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
- sk->sk_allocation);
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
if (err < 0)
goto error;
copy = err;
@@ -1252,7 +1252,8 @@ alloc_new_skb:
get_page(pfrag->page);
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (getfrag(from,
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
page_address(pfrag->page) + pfrag->offset,
offset, copy, skb->len, skb) < 0)
goto error_efault;
@@ -1328,7 +1329,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
cork->mark = ipc->sockc.mark;
- cork->priority = ipc->priority;
+ cork->priority = ipc->sockc.priority;
cork->transmit_time = ipc->sockc.transmit_time;
cork->tx_flags = 0;
sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags);
@@ -1465,7 +1466,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_options_build(skb, opt, cork->addr, rt);
}
- skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
+ skb->priority = cork->priority;
skb->mark = cork->mark;
if (sk_is_tcp(sk))
skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
@@ -1643,7 +1644,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
if (IS_ERR(rt))
return;
- inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
+ inet_sk(sk)->tos = arg->tos;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf377377b52d..6d9c5c20b1c4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -128,20 +128,20 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
{
- char *secdata;
- u32 seclen, secid;
+ struct lsm_context ctx;
+ u32 secid;
int err;
err = security_socket_getpeersec_dgram(NULL, skb, &secid);
if (err)
return;
- err = security_secid_to_secctx(secid, &secdata, &seclen);
- if (err)
+ err = security_secid_to_secctx(secid, &ctx);
+ if (err < 0)
return;
- put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
- security_release_secctx(secdata, seclen);
+ put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context);
+ security_release_secctx(&ctx);
}
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
@@ -315,7 +315,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
- ipc->priority = rt_tos2priority(ipc->tos);
+ ipc->sockc.priority = rt_tos2priority(ipc->tos);
break;
case IP_PROTOCOL:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 25505f9b724c..aaeb5d16f0c9 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -40,6 +40,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
#include <net/dst_metadata.h>
@@ -242,11 +243,11 @@ static struct net_device *__ip_tunnel_create(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
goto failed;
- strscpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name);
} else {
if (strlen(ops->kind) > (IFNAMSIZ - 3))
goto failed;
- strcpy(name, ops->kind);
+ strscpy(name, ops->kind);
strcat(name, "%d");
}
@@ -294,7 +295,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
- iph->tos & INET_DSCP_MASK, dev_net(dev),
+ iph->tos & INET_DSCP_MASK, tunnel->net,
tunnel->parms.link, tunnel->fwmark, 0, 0);
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -611,7 +612,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
tunnel_id_to_key32(key->tun_id),
- tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark,
+ tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark,
skb_get_hash(skb), key->flow_flags);
if (!tunnel_hlen)
@@ -667,7 +668,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ip_tunnel_adj_headroom(dev, headroom);
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
- df, !net_eq(tunnel->net, dev_net(dev)));
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
return;
tx_error:
DEV_STATS_INC(dev, tx_errors);
@@ -774,7 +775,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, tos & INET_DSCP_MASK,
- dev_net(dev), READ_ONCE(tunnel->parms.link),
+ tunnel->net, READ_ONCE(tunnel->parms.link),
tunnel->fwmark, skb_get_hash(skb), 0);
if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
@@ -856,7 +857,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ip_tunnel_adj_headroom(dev, max_headroom);
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
- df, !net_eq(tunnel->net, dev_net(dev)));
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
return;
#if IS_ENABLED(CONFIG_IPV6)
@@ -1162,7 +1163,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
* Allowing to move it to another netns is clearly unsafe.
*/
if (!IS_ERR(itn->fb_tunnel_dev)) {
- itn->fb_tunnel_dev->netns_local = true;
+ itn->fb_tunnel_dev->netns_immutable = true;
itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
itn->type = itn->fb_tunnel_dev->type;
@@ -1173,13 +1174,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
-static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
- struct list_head *head,
- struct rtnl_link_ops *ops)
+void ip_tunnel_delete_net(struct net *net, unsigned int id,
+ struct rtnl_link_ops *ops,
+ struct list_head *head)
{
+ struct ip_tunnel_net *itn = net_generic(net, id);
struct net_device *dev, *aux;
int h;
+ ASSERT_RTNL_NET(net);
+
for_each_netdev_safe(net, dev, aux)
if (dev->rtnl_link_ops == ops)
unregister_netdevice_queue(dev, head);
@@ -1197,27 +1201,13 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
unregister_netdevice_queue(t->dev, head);
}
}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
-void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
- struct rtnl_link_ops *ops,
- struct list_head *dev_to_kill)
-{
- struct ip_tunnel_net *itn;
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list) {
- itn = net_generic(net, id);
- ip_tunnel_destroy(net, itn, dev_to_kill, ops);
- }
-}
-EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
-
-int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm_kern *p, __u32 fwmark)
+int ip_tunnel_newlink(struct net *net, struct net_device *dev,
+ struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
+ __u32 fwmark)
{
struct ip_tunnel *nt;
- struct net *net = dev_net(dev);
struct ip_tunnel_net *itn;
int mtu;
int err;
@@ -1326,7 +1316,6 @@ int ip_tunnel_init(struct net_device *dev)
}
tunnel->dev = dev;
- tunnel->net = dev_net(dev);
strscpy(tunnel->parms.name, dev->name);
iph->version = 4;
iph->ihl = 5;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index a3676155be78..cc9915543637 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -49,7 +49,8 @@ EXPORT_SYMBOL(ip6tun_encaps);
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
- __u8 tos, __u8 ttl, __be16 df, bool xnet)
+ __u8 tos, __u8 ttl, __be16 df, bool xnet,
+ u16 ipcb_flags)
{
int pkt_len = skb->len - skb_inner_network_offset(skb);
struct net *net = dev_net(rt->dst.dev);
@@ -62,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
skb_clear_hash_if_not_l4(skb);
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ IPCB(skb)->flags = ipcb_flags;
/* Push down and install the IP header. */
skb_push(skb, sizeof(struct iphdr));
@@ -416,7 +418,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
skb_dst_update_pmtu_no_confirm(skb, mtu);
- if (!reply || skb->pkt_type == PACKET_HOST)
+ if (!reply)
return 0;
if (skb->protocol == htons(ETH_P_IP))
@@ -451,7 +453,7 @@ static const struct nla_policy
geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
[LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
[LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
- [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+ [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
};
static const struct nla_policy
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index f0b4419cef34..95b6bb78fcd2 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -229,7 +229,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error_icmp;
}
- tdev = dst->dev;
+ tdev = dst_dev(dst);
if (tdev == dev) {
dst_release(dst);
@@ -259,7 +259,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
xmit:
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
skb_dst_set(skb, dst);
- skb->dev = skb_dst(skb)->dev;
+ skb->dev = skb_dst_dev(skb);
err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
@@ -523,16 +523,15 @@ static int __net_init vti_init_net(struct net *net)
return 0;
}
-static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit vti_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill);
}
static struct pernet_operations vti_net_ops = {
.init = vti_init_net,
- .exit_batch_rtnl = vti_exit_batch_rtnl,
+ .exit_rtnl = vti_exit_rtnl,
.id = &vti_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -575,15 +574,18 @@ static void vti_netlink_parms(struct nlattr *data[],
*fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
}
-static int vti_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int vti_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct nlattr **data = params->data;
struct ip_tunnel_parm_kern parms;
+ struct nlattr **tb = params->tb;
__u32 fwmark = 0;
vti_netlink_parms(data, &parms, &fwmark);
- return ip_tunnel_newlink(dev, tb, &parms, fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb,
+ &parms, fwmark);
}
static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 5a4fb2539b08..9a45aed508d1 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -54,6 +54,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
}
/* We always hold one tunnel user reference to indicate a tunnel */
+static struct lock_class_key xfrm_state_lock_key;
static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
{
struct net *net = xs_net(x);
@@ -62,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t = xfrm_state_alloc(net);
if (!t)
goto out;
+ lockdep_set_class(&t->lock, &xfrm_state_lock_key);
t->id.proto = IPPROTO_IPIP;
t->id.spi = x->props.saddr.a4;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index c56b6fe6f0d7..22a7889876c1 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -274,9 +274,9 @@ static int __init ic_open_devs(void)
/* wait for a carrier on at least one device */
start = jiffies;
- next_msg = start + msecs_to_jiffies(20000);
+ next_msg = start + secs_to_jiffies(20);
while (time_before(jiffies, start +
- msecs_to_jiffies(carrier_timeout * 1000))) {
+ secs_to_jiffies(carrier_timeout))) {
int wait, elapsed;
rtnl_lock();
@@ -295,7 +295,7 @@ static int __init ic_open_devs(void)
elapsed = jiffies_to_msecs(jiffies - start);
wait = (carrier_timeout * 1000 - elapsed + 500) / 1000;
pr_info("Waiting up to %d more seconds for network.\n", wait);
- next_msg = jiffies + msecs_to_jiffies(20000);
+ next_msg = jiffies + secs_to_jiffies(20);
}
have_carrier:
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index dc0db5895e0e..3e03af073a1c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -436,11 +436,13 @@ static void ipip_netlink_parms(struct nlattr *data[],
*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
-static int ipip_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ipip_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip_tunnel_encap ipencap;
struct ip_tunnel_parm_kern p;
__u32 fwmark = 0;
@@ -453,7 +455,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
}
ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
- return ip_tunnel_newlink(dev, tb, &p, fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
}
static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
@@ -601,16 +604,15 @@ static int __net_init ipip_init_net(struct net *net)
return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}
-static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net,
- struct list_head *dev_to_kill)
+static void __net_exit ipip_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops,
- dev_to_kill);
+ ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
- .exit_batch_rtnl = ipip_exit_batch_rtnl,
+ .exit_rtnl = ipip_exit_rtnl,
.id = &ipip_net_id,
.size = sizeof(struct ip_tunnel_net),
};
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c5b8ec5c0a8c..e86a8a862c41 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -120,11 +120,6 @@ static void ipmr_expire_process(struct timer_list *t);
lockdep_rtnl_is_held() || \
list_empty(&net->ipv4.mr_tables))
-static bool ipmr_can_free_table(struct net *net)
-{
- return !check_net(net) || !net->ipv4.mr_rules_ops;
-}
-
static struct mr_table *ipmr_mr_table_iter(struct net *net,
struct mr_table *mrt)
{
@@ -317,11 +312,6 @@ EXPORT_SYMBOL(ipmr_rule_default);
#define ipmr_for_each_table(mrt, net) \
for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
-static bool ipmr_can_free_table(struct net *net)
-{
- return !check_net(net);
-}
-
static struct mr_table *ipmr_mr_table_iter(struct net *net,
struct mr_table *mrt)
{
@@ -437,7 +427,7 @@ static void ipmr_free_table(struct mr_table *mrt)
{
struct net *net = read_pnet(&mrt->net);
- WARN_ON_ONCE(!ipmr_can_free_table(net));
+ WARN_ON_ONCE(!mr_can_free_table(net));
timer_shutdown_sync(&mrt->ipmr_expire_timer);
mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
@@ -563,7 +553,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
dev->needs_free_netdev = true;
- dev->netns_local = true;
+ dev->netns_immutable = true;
}
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
@@ -775,7 +765,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
/* Timer process for the unresolved queue. */
static void ipmr_expire_process(struct timer_list *t)
{
- struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
+ struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer);
struct mr_mfc *c, *next;
unsigned long expires;
unsigned long now;
@@ -831,7 +821,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
cache->mfc_un.res.maxvif = vifi + 1;
}
}
- cache->mfc_un.res.lastuse = jiffies;
+ WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
}
static int vif_add(struct net *net, struct mr_table *mrt,
@@ -911,7 +901,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
(VIFF_TUNNEL | VIFF_REGISTER));
- err = dev_get_port_parent_id(dev, &ppid, true);
+ err = netif_get_port_parent_id(dev, &ppid, true);
if (err == 0) {
memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len);
v->dev_parent_id.id_len = ppid.id_len;
@@ -1289,7 +1279,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
}
}
if (list_empty(&mrt->mfc_unres_queue))
- del_timer(&mrt->ipmr_expire_timer);
+ timer_delete(&mrt->ipmr_expire_timer);
spin_unlock_bh(&mfc_unres_lock);
if (found) {
@@ -1681,9 +1671,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
rcu_read_lock();
c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
if (c) {
- sr->pktcnt = c->_c.mfc_un.res.pkt;
- sr->bytecnt = c->_c.mfc_un.res.bytes;
- sr->wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
return 0;
}
@@ -1753,9 +1743,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
if (c) {
- sr.pktcnt = c->_c.mfc_un.res.pkt;
- sr.bytecnt = c->_c.mfc_un.res.bytes;
- sr.wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -1863,20 +1853,19 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
/* Processing handlers for ipmr_forward, under rcu_read_lock() */
-static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
- int in_vifi, struct sk_buff *skb, int vifi)
+static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
struct net_device *vif_dev;
- struct net_device *dev;
struct rtable *rt;
struct flowi4 fl4;
int encap = 0;
vif_dev = vif_dev_read(vif);
if (!vif_dev)
- goto out_free;
+ return -1;
if (vif->flags & VIFF_REGISTER) {
WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
@@ -1884,12 +1873,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
DEV_STATS_INC(vif_dev, tx_packets);
ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
- goto out_free;
+ return -1;
}
- if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
- goto out_free;
-
if (vif->flags & VIFF_TUNNEL) {
rt = ip_route_output_ports(net, &fl4, NULL,
vif->remote, vif->local,
@@ -1897,7 +1883,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
IPPROTO_IPIP,
iph->tos & INET_DSCP_MASK, vif->link);
if (IS_ERR(rt))
- goto out_free;
+ return -1;
encap = sizeof(struct iphdr);
} else {
rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
@@ -1905,11 +1891,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
IPPROTO_IPIP,
iph->tos & INET_DSCP_MASK, vif->link);
if (IS_ERR(rt))
- goto out_free;
+ return -1;
}
- dev = rt->dst.dev;
-
if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
* allow to send ICMP, so that packets will disappear
@@ -1917,14 +1901,14 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
*/
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
- goto out_free;
+ return -1;
}
- encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
+ encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
- goto out_free;
+ return -1;
}
WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
@@ -1944,6 +1928,22 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
}
+ return 0;
+}
+
+static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt,
+ int in_vifi, struct sk_buff *skb, int vifi)
+{
+ struct rtable *rt;
+
+ if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
+ goto out_free;
+
+ if (ipmr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ rt = skb_rtable(skb);
+
IPCB(skb)->flags |= IPSKB_FORWARDED;
/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
@@ -1957,7 +1957,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* result in receiving multiple packets.
*/
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, dev,
+ net, NULL, skb, skb->dev, rt->dst.dev,
ipmr_forward_finish);
return;
@@ -1965,6 +1965,19 @@ out_free:
kfree_skb(skb);
}
+static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
+{
+ if (ipmr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ ip_mc_output(net, NULL, skb);
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
+
/* Called with mrt_lock or rcu_read_lock() */
static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
{
@@ -1988,9 +2001,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
int vif, ct;
vif = c->_c.mfc_parent;
- c->_c.mfc_un.res.pkt++;
- c->_c.mfc_un.res.bytes += skb->len;
- c->_c.mfc_un.res.lastuse = jiffies;
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
struct mfc_cache *cache_proxy;
@@ -2021,7 +2034,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
goto dont_forward;
}
- c->_c.mfc_un.res.wrong_if++;
+ atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
@@ -2075,8 +2088,8 @@ forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, true_vifi,
- skb2, psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi,
+ skb2, psend);
}
psend = ct;
}
@@ -2087,10 +2100,10 @@ last_forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, true_vifi, skb2,
- psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb2,
+ psend);
} else {
- ipmr_queue_xmit(net, mrt, true_vifi, skb, psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb, psend);
return;
}
}
@@ -2224,6 +2237,110 @@ dont_forward:
return 0;
}
+static void ip_mr_output_finish(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *c)
+{
+ int psend = -1;
+ int ct;
+
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+ /* Forward the frame */
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (ip_hdr(skb)->ttl >
+ c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = c->_c.mfc_parent;
+ goto last_xmit;
+ }
+ goto dont_xmit;
+ }
+
+ for (ct = c->_c.mfc_un.res.maxvif - 1;
+ ct >= c->_c.mfc_un.res.minvif; ct--) {
+ if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ipmr_queue_output_xmit(net, mrt,
+ skb2, psend);
+ }
+ psend = ct;
+ }
+ }
+
+last_xmit:
+ if (psend != -1) {
+ ipmr_queue_output_xmit(net, mrt, skb, psend);
+ return;
+ }
+
+dont_xmit:
+ kfree_skb(skb);
+}
+
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
+int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct mfc_cache *cache;
+ struct net_device *dev;
+ struct mr_table *mrt;
+ int vif;
+
+ guard(rcu)();
+
+ dev = rt->dst.dev;
+
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
+ goto mc_output;
+ if (!(IPCB(skb)->flags & IPSKB_MCROUTE))
+ goto mc_output;
+
+ skb->dev = dev;
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto mc_output;
+
+ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (!cache) {
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
+
+ /* No usable cache entry */
+ if (!cache) {
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif >= 0)
+ return ipmr_cache_unresolved(mrt, vif, skb, dev);
+ goto mc_output;
+ }
+
+ vif = cache->_c.mfc_parent;
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev)
+ goto mc_output;
+
+ ip_mr_output_finish(net, mrt, dev, skb, cache);
+ return 0;
+
+mc_output:
+ return ip_mc_output(net, sk, skb);
+}
+
#ifdef CONFIG_IP_PIMSM_V1
/* Handle IGMP messages of PIMv1 */
int pim_rcv_v1(struct sk_buff *skb)
@@ -2511,7 +2628,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
struct rtmsg *rtm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request");
return -EINVAL;
}
@@ -2520,7 +2638,6 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
rtm_ipv4_policy, extack);
- rtm = nlmsg_data(nlh);
if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
(rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
@@ -2836,7 +2953,8 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
{
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
return -EINVAL;
}
@@ -2846,7 +2964,6 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change || ifm->ifi_index) {
NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
@@ -3029,9 +3146,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
if (it->cache != &mrt->mfc_unres_queue) {
seq_printf(seq, " %8lu %8lu %8lu",
- mfc->_c.mfc_un.res.pkt,
- mfc->_c.mfc_un.res.bytes,
- mfc->_c.mfc_un.res.wrong_if);
+ atomic_long_read(&mfc->_c.mfc_un.res.pkt),
+ atomic_long_read(&mfc->_c.mfc_un.res.bytes),
+ atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
for (n = mfc->_c.mfc_un.res.minvif;
n < mfc->_c.mfc_un.res.maxvif; n++) {
if (VIF_EXISTS(mrt, n) &&
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index f0af12a2f70b..28d77d454d44 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -263,9 +263,9 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
lastuse = READ_ONCE(c->mfc_un.res.lastuse);
lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
- mfcs.mfcs_packets = c->mfc_un.res.pkt;
- mfcs.mfcs_bytes = c->mfc_un.res.bytes;
- mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt);
+ mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes);
+ mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if);
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
@@ -330,9 +330,6 @@ next_entry:
list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
if (e < s_e)
goto next_entry2;
- if (filter->dev &&
- !mr_mfc_uses_dev(mrt, mfc, filter->dev))
- goto next_entry2;
err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 08bc3f2c0078..0565f001120d 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -20,12 +20,12 @@
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
{
+ struct net_device *dev = skb_dst_dev(skb);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
__u8 flags;
- struct net_device *dev = skb_dst(skb)->dev;
struct flow_keys flkeys;
unsigned int hh_len;
@@ -74,7 +74,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
#endif
/* Change in oif may mean change in hh_len. */
- hh_len = skb_dst(skb)->dev->hard_header_len;
+ hh_len = skb_dst_dev(skb)->hard_header_len;
if (skb_headroom(skb) < hh_len &&
pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
0, GFP_ATOMIC))
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ef8009281da5..7dc9772fe2d8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -13,8 +13,9 @@ config NF_DEFRAG_IPV4
# old sockopt interface and eval loop
config IP_NF_IPTABLES_LEGACY
tristate "Legacy IP tables support"
- default n
- select NETFILTER_XTABLES
+ depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default m if NETFILTER_XTABLES_LEGACY
help
iptables is a legacy packet classifier.
This is not needed if you are using iptables over nftables
@@ -182,8 +183,8 @@ config IP_NF_MATCH_TTL
# `filter', generic and specific targets
config IP_NF_FILTER
tristate "Packet filtering"
- default m if NETFILTER_ADVANCED=n
- select IP_NF_IPTABLES_LEGACY
+ default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
Packet filtering defines a table `filter', which has a series of
rules for simple packet filtering at local input, forwarding and
@@ -220,10 +221,10 @@ config IP_NF_TARGET_SYNPROXY
config IP_NF_NAT
tristate "iptables NAT support"
depends on NF_CONNTRACK
+ depends on IP_NF_IPTABLES_LEGACY
default m if NETFILTER_ADVANCED=n
select NF_NAT
select NETFILTER_XT_NAT
- select IP_NF_IPTABLES_LEGACY
help
This enables the `nat' table in iptables. This allows masquerading,
port forwarding and other forms of full Network Address Port
@@ -263,8 +264,8 @@ endif # IP_NF_NAT
# mangle + specific targets
config IP_NF_MANGLE
tristate "Packet mangling"
- default m if NETFILTER_ADVANCED=n
- select IP_NF_IPTABLES_LEGACY
+ default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `mangle' table to iptables: see the man page for
iptables(8). This table is used for various packet alterations
@@ -299,7 +300,7 @@ config IP_NF_TARGET_TTL
# raw + specific targets
config IP_NF_RAW
tristate 'raw table support (required for NOTRACK/TRACE)'
- select IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
@@ -313,7 +314,7 @@ config IP_NF_SECURITY
tristate "Security table"
depends on SECURITY
depends on NETFILTER_ADVANCED
- select IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `security' table to iptables, for use
with Mandatory Access Control (MAC) policy.
@@ -325,8 +326,9 @@ endif # IP_NF_IPTABLES
# ARP tables
config IP_NF_ARPTABLES
tristate "Legacy ARPTABLES support"
+ depends on NETFILTER_XTABLES_LEGACY
depends on NETFILTER_XTABLES
- default n
+ default n
help
arptables is a legacy packet classifier.
This is not needed if you are using arptables over nftables
@@ -342,6 +344,7 @@ config IP_NF_ARPFILTER
tristate "arptables-legacy packet filtering support"
select IP_NF_ARPTABLES
select NETFILTER_FAMILY_ARP
+ depends on NETFILTER_XTABLES_LEGACY
depends on NETFILTER_XTABLES
help
ARP packet filtering defines a table `filter', which has a series of
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 3d101613f27f..23c8deff8095 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -270,7 +270,7 @@ ipt_do_table(void *priv,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
- jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+ jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 25e1e8eb18dd..ed08fb78cfa8 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
struct iphdr *iph;
local_bh_disable();
- if (this_cpu_read(nf_skb_duplicated))
+ if (current->in_nf_duplicate)
goto out;
/*
* Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->ttl;
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
- __this_cpu_write(nf_skb_duplicated, true);
+ current->in_nf_duplicate = true;
ip_local_out(net, skb->sk, skb);
- __this_cpu_write(nf_skb_duplicated, false);
+ current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 625adbc42037..7e7c49535e3f 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
else
addr = iph->saddr;
- *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+ if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) {
+ *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+ return;
+ }
+
+ *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr);
}
EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
@@ -65,12 +70,17 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct flowi4 fl4 = {
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_proto = pkt->tprot,
.flowi4_uid = sock_net_uid(nft_net(pkt), NULL),
- .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)),
};
const struct net_device *oif;
const struct net_device *found;
+ if (nft_fib_can_skip(pkt)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
+ return;
+ }
+
/*
* Do not set flowi4_oif, it restricts results (for example, asking
* for oif 3 will get RTN_UNICAST result even if the daddr exits
@@ -85,11 +95,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
else
oif = NULL;
- if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
- nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
- nft_fib_store_result(dest, priv, nft_in(pkt));
- return;
- }
+ fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif);
iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
if (!iph) {
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 09a3d73b45ba..29118c43ebf5 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void)
INIT_LIST_HEAD(&nh->f6i_list);
INIT_LIST_HEAD(&nh->grp_list);
INIT_LIST_HEAD(&nh->fdb_list);
+ spin_lock_init(&nh->lock);
}
return nh;
}
@@ -984,8 +985,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
break;
}
- if (nhi->fib_nhc.nhc_lwtstate &&
- lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
+ if (lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
goto nla_put_failure;
@@ -1272,10 +1272,8 @@ static int nh_check_attr_group(struct net *net,
u16 nh_grp_type, struct netlink_ext_ack *extack)
{
unsigned int len = nla_len(tb[NHA_GROUP]);
- u8 nh_family = AF_UNSPEC;
struct nexthop_grp *nhg;
unsigned int i, j;
- u8 nhg_fdb = 0;
if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
NL_SET_ERR_MSG(extack,
@@ -1307,10 +1305,41 @@ static int nh_check_attr_group(struct net *net,
}
}
- if (tb[NHA_FDB])
- nhg_fdb = 1;
nhg = nla_data(tb[NHA_GROUP]);
- for (i = 0; i < len; ++i) {
+ for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
+ if (!tb[i])
+ continue;
+ switch (i) {
+ case NHA_HW_STATS_ENABLE:
+ case NHA_FDB:
+ continue;
+ case NHA_RES_GROUP:
+ if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+ continue;
+ break;
+ }
+ NL_SET_ERR_MSG(extack,
+ "No other attributes can be set in nexthop groups");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ u8 nh_family = AF_UNSPEC;
+ struct nexthop_grp *nhg;
+ unsigned int len;
+ unsigned int i;
+ u8 nhg_fdb;
+
+ len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg);
+ nhg = nla_data(tb[NHA_GROUP]);
+ nhg_fdb = !!tb[NHA_FDB];
+
+ for (i = 0; i < len; i++) {
struct nexthop *nh;
bool is_fdb_nh;
@@ -1330,22 +1359,6 @@ static int nh_check_attr_group(struct net *net,
return -EINVAL;
}
}
- for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
- if (!tb[i])
- continue;
- switch (i) {
- case NHA_HW_STATS_ENABLE:
- case NHA_FDB:
- continue;
- case NHA_RES_GROUP:
- if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
- continue;
- break;
- }
- NL_SET_ERR_MSG(extack,
- "No other attributes can be set in nexthop groups");
- return -EINVAL;
- }
return 0;
}
@@ -1542,12 +1555,12 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
if (nh->is_group) {
struct nh_group *nhg;
- nhg = rtnl_dereference(nh->nh_grp);
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
if (nhg->has_v4)
goto no_v4_nh;
is_fdb_nh = nhg->fdb_nh;
} else {
- nhi = rtnl_dereference(nh->nh_info);
+ nhi = rcu_dereference_rtnl(nh->nh_info);
if (nhi->family == AF_INET)
goto no_v4_nh;
is_fdb_nh = nhi->fdb_nh;
@@ -2105,7 +2118,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
/* not called for nexthop replace */
static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
{
- struct fib6_info *f6i, *tmp;
+ struct fib6_info *f6i;
bool do_flush = false;
struct fib_info *fi;
@@ -2116,13 +2129,24 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
if (do_flush)
fib_flush(net);
- /* ip6_del_rt removes the entry from this list hence the _safe */
- list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+ spin_lock_bh(&nh->lock);
+
+ nh->dead = true;
+
+ while (!list_empty(&nh->f6i_list)) {
+ f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list);
+
/* __ip6_del_rt does a release, so do a hold here */
fib6_info_hold(f6i);
+
+ spin_unlock_bh(&nh->lock);
ipv6_stub->ip6_del_rt(net, f6i,
!READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
+
+ spin_lock_bh(&nh->lock);
}
+
+ spin_unlock_bh(&nh->lock);
}
static void __remove_nexthop(struct net *net, struct nexthop *nh,
@@ -2679,9 +2703,6 @@ static struct nexthop *nexthop_create_group(struct net *net,
int err;
int i;
- if (WARN_ON(!num_nh))
- return ERR_PTR(-EINVAL);
-
nh = nexthop_alloc();
if (!nh)
return ERR_PTR(-ENOMEM);
@@ -2915,11 +2936,6 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
struct nexthop *nh;
int err;
- if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
- NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
- return ERR_PTR(-EINVAL);
- }
-
if (!cfg->nh_id) {
cfg->nh_id = nh_find_unused_id(net);
if (!cfg->nh_id) {
@@ -3016,19 +3032,13 @@ static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
}
static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct nh_config *cfg,
+ struct nlmsghdr *nlh, struct nlattr **tb,
+ struct nh_config *cfg,
struct netlink_ext_ack *extack)
{
struct nhmsg *nhm = nlmsg_data(nlh);
- struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
int err;
- err = nlmsg_parse(nlh, sizeof(*nhm), tb,
- ARRAY_SIZE(rtm_nh_policy_new) - 1,
- rtm_nh_policy_new, extack);
- if (err < 0)
- return err;
-
err = -EINVAL;
if (nhm->resvd || nhm->nh_scope) {
NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
@@ -3093,7 +3103,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
NL_SET_ERR_MSG(extack, "Invalid group type");
goto out;
}
- err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb),
+
+ err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new),
cfg->nh_grp_type, extack);
if (err)
goto out;
@@ -3126,25 +3137,6 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
goto out;
}
- if (!cfg->nh_fdb && tb[NHA_OIF]) {
- cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
- if (cfg->nh_ifindex)
- cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
-
- if (!cfg->dev) {
- NL_SET_ERR_MSG(extack, "Invalid device index");
- goto out;
- } else if (!(cfg->dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack, "Nexthop device is not up");
- err = -ENETDOWN;
- goto out;
- } else if (!netif_carrier_ok(cfg->dev)) {
- NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
- err = -ENETDOWN;
- goto out;
- }
- }
-
err = -EINVAL;
if (tb[NHA_GATEWAY]) {
struct nlattr *gwa = tb[NHA_GATEWAY];
@@ -3206,22 +3198,76 @@ out:
return err;
}
+static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb,
+ struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ if (tb[NHA_GROUP])
+ return nh_check_attr_group_rtnl(net, tb, extack);
+
+ if (tb[NHA_OIF]) {
+ cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+ if (cfg->nh_ifindex)
+ cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+ if (!cfg->dev) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+
+ if (!(cfg->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ return -ENETDOWN;
+ }
+
+ if (!netif_carrier_ok(cfg->dev)) {
+ NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+ return -ENETDOWN;
+ }
+ }
+
+ return 0;
+}
+
/* rtnl */
static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
struct net *net = sock_net(skb->sk);
struct nh_config cfg;
struct nexthop *nh;
int err;
- err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
- if (!err) {
- nh = nexthop_add(net, &cfg, extack);
- if (IS_ERR(nh))
- err = PTR_ERR(nh);
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_new) - 1,
+ rtm_nh_policy_new, extack);
+ if (err < 0)
+ goto out;
+
+ err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack);
+ if (err)
+ goto out;
+
+ if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) {
+ NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+ err = -EINVAL;
+ goto out;
}
+ rtnl_net_lock(net);
+
+ err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack);
+ if (err)
+ goto unlock;
+
+ nh = nexthop_add(net, &cfg, extack);
+ if (IS_ERR(nh))
+ err = PTR_ERR(nh);
+
+unlock:
+ rtnl_net_unlock(net);
+out:
return err;
}
@@ -3278,13 +3324,17 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
+ rtnl_net_lock(net);
+
nh = nexthop_find_by_id(net, id);
- if (!nh)
- return -ENOENT;
+ if (nh)
+ remove_nexthop(net, nh, &nlinfo);
+ else
+ err = -ENOENT;
- remove_nexthop(net, nh, &nlinfo);
+ rtnl_net_unlock(net);
- return 0;
+ return err;
}
/* rtnl */
@@ -3834,7 +3884,7 @@ static int nh_netdev_event(struct notifier_block *this,
nexthop_flush_dev(dev, event);
break;
case NETDEV_CHANGE:
- if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+ if (!(netif_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
nexthop_flush_dev(dev, event);
break;
case NETDEV_CHANGEMTU:
@@ -4000,14 +4050,11 @@ out:
}
EXPORT_SYMBOL(nexthop_res_grp_activity_update);
-static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list,
- struct list_head *dev_to_kill)
+static void __net_exit nexthop_net_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- struct net *net;
-
- ASSERT_RTNL();
- list_for_each_entry(net, net_list, exit_list)
- flush_all_nexthops(net);
+ ASSERT_RTNL_NET(net);
+ flush_all_nexthops(net);
}
static void __net_exit nexthop_net_exit(struct net *net)
@@ -4032,22 +4079,24 @@ static int __net_init nexthop_net_init(struct net *net)
static struct pernet_operations nexthop_net_ops = {
.init = nexthop_net_init,
.exit = nexthop_net_exit,
- .exit_batch_rtnl = nexthop_net_exit_batch_rtnl,
+ .exit_rtnl = nexthop_net_exit_rtnl,
};
static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = {
- {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop},
- {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop},
+ {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop,
+ .flags = RTNL_FLAG_DOIT_PERNET},
{.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop,
.dumpit = rtm_dump_nexthop},
{.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket,
.dumpit = rtm_dump_nexthop_bucket},
{.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP,
- .doit = rtm_new_nexthop},
+ .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
{.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP,
.dumpit = rtm_dump_nexthop},
{.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP,
- .doit = rtm_new_nexthop},
+ .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
{.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP,
.dumpit = rtm_dump_nexthop},
};
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 619ddc087957..031df4c19fcc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -705,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ip_options_data opt_copy;
int free = 0;
__be32 saddr, daddr, faddr;
- u8 tos, scope;
+ u8 scope;
int err;
pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
@@ -768,7 +768,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
faddr = ipc.opt->opt.faddr;
}
- tos = get_rttos(&ipc, inet);
scope = ip_sendmsg_scope(inet, &ipc, msg);
if (ipv4_is_multicast(daddr)) {
@@ -779,9 +778,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
} else if (!ipc.oif)
ipc.oif = READ_ONCE(inet->uc_index);
- flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
- saddr, 0, 0, sk->sk_uid);
+ saddr, 0, 0, sk_uid(sk));
fl4.fl4_icmp_type = user_icmph.type;
fl4.fl4_icmp_code = user_icmph.code;
@@ -966,10 +966,9 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
enum skb_drop_reason ping_rcv(struct sk_buff *skb)
{
- enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET;
- struct sock *sk;
struct net *net = dev_net(skb->dev);
struct icmphdr *icmph = icmp_hdr(skb);
+ struct sock *sk;
/* We assume the packet has already been checked by icmp_rcv */
@@ -980,20 +979,11 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb)
skb_push(skb, skb->data - (u8 *)icmph);
sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
- if (sk) {
- struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
- pr_debug("rcv on socket %p\n", sk);
- if (skb2)
- reason = __ping_queue_rcv_skb(sk, skb2);
- else
- reason = SKB_DROP_REASON_NOMEM;
- }
-
- if (reason)
- pr_debug("no socket, dropping\n");
+ if (sk)
+ return __ping_queue_rcv_skb(sk, skb);
- return reason;
+ kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET);
+ return SKB_DROP_REASON_NO_SOCKET;
}
EXPORT_SYMBOL_GPL(ping_rcv);
@@ -1126,7 +1116,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
0, 0L, 0,
- from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
atomic_read(&sp->sk_drops));
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 40053a02bae1..65b0d0ab0084 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -189,6 +189,10 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW),
+ SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED),
+ SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
+ SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED),
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 0e9e01967ec9..1d2c89d63cc7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -358,7 +358,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb_reserve(skb, hlen);
skb->protocol = htons(ETH_P_IP);
- skb->priority = READ_ONCE(sk->sk_priority);
+ skb->priority = sockc->priority;
skb->mark = sockc->mark;
skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
skb_dst_set(skb, &rt->dst);
@@ -486,7 +486,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
struct flowi4 fl4;
- u8 tos, scope;
+ u8 scope;
int free = 0;
__be32 daddr;
__be32 saddr;
@@ -581,7 +581,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
daddr = ipc.opt->opt.faddr;
}
}
- tos = get_rttos(&ipc, inet);
scope = ip_sendmsg_scope(inet, &ipc, msg);
uc_index = READ_ONCE(inet->uc_index);
@@ -606,11 +605,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
}
- flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
hdrincl ? ipc.protocol : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0, sk->sk_uid);
+ daddr, saddr, 0, 0, sk_uid(sk));
fl4.fl4_icmp_type = 0;
fl4.fl4_icmp_code = 0;
@@ -1043,7 +1043,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
0, 0L, 0,
- from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e5603e84b20d..f639a2ae881a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = {
EXPORT_SYMBOL(ip_tos2prio);
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#ifndef CONFIG_PREEMPT_RT
#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
+#else
+#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
+#endif
#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
@@ -390,7 +394,13 @@ static inline int ip_rt_proc_init(void)
static inline bool rt_is_expired(const struct rtable *rth)
{
- return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
+ bool res;
+
+ rcu_read_lock();
+ res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
+ rcu_read_unlock();
+
+ return res;
}
void rt_cache_flush(struct net *net)
@@ -403,7 +413,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
const void *daddr)
{
const struct rtable *rt = container_of(dst, struct rtable, dst);
- struct net_device *dev = dst->dev;
+ struct net_device *dev = dst_dev(dst);
struct neighbour *n;
rcu_read_lock();
@@ -430,7 +440,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
const struct rtable *rt = container_of(dst, struct rtable, dst);
- struct net_device *dev = dst->dev;
+ struct net_device *dev = dst_dev(dst);
const __be32 *pkey = daddr;
if (rt->rt_gw_family == AF_INET) {
@@ -546,7 +556,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
inet_test_bit(HDRINCL, sk) ?
IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
+ daddr, inet->inet_saddr, 0, 0,
+ sk_uid(sk));
rcu_read_unlock();
}
@@ -706,7 +717,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
*/
rt = rcu_dereference(nhc->nhc_rth_input);
if (rt)
- rt->dst.obsolete = DST_OBSOLETE_KILL;
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
for_each_possible_cpu(i) {
struct rtable __rcu **prt;
@@ -714,7 +725,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
rt = rcu_dereference(*prt);
if (rt)
- rt->dst.obsolete = DST_OBSOLETE_KILL;
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
}
}
@@ -786,7 +797,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
jiffies + ip_rt_gc_timeout);
}
if (kill_route)
- rt->dst.obsolete = DST_OBSOLETE_KILL;
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
neigh_release(n);
@@ -831,9 +842,9 @@ static void ipv4_negative_advice(struct sock *sk,
{
struct rtable *rt = dst_rtable(dst);
- if ((dst->obsolete > 0) ||
+ if ((READ_ONCE(dst->obsolete) > 0) ||
(rt->rt_flags & RTCF_REDIRECTED) ||
- rt->dst.expires)
+ READ_ONCE(rt->dst.expires))
sk_dst_reset(sk);
}
@@ -870,11 +881,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
}
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
- rcu_read_unlock();
net = dev_net(rt->dst.dev);
- peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
if (!peer) {
+ rcu_read_unlock();
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
rt_nexthop(rt, ip_hdr(skb)->daddr));
return;
@@ -893,7 +904,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
*/
if (peer->n_redirects >= ip_rt_redirect_number) {
peer->rate_last = jiffies;
- goto out_put_peer;
+ goto out_unlock;
}
/* Check for load limit; set rate_last to the latest sent
@@ -914,8 +925,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
&ip_hdr(skb)->saddr, inet_iif(skb),
&ip_hdr(skb)->daddr, &gw);
}
-out_put_peer:
- inet_putpeer(peer);
+out_unlock:
+ rcu_read_unlock();
}
static int ip_error(struct sk_buff *skb)
@@ -975,9 +986,9 @@ static int ip_error(struct sk_buff *skb)
break;
}
+ rcu_read_lock();
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
- l3mdev_master_ifindex(skb->dev), 1);
-
+ l3mdev_master_ifindex_rcu(skb->dev));
send = true;
if (peer) {
now = jiffies;
@@ -989,8 +1000,9 @@ static int ip_error(struct sk_buff *skb)
peer->rate_tokens -= ip_rt_error_cost;
else
send = false;
- inet_putpeer(peer);
}
+ rcu_read_unlock();
+
if (send)
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1001,9 +1013,9 @@ out: kfree_skb_reason(skb, reason);
static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct dst_entry *dst = &rt->dst;
- struct net *net = dev_net(dst->dev);
struct fib_result res;
bool lock = false;
+ struct net *net;
u32 old_mtu;
if (ip_mtu_locked(dst))
@@ -1013,16 +1025,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
if (old_mtu < mtu)
return;
+ rcu_read_lock();
+ net = dev_net_rcu(dst_dev(dst));
if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
}
if (rt->rt_pmtu == mtu && !lock &&
- time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
- return;
+ time_before(jiffies, READ_ONCE(dst->expires) -
+ net->ipv4.ip_rt_mtu_expires / 2))
+ goto out;
- rcu_read_lock();
if (fib_lookup(net, fl4, &res, 0) == 0) {
struct fib_nh_common *nhc;
@@ -1036,14 +1050,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + net->ipv4.ip_rt_mtu_expires);
}
- rcu_read_unlock();
- return;
+ goto out;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
nhc = FIB_RES_NHC(res);
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + net->ipv4.ip_rt_mtu_expires);
}
+out:
rcu_read_unlock();
}
@@ -1123,7 +1137,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
rt = dst_rtable(odst);
- if (odst->obsolete && !odst->ops->check(odst, 0)) {
+ if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) {
rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
if (IS_ERR(rt))
goto out;
@@ -1198,7 +1212,8 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
* this is indicated by setting obsolete to DST_OBSOLETE_KILL or
* DST_OBSOLETE_DEAD.
*/
- if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
+ if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK ||
+ rt_is_expired(rt))
return NULL;
return dst;
}
@@ -1306,10 +1321,15 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
- struct net *net = dev_net(dst->dev);
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
- unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
- net->ipv4.ip_rt_min_advmss);
+ unsigned int advmss;
+ struct net *net;
+
+ rcu_read_lock();
+ net = dev_net_rcu(dst_dev(dst));
+ advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
+ net->ipv4.ip_rt_min_advmss);
+ rcu_read_unlock();
return min(advmss, IPV4_MAX_PMTU - header_size);
}
@@ -1553,7 +1573,7 @@ void rt_flush_dev(struct net_device *dev)
static bool rt_cache_valid(const struct rtable *rt)
{
return rt &&
- rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
!rt_is_expired(rt);
}
@@ -1667,8 +1687,8 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
else if (rt->rt_gw_family == AF_INET6)
new_rt->rt_gw6 = rt->rt_gw6;
- new_rt->dst.input = rt->dst.input;
- new_rt->dst.output = rt->dst.output;
+ new_rt->dst.input = READ_ONCE(rt->dst.input);
+ new_rt->dst.output = READ_ONCE(rt->dst.output);
new_rt->dst.error = rt->dst.error;
new_rt->dst.lastuse = jiffies;
new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
@@ -2024,8 +2044,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net,
hash_keys.addrs.v4addrs.dst = fl4->daddr;
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
hash_keys.basic.ip_proto = fl4->flowi4_proto;
- if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
- hash_keys.ports.src = fl4->fl4_sport;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
+ }
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
hash_keys.ports.dst = fl4->fl4_dport;
@@ -2080,7 +2104,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
- hash_keys.ports.src = fl4->fl4_sport;
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
hash_keys.ports.dst = fl4->fl4_dport;
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
@@ -2141,7 +2168,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
- fib_select_multipath(res, h);
+ fib_select_multipath(res, h, NULL);
IPCB(skb)->flags |= IPSKB_MULTIPATH;
}
#endif
@@ -2445,6 +2472,7 @@ martian_destination:
net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
&daddr, &saddr, dev->name);
#endif
+ goto out;
e_nobufs:
reason = SKB_DROP_REASON_NOMEM;
@@ -2478,7 +2506,8 @@ ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
int our = 0;
if (!in_dev)
- return -EINVAL;
+ return reason;
+
our = ip_check_mc_rcu(in_dev, daddr, saddr,
ip_hdr(skb)->protocol);
@@ -2559,7 +2588,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
do_cache = true;
if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- fi = NULL;
} else if (type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
@@ -2634,7 +2662,7 @@ add:
if (IN_DEV_MFORWARD(in_dev) &&
!ipv4_is_local_multicast(fl4->daddr)) {
rth->dst.input = ip_mr_input;
- rth->dst.output = ip_mc_output;
+ rth->dst.output = ip_mr_output;
}
}
#endif
@@ -2684,8 +2712,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
if (fl4->saddr) {
if (ipv4_is_multicast(fl4->saddr) ||
- ipv4_is_lbcast(fl4->saddr) ||
- ipv4_is_zeronet(fl4->saddr)) {
+ ipv4_is_lbcast(fl4->saddr)) {
rth = ERR_PTR(-EINVAL);
goto out;
}
@@ -2952,8 +2979,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (rt->dst.dev &&
nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
goto nla_put_failure;
- if (rt->dst.lwtstate &&
- lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ if (lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (rt->dst.tclassid &&
@@ -2984,7 +3010,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
}
}
- expires = rt->dst.expires;
+ expires = READ_ONCE(rt->dst.expires);
if (expires) {
unsigned long now = jiffies;
@@ -3191,7 +3217,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
struct rtmsg *rtm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
NL_SET_ERR_MSG(extack,
"ipv4: Invalid header for route get request");
return -EINVAL;
@@ -3201,7 +3228,6 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
rtm_ipv4_policy, extack);
- rtm = nlmsg_data(nlh);
if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
(rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
rtm->rtm_table || rtm->rtm_protocol ||
@@ -3267,6 +3293,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
+ dscp_t dscp;
kuid_t uid;
u32 iif;
int err;
@@ -3281,6 +3308,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
dst = nla_get_in_addr_default(tb[RTA_DST], 0);
iif = nla_get_u32_default(tb[RTA_IIF], 0);
mark = nla_get_u32_default(tb[RTA_MARK], 0);
+ dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
if (tb[RTA_UID])
uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
else
@@ -3305,7 +3333,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK;
+ fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
@@ -3329,9 +3357,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input_rcu(skb, dst, src,
- inet_dsfield_to_dscp(rtm->rtm_tos),
- dev, &res) ? -EINVAL : 0;
+ err = ip_route_input_rcu(skb, dst, src, dscp, dev,
+ &res) ? -EINVAL : 0;
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 1948d15f1f28..eb0819463fae 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -222,7 +222,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
return NULL;
}
-EXPORT_SYMBOL(tcp_get_cookie_sock);
+EXPORT_IPV6_MOD(tcp_get_cookie_sock);
/*
* when syncookies are in effect and tcp timestamps are enabled we stored
@@ -259,7 +259,7 @@ bool cookie_timestamp_decode(const struct net *net,
return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0;
}
-EXPORT_SYMBOL(cookie_timestamp_decode);
+EXPORT_IPV6_MOD(cookie_timestamp_decode);
static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
@@ -279,6 +279,7 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
ireq->smc_ok = 0;
treq->snt_synack = 0;
+ treq->snt_tsval_first = 0;
treq->tfo_listener = false;
treq->txhash = net_tx_rndhash();
treq->rcv_isn = ntohl(th->seq) - 1;
@@ -310,7 +311,7 @@ struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb)
return req;
}
-EXPORT_SYMBOL_GPL(cookie_bpf_check);
+EXPORT_IPV6_MOD_GPL(cookie_bpf_check);
#endif
struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
@@ -351,7 +352,7 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
return req;
}
-EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc);
+EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc);
static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
struct sk_buff *skb)
@@ -453,7 +454,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
IPPROTO_TCP, inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
+ ireq->ir_loc_addr, th->source, th->dest,
+ sk_uid(sk));
security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a79b2a52ce01..3a43010d726f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,6 +28,7 @@ static int tcp_adv_win_scale_max = 31;
static int tcp_app_win_max = 31;
static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
static int tcp_min_snd_mss_max = 65535;
+static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
static int ip_privileged_port_min;
static int ip_privileged_port_max = 65535;
static int ip_ttl_min = 1;
@@ -45,6 +46,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
+static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -1066,6 +1068,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_TWO,
},
{
+ .procname = "tcp_tw_reuse_delay",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &tcp_tw_reuse_delay_max,
+ },
+ {
.procname = "tcp_max_syn_backlog",
.data = &init_net.ipv4.sysctl_max_syn_backlog,
.maxlen = sizeof(int),
@@ -1573,6 +1584,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+ {
+ .procname = "tcp_rto_max_ms",
+ .data = &init_net.ipv4.sysctl_tcp_rto_max_ms,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE_THOUSAND,
+ .extra2 = &tcp_rto_max_max,
+ },
};
static __net_init int ipv4_sysctl_init_net(struct net *net)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0d704bda6c41..71a956fbfc55 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -300,10 +300,8 @@ DEFINE_PER_CPU(u32, tcp_tw_isn);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
long sysctl_tcp_mem[3] __read_mostly;
-EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_IPV6_MOD(sysctl_tcp_mem);
-atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
-EXPORT_SYMBOL(tcp_memory_allocated);
DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
@@ -316,7 +314,7 @@ EXPORT_SYMBOL(tcp_have_smc);
* Current number of TCP sockets.
*/
struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
-EXPORT_SYMBOL(tcp_sockets_allocated);
+EXPORT_IPV6_MOD(tcp_sockets_allocated);
/*
* TCP splice context
@@ -349,7 +347,7 @@ void tcp_enter_memory_pressure(struct sock *sk)
if (!cmpxchg(&tcp_memory_pressure, 0, val))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
}
-EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
+EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure);
void tcp_leave_memory_pressure(struct sock *sk)
{
@@ -362,7 +360,7 @@ void tcp_leave_memory_pressure(struct sock *sk)
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
jiffies_to_msecs(jiffies - val));
}
-EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
+EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure);
/* Convert seconds to retransmits based on initial and max timeout */
static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
@@ -423,7 +421,7 @@ void tcp_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int rto_min_us;
+ int rto_min_us, rto_max_ms;
tp->out_of_order_queue = RB_ROOT;
sk->tcp_rtx_queue = RB_ROOT;
@@ -432,6 +430,10 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+
+ rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms);
+ icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms);
+
rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us);
icsk->icsk_delack_max = TCP_DELACK_MAX;
@@ -475,7 +477,7 @@ void tcp_init_sock(struct sock *sk)
sk_sockets_allocated_inc(sk);
xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
}
-EXPORT_SYMBOL(tcp_init_sock);
+EXPORT_IPV6_MOD(tcp_init_sock);
static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
{
@@ -488,10 +490,14 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
sock_tx_timestamp(sk, sockc, &shinfo->tx_flags);
if (tsflags & SOF_TIMESTAMPING_TX_ACK)
- tcb->txstamp_ack = 1;
+ tcb->txstamp_ack |= TSTAMP_ACK_SK;
if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
+
+ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+ SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb)
+ bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
static bool tcp_stream_is_readable(struct sock *sk, int target)
@@ -660,7 +666,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg)
*karg = answ;
return 0;
}
-EXPORT_SYMBOL(tcp_ioctl);
+EXPORT_IPV6_MOD(tcp_ioctl);
void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
@@ -876,7 +882,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
return ret;
}
-EXPORT_SYMBOL(tcp_splice_read);
+EXPORT_IPV6_MOD(tcp_splice_read);
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
bool force_schedule)
@@ -1051,6 +1057,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
+ struct net_devmem_dmabuf_binding *binding = NULL;
struct tcp_sock *tp = tcp_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
@@ -1058,11 +1065,20 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
int process_backlog = 0;
+ int sockc_err = 0;
int zc = 0;
long timeo;
flags = msg->msg_flags;
+ sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
+ if (msg->msg_controllen) {
+ sockc_err = sock_cmsg_send(sk, msg, &sockc);
+ /* Don't return error until MSG_FASTOPEN has been processed;
+ * that may succeed even if the cmsg is invalid.
+ */
+ }
+
if ((flags & MSG_ZEROCOPY) && size) {
if (msg->msg_ubuf) {
uarg = msg->msg_ubuf;
@@ -1070,7 +1086,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
zc = MSG_ZEROCOPY;
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
skb = tcp_write_queue_tail(sk);
- uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb),
+ !sockc_err && sockc.dmabuf_id);
if (!uarg) {
err = -ENOBUFS;
goto out_err;
@@ -1079,12 +1096,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
zc = MSG_ZEROCOPY;
else
uarg_to_msgzc(uarg)->zerocopy = 0;
+
+ if (!sockc_err && sockc.dmabuf_id) {
+ binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ binding = NULL;
+ goto out_err;
+ }
+ }
}
} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
if (sk->sk_route_caps & NETIF_F_SG)
zc = MSG_SPLICE_PAGES;
}
+ if (!sockc_err && sockc.dmabuf_id &&
+ (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
if (unlikely(flags & MSG_FASTOPEN ||
inet_test_bit(DEFER_CONNECT, sk)) &&
!tp->repair) {
@@ -1123,13 +1155,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
- sockcm_init(&sockc, sk);
- if (msg->msg_controllen) {
- err = sock_cmsg_send(sk, msg, &sockc);
- if (unlikely(err)) {
- err = -EINVAL;
- goto out_err;
- }
+ if (sockc_err) {
+ err = sockc_err;
+ goto out_err;
}
/* This should be in poll */
@@ -1146,12 +1174,14 @@ restart:
goto do_error;
while (msg_data_left(msg)) {
- ssize_t copy = 0;
+ int copy = 0;
skb = tcp_write_queue_tail(sk);
if (skb)
copy = size_goal - skb->len;
+ trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);
+
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
@@ -1248,7 +1278,8 @@ new_segment:
goto wait_for_space;
}
- err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg,
+ binding);
if (err == -EMSGSIZE || err == -EEXIST) {
tcp_mark_push(tp, skb);
goto new_segment;
@@ -1264,8 +1295,7 @@ new_segment:
if (!copy)
goto wait_for_space;
- err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
- sk->sk_allocation);
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
if (err < 0) {
if (err == -EMSGSIZE) {
tcp_mark_push(tp, skb);
@@ -1329,6 +1359,8 @@ out_nopush:
/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
if (uarg && !msg->msg_ubuf)
net_zcopy_put(uarg);
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
return copied + copied_syn;
do_error:
@@ -1346,6 +1378,9 @@ out_err:
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
return err;
}
EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
@@ -1376,7 +1411,7 @@ void tcp_splice_eof(struct socket *sock)
tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
release_sock(sk);
}
-EXPORT_SYMBOL_GPL(tcp_splice_eof);
+EXPORT_IPV6_MOD_GPL(tcp_splice_eof);
/*
* Handle reading urgent data. BSD has very simple semantics for
@@ -1565,12 +1600,13 @@ EXPORT_SYMBOL(tcp_recv_skb);
* or for 'peeking' the socket using this routine
* (although both would be easy to implement).
*/
-int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- u32 seq = tp->copied_seq;
+ u32 seq = *copied_seq;
u32 offset;
int copied = 0;
@@ -1624,9 +1660,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
- WRITE_ONCE(tp->copied_seq, seq);
+ WRITE_ONCE(*copied_seq, seq);
}
- WRITE_ONCE(tp->copied_seq, seq);
+ WRITE_ONCE(*copied_seq, seq);
+
+ if (noack)
+ goto out;
tcp_rcv_space_adjust(sk);
@@ -1635,10 +1674,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
}
+out:
return copied;
}
+
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, false,
+ &tcp_sk(sk)->copied_seq);
+}
EXPORT_SYMBOL(tcp_read_sock);
+int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
+}
+
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct sk_buff *skb;
@@ -1667,7 +1721,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
}
return copied;
}
-EXPORT_SYMBOL(tcp_read_skb);
+EXPORT_IPV6_MOD(tcp_read_skb);
void tcp_read_done(struct sock *sk, size_t len)
{
@@ -1712,7 +1766,7 @@ int tcp_peek_len(struct socket *sock)
{
return tcp_inq(sock->sk);
}
-EXPORT_SYMBOL(tcp_peek_len);
+EXPORT_IPV6_MOD(tcp_peek_len);
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val)
@@ -1739,7 +1793,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
}
return 0;
}
-EXPORT_SYMBOL(tcp_set_rcvlowat);
+EXPORT_IPV6_MOD(tcp_set_rcvlowat);
void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping_internal *tss)
@@ -1772,7 +1826,7 @@ int tcp_mmap(struct file *file, struct socket *sock,
vma->vm_ops = &tcp_vm_ops;
return 0;
}
-EXPORT_SYMBOL(tcp_mmap);
+EXPORT_IPV6_MOD(tcp_mmap);
static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
u32 *offset_frag)
@@ -2438,14 +2492,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
*/
memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
dmabuf_cmsg.frag_size = copy;
- err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR,
- sizeof(dmabuf_cmsg), &dmabuf_cmsg);
- if (err || msg->msg_flags & MSG_CTRUNC) {
- msg->msg_flags &= ~MSG_CTRUNC;
- if (!err)
- err = -ETOOSMALL;
+ err = put_cmsg_notrunc(msg, SOL_SOCKET,
+ SO_DEVMEM_LINEAR,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err)
goto out;
- }
sent += copy;
@@ -2476,6 +2528,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
}
niov = skb_frag_net_iov(frag);
+ if (!net_is_devmem_iov(niov)) {
+ err = -ENODEV;
+ goto out;
+ }
+
end = start + skb_frag_size(frag);
copy = end - offset;
@@ -2494,21 +2551,17 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
/* Will perform the exchange later */
dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
- dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov);
+ dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);
offset += copy;
remaining_len -= copy;
- err = put_cmsg(msg, SOL_SOCKET,
- SO_DEVMEM_DMABUF,
- sizeof(dmabuf_cmsg),
- &dmabuf_cmsg);
- if (err || msg->msg_flags & MSG_CTRUNC) {
- msg->msg_flags &= ~MSG_CTRUNC;
- if (!err)
- err = -ETOOSMALL;
+ err = put_cmsg_notrunc(msg, SOL_SOCKET,
+ SO_DEVMEM_DMABUF,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err)
goto out;
- }
atomic_long_inc(&niov->pp_ref_count);
tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
@@ -2864,7 +2917,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
}
return ret;
}
-EXPORT_SYMBOL(tcp_recvmsg);
+EXPORT_IPV6_MOD(tcp_recvmsg);
void tcp_set_state(struct sock *sk, int state)
{
@@ -2994,7 +3047,7 @@ void tcp_shutdown(struct sock *sk, int how)
tcp_send_fin(sk);
}
}
-EXPORT_SYMBOL(tcp_shutdown);
+EXPORT_IPV6_MOD(tcp_shutdown);
int tcp_orphan_count_sum(void)
{
@@ -3174,7 +3227,7 @@ adjudge_to_death:
const int tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk,
+ tcp_reset_keepalive_timer(sk,
tmo - TCP_TIMEWAIT_LEN);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -3326,8 +3379,8 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_probes_out = 0;
icsk->icsk_probes_tstamp = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
- icsk->icsk_rto_min = TCP_RTO_MIN;
- icsk->icsk_delack_max = TCP_DELACK_MAX;
+ WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
+ WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX);
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
tp->snd_cwnd_cnt = 0;
@@ -3381,6 +3434,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->rack.reo_wnd_persist = 0;
tp->rack.dsack_seen = 0;
tp->syn_data_acked = 0;
+ tp->syn_fastopen_child = 0;
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
@@ -3493,7 +3547,7 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
}
DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
-EXPORT_SYMBOL(tcp_tx_delay_enabled);
+EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
static void tcp_enable_tx_delay(void)
{
@@ -3627,7 +3681,7 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
elapsed = tp->keepalive_time - elapsed;
else
elapsed = 0;
- inet_csk_reset_keepalive_timer(sk, elapsed);
+ tcp_reset_keepalive_timer(sk, elapsed);
}
return 0;
@@ -3667,36 +3721,49 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt);
int tcp_set_window_clamp(struct sock *sk, int val)
{
+ u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh;
struct tcp_sock *tp = tcp_sk(sk);
if (!val) {
if (sk->sk_state != TCP_CLOSE)
return -EINVAL;
WRITE_ONCE(tp->window_clamp, 0);
- } else {
- u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
- u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
- SOCK_MIN_RCVBUF / 2 : val;
+ return 0;
+ }
- if (new_window_clamp == old_window_clamp)
- return 0;
+ old_window_clamp = tp->window_clamp;
+ new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val);
- WRITE_ONCE(tp->window_clamp, new_window_clamp);
- if (new_window_clamp < old_window_clamp) {
- /* need to apply the reserved mem provisioning only
- * when shrinking the window clamp
- */
- __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp);
+ if (new_window_clamp == old_window_clamp)
+ return 0;
- } else {
- new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
- tp->rcv_ssthresh = max(new_rcv_ssthresh,
- tp->rcv_ssthresh);
- }
+ WRITE_ONCE(tp->window_clamp, new_window_clamp);
+
+ /* Need to apply the reserved mem provisioning only
+ * when shrinking the window clamp.
+ */
+ if (new_window_clamp < old_window_clamp) {
+ __tcp_adjust_rcv_ssthresh(sk, new_window_clamp);
+ } else {
+ new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp);
+ tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh);
}
return 0;
}
+int tcp_sock_set_maxseg(struct sock *sk, int val)
+{
+ /* Values greater than interface MTU won't take effect. However
+ * at the point when this call is done we typically don't yet
+ * know which interface is going to be used
+ */
+ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
+ return -EINVAL;
+
+ tcp_sk(sk)->rx_opt.user_mss = val;
+ return 0;
+}
+
/*
* Socket option code for TCP.
*/
@@ -3802,21 +3869,34 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
TCP_RTO_MAX / HZ));
return 0;
+ case TCP_RTO_MAX_MS:
+ if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val));
+ return 0;
+ case TCP_RTO_MIN_US: {
+ int rto_min = usecs_to_jiffies(val);
+
+ if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min);
+ return 0;
+ }
+ case TCP_DELACK_MAX_US: {
+ int delack_max = usecs_to_jiffies(val);
+
+ if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
+ return 0;
+ }
}
sockopt_lock_sock(sk);
switch (optname) {
case TCP_MAXSEG:
- /* Values greater than interface MTU won't take effect. However
- * at the point when this call is done we typically don't yet
- * know which interface is going to be used
- */
- if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
- err = -EINVAL;
- break;
- }
- tp->rx_opt.user_mss = val;
+ err = tcp_sock_set_maxseg(sk, val);
break;
case TCP_NODELAY:
@@ -4031,7 +4111,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
-EXPORT_SYMBOL(tcp_setsockopt);
+EXPORT_IPV6_MOD(tcp_setsockopt);
static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
struct tcp_info *info)
@@ -4107,7 +4187,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
}
- if (tp->ecn_flags & TCP_ECN_OK)
+ if (tcp_ecn_mode_any(tp))
info->tcpi_options |= TCPI_OPT_ECN;
if (tp->ecn_flags & TCP_ECN_SEEN)
info->tcpi_options |= TCPI_OPT_ECN_SEEN;
@@ -4115,6 +4195,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
if (tp->tcp_usec_ts)
info->tcpi_options |= TCPI_OPT_USEC_TS;
+ if (tp->syn_fastopen_child)
+ info->tcpi_options |= TCPI_OPT_TFO_CHILD;
info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
@@ -4638,6 +4720,15 @@ zerocopy_rcv_out:
case TCP_IS_MPTCP:
val = 0;
break;
+ case TCP_RTO_MAX_MS:
+ val = jiffies_to_msecs(tcp_rto_max(sk));
+ break;
+ case TCP_RTO_MIN_US:
+ val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min));
+ break;
+ case TCP_DELACK_MAX_US:
+ val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max));
+ break;
default:
return -ENOPROTOOPT;
}
@@ -4659,7 +4750,7 @@ bool tcp_bpf_bypass_getsockopt(int level, int optname)
return false;
}
-EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt);
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
@@ -4673,11 +4764,11 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
USER_SOCKPTR(optlen));
}
-EXPORT_SYMBOL(tcp_getsockopt);
+EXPORT_IPV6_MOD(tcp_getsockopt);
#ifdef CONFIG_TCP_MD5SIG
int tcp_md5_sigpool_id = -1;
-EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id);
+EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id);
int tcp_md5_alloc_sigpool(void)
{
@@ -4723,7 +4814,7 @@ int tcp_md5_hash_key(struct tcp_sigpool *hp,
*/
return data_race(crypto_ahash_update(hp->req));
}
-EXPORT_SYMBOL(tcp_md5_hash_key);
+EXPORT_IPV6_MOD(tcp_md5_hash_key);
/* Called with rcu_read_lock() */
static enum skb_drop_reason
@@ -4843,7 +4934,7 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
l3index, md5_location);
}
-EXPORT_SYMBOL_GPL(tcp_inbound_hash);
+EXPORT_IPV6_MOD_GPL(tcp_inbound_hash);
void tcp_done(struct sock *sk)
{
@@ -4964,9 +5055,8 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
- CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32);
/* TXRX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
@@ -4992,7 +5082,12 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77);
+#else
CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69);
+#endif
/* TX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
@@ -5133,7 +5228,7 @@ void __init tcp_init(void)
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
- max_rshare = min(6UL*1024*1024, limit);
+ max_rshare = min(32UL*1024*1024, limit);
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
@@ -5149,6 +5244,6 @@ void __init tcp_init(void)
tcp_v4_init();
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
- tcp_tasklet_init();
+ tcp_tsq_work_init();
mptcp_init();
}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 370993c03d31..ba581785adb4 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -49,13 +49,14 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
sge = sk_msg_elem(msg, i);
size = (apply && apply_bytes < sge->length) ?
apply_bytes : sge->length;
- if (!sk_wmem_schedule(sk, size)) {
+ if (!__sk_rmem_schedule(sk, size, false)) {
if (!copied)
ret = -ENOMEM;
break;
}
sk_mem_charge(sk, size);
+ atomic_add(size, &sk->sk_rmem_alloc);
sk_msg_xfer(tmp, msg, i, size);
copied += size;
if (sge->length)
@@ -74,7 +75,8 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
if (!ret) {
msg->sg.start = i;
- sk_psock_queue_msg(psock, tmp);
+ if (!sk_psock_queue_msg(psock, tmp))
+ atomic_sub(copied, &sk->sk_rmem_alloc);
sk_psock_data_ready(sk, psock);
} else {
sk_msg_free(sk, tmp);
@@ -441,7 +443,6 @@ more_data:
cork = true;
psock->cork = NULL;
}
- sk_msg_return(sk, msg, tosend);
release_sock(sk);
origsize = msg->sg.size;
@@ -453,8 +454,9 @@ more_data:
sock_put(sk_redir);
lock_sock(sk);
+ sk_mem_uncharge(sk, sent);
if (unlikely(ret < 0)) {
- int free = sk_msg_free_nocharge(sk, msg);
+ int free = sk_msg_free(sk, msg);
if (!cork)
*copied -= free;
@@ -468,7 +470,7 @@ more_data:
break;
case __SK_DROP:
default:
- sk_msg_free_partial(sk, msg, tosend);
+ sk_msg_free(sk, msg);
sk_msg_apply_bytes(psock, tosend);
*copied -= (tosend + delta);
return -EACCES;
@@ -484,11 +486,8 @@ more_data:
}
if (msg &&
msg->sg.data[msg->sg.start].page_link &&
- msg->sg.data[msg->sg.start].length) {
- if (eval == __SK_REDIRECT)
- sk_mem_charge(sk, tosend - sent);
+ msg->sg.data[msg->sg.start].length)
goto more_data;
- }
}
return ret;
}
@@ -496,7 +495,7 @@ more_data:
static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_msg tmp, *msg_tx = NULL;
- int copied = 0, err = 0;
+ int copied = 0, err = 0, ret = 0;
struct sk_psock *psock;
long timeo;
int flags;
@@ -539,14 +538,14 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
copy = msg_tx->sg.size - osize;
}
- err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+ ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
copy);
- if (err < 0) {
+ if (ret < 0) {
sk_msg_trim(sk, msg_tx, osize);
goto out_err;
}
- copied += copy;
+ copied += ret;
if (psock->cork_bytes) {
if (size > psock->cork_bytes)
psock->cork_bytes = 0;
@@ -647,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sock *sk = strp->sk;
+ struct sk_psock *psock;
+ struct tcp_sock *tp;
+ int copied = 0;
+
+ tp = tcp_sk(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (WARN_ON_ONCE(!psock)) {
+ desc->error = -EINVAL;
+ goto out;
+ }
+
+ psock->ingress_bytes = 0;
+ copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
+ &psock->copied_seq);
+ if (copied < 0)
+ goto out;
+ /* recv_actor may redirect skb to another socket (SK_REDIRECT) or
+ * just put skb into ingress queue of current socket (SK_PASS).
+ * For SK_REDIRECT, we need to ack the frame immediately but for
+ * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser().
+ */
+ tp->copied_seq = psock->copied_seq - psock->ingress_bytes;
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes);
+out:
+ rcu_read_unlock();
+ return copied;
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
{
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 5dbed91c6178..76c23675ae50 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -392,6 +392,10 @@ static void hystart_update(struct sock *sk, u32 delay)
if (after(tp->snd_una, ca->end_seq))
bictcp_hystart_reset(sk);
+ /* hystart triggers when cwnd is larger than some threshold */
+ if (tcp_snd_cwnd(tp) < hystart_low_window)
+ return;
+
if (hystart_detect & HYSTART_ACK_TRAIN) {
u32 now = bictcp_clock_us(sk);
@@ -467,9 +471,7 @@ __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample
if (ca->delay_min == 0 || ca->delay_min > delay)
ca->delay_min = delay;
- /* hystart triggers when cwnd is larger than some threshold */
- if (!ca->found && tcp_in_slow_start(tp) && hystart &&
- tcp_snd_cwnd(tp) >= hystart_low_window)
+ if (!ca->found && tcp_in_slow_start(tp) && hystart)
hystart_update(sk, delay);
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 8a45a4aea933..03abe0848420 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -90,7 +90,7 @@ __bpf_kfunc static void dctcp_init(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- if ((tp->ecn_flags & TCP_ECN_OK) ||
+ if (tcp_ecn_mode_any(tp) ||
(sk->sk_state == TCP_LISTEN ||
sk->sk_state == TCP_CLOSE)) {
struct dctcp *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h
index d69a77cbd0c7..4b0259111d81 100644
--- a/net/ipv4/tcp_dctcp.h
+++ b/net/ipv4/tcp_dctcp.h
@@ -28,7 +28,7 @@ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
*/
if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
dctcp_ece_ack_cwr(sk, *ce_state);
- __tcp_send_ack(sk, *prior_rcv_nxt);
+ __tcp_send_ack(sk, *prior_rcv_nxt, 0);
}
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f428ecf9120f..45e174b8cd22 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -83,7 +83,7 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb,
#endif
static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
- const struct tcp_ulp_ops *ulp_ops)
+ const struct tcp_ulp_ops *ulp_ops, bool net_admin)
{
struct nlattr *nest;
int err;
@@ -97,7 +97,7 @@ static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
goto nla_failure;
if (ulp_ops->get_info)
- err = ulp_ops->get_info(sk, skb);
+ err = ulp_ops->get_info(sk, skb, net_admin);
if (err)
goto nla_failure;
@@ -113,6 +113,7 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_ulp_ops *ulp_ops;
int err = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -129,15 +130,13 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
}
#endif
- if (net_admin) {
- const struct tcp_ulp_ops *ulp_ops;
-
- ulp_ops = icsk->icsk_ulp_ops;
- if (ulp_ops)
- err = tcp_diag_put_ulp(skb, sk, ulp_ops);
- if (err)
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ err = tcp_diag_put_ulp(skb, sk, ulp_ops, net_admin);
+ if (err < 0)
return err;
}
+
return 0;
}
@@ -164,7 +163,7 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
}
#endif
- if (net_admin && sk_fullsock(sk)) {
+ if (sk_fullsock(sk)) {
const struct tcp_ulp_ops *ulp_ops;
ulp_ops = icsk->icsk_ulp_ops;
@@ -172,7 +171,7 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
size += nla_total_size(0) +
nla_total_size(TCP_ULP_NAME_MAX);
if (ulp_ops->get_info_size)
- size += ulp_ops->get_info_size(sk);
+ size += ulp_ops->get_info_size(sk, net_admin);
}
}
return size;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 0f523cbfe329..f1884f0c9e52 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -3,6 +3,7 @@
#include <linux/tcp.h>
#include <linux/rcupdate.h>
#include <net/tcp.h>
+#include <net/busy_poll.h>
void tcp_fastopen_init_key_once(struct net *net)
{
@@ -178,7 +179,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
if (!skb)
return;
- skb_dst_drop(skb);
+ tcp_cleanup_skb(skb);
/* segs_in has been initialized to 1 in tcp_create_openreq_child().
* Hence, reset segs_in to 0 before calling tcp_segs_in()
* to avoid double counting. Also, tcp_segs_in() expects
@@ -195,7 +196,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tcp_add_receive_queue(sk, skb);
tp->syn_data_acked = 1;
/* u64_stats_update_begin(&tp->syncp) not needed here,
@@ -274,11 +275,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
* because it's been added to the accept queue directly.
*/
req->timeout = tcp_timeout_init(child);
- inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
- req->timeout, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ req->timeout, false);
refcount_set(&req->rsk_refcnt, 2);
+ sk_mark_napi_id_set(child, skb);
+
/* Now finish processing the fastopen child socket. */
tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
@@ -401,6 +404,7 @@ fastopen:
}
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
+ tcp_sk(child)->syn_fastopen_child = 1;
return child;
}
NET_INC_STATS(sock_net(sk),
@@ -468,7 +472,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
}
return false;
}
-EXPORT_SYMBOL(tcp_fastopen_defer_connect);
+EXPORT_IPV6_MOD(tcp_fastopen_defer_connect);
/*
* The following code block is to deal with middle box issues with TFO:
@@ -555,6 +559,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net_device *dev;
struct dst_entry *dst;
struct sk_buff *skb;
@@ -572,7 +577,8 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
} else if (tp->syn_fastopen_ch &&
atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
dst = sk_dst_get(sk);
- if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
+ dev = dst ? dst_dev(dst) : NULL;
+ if (!(dev && (dev->flags & IFF_LOOPBACK)))
atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
dst_release(dst);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bdf13ac26ef..71b76e98371a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -102,6 +102,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */
+#define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -118,18 +119,18 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#if IS_ENABLED(CONFIG_TLS_DEVICE)
static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
-void clean_acked_data_enable(struct inet_connection_sock *icsk,
+void clean_acked_data_enable(struct tcp_sock *tp,
void (*cad)(struct sock *sk, u32 ack_seq))
{
- icsk->icsk_clean_acked = cad;
+ tp->tcp_clean_acked = cad;
static_branch_deferred_inc(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);
-void clean_acked_data_disable(struct inet_connection_sock *icsk)
+void clean_acked_data_disable(struct tcp_sock *tp)
{
static_branch_slow_dec_deferred(&clean_acked_data_enabled);
- icsk->icsk_clean_acked = NULL;
+ tp->tcp_clean_acked = NULL;
}
EXPORT_SYMBOL_GPL(clean_acked_data_disable);
@@ -169,6 +170,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
@@ -185,6 +187,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op,
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = bpf_op;
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
/* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
if (skb)
@@ -243,9 +246,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
do_div(val, skb->truesize);
tcp_sk(sk)->scaling_ratio = val ? val : 1;
- if (old_ratio != tcp_sk(sk)->scaling_ratio)
- WRITE_ONCE(tcp_sk(sk)->window_clamp,
- tcp_win_from_space(sk, sk->sk_rcvbuf));
+ if (old_ratio != tcp_sk(sk)->scaling_ratio) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ val = tcp_win_from_space(sk, sk->sk_rcvbuf);
+ tcp_set_window_clamp(sk, val);
+
+ if (tp->window_clamp < tp->rcvq_space.space)
+ tp->rcvq_space.space = tp->window_clamp;
+ }
}
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
@@ -325,15 +334,14 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct dst_entry *dst = __sk_dst_get(sk);
- return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
+ return icsk->icsk_ack.dst_quick_ack ||
(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
- if (tp->ecn_flags & TCP_ECN_OK)
+ if (tcp_ecn_mode_rfc3168(tp))
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
@@ -356,10 +364,13 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
-static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (tcp_ecn_disabled(tp))
+ return;
+
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
@@ -388,31 +399,39 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
}
}
-static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
-{
- if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
- __tcp_ecn_check_ce(sk, skb);
-}
-
static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
{
- if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
- tp->ecn_flags &= ~TCP_ECN_OK;
+ if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
{
- if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
- tp->ecn_flags &= ~TCP_ECN_OK;
+ if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
- if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
+ if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
return true;
return false;
}
+static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
+{
+ tp->delivered_ce += ecn_count;
+}
+
+/* Updates the delivered and delivered_ce counts */
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
+ bool ece_ack)
+{
+ tp->delivered += delivered;
+ if (ece_ack)
+ tcp_count_delivered_ce(tp, delivered);
+}
+
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
@@ -630,7 +649,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
-EXPORT_SYMBOL(tcp_initialize_rcv_mss);
+EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
/* Receiver "autotuning" code.
*
@@ -645,10 +664,12 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
- u32 new_sample = tp->rcv_rtt_est.rtt_us;
- long m = sample;
+ u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
+ long m = sample << 3;
- if (new_sample != 0) {
+ if (old_sample == 0 || m < old_sample) {
+ new_sample = m;
+ } else {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which
@@ -659,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* else with timestamps disabled convergence takes too
* long.
*/
- if (!win_dep) {
- m -= (new_sample >> 3);
- new_sample += m;
- } else {
- m <<= 3;
- if (m < new_sample)
- new_sample = m;
- }
- } else {
- /* No previous measure. */
- new_sample = m << 3;
+ if (win_dep)
+ return;
+ /* Do not use this sample if receive queue is not empty. */
+ if (tp->rcv_nxt != tp->copied_seq)
+ return;
+ new_sample = old_sample - (old_sample >> 3) + sample;
}
tp->rcv_rtt_est.rtt_us = new_sample;
@@ -693,7 +709,7 @@ new_measure:
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
-static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
+static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
{
u32 delta, delta_us;
@@ -703,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
- delta = 1;
+ delta = min_delta;
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
return delta_us;
}
@@ -721,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
- s32 delta = tcp_rtt_tsopt_us(tp);
+ s32 delta = tcp_rtt_tsopt_us(tp, 0);
- if (delta >= 0)
+ if (delta > 0)
tcp_rcv_rtt_update(tp, delta, 0);
}
}
+static void tcp_rcvbuf_grow(struct sock *sk)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int rcvwin, rcvbuf, cap;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return;
+
+ /* slow start: allow the sender to double its rate. */
+ rcvwin = tp->rcvq_space.space << 1;
+
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
+
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+ rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ /* Make the window clamp follow along. */
+ WRITE_ONCE(tp->window_clamp,
+ tcp_win_from_space(sk, rcvbuf));
+ }
+}
/*
* This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space.
@@ -735,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 copied;
- int time;
+ int time, inq, copied;
trace_tcp_rcv_space_adjust(sk);
@@ -747,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk)
/* Number of bytes copied to user in last RTT */
copied = tp->copied_seq - tp->rcvq_space.seq;
+ /* Number of bytes in receive queue. */
+ inq = tp->rcv_nxt - tp->copied_seq;
+ copied -= inq;
if (copied <= tp->rcvq_space.space)
goto new_measure;
- /* A bit of theory :
- * copied = bytes received in previous RTT, our base window
- * To cope with packet losses, we need a 2x factor
- * To cope with slow start, and sender growing its cwin by 100 %
- * every RTT, we need a 4x factor, because the ACK we are sending
- * now is for the next RTT, not the current one :
- * <prev RTT . ><current RTT .. ><next RTT .... >
- */
-
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- u64 rcvwin, grow;
- int rcvbuf;
-
- /* minimal window to cope with packet losses, assuming
- * steady state. Add some cushion because of small variations.
- */
- rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
-
- /* Accommodate for sender rate increase (eg. slow start) */
- grow = rcvwin * (copied - tp->rcvq_space.space);
- do_div(grow, tp->rcvq_space.space);
- rcvwin += (grow << 1);
+ trace_tcp_rcvbuf_grow(sk, time);
- rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
- if (rcvbuf > sk->sk_rcvbuf) {
- WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
-
- /* Make the window clamp follow along. */
- WRITE_ONCE(tp->window_clamp,
- tcp_win_from_space(sk, rcvbuf));
- }
- }
tp->rcvq_space.space = copied;
+ tcp_rcvbuf_grow(sk);
+
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tp->tcp_mstamp;
@@ -851,7 +865,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
icsk->icsk_ack.lrcvtime = now;
tcp_save_lrcv_flowlabel(sk, skb);
- tcp_ecn_check_ce(sk, skb);
+ tcp_data_ecn_check(sk, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb, true);
@@ -1148,15 +1162,6 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
}
}
-/* Updates the delivered and delivered_ce counts */
-static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
- bool ece_ack)
-{
- tp->delivered += delivered;
- if (ece_ack)
- tp->delivered_ce += delivered;
-}
-
/* This procedure tags the retransmission queue when SACKs arrive.
*
* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -1446,11 +1451,6 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
/* Out-of-order packets delivered */
state->sack_delivered += pcount;
-
- /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
- if (tp->lost_skb_hint &&
- before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
- tp->lost_cnt_hint += pcount;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1491,9 +1491,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
- if (skb == tp->lost_skb_hint)
- tp->lost_cnt_hint += pcount;
-
TCP_SKB_CB(prev)->end_seq += shifted;
TCP_SKB_CB(skb)->seq += shifted;
@@ -1526,10 +1523,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
- if (skb == tp->lost_skb_hint) {
- tp->lost_skb_hint = prev;
- tp->lost_cnt_hint -= tcp_skb_pcount(prev);
- }
TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
@@ -2146,12 +2139,6 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
tp->undo_retrans = -1;
}
-static bool tcp_is_rack(const struct sock *sk)
-{
- return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
- TCP_RACK_LOSS_DETECTION;
-}
-
/* If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
@@ -2177,8 +2164,7 @@ static void tcp_timeout_mark_lost(struct sock *sk)
skb_rbtree_walk_from(skb) {
if (is_reneg)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
- else if (tcp_is_rack(sk) && skb != head &&
- tcp_rack_skb_timeout(tp, skb, 0) > 0)
+ else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0)
continue; /* Don't mark recently sent ones lost yet */
tcp_mark_skb_lost(sk, skb);
}
@@ -2252,30 +2238,13 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag)
unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
msecs_to_jiffies(10));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- delay, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false);
*ack_flag &= ~FLAG_SET_XMIT_TIMER;
return true;
}
return false;
}
-/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
- * counter when SACK is enabled (without SACK, sacked_out is used for
- * that purpose).
- *
- * With reordering, holes may still be in flight, so RFC3517 recovery
- * uses pure sacked_out (total number of SACKed segments) even though
- * it violates the RFC that uses duplicate ACKs, often these are equal
- * but when e.g. out-of-window ACKs or packet duplication occurs,
- * they differ. Since neither occurs due to loss, TCP should really
- * ignore them.
- */
-static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
-{
- return tp->sacked_out + 1;
-}
-
/* Linux NewReno/SACK/ECN state machine.
* --------------------------------------
*
@@ -2328,13 +2297,7 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
*
* If the receiver supports SACK:
*
- * RFC6675/3517: It is the conventional algorithm. A packet is
- * considered lost if the number of higher sequence packets
- * SACKed is greater than or equal the DUPACK thoreshold
- * (reordering). This is implemented in tcp_mark_head_lost and
- * tcp_update_scoreboard.
- *
- * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * RACK (RFC8985): RACK is a newer loss detection algorithm
* (2017-) that checks timing instead of counting DUPACKs.
* Essentially a packet is considered lost if it's not S/ACKed
* after RTT + reordering_window, where both metrics are
@@ -2349,8 +2312,8 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
- * Really tricky (and requiring careful tuning) part of algorithm
- * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The really tricky (and requiring careful tuning) part of the algorithm
+ * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
@@ -2373,83 +2336,10 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static bool tcp_time_to_recover(struct sock *sk, int flag)
+static bool tcp_time_to_recover(const struct tcp_sock *tp)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* Trick#1: The loss is proven. */
- if (tp->lost_out)
- return true;
-
- /* Not-A-Trick#2 : Classic rule... */
- if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
- return true;
-
- return false;
-}
-
-/* Detect loss in event "A" above by marking head of queue up as lost.
- * For RFC3517 SACK, a segment is considered lost if it
- * has at least tp->reordering SACKed seqments above it; "packets" refers to
- * the maximum SACKed segments to pass before reaching this limit.
- */
-static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt;
- /* Use SACK to deduce losses of new sequences sent during recovery */
- const u32 loss_high = tp->snd_nxt;
-
- WARN_ON(packets > tp->packets_out);
- skb = tp->lost_skb_hint;
- if (skb) {
- /* Head already handled? */
- if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
- return;
- cnt = tp->lost_cnt_hint;
- } else {
- skb = tcp_rtx_queue_head(sk);
- cnt = 0;
- }
-
- skb_rbtree_walk_from(skb) {
- /* TODO: do this better */
- /* this is not the most efficient way to do this... */
- tp->lost_skb_hint = skb;
- tp->lost_cnt_hint = cnt;
-
- if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
- break;
-
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
- cnt += tcp_skb_pcount(skb);
-
- if (cnt > packets)
- break;
-
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
- tcp_mark_skb_lost(sk, skb);
-
- if (mark_head)
- break;
- }
- tcp_verify_left_out(tp);
-}
-
-/* Account newly detected lost packet(s) */
-
-static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tcp_is_sack(tp)) {
- int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto >= 0)
- tcp_mark_head_lost(sk, sacked_upto, 0);
- else if (fast_rexmit)
- tcp_mark_head_lost(sk, 1, 1);
- }
+ /* Has loss detection marked at least one packet lost? */
+ return tp->lost_out != 0;
}
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
@@ -2475,20 +2365,33 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
const struct sock *sk = (const struct sock *)tp;
- if (tp->retrans_stamp &&
- tcp_tsopt_ecr_before(tp, tp->retrans_stamp))
- return true; /* got echoed TS before first retransmission */
+ /* Received an echoed timestamp before the first retransmission? */
+ if (tp->retrans_stamp)
+ return tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
+
+ /* We set tp->retrans_stamp upon the first retransmission of a loss
+ * recovery episode, so normally if tp->retrans_stamp is 0 then no
+ * retransmission has happened yet (likely due to TSQ, which can cause
+ * fast retransmits to be delayed). So if snd_una advanced while
+ * (tp->retrans_stamp is 0 then apparently a packet was merely delayed,
+ * not lost. But there are exceptions where we retransmit but then
+ * clear tp->retrans_stamp, so we check for those exceptions.
+ */
- /* Check if nothing was retransmitted (retrans_stamp==0), which may
- * happen in fast recovery due to TSQ. But we ignore zero retrans_stamp
- * in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear
- * retrans_stamp even if we had retransmitted the SYN.
+ /* (1) For non-SACK connections, tcp_is_non_sack_preventing_reopen()
+ * clears tp->retrans_stamp when snd_una == high_seq.
*/
- if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */
- sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */
- return true; /* nothing was retransmitted */
+ if (!tcp_is_sack(tp) && !before(tp->snd_una, tp->high_seq))
+ return false;
- return false;
+ /* (2) In TCP_SYN_SENT tcp_clean_rtx_queue() clears tp->retrans_stamp
+ * when setting FLAG_SYN_ACKED is set, even if the SYN was
+ * retransmitted.
+ */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return false;
+
+ return true; /* tp->retrans_stamp is zero; no retransmit yet */
}
/* Undo procedures. */
@@ -2710,6 +2613,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
return;
+ trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag);
+
tp->prr_delivered += newly_acked_sacked;
if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
@@ -2875,8 +2780,6 @@ void tcp_simple_retransmit(struct sock *sk)
tcp_mark_skb_lost(sk, skb);
}
- tcp_clear_retrans_hints_partial(tp);
-
if (!tp->lost_out)
return;
@@ -2892,7 +2795,7 @@ void tcp_simple_retransmit(struct sock *sk)
*/
tcp_non_congestion_loss_retransmit(sk);
}
-EXPORT_SYMBOL(tcp_simple_retransmit);
+EXPORT_IPV6_MOD(tcp_simple_retransmit);
void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
@@ -2984,17 +2887,8 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
*rexmit = REXMIT_LOST;
}
-static bool tcp_force_fast_retransmit(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- return after(tcp_highest_sack_seq(tp),
- tp->snd_una + tp->reordering * tp->mss_cache);
-}
-
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
- bool *do_lost)
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3019,9 +2913,6 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
tcp_undo_cwnd_reduction(sk, true);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
- } else {
- /* Partial ACK arrived. Force fast retransmit. */
- *do_lost = tcp_force_fast_retransmit(sk);
}
return false;
}
@@ -3035,7 +2926,7 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
if (unlikely(tcp_is_reno(tp))) {
tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
- } else if (tcp_is_rack(sk)) {
+ } else {
u32 prior_retrans = tp->retrans_out;
if (tcp_rack_mark_lost(sk))
@@ -3062,10 +2953,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int fast_rexmit = 0, flag = *ack_flag;
+ int flag = *ack_flag;
bool ece_ack = flag & FLAG_ECE;
- bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
- tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
@@ -3114,7 +3003,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp))
tcp_add_reno_sack(sk, num_dupack, ece_ack);
- } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
+ } else if (tcp_try_undo_partial(sk, prior_snd_una))
return;
if (tcp_try_undo_dsack(sk))
@@ -3122,7 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_identify_packet_loss(sk, ack_flag);
if (icsk->icsk_ca_state != TCP_CA_Recovery) {
- if (!tcp_time_to_recover(sk, flag))
+ if (!tcp_time_to_recover(tp))
return;
/* Undo reverts the recovery state. If loss is evident,
* starts a new recovery (e.g. reordering then loss);
@@ -3151,7 +3040,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tcp_try_undo_dsack(sk);
tcp_identify_packet_loss(sk, ack_flag);
- if (!tcp_time_to_recover(sk, flag)) {
+ if (!tcp_time_to_recover(tp)) {
tcp_try_to_open(sk, flag);
return;
}
@@ -3169,11 +3058,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
/* Otherwise enter Recovery state */
tcp_enter_recovery(sk, ece_ack);
- fast_rexmit = 1;
}
- if (!tcp_is_rack(sk) && do_lost)
- tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
}
@@ -3215,7 +3101,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
- seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
+ seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
@@ -3282,8 +3168,7 @@ void tcp_rearm_rto(struct sock *sk)
*/
rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
- tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true);
}
}
@@ -3430,8 +3315,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
- if (unlikely(skb == tp->lost_skb_hint))
- tp->lost_skb_hint = NULL;
tcp_highest_sack_replace(sk, skb, next);
tcp_rtx_queue_unlink_and_free(skb, sk);
}
@@ -3489,14 +3372,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (flag & FLAG_RETRANS_DATA_ACKED)
flag &= ~FLAG_ORIG_SACK_ACKED;
} else {
- int delta;
-
/* Non-retransmitted hole got filled? That's reordering */
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
-
- delta = prior_sacked - tp->sacked_out;
- tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
@@ -3560,10 +3438,10 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using!
*/
} else {
- unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
+ unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk));
when = tcp_clamp_probe0_to_user_timeout(sk, when);
- tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true);
}
}
@@ -3808,8 +3686,16 @@ static void tcp_store_ts_recent(struct tcp_sock *tp)
tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
}
-static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta)
+{
+ tcp_store_ts_recent(tp);
+ return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0;
+}
+
+static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
+ s32 delta;
+
if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
@@ -3818,13 +3704,17 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* Not only, also it occurs for expired timestamps.
*/
- if (tcp_paws_check(&tp->rx_opt, 0))
- tcp_store_ts_recent(tp);
+ if (tcp_paws_check(&tp->rx_opt, 0)) {
+ delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent;
+ return __tcp_replace_ts_recent(tp, delta);
+ }
}
+
+ return 0;
}
/* This routine deals with acks during a TLP episode and ends an episode by
- * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
+ * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985
*/
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
@@ -3856,12 +3746,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
}
}
-static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+static void tcp_in_ack_event(struct sock *sk, int flag)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->in_ack_event)
- icsk->icsk_ca_ops->in_ack_event(sk, flags);
+ if (icsk->icsk_ca_ops->in_ack_event) {
+ u32 ack_ev_flags = 0;
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
+ if (flag & FLAG_SLOWPATH) {
+ ack_ev_flags |= CA_ACK_SLOWPATH;
+ if (flag & FLAG_ECE)
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags);
+ }
}
/* Congestion control has updated the cwnd already. So if we're in
@@ -3954,8 +3855,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled.key))
- if (icsk->icsk_clean_acked)
- icsk->icsk_clean_acked(sk, ack);
+ if (tp->tcp_clean_acked)
+ tp->tcp_clean_acked(sk, ack);
#endif
}
@@ -3966,7 +3867,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* is in window.
*/
if (flag & FLAG_UPDATE_TS_RECENT)
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+ flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
FLAG_SND_UNA_ADVANCED) {
@@ -3978,12 +3879,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
- tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
-
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
- u32 ack_ev_flags = CA_ACK_SLOWPATH;
-
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
@@ -3995,19 +3892,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
- ack_ev_flags |= CA_ACK_ECE;
- }
if (sack_state.sack_delivered)
tcp_count_delivered(tp, sack_state.sack_delivered,
flag & FLAG_ECE);
-
- if (flag & FLAG_WIN_UPDATE)
- ack_ev_flags |= CA_ACK_WIN_UPDATE;
-
- tcp_in_ack_event(sk, ack_ev_flags);
}
/* This is a deviation from RFC3168 since it states that:
@@ -4034,6 +3924,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
+ tcp_in_ack_event(sk, flag);
+
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
@@ -4065,6 +3957,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
+ tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
@@ -4174,7 +4067,6 @@ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
}
return mss;
}
-EXPORT_SYMBOL_GPL(tcp_parse_mss_option);
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
@@ -4450,34 +4342,40 @@ static u32 tcp_tsval_replay(const struct sock *sk)
return inet_csk(sk)->icsk_rto * 1200 / HZ;
}
-static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
+ const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
- u32 seq = TCP_SKB_CB(skb)->seq;
+ SKB_DR_INIT(reason, TCP_RFC7323_PAWS);
u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ u32 seq = TCP_SKB_CB(skb)->seq;
- return /* 1. Pure ACK with correct sequence number. */
- (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+ /* 1. Is this not a pure ACK ? */
+ if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq)
+ return reason;
- /* 2. ... and duplicate ACK. */
- ack == tp->snd_una &&
+ /* 2. Is its sequence not the expected one ? */
+ if (seq != tp->rcv_nxt)
+ return before(seq, tp->rcv_nxt) ?
+ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK :
+ reason;
- /* 3. ... and does not update window. */
- !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+ /* 3. Is this not a duplicate ACK ? */
+ if (ack != tp->snd_una)
+ return reason;
- /* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <=
- tcp_tsval_replay(sk);
-}
+ /* 4. Is this updating the window ? */
+ if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) <<
+ tp->rx_opt.snd_wscale))
+ return reason;
-static inline bool tcp_paws_discard(const struct sock *sk,
- const struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
+ /* 5. Is this not in the replay window ? */
+ if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) >
+ tcp_tsval_replay(sk))
+ return reason;
- return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
- !tcp_disordered_ack(sk, skb);
+ return 0;
}
/* Check segment sequence number for validity.
@@ -4493,14 +4391,22 @@ static inline bool tcp_paws_discard(const struct sock *sk,
* (borrowed from freebsd)
*/
-static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
+static enum skb_drop_reason tcp_sequence(const struct sock *sk,
u32 seq, u32 end_seq)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
if (before(end_seq, tp->rcv_wup))
return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
- if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
- return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+ if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) {
+ if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+
+ /* Only accept this packet if receive queue is empty. */
+ if (skb_queue_len(&sk->sk_receive_queue))
+ return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE;
+ }
return SKB_NOT_DROPPED_YET;
}
@@ -4518,7 +4424,7 @@ void tcp_done_with_error(struct sock *sk, int err)
if (!sock_flag(sk, SOCK_DEAD))
sk_error_report(sk);
}
-EXPORT_SYMBOL(tcp_done_with_error);
+EXPORT_IPV6_MOD(tcp_done_with_error);
/* When we get a reset we do this. */
void tcp_reset(struct sock *sk, struct sk_buff *skb)
@@ -4947,8 +4853,9 @@ static void tcp_ofo_queue(struct sock *sk)
if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
__u32 dsack = dsack_high;
+
if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
- dsack_high = TCP_SKB_CB(skb)->end_seq;
+ dsack = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
p = rb_next(p);
@@ -4964,7 +4871,7 @@ static void tcp_ofo_queue(struct sock *sk)
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (!eaten)
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tcp_add_receive_queue(sk, skb);
else
kfree_skb_partial(skb, fragstolen);
@@ -4981,10 +4888,20 @@ static void tcp_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
-static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+/* Check if this incoming skb can be added to socket receive queues
+ * while satisfying sk->sk_rcvbuf limit.
+ */
+static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
+{
+ unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize;
+
+ return new_mem <= sk->sk_rcvbuf;
+}
+
+static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
unsigned int size)
{
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ if (!tcp_can_ingest(sk, skb) ||
!sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk, skb) < 0)
@@ -5007,7 +4924,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
bool fragstolen;
tcp_save_lrcv_flowlabel(sk, skb);
- tcp_ecn_check_ce(sk, skb);
+ tcp_data_ecn_check(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -5016,6 +4933,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
return;
}
+ tcp_measure_rcv_mss(sk, skb);
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
@@ -5143,6 +5061,9 @@ end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+ if (sk->sk_socket)
+ tcp_rcvbuf_grow(sk);
}
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
@@ -5156,7 +5077,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
skb, fragstolen)) ? 1 : 0;
tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tcp_add_receive_queue(sk, skb);
skb_set_owner_r(skb, sk);
}
return eaten;
@@ -5239,7 +5160,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb);
return;
}
- skb_dst_drop(skb);
+ tcp_cleanup_skb(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
reason = SKB_DROP_REASON_NOT_SPECIFIED;
@@ -5596,7 +5517,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
tp->ooo_last_skb = rb_to_skb(prev);
if (!prev || goal <= 0) {
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ if (tcp_can_ingest(sk, in_skb) &&
!tcp_under_memory_pressure(sk))
break;
goal = sk->sk_rcvbuf >> 3;
@@ -5628,14 +5549,18 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ /* Do nothing if our queues are empty. */
+ if (!atomic_read(&sk->sk_rmem_alloc))
+ return -1;
+
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ if (!tcp_can_ingest(sk, in_skb))
tcp_clamp_window(sk);
else if (tcp_under_memory_pressure(sk))
tcp_adjust_rcv_ssthresh(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
tcp_collapse_ofo_queue(sk);
@@ -5645,7 +5570,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
NULL,
tp->copied_seq, tp->rcv_nxt);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* Collapsing did not help, destructive actions follow.
@@ -5653,7 +5578,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
tcp_prune_ofo_queue(sk, in_skb);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* If we are really being abused, tell the caller to silently
@@ -5949,25 +5874,37 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
- tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(sk, skb)) {
- if (!th->rst) {
- if (unlikely(th->syn))
- goto syn_challenge;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
- if (!tcp_oow_rate_limited(sock_net(sk), skb,
- LINUX_MIB_TCPACKSKIPPEDPAWS,
- &tp->last_oow_ack_time))
- tcp_send_dupack(sk, skb);
- SKB_DR_SET(reason, TCP_RFC7323_PAWS);
- goto discard;
- }
- /* Reset is accepted even if it did not pass PAWS. */
+ if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) ||
+ !tp->rx_opt.saw_tstamp ||
+ tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW))
+ goto step1;
+
+ reason = tcp_disordered_ack_check(sk, skb);
+ if (!reason)
+ goto step1;
+ /* Reset is accepted even if it did not pass PAWS. */
+ if (th->rst)
+ goto step1;
+ if (unlikely(th->syn))
+ goto syn_challenge;
+
+ /* Old ACK are common, increment PAWS_OLD_ACK
+ * and do not send a dupack.
+ */
+ if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK);
+ goto discard;
}
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ goto discard;
+step1:
/* Step 1: check sequence number */
- reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
if (reason) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
@@ -5978,6 +5915,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (!th->rst) {
if (th->syn)
goto syn_challenge;
+
+ if (reason == SKB_DROP_REASON_TCP_INVALID_SEQUENCE ||
+ reason == SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_BEYOND_WINDOW);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
@@ -6132,6 +6074,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
+ s32 delta = 0;
+ int flag = 0;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
@@ -6144,8 +6088,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
+ delta = tp->rx_opt.rcv_tsval -
+ tp->rx_opt.ts_recent;
/* If PAWS failed, check it more carefully in slow path */
- if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+ if (delta < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
@@ -6165,12 +6111,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
+ flag |= __tcp_replace_ts_recent(tp,
+ delta);
/* We know that such packets are checksummed
* on entry.
*/
- tcp_ack(sk, skb, 0);
+ tcp_ack(sk, skb, flag);
__kfree_skb(skb);
tcp_data_snd_check(sk);
/* When receiving pure ack in fast path, update
@@ -6191,6 +6138,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_checksum_complete(skb))
goto csum_error;
+ if (after(TCP_SKB_CB(skb)->end_seq,
+ tp->rcv_nxt + tcp_receive_window(tp)))
+ goto validate;
+
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
@@ -6201,14 +6152,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
+ flag |= __tcp_replace_ts_recent(tp,
+ delta);
tcp_rcv_rtt_measure_ts(sk, skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- skb_dst_drop(skb);
+ tcp_cleanup_skb(skb);
__skb_pull(skb, tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
@@ -6216,7 +6168,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
- tcp_ack(sk, skb, FLAG_DATA);
+ tcp_ack(sk, skb, flag | FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
@@ -6245,7 +6197,7 @@ slow_path:
/*
* Standard slow path.
*/
-
+validate:
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
@@ -6276,7 +6228,7 @@ csum_error:
discard:
tcp_drop_reason(sk, skb, reason);
}
-EXPORT_SYMBOL(tcp_rcv_established);
+EXPORT_IPV6_MOD(tcp_rcv_established);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
{
@@ -6329,7 +6281,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
@@ -6452,9 +6404,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
/* Previous FIN/ACK or RST/ACK might be ignored. */
if (icsk->icsk_retransmits == 0)
- inet_csk_reset_xmit_timer(sk,
- ICSK_TIME_RETRANS,
- TCP_TIMEOUT_MIN, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_MIN, false);
SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE);
goto reset_and_undo;
}
@@ -6569,8 +6520,8 @@ consume:
*/
inet_csk_schedule_ack(sk);
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, false);
goto consume;
}
tcp_send_ack(sk);
@@ -6788,10 +6739,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- if (!tcp_check_req(sk, skb, req, true, &req_stolen)) {
- SKB_DR_SET(reason, TCP_FASTOPEN);
+ SKB_DR_SET(reason, TCP_FASTOPEN);
+ if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason))
goto discard;
- }
}
if (!th->ack && !th->rst && !th->syn) {
@@ -6827,6 +6777,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
if (req) {
tcp_rcv_synrecv_state_fastopen(sk);
} else {
@@ -6852,9 +6805,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- if (tp->rx_opt.tstamp_ok)
- tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
@@ -6904,7 +6854,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
@@ -6912,7 +6862,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
- inet_csk_reset_keepalive_timer(sk, tmo);
+ tcp_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto consume;
@@ -6990,7 +6940,7 @@ consume:
__kfree_skb(skb);
return 0;
}
-EXPORT_SYMBOL(tcp_rcv_state_process);
+EXPORT_IPV6_MOD(tcp_rcv_state_process);
static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
{
@@ -7057,6 +7007,7 @@ static void tcp_openreq_init(struct request_sock *req,
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = 0;
+ tcp_rsk(req)->snt_tsval_first = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
@@ -7172,7 +7123,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
return mss;
}
-EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
+EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss);
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
@@ -7328,6 +7279,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req,
req->timeout))) {
reqsk_free(req);
+ dst_release(dst);
return 0;
}
@@ -7352,4 +7304,4 @@ drop:
tcp_listendrop(sk);
return 0;
}
-EXPORT_SYMBOL(tcp_conn_request);
+EXPORT_IPV6_MOD(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a38c8b1f44db..84d3d556ed80 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -58,7 +58,9 @@
#include <linux/times.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sock_diag.h>
+#include <net/aligned_data.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
@@ -66,6 +68,7 @@
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
+#include <net/inet_ecn.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
@@ -92,7 +95,6 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
#endif
struct inet_hashinfo tcp_hashinfo;
-EXPORT_SYMBOL(tcp_hashinfo);
static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
@@ -120,6 +122,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
int ts_recent_stamp;
+ u32 reuse_thresh;
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
reuse = 0;
@@ -162,9 +165,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
if (ts_recent_stamp &&
- (!twp || (reuse && time_after32(ktime_get_seconds(),
- ts_recent_stamp)))) {
+ (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
* and releasing the bucket lock.
*/
@@ -197,7 +201,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
return 0;
}
-EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
@@ -357,7 +361,7 @@ failure:
inet->inet_dport = 0;
return err;
}
-EXPORT_SYMBOL(tcp_v4_connect);
+EXPORT_IPV6_MOD(tcp_v4_connect);
/*
* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
@@ -398,7 +402,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
}
-EXPORT_SYMBOL(tcp_v4_mtu_reduced);
+EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
@@ -432,7 +436,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
}
reqsk_put(req);
}
-EXPORT_SYMBOL(tcp_req_err);
+EXPORT_IPV6_MOD(tcp_req_err);
/* TCP-LD (RFC 6069) logic */
void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
@@ -456,15 +460,14 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
icsk->icsk_backoff--;
icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
- icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
tcp_mstamp_refresh(tp);
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
if (remaining > 0) {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- remaining, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
} else {
/* RTO revert clocked out retransmission.
* Will retransmit now.
@@ -472,7 +475,7 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
tcp_retransmit_timer(sk);
}
}
-EXPORT_SYMBOL(tcp_ld_RTO_revert);
+EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
/*
* This routine is called by the ICMP module when it gets some
@@ -494,14 +497,14 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
- struct tcp_sock *tp;
+ struct net *net = dev_net_rcu(skb->dev);
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
- struct sock *sk;
struct request_sock *fastopen;
+ struct tcp_sock *tp;
u32 seq, snd_una;
+ struct sock *sk;
int err;
- struct net *net = dev_net(skb->dev);
sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->daddr, th->dest, iph->saddr,
@@ -674,7 +677,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
-EXPORT_SYMBOL(tcp_v4_send_check);
+EXPORT_IPV6_MOD(tcp_v4_send_check);
#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
@@ -786,7 +789,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+ net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
/* Invalid TCP option size or twice included auth */
if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
@@ -887,7 +890,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
- arg.tos = ip_hdr(skb)->tos;
+ /* ECN bits of TW reset are cleared */
+ arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
@@ -896,7 +900,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
sock_net_set(ctl_sk, net);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
transmit_time = tcp_transmit_time(sk);
@@ -1033,11 +1037,21 @@ static void tcp_v4_send_ack(const struct sock *sk,
local_bh_enable();
}
-static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
+ enum tcp_tw_status tw_status)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
struct tcp_key key = {};
+ u8 tos = tw->tw_tos;
+
+ /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
+ * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
+ * being placed in a different service queues (Classic rather than L4S)
+ */
+ if (tw_status == TCP_TW_ACK_OOW)
+ tos &= ~INET_ECN_MASK;
+
#ifdef CONFIG_TCP_AO
struct tcp_ao_info *ao_info;
@@ -1081,7 +1095,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
READ_ONCE(tcptw->tw_ts_recent),
tw->tw_bound_dev_if, &key,
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
- tw->tw_tos,
+ tos,
tw->tw_txhash);
inet_twsk_put(tw);
@@ -1151,14 +1165,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
key.type = TCP_KEY_MD5;
}
+ /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
tcp_rsk_tsval(tcp_rsk(req)),
- READ_ONCE(req->ts_recent),
+ req->ts_recent,
0, &key,
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
- ip_hdr(skb)->tos,
+ ip_hdr(skb)->tos & ~INET_ECN_MASK,
READ_ONCE(tcp_rsk(req)->txhash));
if (tcp_key_is_ao(&key))
kfree(key.traffic_key);
@@ -1229,7 +1244,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
*/
DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
-EXPORT_SYMBOL(tcp_md5_needed);
+EXPORT_IPV6_MOD(tcp_md5_needed);
static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
{
@@ -1288,7 +1303,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
}
return best_match;
}
-EXPORT_SYMBOL(__tcp_md5_do_lookup);
+EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
@@ -1335,7 +1350,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
}
-EXPORT_SYMBOL(tcp_v4_md5_lookup);
+EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
{
@@ -1431,7 +1446,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
newkey, newkeylen, GFP_KERNEL);
}
-EXPORT_SYMBOL(tcp_md5_do_add);
+EXPORT_IPV6_MOD(tcp_md5_do_add);
int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
int family, u8 prefixlen, int l3index,
@@ -1463,7 +1478,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
key->flags, key->key, key->keylen,
sk_gfp_mask(sk, GFP_ATOMIC));
}
-EXPORT_SYMBOL(tcp_md5_key_copy);
+EXPORT_IPV6_MOD(tcp_md5_key_copy);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
u8 prefixlen, int l3index, u8 flags)
@@ -1478,7 +1493,7 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
kfree_rcu(key, rcu);
return 0;
}
-EXPORT_SYMBOL(tcp_md5_do_del);
+EXPORT_IPV6_MOD(tcp_md5_do_del);
void tcp_clear_md5_list(struct sock *sk)
{
@@ -1657,7 +1672,7 @@ clear_hash_nostart:
memset(md5_hash, 0, 16);
return 1;
}
-EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
#endif
@@ -1690,7 +1705,6 @@ static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
- .rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
@@ -1730,7 +1744,7 @@ drop:
tcp_listendrop(sk);
return 0;
}
-EXPORT_SYMBOL(tcp_v4_conn_request);
+EXPORT_IPV6_MOD(tcp_v4_conn_request);
/*
@@ -1768,10 +1782,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- sk_daddr_set(newsk, ireq->ir_rmt_addr);
- sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
- newsk->sk_bound_dev_if = ireq->ir_iif;
- newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = rcu_dereference(ireq->ireq_opt);
RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
newinet->mc_index = inet_iif(skb);
@@ -1854,7 +1864,7 @@ put_and_exit:
tcp_done(newsk);
goto exit;
}
-EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
@@ -1965,7 +1975,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);
int tcp_v4_early_demux(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
@@ -2016,6 +2026,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
u32 gso_size;
u64 limit;
int delta;
+ int err;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@@ -2025,7 +2036,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
*/
skb_condense(skb);
- skb_dst_drop(skb);
+ tcp_cleanup_skb(skb);
if (unlikely(tcp_checksum_complete(skb))) {
bh_unlock_sock(sk);
@@ -2055,7 +2066,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
!((TCP_SKB_CB(tail)->tcp_flags &
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
((TCP_SKB_CB(tail)->tcp_flags ^
- TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+ TCP_SKB_CB(skb)->tcp_flags) &
+ (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
!tcp_skb_can_collapse_rx(tail, skb) ||
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
@@ -2125,23 +2137,29 @@ no_coalesce:
limit = min_t(u64, limit, UINT_MAX);
- if (unlikely(sk_add_backlog(sk, skb, limit))) {
+ err = sk_add_backlog(sk, skb, limit);
+ if (unlikely(err)) {
bh_unlock_sock(sk);
- *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ if (err == -ENOMEM) {
+ *reason = SKB_DROP_REASON_PFMEMALLOC;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+ } else {
+ *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ }
return true;
}
return false;
}
-EXPORT_SYMBOL(tcp_add_backlog);
+EXPORT_IPV6_MOD(tcp_add_backlog);
-int tcp_filter(struct sock *sk, struct sk_buff *skb)
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
{
struct tcphdr *th = (struct tcphdr *)skb->data;
- return sk_filter_trim_cap(sk, skb, th->doff * 4);
+ return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
}
-EXPORT_SYMBOL(tcp_filter);
+EXPORT_IPV6_MOD(tcp_filter);
static void tcp_v4_restore_cb(struct sk_buff *skb)
{
@@ -2163,7 +2181,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->has_rxtstamp =
@@ -2176,8 +2194,9 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
enum skb_drop_reason drop_reason;
+ enum tcp_tw_status tw_status;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
@@ -2265,13 +2284,12 @@ lookup:
}
refcounted = true;
nsk = NULL;
- if (!tcp_filter(sk, skb)) {
+ if (!tcp_filter(sk, skb, &drop_reason)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
- nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
- } else {
- drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+ nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
+ &drop_reason);
}
if (!nsk) {
reqsk_put(req);
@@ -2327,10 +2345,9 @@ process:
nf_reset_ct(skb);
- if (tcp_filter(sk, skb)) {
- drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+ if (tcp_filter(sk, skb, &drop_reason))
goto discard_and_relse;
- }
+
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
@@ -2404,7 +2421,10 @@ do_time_wait:
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
- switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
+
+ tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
+ &drop_reason);
+ switch (tw_status) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(net,
net->ipv4.tcp_death_row.hashinfo,
@@ -2425,7 +2445,8 @@ do_time_wait:
/* to ACK */
fallthrough;
case TCP_TW_ACK:
- tcp_v4_timewait_ack(sk, skb);
+ case TCP_TW_ACK_OOW:
+ tcp_v4_timewait_ack(sk, skb, tw_status);
break;
case TCP_TW_RST:
tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
@@ -2450,7 +2471,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
sk->sk_rx_dst_ifindex = skb->skb_iif;
}
}
-EXPORT_SYMBOL(inet_sk_rx_dst_set);
+EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
@@ -2462,11 +2483,9 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
- .addr2sockaddr = inet_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in),
.mtu_reduced = tcp_v4_mtu_reduced,
};
-EXPORT_SYMBOL(ipv4_specific);
+EXPORT_IPV6_MOD(ipv4_specific);
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
@@ -2576,7 +2595,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
sk_sockets_allocated_dec(sk);
}
-EXPORT_SYMBOL(tcp_v4_destroy_sock);
+EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
@@ -2812,7 +2831,7 @@ out:
st->last_pos = *pos;
return rc;
}
-EXPORT_SYMBOL(tcp_seq_start);
+EXPORT_IPV6_MOD(tcp_seq_start);
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
@@ -2843,7 +2862,7 @@ out:
st->last_pos = *pos;
return rc;
}
-EXPORT_SYMBOL(tcp_seq_next);
+EXPORT_IPV6_MOD(tcp_seq_next);
void tcp_seq_stop(struct seq_file *seq, void *v)
{
@@ -2861,7 +2880,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v)
break;
}
}
-EXPORT_SYMBOL(tcp_seq_stop);
+EXPORT_IPV6_MOD(tcp_seq_stop);
static void get_openreq4(const struct request_sock *req,
struct seq_file *f, int i)
@@ -2882,7 +2901,7 @@ static void get_openreq4(const struct request_sock *req,
jiffies_delta_to_clock_t(delta),
req->num_timeout,
from_kuid_munged(seq_user_ns(f),
- sock_i_uid(req->rsk_listener)),
+ sk_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
@@ -2910,10 +2929,10 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
- timer_expires = icsk->icsk_timeout;
+ timer_expires = icsk_timeout(icsk);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = icsk->icsk_timeout;
+ timer_expires = icsk_timeout(icsk);
} else if (timer_pending(&sk->sk_timer)) {
timer_active = 2;
timer_expires = sk->sk_timer.expires;
@@ -2940,7 +2959,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
- from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
icsk->icsk_probes_out,
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
@@ -3000,13 +3019,17 @@ out:
}
#ifdef CONFIG_BPF_SYSCALL
+union bpf_tcp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
struct bpf_tcp_iter_state {
struct tcp_iter_state state;
unsigned int cur_sk;
unsigned int end_sk;
unsigned int max_sk;
- struct sock **batch;
- bool st_bucket_done;
+ union bpf_tcp_iter_batch_item *batch;
};
struct bpf_iter__tcp {
@@ -3029,21 +3052,32 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
{
- while (iter->cur_sk < iter->end_sk)
- sock_gen_put(iter->batch[iter->cur_sk++]);
+ union bpf_tcp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_gen_put(item->sk);
+ item->cookie = cookie;
+ }
}
static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
- unsigned int new_batch_sz)
+ unsigned int new_batch_sz, gfp_t flags)
{
- struct sock **new_batch;
+ union bpf_tcp_iter_batch_item *new_batch;
new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
- GFP_USER | __GFP_NOWARN);
+ flags | __GFP_NOWARN);
if (!new_batch)
return -ENOMEM;
- bpf_iter_tcp_put_batch(iter);
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
kvfree(iter->batch);
iter->batch = new_batch;
iter->max_sk = new_batch_sz;
@@ -3051,112 +3085,234 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
return 0;
}
-static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
- struct sock *start_sk)
+static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
+ union bpf_tcp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ sk_nulls_for_each_from(sk, node)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ return sk;
+ }
+
+ return NULL;
+}
+
+static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
{
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = listening_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ ++st->bucket;
+ sk = listening_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = established_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+ ++st->bucket;
+ sk = established_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ sk = bpf_iter_tcp_resume_listening(seq);
+ if (sk)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ fallthrough;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ sk = bpf_iter_tcp_resume_established(seq);
+ break;
+ }
+
+ return sk;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
struct hlist_nulls_node *node;
unsigned int expected = 1;
struct sock *sk;
- sock_hold(start_sk);
- iter->batch[iter->end_sk++] = start_sk;
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
- sk = sk_nulls_next(start_sk);
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk)) {
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
- iter->batch[iter->end_sk++] = sk;
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
}
expected++;
}
}
- spin_unlock(&hinfo->lhash2[st->bucket].lock);
return expected;
}
static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
- struct sock *start_sk)
+ struct sock **start_sk)
{
- struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
- struct tcp_iter_state *st = &iter->state;
struct hlist_nulls_node *node;
unsigned int expected = 1;
struct sock *sk;
- sock_hold(start_sk);
- iter->batch[iter->end_sk++] = start_sk;
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
- sk = sk_nulls_next(start_sk);
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk)) {
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
- iter->batch[iter->end_sk++] = sk;
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
}
expected++;
}
}
- spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
return expected;
}
-static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ return bpf_iter_tcp_listening_batch(seq, start_sk);
+ else
+ return bpf_iter_tcp_established_batch(seq, start_sk);
+}
+
+static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
{
struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ else
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
unsigned int expected;
- bool resized = false;
struct sock *sk;
+ int err;
- /* The st->bucket is done. Directly advance to the next
- * bucket instead of having the tcp_seek_last_pos() to skip
- * one by one in the current bucket and eventually find out
- * it has to advance to the next bucket.
- */
- if (iter->st_bucket_done) {
- st->offset = 0;
- st->bucket++;
- if (st->state == TCP_SEQ_STATE_LISTENING &&
- st->bucket > hinfo->lhash2_mask) {
- st->state = TCP_SEQ_STATE_ESTABLISHED;
- st->bucket = 0;
- }
- }
+ sk = bpf_iter_tcp_resume(seq);
+ if (!sk)
+ return NULL; /* Done */
-again:
- /* Get a new batch */
- iter->cur_sk = 0;
- iter->end_sk = 0;
- iter->st_bucket_done = false;
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
+
+ /* Batch size was too small. */
+ bpf_iter_tcp_unlock_bucket(seq);
+ bpf_iter_tcp_put_batch(iter);
+ err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
- sk = tcp_seek_last_pos(seq);
+ sk = bpf_iter_tcp_resume(seq);
if (!sk)
return NULL; /* Done */
- if (st->state == TCP_SEQ_STATE_LISTENING)
- expected = bpf_iter_tcp_listening_batch(seq, sk);
- else
- expected = bpf_iter_tcp_established_batch(seq, sk);
-
- if (iter->end_sk == expected) {
- iter->st_bucket_done = true;
- return sk;
- }
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
- if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
- resized = true;
- goto again;
+ /* Batch size was still too small. Hold onto the lock while we try
+ * again with a larger batch to make sure the current bucket's size
+ * does not change in the meantime.
+ */
+ err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
+ if (err) {
+ bpf_iter_tcp_unlock_bucket(seq);
+ return ERR_PTR(err);
}
- return sk;
+ expected = bpf_iter_fill_batch(seq, &sk);
+ WARN_ON_ONCE(iter->end_sk != expected);
+done:
+ bpf_iter_tcp_unlock_bucket(seq);
+ return iter->batch[0].sk;
}
static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
@@ -3186,16 +3342,11 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
* meta.seq_num is used instead.
*/
st->num++;
- /* Move st->offset to the next sk in the bucket such that
- * the future start() will resume at st->offset in
- * st->bucket. See tcp_seek_last_pos().
- */
- st->offset++;
- sock_gen_put(iter->batch[iter->cur_sk++]);
+ sock_gen_put(iter->batch[iter->cur_sk++].sk);
}
if (iter->cur_sk < iter->end_sk)
- sk = iter->batch[iter->cur_sk];
+ sk = iter->batch[iter->cur_sk].sk;
else
sk = bpf_iter_tcp_batch(seq);
@@ -3232,9 +3383,9 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
const struct request_sock *req = v;
uid = from_kuid_munged(seq_user_ns(seq),
- sock_i_uid(req->rsk_listener));
+ sk_uid(req->rsk_listener));
} else {
- uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
}
meta.seq = seq;
@@ -3261,10 +3412,8 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
(void)tcp_prog_seq_show(prog, &meta, v, 0);
}
- if (iter->cur_sk < iter->end_sk) {
+ if (iter->cur_sk < iter->end_sk)
bpf_iter_tcp_put_batch(iter);
- iter->st_bucket_done = false;
- }
}
static const struct seq_operations bpf_iter_tcp_seq_ops = {
@@ -3377,7 +3526,7 @@ struct proto tcp_prot = {
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
- .memory_allocated = &tcp_memory_allocated,
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
.memory_pressure = &tcp_memory_pressure,
@@ -3457,6 +3606,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
@@ -3480,8 +3630,8 @@ static int __net_init tcp_sk_init(struct net *net)
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of 16 TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
+ /* Default TSQ limit of 4 MB */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
@@ -3530,6 +3680,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_pingpong_thresh = 1;
net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
+ net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
return 0;
}
@@ -3580,7 +3731,7 @@ static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
if (err)
return err;
- err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
+ err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
if (err) {
bpf_iter_fini_seq_net(priv_data);
return err;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 95669935494e..03c068ea27b6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -166,11 +166,11 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
unsigned int hash)
{
struct tcp_metrics_block *tm;
- struct net *net;
bool reclaim = false;
+ struct net *net;
spin_lock_bh(&tcp_metrics_lock);
- net = dev_net(dst->dev);
+ net = dev_net_rcu(dst_dev(dst));
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
@@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return NULL;
}
- net = dev_net(dst->dev);
+ net = dev_net_rcu(dst_dev(dst));
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
else
return NULL;
- net = dev_net(dst->dev);
+ net = dev_net_rcu(dst_dev(dst));
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bb1fe1ba867a..2994c9222c9c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -44,7 +44,7 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller.
*/
- return TCP_TW_ACK;
+ return TCP_TW_ACK_OOW;
}
/* We are rate-limiting, so just release the tw sock and drop skb. */
@@ -97,7 +97,8 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq,
*/
enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
- const struct tcphdr *th, u32 *tw_isn)
+ const struct tcphdr *th, u32 *tw_isn,
+ enum skb_drop_reason *drop_reason)
{
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
@@ -157,8 +158,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
rcv_nxt);
if (tmp_opt.saw_tstamp) {
+ u64 ts = tcp_clock_ms();
+
+ WRITE_ONCE(tw->tw_entry_stamp, ts);
WRITE_ONCE(tcptw->tw_ts_recent_stamp,
- ktime_get_seconds());
+ div_u64(ts, MSEC_PER_SEC));
WRITE_ONCE(tcptw->tw_ts_recent,
tmp_opt.rcv_tsval);
}
@@ -242,8 +246,10 @@ kill:
return TCP_TW_SYN;
}
- if (paws_reject)
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+ if (paws_reject) {
+ *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
+ }
if (!th->rst) {
/* In this case we must reset the TIMEWAIT timer.
@@ -261,7 +267,7 @@ kill:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
-EXPORT_SYMBOL(tcp_timewait_state_process);
+EXPORT_IPV6_MOD(tcp_timewait_state_process);
static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
{
@@ -316,6 +322,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_mark = sk->sk_mark;
tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ /* refreshed when we enter true TIME-WAIT state */
+ tw->tw_entry_stamp = tcp_time_stamp_ms(tp);
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
@@ -326,6 +334,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_last_oow_ack_time = 0;
tcptw->tw_tx_delay = tp->tcp_tx_delay;
tw->tw_txhash = sk->sk_txhash;
+ tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping;
+#endif
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -389,7 +401,7 @@ void tcp_twsk_destructor(struct sock *sk)
#endif
tcp_ao_destroy_sock(sk, true);
}
-EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor);
void tcp_twsk_purge(struct list_head *net_exit_list)
{
@@ -448,12 +460,13 @@ void tcp_openreq_init_rwin(struct request_sock *req,
rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
}
-EXPORT_SYMBOL(tcp_openreq_init_rwin);
static void tcp_ecn_openreq_child(struct tcp_sock *tp,
const struct request_sock *req)
{
- tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
+ tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
+ TCP_ECN_MODE_RFC3168 :
+ TCP_ECN_DISABLED);
}
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
@@ -483,7 +496,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
tcp_set_ca_state(sk, TCP_CA_Open);
}
-EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child);
static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
struct request_sock *req,
@@ -557,8 +570,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
if (sock_flag(newsk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
+ tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
newtp->rx_opt.sack_ok = ireq->sack_ok;
@@ -578,7 +590,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
if (newtp->rx_opt.tstamp_ok) {
newtp->tcp_usec_ts = treq->req_usec_ts;
- newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
+ newtp->rx_opt.ts_recent = req->ts_recent;
newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
} else {
@@ -650,12 +662,14 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- bool fastopen, bool *req_stolen)
+ bool fastopen, bool *req_stolen,
+ enum skb_drop_reason *drop_reason)
{
struct tcp_options_received tmp_opt;
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ bool tsecr_reject = false;
bool paws_reject = false;
bool own_req;
@@ -664,9 +678,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
- tmp_opt.ts_recent = READ_ONCE(req->ts_recent);
- if (tmp_opt.rcv_tsecr)
+ tmp_opt.ts_recent = req->ts_recent;
+ if (tmp_opt.rcv_tsecr) {
+ if (inet_rsk(req)->tstamp_ok && !fastopen)
+ tsecr_reject = !between(tmp_opt.rcv_tsecr,
+ tcp_rsk(req)->snt_tsval_first,
+ READ_ONCE(tcp_rsk(req)->snt_tsval_last));
tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
+ }
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
@@ -707,7 +726,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
&tcp_rsk(req)->last_oow_ack_time) &&
- !inet_rtx_syn_ack(sk, req)) {
+ !tcp_rtx_synack(sk, req)) {
unsigned long expires = jiffies;
expires += reqsk_timeout(req, TCP_RTO_MAX);
@@ -781,37 +800,34 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->snt_isn + 1))
return sk;
- /* Also, it would be not so bad idea to check rcv_tsecr, which
- * is essentially ACK extension and too early or too late values
- * should cause reset in unsynchronized states.
- */
-
/* RFC793: "first check sequence number". */
- if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt,
- tcp_rsk(req)->rcv_nxt +
- tcp_synack_window(req))) {
+ if (paws_reject || tsecr_reject ||
+ !tcp_in_window(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ tcp_rsk(req)->rcv_nxt,
+ tcp_rsk(req)->rcv_nxt +
+ tcp_synack_window(req))) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST) &&
!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
&tcp_rsk(req)->last_oow_ack_time))
req->rsk_ops->send_ack(sk, skb, req);
- if (paws_reject)
+ if (paws_reject) {
+ SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ } else if (tsecr_reject) {
+ SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED);
+ } else {
+ SKB_DR_SET(*drop_reason, TCP_OVERWINDOW);
+ }
return NULL;
}
/* In sequence, PAWS is OK. */
- /* TODO: We probably should defer ts_recent change once
- * we take ownership of @req.
- */
- if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
- WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval);
-
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
/* Truncate SYN, it is out of window starting
at tcp_rsk(req)->rcv_isn + 1. */
@@ -860,6 +876,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
+ if (own_req && tmp_opt.saw_tstamp &&
+ !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
+ tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval;
+
if (own_req && rsk_drop_req(req)) {
reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
@@ -872,6 +892,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return inet_csk_complete_hashdance(sk, child, req, own_req);
listen_overflow:
+ SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW);
if (sk != req->rsk_listener)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
@@ -901,7 +922,7 @@ embryonic_reset:
}
return NULL;
}
-EXPORT_SYMBOL(tcp_check_req);
+EXPORT_IPV6_MOD(tcp_check_req);
/*
* Queue segment on the new socket if the new socket is active,
@@ -943,4 +964,4 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
sock_put(child);
return reason;
}
-EXPORT_SYMBOL(tcp_child_process);
+EXPORT_IPV6_MOD(tcp_child_process);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 2308665b51c5..be5c2294610e 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -13,12 +13,15 @@
#include <net/tcp.h>
#include <net/protocol.h>
-static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
+static void tcp_gso_tstamp(struct sk_buff *skb, struct sk_buff *gso_skb,
unsigned int seq, unsigned int mss)
{
+ u32 flags = skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP;
+ u32 ts_seq = skb_shinfo(gso_skb)->tskey;
+
while (skb) {
if (before(ts_seq, seq + mss)) {
- skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
+ skb_shinfo(skb)->tx_flags |= flags;
skb_shinfo(skb)->tskey = ts_seq;
return;
}
@@ -139,6 +142,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
struct sk_buff *gso_skb = skb;
__sum16 newcheck;
bool ooo_okay, copy_destructor;
+ bool ecn_cwr_mask;
__wsum delta;
th = tcp_hdr(skb);
@@ -193,11 +197,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th = tcp_hdr(skb);
seq = ntohl(th->seq);
- if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
- tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+ if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP))
+ tcp_gso_tstamp(segs, gso_skb, seq, mss);
newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta));
+ ecn_cwr_mask = !!(skb_shinfo(gso_skb)->gso_type & SKB_GSO_TCP_ACCECN);
+
while (skb->next) {
th->fin = th->psh = 0;
th->check = newcheck;
@@ -217,7 +223,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th = tcp_hdr(skb);
th->seq = htonl(seq);
- th->cwr = 0;
+
+ th->cwr &= ecn_cwr_mask;
}
/* Following permits TCP Small Queues to work well with GSO :
@@ -325,7 +332,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
th2 = tcp_hdr(p);
flush = (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
- ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+ ~(TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
for (i = sizeof(*th); i < thlen; i += 4)
flush |= *(u32 *)((u8 *)th + i) ^
@@ -352,6 +359,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
flush |= skb->ip_summed != p->ip_summed;
flush |= skb->csum_level != p->csum_level;
flush |= NAPI_GRO_CB(p)->count >= 64;
+ skb_set_network_header(skb, skb_gro_receive_network_offset(skb));
if (flush || skb_gro_receive_list(p, skb))
mss = 1;
@@ -401,7 +409,7 @@ void tcp_gro_complete(struct sk_buff *skb)
shinfo->gso_segs = NAPI_GRO_CB(skb)->count;
if (th->cwr)
- shinfo->gso_type |= SKB_GSO_TCP_ECN;
+ shinfo->gso_type |= SKB_GSO_TCP_ACCECN;
}
EXPORT_SYMBOL(tcp_gro_complete);
@@ -425,14 +433,14 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet_get_iif_sdif(skb, &iif, &sdif);
iph = skb_gro_network_header(skb);
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
iif, sdif);
NAPI_GRO_CB(skb)->is_flist = !sk;
if (sk)
- sock_put(sk);
+ sock_gen_put(sk);
}
INDIRECT_CALLABLE_SCOPE
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5485a70b5fe5..caf11920a878 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -250,7 +250,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
WRITE_ONCE(*__window_clamp,
min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
}
-EXPORT_SYMBOL(tcp_select_initial_window);
+EXPORT_IPV6_MOD(tcp_select_initial_window);
/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied. The return
@@ -265,11 +265,14 @@ static u16 tcp_select_window(struct sock *sk)
u32 cur_win, new_win;
/* Make the window 0 if we failed to queue the data because we
- * are out of memory. The window is temporary, so we don't store
- * it on the socket.
+ * are out of memory.
*/
- if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
+ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) {
+ tp->pred_flags = 0;
+ tp->rcv_wnd = 0;
+ tp->rcv_wup = tp->rcv_nxt;
return 0;
+ }
cur_win = tcp_receive_window(tp);
new_win = __tcp_select_window(sk);
@@ -322,7 +325,7 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
const struct tcp_sock *tp = tcp_sk(sk);
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
- if (!(tp->ecn_flags & TCP_ECN_OK))
+ if (tcp_ecn_disabled(tp))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
else if (tcp_ca_needs_ecn(sk) ||
tcp_bpf_ca_needs_ecn(sk))
@@ -348,7 +351,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
- tp->ecn_flags = TCP_ECN_OK;
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
INET_ECN_xmit(sk);
}
@@ -378,7 +381,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->ecn_flags & TCP_ECN_OK) {
+ if (tcp_ecn_mode_rfc3168(tp)) {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
@@ -400,7 +403,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
* auto increment end seqno.
*/
-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -522,6 +525,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
sock_owned_by_me(sk);
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
}
@@ -567,6 +571,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
sock_owned_by_me(sk);
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
}
@@ -883,8 +888,10 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
unsigned int size;
if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
- opts->options |= OPTION_MPTCP;
- remaining -= size;
+ if (remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ remaining -= size;
+ }
}
}
@@ -936,7 +943,13 @@ static unsigned int tcp_synack_options(const struct sock *sk,
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
tcp_rsk(req)->ts_off;
- opts->tsecr = READ_ONCE(req->ts_recent);
+ if (!tcp_rsk(req)->snt_tsval_first) {
+ if (!opts->tsval)
+ opts->tsval = ~0U;
+ tcp_rsk(req)->snt_tsval_first = opts->tsval;
+ }
+ WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval);
+ opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
@@ -1053,15 +1066,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
* needs to be reallocated in a driver.
* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
*
- * Since transmit from skb destructor is forbidden, we use a tasklet
+ * Since transmit from skb destructor is forbidden, we use a BH work item
* to process all sockets that eventually need to send more skbs.
- * We use one tasklet per cpu, with its own queue of sockets.
+ * We use one work item per cpu, with its own queue of sockets.
*/
-struct tsq_tasklet {
- struct tasklet_struct tasklet;
+struct tsq_work {
+ struct work_struct work;
struct list_head head; /* queue of tcp sockets */
};
-static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+static DEFINE_PER_CPU(struct tsq_work, tsq_work);
static void tcp_tsq_write(struct sock *sk)
{
@@ -1091,14 +1104,14 @@ static void tcp_tsq_handler(struct sock *sk)
bh_unlock_sock(sk);
}
/*
- * One tasklet per cpu tries to send more skbs.
- * We run in tasklet context but need to disable irqs when
+ * One work item per cpu tries to send more skbs.
+ * We run in BH context but need to disable irqs when
* transferring tsq->head because tcp_wfree() might
* interrupt us (non NAPI drivers)
*/
-static void tcp_tasklet_func(struct tasklet_struct *t)
+static void tcp_tsq_workfn(struct work_struct *work)
{
- struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet);
+ struct tsq_work *tsq = container_of(work, struct tsq_work, work);
LIST_HEAD(list);
unsigned long flags;
struct list_head *q, *n;
@@ -1166,17 +1179,17 @@ void tcp_release_cb(struct sock *sk)
if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
tcp_send_ack(sk);
}
-EXPORT_SYMBOL(tcp_release_cb);
+EXPORT_IPV6_MOD(tcp_release_cb);
-void __init tcp_tasklet_init(void)
+void __init tcp_tsq_work_init(void)
{
int i;
for_each_possible_cpu(i) {
- struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+ struct tsq_work *tsq = &per_cpu(tsq_work, i);
INIT_LIST_HEAD(&tsq->head);
- tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
+ INIT_WORK(&tsq->work, tcp_tsq_workfn);
}
}
@@ -1190,11 +1203,11 @@ void tcp_wfree(struct sk_buff *skb)
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nval, oval;
- struct tsq_tasklet *tsq;
+ struct tsq_work *tsq;
bool empty;
/* Keep one reference on sk_wmem_alloc.
- * Will be released by sk_free() from here or tcp_tasklet_func()
+ * Will be released by sk_free() from here or tcp_tsq_workfn()
*/
WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
@@ -1216,13 +1229,13 @@ void tcp_wfree(struct sk_buff *skb)
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
} while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
- /* queue this socket to tasklet queue */
+ /* queue this socket to BH workqueue */
local_irq_save(flags);
- tsq = this_cpu_ptr(&tsq_tasklet);
+ tsq = this_cpu_ptr(&tsq_work);
empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
if (empty)
- tasklet_schedule(&tsq->tasklet);
+ queue_work(system_bh_wq, &tsq->work);
local_irq_restore(flags);
return;
out:
@@ -1382,7 +1395,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
- tcb->tcp_flags);
+ (tcb->tcp_flags & TCPHDR_FLAGS_MASK));
th->check = 0;
th->urg_ptr = 0;
@@ -1541,11 +1554,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
if (tcp_is_reno(tp) && decr > 0)
tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
- if (tp->lost_skb_hint &&
- before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- tp->lost_cnt_hint -= decr;
-
tcp_verify_left_out(tp);
}
@@ -1603,8 +1611,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
struct sk_buff *buff;
int old_factor;
long limit;
+ u16 flags;
int nlen;
- u8 flags;
if (WARN_ON(len > skb->len))
return -EINVAL;
@@ -1778,7 +1786,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
return __tcp_mtu_to_mss(sk, pmtu) -
(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
}
-EXPORT_SYMBOL(tcp_mtu_to_mss);
+EXPORT_IPV6_MOD(tcp_mtu_to_mss);
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
@@ -1808,7 +1816,6 @@ void tcp_mtup_init(struct sock *sk)
if (icsk->icsk_mtup.enabled)
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
-EXPORT_SYMBOL(tcp_mtup_init);
/* This function synchronize snd mss to current pmtu/exthdr set.
@@ -1852,7 +1859,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
return mss_now;
}
-EXPORT_SYMBOL(tcp_sync_mss);
+EXPORT_IPV6_MOD(tcp_sync_mss);
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
@@ -2159,7 +2166,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
{
int nlen = skb->len - len;
struct sk_buff *buff;
- u8 flags;
+ u16 flags;
/* All of a TSO frame must be composed of paged data. */
DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);
@@ -2607,9 +2614,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
- if (sk->sk_pacing_status == SK_PACING_NONE)
- limit = min_t(unsigned long, limit,
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
+ limit = min_t(unsigned long, limit,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit <<= factor;
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
@@ -2628,7 +2634,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
/* Always send skb if rtx queue is empty or has one skb.
* No need to wait for TX completion to call us back,
- * after softirq/tasklet schedule.
+ * after softirq schedule.
* This helps when TX completions are delayed too much.
*/
if (tcp_rtx_queue_empty_or_single_skb(sk))
@@ -2906,7 +2912,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
- tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true);
return true;
}
@@ -3241,7 +3247,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
/* changed transmit queue under us so clear hints */
- tcp_clear_retrans_hints_partial(tp);
if (next_skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = skb;
@@ -3325,8 +3330,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0;
- if (skb_still_in_host_queue(sk, skb))
- return -EBUSY;
+ if (skb_still_in_host_queue(sk, skb)) {
+ err = -EBUSY;
+ goto out;
+ }
start:
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
@@ -3337,14 +3344,19 @@ start:
}
if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
WARN_ON_ONCE(1);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
+ }
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) {
+ err = -ENOMEM;
+ goto out;
}
- if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
- return -ENOMEM;
}
- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
- return -EHOSTUNREACH; /* Routing failure or similar. */
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+ err = -EHOSTUNREACH; /* Routing failure or similar. */
+ goto out;
+ }
cur_mss = tcp_current_mss(sk);
avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
@@ -3355,8 +3367,10 @@ start:
* our retransmit of one segment serves as a zero window probe.
*/
if (avail_wnd <= 0) {
- if (TCP_SKB_CB(skb)->seq != tp->snd_una)
- return -EAGAIN;
+ if (TCP_SKB_CB(skb)->seq != tp->snd_una) {
+ err = -EAGAIN;
+ goto out;
+ }
avail_wnd = cur_mss;
}
@@ -3368,11 +3382,15 @@ start:
}
if (skb->len > len) {
if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
- cur_mss, GFP_ATOMIC))
- return -ENOMEM; /* We'll try again later. */
+ cur_mss, GFP_ATOMIC)) {
+ err = -ENOMEM; /* We'll try again later. */
+ goto out;
+ }
} else {
- if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
- return -ENOMEM;
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) {
+ err = -ENOMEM;
+ goto out;
+ }
diff = tcp_skb_pcount(skb);
tcp_set_skb_tso_segs(skb, cur_mss);
@@ -3426,17 +3444,16 @@ start:
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
- if (likely(!err)) {
- trace_tcp_retransmit_skb(sk, skb);
- } else if (err != -EBUSY) {
+ if (unlikely(err) && err != -EBUSY)
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
- }
/* To avoid taking spuriously low RTT samples based on a timestamp
* for a transmit that never happened, always mark EVER_RETRANS
*/
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+out:
+ trace_tcp_retransmit_skb(sk, skb, err);
return err;
}
@@ -3540,8 +3557,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
}
if (rearm_timer)
tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ inet_csk(sk)->icsk_rto, true);
}
/* We allow to exceed memory limits for FIN packets to expedite
@@ -3848,7 +3864,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
return skb;
}
-EXPORT_SYMBOL(tcp_make_synack);
+EXPORT_IPV6_MOD(tcp_make_synack);
static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
{
@@ -4158,8 +4174,8 @@ int tcp_connect(struct sock *sk)
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, false);
return 0;
}
EXPORT_SYMBOL(tcp_connect);
@@ -4168,7 +4184,7 @@ u32 tcp_delack_max(const struct sock *sk)
{
u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1;
- return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min);
+ return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min);
}
/* Send out a delayed ack, the caller does the policy checking
@@ -4214,22 +4230,21 @@ void tcp_send_delayed_ack(struct sock *sk)
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer is about to expire, send ACK now. */
- if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
- if (!time_before(timeout, icsk->icsk_ack.timeout))
- timeout = icsk->icsk_ack.timeout;
+ if (!time_before(timeout, icsk_delack_timeout(icsk)))
+ timeout = icsk_delack_timeout(icsk);
}
smp_store_release(&icsk->icsk_ack.pending,
icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER);
- icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
/* This routine sends an ack and also updates the window. */
-void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags)
{
struct sk_buff *buff;
@@ -4248,17 +4263,17 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
unsigned long delay;
delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
- if (delay < TCP_RTO_MAX)
+ if (delay < tcp_rto_max(sk))
icsk->icsk_ack.retry++;
inet_csk_schedule_ack(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false);
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
- tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
+ tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags);
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
@@ -4273,7 +4288,7 @@ EXPORT_SYMBOL_GPL(__tcp_send_ack);
void tcp_send_ack(struct sock *sk)
{
- __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
+ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0);
}
/* This routine sends a packet with an out of date sequence
@@ -4388,7 +4403,7 @@ void tcp_send_probe0(struct sock *sk)
if (err <= 0) {
if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
icsk->icsk_backoff++;
- timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
+ timeout = tcp_probe0_when(sk, tcp_rto_max(sk));
} else {
/* If packet was not sent due to local congestion,
* Let senders fight for local resources conservatively.
@@ -4397,7 +4412,7 @@ void tcp_send_probe0(struct sock *sk)
}
timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
- tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
@@ -4422,7 +4437,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
tcp_sk_rw(sk)->total_retrans++;
}
trace_tcp_retransmit_synack(sk, req);
+ req->num_retrans++;
}
return res;
}
-EXPORT_SYMBOL(tcp_rtx_synack);
+EXPORT_IPV6_MOD(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index bba10110fbbc..c52fd3254b6e 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -35,7 +35,7 @@ s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
}
-/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+/* RACK loss detection (IETF RFC8985):
*
* Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b412ed88ccd9..a207877270fb 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
/* If peer does not open window for long time, or did not transmit
* anything for long time, penalize it. */
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset)
shift++;
/* If some dubious ICMP arrived, penalize even more. */
@@ -189,12 +189,12 @@ static unsigned int tcp_model_timeout(struct sock *sk,
{
unsigned int linear_backoff_thresh, timeout;
- linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
+ linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base);
if (boundary <= linear_backoff_thresh)
timeout = ((2 << boundary) - 1) * rto_base;
else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
- (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ (boundary - linear_backoff_thresh) * tcp_rto_max(sk);
return jiffies_to_msecs(timeout);
}
/**
@@ -268,7 +268,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
- const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
+ const bool alive = icsk->icsk_rto < tcp_rto_max(sk);
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -322,8 +322,9 @@ void tcp_delack_timer_handler(struct sock *sk)
if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
return;
- if (time_after(icsk->icsk_ack.timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+ if (time_after(icsk_delack_timeout(icsk), jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ icsk_delack_timeout(icsk));
return;
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
@@ -358,7 +359,7 @@ void tcp_delack_timer_handler(struct sock *sk)
static void tcp_delack_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_delack_timer);
+ timer_container_of(icsk, t, icsk_delack_timer);
struct sock *sk = &icsk->icsk_inet.sk;
/* Avoid taking socket spinlock if there is no ACK to send.
@@ -416,7 +417,8 @@ static void tcp_probe_timer(struct sock *sk)
}
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
- const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+ unsigned int rto_max = tcp_rto_max(sk);
+ const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
@@ -476,13 +478,13 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
* regular retransmit because if the child socket has been accepted
* it's not good to give up too easily.
*/
- inet_rtx_syn_ack(sk, req);
+ tcp_rtx_synack(sk, req);
req->num_timeout++;
tcp_update_rto_stats(sk);
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_time_stamp_ts(tp);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- req->timeout << req->num_timeout, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ req->timeout << req->num_timeout, false);
}
static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
@@ -492,7 +494,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
const struct tcp_sock *tp = tcp_sk(sk);
- int timeout = TCP_RTO_MAX * 2;
+ int timeout = tcp_rto_max(sk) * 2;
s32 rcv_delta;
if (user_timeout) {
@@ -508,7 +510,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
* and tp->rcv_tstamp might very well have been written recently.
* rcv_delta can thus be negative.
*/
- rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp;
+ rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp;
if (rcv_delta <= timeout)
return false;
@@ -626,9 +628,9 @@ void tcp_retransmit_timer(struct sock *sk)
/* Retransmission failed because of local congestion,
* Let senders fight for local resources conservatively.
*/
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- TCP_RESOURCE_PROBE_INTERVAL,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_RESOURCE_PROBE_INTERVAL,
+ false);
goto out;
}
@@ -665,7 +667,7 @@ out_reset_timer:
icsk->icsk_backoff = 0;
icsk->icsk_rto = clamp(__tcp_set_rto(tp),
tcp_rto_min(sk),
- TCP_RTO_MAX);
+ tcp_rto_max(sk));
} else if (sk->sk_state != TCP_SYN_SENT ||
tp->total_rto >
READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
@@ -673,10 +675,10 @@ out_reset_timer:
* activated.
*/
icsk->icsk_backoff++;
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk));
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ tcp_clamp_rto_to_user_timeout(sk), false);
if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0))
__sk_dst_reset(sk);
@@ -684,7 +686,8 @@ out:;
}
/* Called with bottom-half processing disabled.
- Called by tcp_write_timer() */
+ * Called by tcp_write_timer() and tcp_release_cb().
+ */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -694,11 +697,11 @@ void tcp_write_timer_handler(struct sock *sk)
!icsk->icsk_pending)
return;
- if (time_after(icsk->icsk_timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+ if (time_after(icsk_timeout(icsk), jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ icsk_timeout(icsk));
return;
}
-
tcp_mstamp_refresh(tcp_sk(sk));
event = icsk->icsk_pending;
@@ -723,7 +726,7 @@ void tcp_write_timer_handler(struct sock *sk)
static void tcp_write_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_retransmit_timer);
+ timer_container_of(icsk, t, icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
/* Avoid locking the socket when there is no pending event. */
@@ -749,7 +752,17 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
__NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
}
-EXPORT_SYMBOL(tcp_syn_ack_timeout);
+EXPORT_IPV6_MOD(tcp_syn_ack_timeout);
+
+void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+static void tcp_delete_keepalive_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
void tcp_set_keepalive(struct sock *sk, int val)
{
@@ -757,16 +770,15 @@ void tcp_set_keepalive(struct sock *sk, int val)
return;
if (val && !sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
else if (!val)
- inet_csk_delete_keepalive_timer(sk);
+ tcp_delete_keepalive_timer(sk);
}
-EXPORT_SYMBOL_GPL(tcp_set_keepalive);
-
+EXPORT_IPV6_MOD_GPL(tcp_set_keepalive);
-static void tcp_keepalive_timer (struct timer_list *t)
+static void tcp_keepalive_timer(struct timer_list *t)
{
- struct sock *sk = from_timer(sk, t, sk_timer);
+ struct sock *sk = timer_container_of(sk, t, sk_timer);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;
@@ -775,7 +787,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- inet_csk_reset_keepalive_timer (sk, HZ/20);
+ tcp_reset_keepalive_timer(sk, HZ/20);
goto out;
}
@@ -841,7 +853,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
}
resched:
- inet_csk_reset_keepalive_timer (sk, elapsed);
+ tcp_reset_keepalive_timer(sk, elapsed);
goto out;
death:
@@ -884,11 +896,9 @@ void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
- hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_PINNED_SOFT);
- tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
+ hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_SOFT);
- hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_PINNED_SOFT);
- tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
+ hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED_SOFT);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6a01905d379f..cc3ce0f762ec 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -93,6 +93,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
+#include <linux/sock_diag.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -119,15 +120,13 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6_stubs.h>
#endif
+#include <net/rps.h>
struct udp_table udp_table __read_mostly;
-EXPORT_SYMBOL(udp_table);
long sysctl_udp_mem[3] __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_mem);
+EXPORT_IPV6_MOD(sysctl_udp_mem);
-atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
-EXPORT_SYMBOL(udp_memory_allocated);
DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
@@ -144,8 +143,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
unsigned long *bitmap,
struct sock *sk, unsigned int log)
{
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
- kuid_t uid = sock_i_uid(sk);
sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
@@ -157,7 +156,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
inet_rcv_saddr_equal(sk, sk2, true)) {
if (sk2->sk_reuseport && sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(uid, sock_i_uid(sk2))) {
+ uid_eq(uid, sk_uid(sk2))) {
if (!bitmap)
return 0;
} else {
@@ -179,8 +178,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
struct udp_hslot *hslot2,
struct sock *sk)
{
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
- kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
@@ -194,7 +193,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
inet_rcv_saddr_equal(sk, sk2, true)) {
if (sk2->sk_reuseport && sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(uid, sock_i_uid(sk2))) {
+ uid_eq(uid, sk_uid(sk2))) {
res = 0;
} else {
res = 1;
@@ -209,7 +208,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
{
struct net *net = sock_net(sk);
- kuid_t uid = sock_i_uid(sk);
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
sk_for_each(sk2, &hslot->head) {
@@ -219,7 +218,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
(udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false)) {
return reuseport_add_sock(sk, sk2,
inet_rcv_saddr_any(sk));
@@ -352,7 +351,7 @@ fail_unlock:
fail:
return error;
}
-EXPORT_SYMBOL(udp_lib_get_port);
+EXPORT_IPV6_MOD(udp_lib_get_port);
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
@@ -418,7 +417,50 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
return __inet_ehashfn(laddr, lport, faddr, fport,
udp_ehash_secret + net_hash_mix(net));
}
-EXPORT_SYMBOL(udp_ehashfn);
+EXPORT_IPV6_MOD(udp_ehashfn);
+
+/**
+ * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port)
+ * @net: Network namespace
+ * @saddr: Source address, network order
+ * @sport: Source port, network order
+ * @daddr: Destination address, network order
+ * @hnum: Destination port, host order
+ * @dif: Destination interface index
+ * @sdif: Destination bridge port index, if relevant
+ * @udptable: Set of UDP hash tables
+ *
+ * Simplified lookup to be used as fallback if no sockets are found due to a
+ * potential race between (receive) address change, and lookup happening before
+ * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
+ * result sockets, because if we have one, we don't need the fallback at all.
+ *
+ * Called under rcu_read_lock().
+ *
+ * Return: socket with highest matching score if any, NULL if none
+ */
+static struct sock *udp4_lib_lookup1(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ const struct udp_table *udptable)
+{
+ unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot = &udptable->hash[slot];
+ struct sock *sk, *result = NULL;
+ int score, badness = 0;
+
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net,
+ saddr, sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+
+ return result;
+}
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(const struct net *net,
@@ -533,7 +575,7 @@ begin:
return NULL;
}
-/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */
+/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */
static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
u16 newhash4)
{
@@ -582,15 +624,13 @@ void udp_lib_hash4(struct sock *sk, u16 hash)
struct net *net = sock_net(sk);
struct udp_table *udptable;
- /* Connected udp socket can re-connect to another remote address,
- * so rehash4 is needed.
+ /* Connected udp socket can re-connect to another remote address, which
+ * will be handled by rehash. Thus no need to redo hash4 here.
*/
- udptable = net->ipv4.udp_table;
- if (udp_hashed4(sk)) {
- udp_rehash4(udptable, sk, hash);
+ if (udp_hashed4(sk))
return;
- }
+ udptable = net->ipv4.udp_table;
hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
hslot4 = udp_hashslot4(udptable, hash);
@@ -612,7 +652,7 @@ void udp_lib_hash4(struct sock *sk, u16 hash)
spin_unlock_bh(&hslot->lock);
}
-EXPORT_SYMBOL(udp_lib_hash4);
+EXPORT_IPV6_MOD(udp_lib_hash4);
/* call with sock lock */
void udp4_hash4(struct sock *sk)
@@ -628,7 +668,7 @@ void udp4_hash4(struct sock *sk)
udp_lib_hash4(sk, hash);
}
-EXPORT_SYMBOL(udp4_hash4);
+EXPORT_IPV6_MOD(udp4_hash4);
#endif /* CONFIG_BASE_SMALL */
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
@@ -683,6 +723,19 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif, sdif,
hslot2, skb);
+ if (!IS_ERR_OR_NULL(result))
+ goto done;
+
+ /* Primary hash (destination port) lookup as fallback for this race:
+ * 1. __ip4_datagram_connect() sets sk_rcv_saddr
+ * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet
+ * 3. rehash operation updating _secondary and four-tuple_ hashes
+ * The primary hash doesn't need an update after 1., so, thanks to this
+ * further step, 1. and 3. don't need to be atomic against the lookup.
+ */
+ result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
+ udptable);
+
done:
if (IS_ERR(result))
return NULL;
@@ -755,11 +808,11 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk,
}
DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-EXPORT_SYMBOL(udp_encap_needed_key);
+EXPORT_IPV6_MOD(udp_encap_needed_key);
#if IS_ENABLED(CONFIG_IPV6)
DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
-EXPORT_SYMBOL(udpv6_encap_needed_key);
+EXPORT_IPV6_MOD(udpv6_encap_needed_key);
#endif
void udp_encap_enable(void)
@@ -987,7 +1040,7 @@ void udp_flush_pending_frames(struct sock *sk)
ip_flush_pending_frames(sk);
}
}
-EXPORT_SYMBOL(udp_flush_pending_frames);
+EXPORT_IPV6_MOD(udp_flush_pending_frames);
/**
* udp4_hwcsum - handle outgoing HW checksumming
@@ -1087,9 +1140,9 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
const int hlen = skb_network_header_len(skb) +
sizeof(struct udphdr);
- if (hlen + cork->gso_size > cork->fragsize) {
+ if (hlen + min(datalen, cork->gso_size) > cork->fragsize) {
kfree_skb(skb);
- return -EINVAL;
+ return -EMSGSIZE;
}
if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
kfree_skb(skb);
@@ -1175,7 +1228,7 @@ out:
WRITE_ONCE(up->pending, 0);
return err;
}
-EXPORT_SYMBOL(udp_push_pending_frames);
+EXPORT_IPV6_MOD(udp_push_pending_frames);
static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
{
@@ -1212,7 +1265,7 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
return need_ip;
}
-EXPORT_SYMBOL_GPL(udp_cmsg_send);
+EXPORT_IPV6_MOD_GPL(udp_cmsg_send);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
@@ -1227,7 +1280,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int free = 0;
int connected = 0;
__be32 daddr, faddr, saddr;
- u8 tos, scope;
+ u8 scope;
__be16 dport;
int err, is_udplite = IS_UDPLITE(sk);
int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
@@ -1351,7 +1404,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
faddr = ipc.opt->opt.faddr;
connected = 0;
}
- tos = get_rttos(&ipc, inet);
scope = ip_sendmsg_scope(inet, &ipc, msg);
if (scope == RT_SCOPE_LINK)
connected = 0;
@@ -1388,9 +1440,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope,
+ flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
sk->sk_protocol, flow_flags, faddr, saddr,
- dport, inet->inet_sport, sk->sk_uid);
+ dport, inet->inet_sport,
+ sk_uid(sk));
security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -1507,7 +1561,7 @@ void udp_splice_eof(struct socket *sock)
udp_push_pending_frames(sk);
release_sock(sk);
}
-EXPORT_SYMBOL_GPL(udp_splice_eof);
+EXPORT_IPV6_MOD_GPL(udp_splice_eof);
#define UDP_SKB_IS_STATELESS 0x80000000
@@ -1572,12 +1626,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb)
}
/* fully reclaim rmem/fwd memory allocated for skb */
-static void udp_rmem_release(struct sock *sk, int size, int partial,
- bool rx_queue_lock_held)
+static void udp_rmem_release(struct sock *sk, unsigned int size,
+ int partial, bool rx_queue_lock_held)
{
struct udp_sock *up = udp_sk(sk);
struct sk_buff_head *sk_queue;
- int amt;
+ unsigned int amt;
if (likely(partial)) {
up->forward_deficit += size;
@@ -1597,10 +1651,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (!rx_queue_lock_held)
spin_lock(&sk_queue->lock);
-
- sk_forward_alloc_add(sk, size);
- amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
- sk_forward_alloc_add(sk, -amt);
+ amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
+ sk_forward_alloc_add(sk, size - amt);
if (amt)
__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
@@ -1624,7 +1676,7 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
prefetch(&skb->data);
udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
}
-EXPORT_SYMBOL(udp_skb_destructor);
+EXPORT_IPV6_MOD(udp_skb_destructor);
/* as above, but the caller held the rx queue lock, too */
static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
@@ -1672,18 +1724,25 @@ static int udp_rmem_schedule(struct sock *sk, int size)
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff_head *list = &sk->sk_receive_queue;
- int rmem, err = -ENOMEM;
+ unsigned int rmem, rcvbuf;
spinlock_t *busy = NULL;
- bool becomes_readable;
- int size, rcvbuf;
+ int size, err = -ENOMEM;
- /* Immediately drop when the receive queue is full.
- * Always allow at least one packet.
- */
rmem = atomic_read(&sk->sk_rmem_alloc);
rcvbuf = READ_ONCE(sk->sk_rcvbuf);
- if (rmem > rcvbuf)
- goto drop;
+ size = skb->truesize;
+
+ /* Immediately drop when the receive queue is full.
+ * Cast to unsigned int performs the boundary check for INT_MAX.
+ */
+ if (rmem + size > rcvbuf) {
+ if (rcvbuf > INT_MAX >> 1)
+ goto drop;
+
+ /* Always allow at least one packet for small buffer. */
+ if (rmem > rcvbuf)
+ goto drop;
+ }
/* Under mem pressure, it might be helpful to help udp_recvmsg()
* having linear skbs :
@@ -1693,10 +1752,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
*/
if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
-
+ size = skb->truesize;
busy = busylock_acquire(sk);
}
- size = skb->truesize;
+
udp_set_dev_scratch(skb);
atomic_add(size, &sk->sk_rmem_alloc);
@@ -1715,19 +1774,12 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
*/
sock_skb_set_dropcount(sk, skb);
- becomes_readable = skb_queue_empty(list);
__skb_queue_tail(list, skb);
spin_unlock(&list->lock);
- if (!sock_flag(sk, SOCK_DEAD)) {
- if (becomes_readable ||
- sk->sk_data_ready != sock_def_readable ||
- READ_ONCE(sk->sk_peek_off) >= 0)
- INDIRECT_CALL_1(sk->sk_data_ready,
- sock_def_readable, sk);
- else
- sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
- }
+ if (!sock_flag(sk, SOCK_DEAD))
+ INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
+
busylock_release(busy);
return 0;
@@ -1739,7 +1791,7 @@ drop:
busylock_release(busy);
return err;
}
-EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
+EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb);
void udp_destruct_common(struct sock *sk)
{
@@ -1755,7 +1807,7 @@ void udp_destruct_common(struct sock *sk)
}
udp_rmem_release(sk, total, 0, true);
}
-EXPORT_SYMBOL_GPL(udp_destruct_common);
+EXPORT_IPV6_MOD_GPL(udp_destruct_common);
static void udp_destruct_sock(struct sock *sk)
{
@@ -1786,11 +1838,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
skb_release_head_state(skb);
__consume_stateless_skb(skb);
}
-EXPORT_SYMBOL_GPL(skb_consume_udp);
+EXPORT_IPV6_MOD_GPL(skb_consume_udp);
static struct sk_buff *__first_packet_length(struct sock *sk,
struct sk_buff_head *rcvq,
- int *total)
+ unsigned int *total)
{
struct sk_buff *skb;
@@ -1803,7 +1855,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
*total += skb->truesize;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
} else {
udp_skb_csum_unnecessary_set(skb);
break;
@@ -1823,8 +1875,8 @@ static int first_packet_length(struct sock *sk)
{
struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+ unsigned int total = 0;
struct sk_buff *skb;
- int total = 0;
int res;
spin_lock_bh(&rcvq->lock);
@@ -1868,7 +1920,7 @@ int udp_ioctl(struct sock *sk, int cmd, int *karg)
return 0;
}
-EXPORT_SYMBOL(udp_ioctl);
+EXPORT_IPV6_MOD(udp_ioctl);
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
int *off, int *err)
@@ -1891,8 +1943,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
error = -EAGAIN;
do {
spin_lock_bh(&queue->lock);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off,
- err, &last);
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
if (skb) {
if (!(flags & MSG_PEEK))
udp_skb_destructor(sk, skb);
@@ -1913,8 +1965,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off,
- err, &last);
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
if (skb && !(flags & MSG_PEEK))
udp_skb_dtor_locked(sk, skb);
spin_unlock(&sk_queue->lock);
@@ -1957,14 +2009,14 @@ try_again:
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
__UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
goto try_again;
}
WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
return recv_actor(sk, skb);
}
-EXPORT_SYMBOL(udp_read_skb);
+EXPORT_IPV6_MOD(udp_read_skb);
/*
* This should be easy, if there is something there we
@@ -2072,7 +2124,7 @@ csum_copy_err:
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -2091,7 +2143,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
}
-EXPORT_SYMBOL(udp_pre_connect);
+EXPORT_IPV6_MOD(udp_pre_connect);
static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -2140,7 +2192,7 @@ int udp_disconnect(struct sock *sk, int flags)
release_sock(sk);
return 0;
}
-EXPORT_SYMBOL(udp_disconnect);
+EXPORT_IPV6_MOD(udp_disconnect);
void udp_lib_unhash(struct sock *sk)
{
@@ -2148,6 +2200,7 @@ void udp_lib_unhash(struct sock *sk)
struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
+ sock_rps_delete_flow(sk);
hslot = udp_hashslot(udptable, sock_net(sk),
udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
@@ -2170,7 +2223,7 @@ void udp_lib_unhash(struct sock *sk)
spin_unlock_bh(&hslot->lock);
}
}
-EXPORT_SYMBOL(udp_lib_unhash);
+EXPORT_IPV6_MOD(udp_lib_unhash);
/*
* inet_rcv_saddr was changed, we must rehash secondary hash
@@ -2181,14 +2234,14 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2, *nhslot2;
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
nhslot2 = udp_hashslot2(udptable, newhash);
udp_sk(sk)->udp_portaddr_hash = newhash;
if (hslot2 != nhslot2 ||
rcu_access_pointer(sk->sk_reuseport_cb)) {
- hslot = udp_hashslot(udptable, sock_net(sk),
- udp_sk(sk)->udp_port_hash);
/* we must lock primary chain too */
spin_lock_bh(&hslot->lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
@@ -2207,24 +2260,34 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
spin_unlock(&nhslot2->lock);
}
- if (udp_hashed4(sk)) {
- udp_rehash4(udptable, sk, newhash4);
+ spin_unlock_bh(&hslot->lock);
+ }
- if (hslot2 != nhslot2) {
- spin_lock(&hslot2->lock);
- udp_hash4_dec(hslot2);
- spin_unlock(&hslot2->lock);
+ /* Now process hash4 if necessary:
+ * (1) update hslot4;
+ * (2) update hslot2->hash4_cnt.
+ * Note that hslot2/hslot4 should be checked separately, as
+ * either of them may change with the other unchanged.
+ */
+ if (udp_hashed4(sk)) {
+ spin_lock_bh(&hslot->lock);
- spin_lock(&nhslot2->lock);
- udp_hash4_inc(nhslot2);
- spin_unlock(&nhslot2->lock);
- }
+ udp_rehash4(udptable, sk, newhash4);
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ udp_hash4_inc(nhslot2);
+ spin_unlock(&nhslot2->lock);
}
+
spin_unlock_bh(&hslot->lock);
}
}
}
-EXPORT_SYMBOL(udp_lib_rehash);
+EXPORT_IPV6_MOD(udp_lib_rehash);
void udp_v4_rehash(struct sock *sk)
{
@@ -2284,7 +2347,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
- int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@@ -2373,10 +2436,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
- drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
goto drop;
- }
udp_csum_pull_header(skb);
@@ -2429,7 +2490,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
}
return false;
}
-EXPORT_SYMBOL(udp_sk_rx_dst_set);
+EXPORT_IPV6_MOD(udp_sk_rx_dst_set);
/*
* Multicasts and broadcasts go to each listener.
@@ -2836,20 +2897,40 @@ void udp_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
- if (udp_test_bit(ENCAP_ENABLED, sk))
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
static_branch_dec(&udp_encap_needed_key);
+ udp_tunnel_cleanup_gro(sk);
+ }
}
}
+typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family,
struct sock *sk)
{
#ifdef CONFIG_XFRM
+ udp_gro_receive_t new_gro_receive;
+
if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) {
- if (family == AF_INET)
- WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv);
- else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
- WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv);
+ if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+ new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv;
+ else
+ new_gro_receive = xfrm4_gro_udp_encap_rcv;
+
+ if (udp_sk(sk)->gro_receive != new_gro_receive) {
+ /*
+ * With IPV6_ADDRFORM the gro callback could change
+ * after being set, unregister the old one, if valid.
+ */
+ if (udp_sk(sk)->gro_receive)
+ udp_tunnel_update_gro_rcv(sk, false);
+
+ WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive);
+ udp_tunnel_update_gro_rcv(sk, true);
+ }
}
#endif
}
@@ -2899,6 +2980,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_ENCAP:
+ sockopt_lock_sock(sk);
switch (val) {
case 0:
#ifdef CONFIG_XFRM
@@ -2922,6 +3004,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
err = -ENOPROTOOPT;
break;
}
+ sockopt_release_sock(sk);
break;
case UDP_NO_CHECK6_TX:
@@ -2939,13 +3022,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_GRO:
-
+ sockopt_lock_sock(sk);
/* when enabling GRO, accept the related GSO packet type */
if (valbool)
udp_tunnel_encap_enable(sk);
udp_assign_bit(GRO_ENABLED, sk, valbool);
udp_assign_bit(ACCEPT_L4, sk, valbool);
set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
+ sockopt_release_sock(sk);
break;
/*
@@ -2985,7 +3069,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
return err;
}
-EXPORT_SYMBOL(udp_lib_setsockopt);
+EXPORT_IPV6_MOD(udp_lib_setsockopt);
int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen)
@@ -3056,7 +3140,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
-EXPORT_SYMBOL(udp_lib_getsockopt);
+EXPORT_IPV6_MOD(udp_lib_getsockopt);
int udp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
@@ -3098,7 +3182,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
return mask;
}
-EXPORT_SYMBOL(udp_poll);
+EXPORT_IPV6_MOD(udp_poll);
int udp_abort(struct sock *sk, int err)
{
@@ -3121,7 +3205,7 @@ out:
return 0;
}
-EXPORT_SYMBOL_GPL(udp_abort);
+EXPORT_IPV6_MOD_GPL(udp_abort);
struct proto udp_prot = {
.name = "UDP",
@@ -3147,7 +3231,7 @@ struct proto udp_prot = {
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
- .memory_allocated = &udp_memory_allocated,
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
.per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
.sysctl_mem = sysctl_udp_mem,
@@ -3255,7 +3339,7 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos)
return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
-EXPORT_SYMBOL(udp_seq_start);
+EXPORT_IPV6_MOD(udp_seq_start);
void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
@@ -3269,7 +3353,7 @@ void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
-EXPORT_SYMBOL(udp_seq_next);
+EXPORT_IPV6_MOD(udp_seq_next);
void udp_seq_stop(struct seq_file *seq, void *v)
{
@@ -3281,7 +3365,7 @@ void udp_seq_stop(struct seq_file *seq, void *v)
if (state->bucket <= udptable->mask)
spin_unlock_bh(&udptable->hash[state->bucket].lock);
}
-EXPORT_SYMBOL(udp_seq_stop);
+EXPORT_IPV6_MOD(udp_seq_stop);
/* ------------------------------------------------------------------------ */
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
@@ -3299,7 +3383,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
sk_wmem_alloc_get(sp),
udp_rqueue_get(sp),
0, 0L, 0,
- from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
atomic_read(&sp->sk_drops));
@@ -3329,34 +3413,55 @@ struct bpf_iter__udp {
int bucket __aligned(8);
};
+union bpf_udp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
struct bpf_udp_iter_state {
struct udp_iter_state state;
unsigned int cur_sk;
unsigned int end_sk;
unsigned int max_sk;
- int offset;
- struct sock **batch;
- bool st_bucket_done;
+ union bpf_udp_iter_batch_item *batch;
};
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
- unsigned int new_batch_sz);
+ unsigned int new_batch_sz, gfp_t flags);
+static struct sock *bpf_iter_udp_resume(struct sock *first_sk,
+ union bpf_udp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct sock *sk = NULL;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ udp_portaddr_for_each_entry_from(sk)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ goto done;
+ }
+done:
+ return sk;
+}
+
static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
{
struct bpf_udp_iter_state *iter = seq->private;
struct udp_iter_state *state = &iter->state;
+ unsigned int find_cookie, end_cookie;
struct net *net = seq_file_net(seq);
- int resume_bucket, resume_offset;
struct udp_table *udptable;
unsigned int batch_sks = 0;
- bool resized = false;
+ int resume_bucket;
+ int resizes = 0;
struct sock *sk;
+ int err = 0;
resume_bucket = state->bucket;
- resume_offset = iter->offset;
/* The current batch is done, so advance the bucket. */
- if (iter->st_bucket_done)
+ if (iter->cur_sk == iter->end_sk)
state->bucket++;
udptable = udp_get_table_seq(seq, net);
@@ -3369,62 +3474,89 @@ again:
* before releasing the bucket lock. This allows BPF programs that are
* called in seq_show to acquire the bucket lock if needed.
*/
+ find_cookie = iter->cur_sk;
+ end_cookie = iter->end_sk;
iter->cur_sk = 0;
iter->end_sk = 0;
- iter->st_bucket_done = false;
batch_sks = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
if (hlist_empty(&hslot2->head))
- continue;
+ goto next_bucket;
- iter->offset = 0;
spin_lock_bh(&hslot2->lock);
- udp_portaddr_for_each_entry(sk, &hslot2->head) {
+ sk = hlist_entry_safe(hslot2->head.first, struct sock,
+ __sk_common.skc_portaddr_node);
+ /* Resume from the first (in iteration order) unseen socket from
+ * the last batch that still exists in resume_bucket. Most of
+ * the time this will just be where the last iteration left off
+ * in resume_bucket unless that socket disappeared between
+ * reads.
+ */
+ if (state->bucket == resume_bucket)
+ sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+fill_batch:
+ udp_portaddr_for_each_entry_from(sk) {
if (seq_sk_match(seq, sk)) {
- /* Resume from the last iterated socket at the
- * offset in the bucket before iterator was stopped.
- */
- if (state->bucket == resume_bucket &&
- iter->offset < resume_offset) {
- ++iter->offset;
- continue;
- }
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
- iter->batch[iter->end_sk++] = sk;
+ iter->batch[iter->end_sk++].sk = sk;
}
batch_sks++;
}
}
+
+ /* Allocate a larger batch and try again. */
+ if (unlikely(resizes <= 1 && iter->end_sk &&
+ iter->end_sk != batch_sks)) {
+ resizes++;
+
+ /* First, try with GFP_USER to maximize the chances of
+ * grabbing more memory.
+ */
+ if (resizes == 1) {
+ spin_unlock_bh(&hslot2->lock);
+ err = bpf_iter_udp_realloc_batch(iter,
+ batch_sks * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+ /* Start over. */
+ goto again;
+ }
+
+ /* Next, hold onto the lock, so the bucket doesn't
+ * change while we get the rest of the sockets.
+ */
+ err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+ GFP_NOWAIT);
+ if (err) {
+ spin_unlock_bh(&hslot2->lock);
+ return ERR_PTR(err);
+ }
+
+ /* Pick up where we left off. */
+ sk = iter->batch[iter->end_sk - 1].sk;
+ sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+ struct sock,
+ __sk_common.skc_portaddr_node);
+ batch_sks = iter->end_sk;
+ goto fill_batch;
+ }
+
spin_unlock_bh(&hslot2->lock);
if (iter->end_sk)
break;
+next_bucket:
+ resizes = 0;
}
- /* All done: no batch made. */
- if (!iter->end_sk)
- return NULL;
-
- if (iter->end_sk == batch_sks) {
- /* Batching is done for the current bucket; return the first
- * socket to be iterated from the batch.
- */
- iter->st_bucket_done = true;
- goto done;
- }
- if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
- resized = true;
- /* After allocating a larger batch, retry one more time to grab
- * the whole bucket.
- */
- goto again;
- }
-done:
- return iter->batch[0];
+ WARN_ON_ONCE(iter->end_sk != batch_sks);
+ return iter->end_sk ? iter->batch[0].sk : NULL;
}
static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -3435,16 +3567,14 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
/* Whenever seq_next() is called, the iter->cur_sk is
* done with seq_show(), so unref the iter->cur_sk.
*/
- if (iter->cur_sk < iter->end_sk) {
- sock_put(iter->batch[iter->cur_sk++]);
- ++iter->offset;
- }
+ if (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++].sk);
/* After updating iter->cur_sk, check if there are more sockets
* available in the current bucket batch.
*/
if (iter->cur_sk < iter->end_sk)
- sk = iter->batch[iter->cur_sk];
+ sk = iter->batch[iter->cur_sk].sk;
else
/* Prepare a new batch. */
sk = bpf_iter_udp_batch(seq);
@@ -3496,7 +3626,7 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
goto unlock;
}
- uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
@@ -3508,8 +3638,19 @@ unlock:
static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
{
- while (iter->cur_sk < iter->end_sk)
- sock_put(iter->batch[iter->cur_sk++]);
+ union bpf_udp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_put(item->sk);
+ item->cookie = cookie;
+ }
}
static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
@@ -3525,10 +3666,8 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
}
- if (iter->cur_sk < iter->end_sk) {
+ if (iter->cur_sk < iter->end_sk)
bpf_iter_udp_put_batch(iter);
- iter->st_bucket_done = false;
- }
}
static const struct seq_operations bpf_iter_udp_seq_ops = {
@@ -3560,7 +3699,7 @@ const struct seq_operations udp_seq_ops = {
.stop = udp_seq_stop,
.show = udp4_seq_show,
};
-EXPORT_SYMBOL(udp_seq_ops);
+EXPORT_IPV6_MOD(udp_seq_ops);
static struct udp_seq_afinfo udp4_seq_afinfo = {
.family = AF_INET,
@@ -3749,6 +3888,15 @@ fallback:
static int __net_init udp_pernet_init(struct net *net)
{
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+ int i;
+
+ /* No tunnel is configured */
+ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) {
+ INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list);
+ RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL);
+ }
+#endif
udp_sysctl_init(net);
udp_set_table(net);
@@ -3770,16 +3918,19 @@ DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
struct udp_sock *udp_sk, uid_t uid, int bucket)
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
- unsigned int new_batch_sz)
+ unsigned int new_batch_sz, gfp_t flags)
{
- struct sock **new_batch;
+ union bpf_udp_iter_batch_item *new_batch;
new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
- GFP_USER | __GFP_NOWARN);
+ flags | __GFP_NOWARN);
if (!new_batch)
return -ENOMEM;
- bpf_iter_udp_put_batch(iter);
+ if (flags != GFP_NOWAIT)
+ bpf_iter_udp_put_batch(iter);
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
kvfree(iter->batch);
iter->batch = new_batch;
iter->max_sk = new_batch_sz;
@@ -3798,10 +3949,12 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
if (ret)
return ret;
- ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+ ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
if (ret)
bpf_iter_fini_seq_net(priv_data);
+ iter->state.bucket = -1;
+
return ret;
}
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e1ff3a375996..c7142213fc21 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UDP4_IMPL_H
#define _UDP4_IMPL_H
+#include <net/aligned_data.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/protocol.h>
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a5be6e4ed326..b1f3fd302e9d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -12,6 +12,162 @@
#include <net/udp.h>
#include <net/protocol.h>
#include <net/inet_common.h>
+#include <net/udp_tunnel.h>
+
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+
+/*
+ * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL
+ * values for the udp tunnel static call.
+ */
+static struct sk_buff *dummy_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
+struct udp_tunnel_type_entry {
+ udp_tunnel_gro_rcv_t gro_receive;
+ refcount_t count;
+};
+
+#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \
+ IS_ENABLED(CONFIG_VXLAN) * 2 + \
+ IS_ENABLED(CONFIG_NET_FOU) * 2 + \
+ IS_ENABLED(CONFIG_XFRM) * 2)
+
+DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv);
+static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call);
+static DEFINE_MUTEX(udp_tunnel_gro_type_lock);
+static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES];
+static unsigned int udp_tunnel_gro_type_nr;
+static DEFINE_SPINLOCK(udp_tunnel_gro_lock);
+
+void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add)
+{
+ bool is_ipv6 = sk->sk_family == AF_INET6;
+ struct udp_sock *tup, *up = udp_sk(sk);
+ struct udp_tunnel_gro *udp_tunnel_gro;
+
+ spin_lock(&udp_tunnel_gro_lock);
+ udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6];
+ if (add)
+ hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list);
+ else if (up->tunnel_list.pprev)
+ hlist_del_init(&up->tunnel_list);
+
+ if (udp_tunnel_gro->list.first &&
+ !udp_tunnel_gro->list.first->next) {
+ tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock,
+ tunnel_list);
+
+ rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup);
+ } else {
+ RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL);
+ }
+
+ spin_unlock(&udp_tunnel_gro_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup);
+
+void udp_tunnel_update_gro_rcv(struct sock *sk, bool add)
+{
+ struct udp_tunnel_type_entry *cur = NULL;
+ struct udp_sock *up = udp_sk(sk);
+ int i, old_gro_type_nr;
+
+ if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive)
+ return;
+
+ mutex_lock(&udp_tunnel_gro_type_lock);
+
+ /* Check if the static call is permanently disabled. */
+ if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES)
+ goto out;
+
+ for (i = 0; i < udp_tunnel_gro_type_nr; i++)
+ if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive)
+ cur = &udp_tunnel_gro_types[i];
+
+ old_gro_type_nr = udp_tunnel_gro_type_nr;
+ if (add) {
+ /*
+ * Update the matching entry, if found, or add a new one
+ * if needed
+ */
+ if (cur) {
+ refcount_inc(&cur->count);
+ goto out;
+ }
+
+ if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) {
+ pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n");
+ /* Ensure static call will never be enabled */
+ udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1;
+ } else {
+ cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++];
+ refcount_set(&cur->count, 1);
+ cur->gro_receive = up->gro_receive;
+ }
+ } else {
+ /*
+ * The stack cleanups only successfully added tunnel, the
+ * lookup on removal should never fail.
+ */
+ if (WARN_ON_ONCE(!cur))
+ goto out;
+
+ if (!refcount_dec_and_test(&cur->count))
+ goto out;
+
+ /* Avoid gaps, so that the enable tunnel has always id 0 */
+ *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr];
+ }
+
+ if (udp_tunnel_gro_type_nr == 1) {
+ static_call_update(udp_tunnel_gro_rcv,
+ udp_tunnel_gro_types[0].gro_receive);
+ static_branch_enable(&udp_tunnel_static_call);
+ } else if (old_gro_type_nr == 1) {
+ static_branch_disable(&udp_tunnel_static_call);
+ static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv);
+ }
+
+out:
+ mutex_unlock(&udp_tunnel_gro_type_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv);
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ if (static_branch_likely(&udp_tunnel_static_call)) {
+ if (unlikely(gro_recursion_inc_test(skb))) {
+ NAPI_GRO_CB(skb)->flush |= 1;
+ return NULL;
+ }
+ return static_call(udp_tunnel_gro_rcv)(sk, head, skb);
+ }
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#else
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#endif
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
@@ -61,7 +217,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
skb->remcsum_offload = remcsum;
- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
!need_ipsec &&
@@ -247,6 +403,62 @@ static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs)
return segs;
}
+static void __udpv6_gso_segment_csum(struct sk_buff *seg,
+ struct in6_addr *oldip,
+ const struct in6_addr *newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct udphdr *uh = udp_hdr(seg);
+
+ if (ipv6_addr_equal(oldip, newip) && *oldport == newport)
+ return;
+
+ if (uh->check) {
+ inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32,
+ newip->s6_addr32, true);
+
+ inet_proto_csum_replace2(&uh->check, seg, *oldport, newport,
+ false);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+
+ *oldip = *newip;
+ *oldport = newport;
+}
+
+static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct ipv6hdr *iph;
+ const struct udphdr *uh;
+ struct ipv6hdr *iph2;
+ struct sk_buff *seg;
+ struct udphdr *uh2;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+ iph = ipv6_hdr(seg);
+ uh2 = udp_hdr(seg->next);
+ iph2 = ipv6_hdr(seg->next);
+
+ if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) &&
+ ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
+ ipv6_addr_equal(&iph->daddr, &iph2->daddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ uh2 = udp_hdr(seg);
+ iph2 = ipv6_hdr(seg);
+
+ __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr,
+ &uh2->source, uh->source);
+ __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr,
+ &uh2->dest, uh->dest);
+ }
+
+ return segs;
+}
+
static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
netdev_features_t features,
bool is_ipv6)
@@ -259,7 +471,10 @@ static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss);
- return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb);
+ if (is_ipv6)
+ return __udpv6_gso_segment_list_csum(skb);
+ else
+ return __udpv4_gso_segment_list_csum(skb);
}
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
@@ -273,6 +488,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
bool copy_dtor;
__sum16 check;
__be16 newlen;
+ int ret = 0;
mss = skb_shinfo(gso_skb)->gso_size;
if (gso_skb->len <= sizeof(*uh) + mss)
@@ -301,6 +517,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size)
return __udp_gso_segment_list(gso_skb, features, is_ipv6);
+ ret = __skb_linearize(gso_skb);
+ if (ret)
+ return ERR_PTR(ret);
+
/* Setup csum, as fraglist skips this in udp4_gro_receive. */
gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head;
gso_skb->csum_offset = offsetof(struct udphdr, check);
@@ -321,13 +541,17 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
/* clear destructor to avoid skb_segment assigning it to tail */
copy_dtor = gso_skb->destructor == sock_wfree;
- if (copy_dtor)
+ if (copy_dtor) {
gso_skb->destructor = NULL;
+ gso_skb->sk = NULL;
+ }
segs = skb_segment(gso_skb, features);
if (IS_ERR_OR_NULL(segs)) {
- if (copy_dtor)
+ if (copy_dtor) {
gso_skb->destructor = sock_wfree;
+ gso_skb->sk = sk;
+ }
return segs;
}
@@ -536,6 +760,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
+ skb_set_network_header(skb, skb_gro_receive_network_offset(skb));
ret = skb_gro_receive_list(p, skb);
} else {
skb_gro_postpull_rcsum(skb, uh,
@@ -618,7 +843,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
- pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+ pp = udp_tunnel_gro_rcv(sk, head, skb);
out:
skb_gro_flush_final(skb, pp, flush);
@@ -630,9 +855,14 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
__be16 dport)
{
const struct iphdr *iph = skb_gro_network_header(skb);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
+ struct sock *sk;
int iif, sdif;
+ sk = udp_tunnel_sk(net, false);
+ if (sk && dport == htons(sk->sk_num))
+ return sk;
+
inet_get_iif_sdif(skb, &iif, &sdif);
return __udp4_lib_lookup(net, iph->saddr, sport,
@@ -763,5 +993,6 @@ int __init udpv4_offload_init(void)
.gro_complete = udp4_gro_complete,
},
};
+
return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP);
}
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 619a53eb672d..fce945f23069 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -58,6 +58,15 @@ error:
}
EXPORT_SYMBOL(udp_sock_create4);
+static bool sk_saddr_any(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#else
+ return !sk->sk_rcv_saddr;
+#endif
+}
+
void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
struct udp_tunnel_sock_cfg *cfg)
{
@@ -80,6 +89,12 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->gro_complete = cfg->gro_complete;
udp_tunnel_encap_enable(sk);
+
+ udp_tunnel_update_gro_rcv(sk, true);
+
+ if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) &&
+ sk->sk_kern_sock)
+ udp_tunnel_update_gro_lookup(net, sk, true);
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
@@ -119,15 +134,17 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
struct udp_tunnel_info ti;
struct net_device *dev;
+ ASSERT_RTNL();
+
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
+ for_each_netdev(net, dev) {
+ udp_tunnel_nic_lock(dev);
udp_tunnel_nic_add_port(dev, &ti);
+ udp_tunnel_nic_unlock(dev);
}
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
@@ -139,22 +156,24 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
struct udp_tunnel_info ti;
struct net_device *dev;
+ ASSERT_RTNL();
+
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
+ for_each_netdev(net, dev) {
+ udp_tunnel_nic_lock(dev);
udp_tunnel_nic_del_port(dev, &ti);
+ udp_tunnel_nic_unlock(dev);
}
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl,
__be16 df, __be16 src_port, __be16 dst_port,
- bool xnet, bool nocheck)
+ bool xnet, bool nocheck, u16 ipcb_flags)
{
struct udphdr *uh;
@@ -170,7 +189,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
udp_set_csum(nocheck, skb, src, dst, skb->len);
- iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
+ iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet,
+ ipcb_flags);
}
EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index b6d2d16189c0..ff66db48453c 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -29,6 +29,7 @@ struct udp_tunnel_nic_table_entry {
* struct udp_tunnel_nic - UDP tunnel port offload state
* @work: async work for talking to hardware from process context
* @dev: netdev pointer
+ * @lock: protects all fields
* @need_sync: at least one port start changed
* @need_replay: space was freed, we need a replay of all ports
* @work_pending: @work is currently scheduled
@@ -41,6 +42,8 @@ struct udp_tunnel_nic {
struct net_device *dev;
+ struct mutex lock;
+
u8 need_sync:1;
u8 need_replay:1;
u8 work_pending:1;
@@ -298,22 +301,11 @@ __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
static void
udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
{
- const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
- bool may_sleep;
-
if (!utn->need_sync)
return;
- /* Drivers which sleep in the callback need to update from
- * the workqueue, if we come from the tunnel driver's notification.
- */
- may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP;
- if (!may_sleep)
- __udp_tunnel_nic_device_sync(dev, utn);
- if (may_sleep || utn->need_replay) {
- queue_work(udp_tunnel_nic_workqueue, &utn->work);
- utn->work_pending = 1;
- }
+ queue_work(udp_tunnel_nic_workqueue, &utn->work);
+ utn->work_pending = 1;
}
static bool
@@ -554,12 +546,12 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
struct udp_tunnel_nic *utn;
unsigned int i, j;
- ASSERT_RTNL();
-
utn = dev->udp_tunnel_nic;
if (!utn)
return;
+ mutex_lock(&utn->lock);
+
utn->need_sync = false;
for (i = 0; i < utn->n_tables; i++)
for (j = 0; j < info->tables[i].n_entries; j++) {
@@ -569,7 +561,7 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL |
UDP_TUNNEL_NIC_ENTRY_OP_FAIL);
- /* We don't release rtnl across ops */
+ /* We don't release utn lock across ops */
WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN);
if (!entry->use_cnt)
continue;
@@ -579,6 +571,8 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
}
__udp_tunnel_nic_device_sync(dev, utn);
+
+ mutex_unlock(&utn->lock);
}
static size_t
@@ -643,6 +637,33 @@ err_cancel:
return -EMSGSIZE;
}
+static void __udp_tunnel_nic_assert_locked(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ lockdep_assert_held(&utn->lock);
+}
+
+static void __udp_tunnel_nic_lock(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ mutex_lock(&utn->lock);
+}
+
+static void __udp_tunnel_nic_unlock(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ mutex_unlock(&utn->lock);
+}
+
static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
.get_port = __udp_tunnel_nic_get_port,
.set_port_priv = __udp_tunnel_nic_set_port_priv,
@@ -651,6 +672,9 @@ static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
.reset_ntf = __udp_tunnel_nic_reset_ntf,
.dump_size = __udp_tunnel_nic_dump_size,
.dump_write = __udp_tunnel_nic_dump_write,
+ .assert_locked = __udp_tunnel_nic_assert_locked,
+ .lock = __udp_tunnel_nic_lock,
+ .unlock = __udp_tunnel_nic_unlock,
};
static void
@@ -710,11 +734,15 @@ static void udp_tunnel_nic_device_sync_work(struct work_struct *work)
container_of(work, struct udp_tunnel_nic, work);
rtnl_lock();
+ mutex_lock(&utn->lock);
+
utn->work_pending = 0;
__udp_tunnel_nic_device_sync(utn->dev, utn);
if (utn->need_replay)
udp_tunnel_nic_replay(utn->dev, utn);
+
+ mutex_unlock(&utn->lock);
rtnl_unlock();
}
@@ -730,6 +758,7 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
return NULL;
utn->n_tables = n_tables;
INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work);
+ mutex_init(&utn->lock);
for (i = 0; i < n_tables; i++) {
utn->entries[i] = kcalloc(info->tables[i].n_entries,
@@ -821,8 +850,11 @@ static int udp_tunnel_nic_register(struct net_device *dev)
dev_hold(dev);
dev->udp_tunnel_nic = utn;
- if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) {
+ udp_tunnel_nic_lock(dev);
udp_tunnel_get_rx_info(dev);
+ udp_tunnel_nic_unlock(dev);
+ }
return 0;
}
@@ -832,6 +864,8 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
{
const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ udp_tunnel_nic_lock(dev);
+
/* For a shared table remove this dev from the list of sharing devices
* and if there are other devices just detach.
*/
@@ -841,8 +875,10 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
list_for_each_entry(node, &info->shared->devices, list)
if (node->dev == dev)
break;
- if (list_entry_is_head(node, &info->shared->devices, list))
+ if (list_entry_is_head(node, &info->shared->devices, list)) {
+ udp_tunnel_nic_unlock(dev);
return;
+ }
list_del(&node->list);
kfree(node);
@@ -852,6 +888,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
if (first) {
udp_tunnel_drop_rx_info(dev);
utn->dev = first->dev;
+ udp_tunnel_nic_unlock(dev);
goto release_dev;
}
@@ -862,6 +899,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
* from the work which we will boot immediately.
*/
udp_tunnel_nic_flush(dev, utn);
+ udp_tunnel_nic_unlock(dev);
/* Wait for the work to be done using the state, netdev core will
* retry unregister until we give up our reference on this device.
@@ -910,12 +948,16 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
return NOTIFY_DONE;
if (event == NETDEV_UP) {
+ udp_tunnel_nic_lock(dev);
WARN_ON(!udp_tunnel_nic_is_empty(dev, utn));
udp_tunnel_get_rx_info(dev);
+ udp_tunnel_nic_unlock(dev);
return NOTIFY_OK;
}
if (event == NETDEV_GOING_DOWN) {
+ udp_tunnel_nic_lock(dev);
udp_tunnel_nic_flush(dev, utn);
+ udp_tunnel_nic_unlock(dev);
return NOTIFY_OK;
}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index af37af3ab727..d3e621a11a1a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -60,7 +60,7 @@ struct proto udplite_prot = {
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
- .memory_allocated = &udp_memory_allocated,
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
.per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
.sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index b5b06323cfd9..f28cfd88eaf5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -182,11 +182,15 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
int offset = skb_gro_offset(skb);
const struct net_offload *ops;
struct sk_buff *pp = NULL;
- int ret;
-
- offset = offset - sizeof(struct udphdr);
+ int len, dlen;
+ __u8 *udpdata;
+ __be32 *udpdata32;
- if (!pskb_pull(skb, offset))
+ len = skb->len - offset;
+ dlen = offset + min(len, 8);
+ udpdata = skb_gro_header(skb, dlen, offset);
+ udpdata32 = (__be32 *)udpdata;
+ if (unlikely(!udpdata))
return NULL;
rcu_read_lock();
@@ -194,11 +198,13 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
if (!ops || !ops->callbacks.gro_receive)
goto out;
- ret = __xfrm4_udp_encap_rcv(sk, skb, false);
- if (ret)
+ /* check if it is a keepalive or IKE packet */
+ if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0)
goto out;
- skb_push(skb, offset);
+ /* set the transport header to ESP */
+ skb_set_transport_header(skb, offset);
+
NAPI_GRO_CB(skb)->proto = IPPROTO_UDP;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
@@ -208,7 +214,6 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
out:
rcu_read_unlock();
- skb_push(skb, offset);
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 1;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 3cff51ba72bb..0ae67d537499 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -31,7 +31,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, skb->dev, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst_dev(skb),
__xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}