summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/bpf_tcp_ca.c41
-rw-r--r--net/ipv4/devinet.c21
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_semantics.c12
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/fou.c10
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/igmp.c30
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_output.c39
-rw-r--r--net/ipv4/ip_sockglue.c24
-rw-r--r--net/ipv4/ip_tunnel.c9
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/netfilter/arptable_filter.c23
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c56
-rw-r--r--net/ipv4/netfilter/iptable_filter.c24
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c19
-rw-r--r--net/ipv4/netfilter/iptable_nat.c20
-rw-r--r--net/ipv4/netfilter/iptable_raw.c21
-rw-r--r--net/ipv4/netfilter/iptable_security.c23
-rw-r--r--net/ipv4/route.c79
-rw-r--r--net/ipv4/tcp.c5
-rw-r--r--net/ipv4/tcp_fastopen.c20
-rw-r--r--net/ipv4/tcp_input.c54
-rw-r--r--net/ipv4/tcp_ipv4.c411
-rw-r--r--net/ipv4/tcp_output.c3
-rw-r--r--net/ipv4/tcp_recovery.c3
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/udp_bpf.c1
-rw-r--r--net/ipv4/udp_offload.c2
33 files changed, 644 insertions, 340 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 54648181dd56..1d816a5fd3eb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
* changes context in a wrong way it will be caught.
*/
err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
- BPF_CGROUP_INET4_BIND, &flags);
+ CGROUP_INET4_BIND, &flags);
if (err)
return err;
@@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin_port = inet->inet_dport;
sin->sin_addr.s_addr = inet->inet_daddr;
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
- BPF_CGROUP_INET4_GETPEERNAME,
+ CGROUP_INET4_GETPEERNAME,
NULL);
} else {
__be32 addr = inet->inet_rcv_saddr;
@@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
- BPF_CGROUP_INET4_GETSOCKNAME,
+ CGROUP_INET4_GETSOCKNAME,
NULL);
}
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
@@ -953,10 +953,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCGIFNETMASK:
case SIOCGIFDSTADDR:
case SIOCGIFPFLAGS:
- if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+ if (get_user_ifreq(&ifr, NULL, p))
return -EFAULT;
err = devinet_ioctl(net, cmd, &ifr);
- if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq)))
+ if (!err && put_user_ifreq(&ifr, p))
err = -EFAULT;
break;
@@ -966,7 +966,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCSIFDSTADDR:
case SIOCSIFPFLAGS:
case SIOCSIFFLAGS:
- if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+ if (get_user_ifreq(&ifr, NULL, p))
return -EFAULT;
err = devinet_ioctl(net, cmd, &ifr);
break;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 9e41eff4a685..0dcee9df1326 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -10,6 +10,9 @@
#include <net/tcp.h>
#include <net/bpf_sk_storage.h>
+/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */
+extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+
static u32 optional_ops[] = {
offsetof(struct tcp_congestion_ops, init),
offsetof(struct tcp_congestion_ops, release),
@@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
.arg2_type = ARG_ANYTHING,
};
+static u32 prog_ops_moff(const struct bpf_prog *prog)
+{
+ const struct btf_member *m;
+ const struct btf_type *t;
+ u32 midx;
+
+ midx = prog->expected_attach_type;
+ t = bpf_tcp_congestion_ops.type;
+ m = &btf_type_member(t)[midx];
+
+ return btf_member_bit_offset(t, m) / 8;
+}
+
static const struct bpf_func_proto *
bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
const struct bpf_prog *prog)
@@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ /* Does not allow release() to call setsockopt.
+ * release() is called when the current bpf-tcp-cc
+ * is retiring. It is not allowed to call
+ * setsockopt() to make further changes which
+ * may potentially allocate new resources.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ /* Since get/setsockopt is usually expected to
+ * be available together, disable getsockopt for
+ * release also to avoid usage surprise.
+ * The bpf-tcp-cc already has a more powerful way
+ * to read tcp_sock from the PTR_TO_BTF_ID.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
default:
return bpf_base_func_proto(func_id);
}
@@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata)
tcp_unregister_congestion_control(kdata);
}
-/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */
-extern struct bpf_struct_ops bpf_tcp_congestion_ops;
-
struct bpf_struct_ops bpf_tcp_congestion_ops = {
.verifier_ops = &bpf_tcp_ca_verifier_ops,
.reg = bpf_tcp_ca_reg,
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 73721a4448bd..f4468980b675 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -215,7 +215,7 @@ static void devinet_sysctl_unregister(struct in_device *idev)
static struct in_ifaddr *inet_alloc_ifa(void)
{
- return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+ return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT);
}
static void inet_rcu_free_ifa(struct rcu_head *head)
@@ -1243,7 +1243,7 @@ out:
return ret;
}
-static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
+int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
{
struct in_device *in_dev = __in_dev_get_rtnl(dev);
const struct in_ifaddr *ifa;
@@ -1950,7 +1950,8 @@ static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
};
static int inet_validate_link_af(const struct net_device *dev,
- const struct nlattr *nla)
+ const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
{
struct nlattr *a, *tb[IFLA_INET_MAX+1];
int err, rem;
@@ -1959,7 +1960,7 @@ static int inet_validate_link_af(const struct net_device *dev,
return -EAFNOSUPPORT;
err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
- inet_af_policy, NULL);
+ inet_af_policy, extack);
if (err < 0)
return err;
@@ -2424,11 +2425,15 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
int *valp = ctl->data;
int val = *valp;
loff_t pos = *ppos;
- int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
+ int ret;
- if (write && *valp != val) {
- struct net *net = ctl->extra2;
+ if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ if (write && *valp != val) {
if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
if (!rtnl_trylock()) {
/* Restore the original values before restarting */
@@ -2762,8 +2767,6 @@ void __init devinet_init(void)
INIT_HLIST_HEAD(&inet_addr_lst[i]);
register_pernet_subsys(&devinet_ops);
-
- register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index a09e36c4a413..851f542928a3 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -97,7 +97,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
{
- struct esp_output_extra *extra = esp_tmp_extra(tmp);
struct crypto_aead *aead = x->data;
int extralen = 0;
u8 *iv;
@@ -105,9 +104,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
struct scatterlist *sg;
if (x->props.flags & XFRM_STATE_ESN)
- extralen += sizeof(*extra);
+ extralen += sizeof(struct esp_output_extra);
- extra = esp_tmp_extra(tmp);
iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4c0c33e4710d..b42c429cebbe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -208,9 +208,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
void fib_nh_common_release(struct fib_nh_common *nhc)
{
- if (nhc->nhc_dev)
- dev_put(nhc->nhc_dev);
-
+ dev_put(nhc->nhc_dev);
lwtstate_put(nhc->nhc_lwtstate);
rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
rt_fibinfo_free(&nhc->nhc_rth_input);
@@ -260,7 +258,7 @@ EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
spin_lock_bh(&fib_info_lock);
- if (fi && --fi->fib_treeref == 0) {
+ if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
hlist_del(&fi->fib_hash);
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
@@ -1373,7 +1371,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
if (!cfg->fc_mx) {
fi = fib_find_info_nh(net, cfg);
if (fi) {
- fi->fib_treeref++;
+ refcount_inc(&fi->fib_treeref);
return fi;
}
}
@@ -1547,11 +1545,11 @@ link_it:
if (ofi) {
fi->fib_dead = 1;
free_fib_info(fi);
- ofi->fib_treeref++;
+ refcount_inc(&ofi->fib_treeref);
return ofi;
}
- fi->fib_treeref++;
+ refcount_set(&fi->fib_treeref, 1);
refcount_set(&fi->fib_clntref, 1);
spin_lock_bh(&fib_info_lock);
hlist_add_head(&fi->fib_hash,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 25cf387cca5b..8060524f4256 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2380,11 +2380,11 @@ void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
- 0, SLAB_PANIC, NULL);
+ 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
LEAF_SIZE,
- 0, SLAB_PANIC, NULL);
+ 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
}
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index e5f69b0bf3df..8fcbc6258ec5 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -230,8 +230,8 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
+ const struct net_offload __rcu **offloads;
u8 proto = fou_from_sock(sk)->protocol;
- const struct net_offload **offloads;
const struct net_offload *ops;
struct sk_buff *pp = NULL;
@@ -263,10 +263,10 @@ out_unlock:
static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
int nhoff)
{
- const struct net_offload *ops;
+ const struct net_offload __rcu **offloads;
u8 proto = fou_from_sock(sk)->protocol;
+ const struct net_offload *ops;
int err = -ENOSYS;
- const struct net_offload **offloads;
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
@@ -311,7 +311,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
- const struct net_offload **offloads;
+ const struct net_offload __rcu **offloads;
const struct net_offload *ops;
struct sk_buff *pp = NULL;
struct sk_buff *p;
@@ -457,8 +457,8 @@ out:
static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
- const struct net_offload **offloads;
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
+ const struct net_offload __rcu **offloads;
const struct net_offload *ops;
unsigned int guehlen = 0;
u8 proto;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c695d294a5df..8b30cadff708 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1095,8 +1095,7 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
sizeof(struct in6_addr))
goto send_mal_query;
dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
break;
#endif
default:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 00576bae183d..d2e2b3d18c66 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2240,7 +2240,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
iml->sfmode, psf->sl_count, psf->sl_addr, 0);
RCU_INIT_POINTER(iml->sflist, NULL);
/* decrease mem now to avoid the memleak warning */
- atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+ atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
kfree_rcu(psf, rcu);
return err;
}
@@ -2389,7 +2389,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (psl)
count += psl->sl_max;
- newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
@@ -2400,7 +2401,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
/* decrease mem now to avoid the memleak warning */
- atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
kfree_rcu(psl, rcu);
}
rcu_assign_pointer(pmc->sflist, newpsl);
@@ -2475,19 +2477,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
goto done;
}
if (msf->imsf_numsrc) {
- newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
- GFP_KERNEL);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+ msf->imsf_numsrc),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
- memcpy(newpsl->sl_addr, msf->imsf_slist,
- msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+ memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
+ flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
- sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+ sock_kfree_s(sk, newpsl,
+ struct_size(newpsl, sl_addr,
+ newpsl->sl_max));
goto done;
}
} else {
@@ -2500,7 +2505,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
psl->sl_count, psl->sl_addr, 0);
/* decrease mem now to avoid the memleak warning */
- atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
kfree_rcu(psl, rcu);
} else
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
@@ -2558,14 +2564,14 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
count = psl->sl_count;
}
copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
- len = copycount * sizeof(psl->sl_addr[0]);
+ len = flex_array_size(psl, sl_addr, copycount);
msf->imsf_numsrc = count;
if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
return -EFAULT;
}
if (len &&
- copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+ copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len))
return -EFAULT;
return 0;
done:
@@ -2720,6 +2726,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u
rv = 1;
} else if (im) {
if (src_addr) {
+ spin_lock_bh(&im->lock);
for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
@@ -2730,6 +2737,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u
im->sfcount[MCAST_EXCLUDE];
else
rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ spin_unlock_bh(&im->lock);
} else
rv = 1; /* unspecified source; tentatively allow */
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 754013fa393b..f25d02ad4a8a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -534,7 +534,8 @@ out:
atomic_read(&newsk->sk_rmem_alloc));
mem_cgroup_sk_alloc(newsk);
if (newsk->sk_memcg && amt)
- mem_cgroup_charge_skmem(newsk->sk_memcg, amt);
+ mem_cgroup_charge_skmem(newsk->sk_memcg, amt,
+ GFP_KERNEL | __GFP_NOFAIL);
release_sock(newsk);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 95419b7adf5c..177d26d8fb9c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -925,7 +925,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_stop = ipgre_close,
#endif
.ndo_start_xmit = ipgre_xmit,
- .ndo_do_ioctl = ip_tunnel_ioctl,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d8a8da3ae7e..9bca57ef8b83 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -198,19 +198,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
- /* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
- struct sk_buff *skb2;
-
- skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
- if (!skb2) {
- kfree_skb(skb);
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
return -ENOMEM;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
}
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
@@ -446,8 +437,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
- memcpy(&iph->saddr, &fl4->saddr,
- sizeof(fl4->saddr) + sizeof(fl4->daddr));
+
+ iph->saddr = fl4->saddr;
+ iph->daddr = fl4->daddr;
}
/* Note: skb->sk can be different from sk, in case of tunnels */
@@ -614,18 +606,6 @@ void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
}
EXPORT_SYMBOL(ip_fraglist_init);
-static void ip_fraglist_ipcb_prepare(struct sk_buff *skb,
- struct ip_fraglist_iter *iter)
-{
- struct sk_buff *to = iter->frag;
-
- /* Copy the flags to each fragment. */
- IPCB(to)->flags = IPCB(skb)->flags;
-
- if (iter->offset == 0)
- ip_options_fragment(to);
-}
-
void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
{
unsigned int hlen = iter->hlen;
@@ -671,7 +651,7 @@ void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
EXPORT_SYMBOL(ip_frag_init);
static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
- bool first_frag, struct ip_frag_state *state)
+ bool first_frag)
{
/* Copy the flags to each fragment. */
IPCB(to)->flags = IPCB(from)->flags;
@@ -846,11 +826,14 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
/* Everything is OK. Generate! */
ip_fraglist_init(skb, iph, hlen, &iter);
+ if (iter.frag)
+ ip_options_fragment(iter.frag);
+
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
if (iter.frag) {
- ip_fraglist_ipcb_prepare(skb, &iter);
+ IPCB(iter.frag)->flags = IPCB(skb)->flags;
ip_fraglist_prepare(skb, &iter);
}
@@ -905,7 +888,7 @@ slow_path:
err = PTR_ERR(skb2);
goto fail;
}
- ip_frag_ipcb(skb, skb2, first_frag, &state);
+ ip_frag_ipcb(skb, skb2, first_frag);
/*
* Put this fragment into the sending queue.
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ec6036713e2c..b297bb28556e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -663,12 +663,11 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex,
struct sockaddr_storage *group,
struct sockaddr_storage *list)
{
- int msize = IP_MSFILTER_SIZE(numsrc);
struct ip_msfilter *msf;
struct sockaddr_in *psin;
int err, i;
- msf = kmalloc(msize, GFP_KERNEL);
+ msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL);
if (!msf)
return -ENOBUFS;
@@ -684,7 +683,7 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex,
if (psin->sin_family != AF_INET)
goto Eaddrnotavail;
- msf->imsf_slist[i] = psin->sin_addr.s_addr;
+ msf->imsf_slist_flex[i] = psin->sin_addr.s_addr;
}
err = ip_mc_msfilter(sk, msf, ifindex);
kfree(msf);
@@ -791,7 +790,8 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
goto out_free_gsf;
err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc,
- gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist);
+ gsf->gf_fmode, &gsf->gf_group,
+ gsf->gf_slist_flex);
out_free_gsf:
kfree(gsf);
return err;
@@ -800,7 +800,7 @@ out_free_gsf:
static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
int optlen)
{
- const int size0 = offsetof(struct compat_group_filter, gf_slist);
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
struct compat_group_filter *gf32;
unsigned int n;
void *p;
@@ -814,7 +814,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
p = kmalloc(optlen + 4, GFP_KERNEL);
if (!p)
return -ENOMEM;
- gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */
+ gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
err = -EFAULT;
if (copy_from_sockptr(gf32, optval, optlen))
@@ -827,7 +827,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
goto out_free_gsf;
err = -EINVAL;
- if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen)
+ if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
goto out_free_gsf;
/* numsrc >= (4G-140)/128 overflow in 32 bits */
@@ -835,7 +835,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf)
goto out_free_gsf;
err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode,
- &gf32->gf_group, gf32->gf_slist);
+ &gf32->gf_group, gf32->gf_slist_flex);
out_free_gsf:
kfree(p);
return err;
@@ -1456,7 +1456,7 @@ static bool getsockopt_needs_rtnl(int optname)
static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
int __user *optlen, int len)
{
- const int size0 = offsetof(struct group_filter, gf_slist);
+ const int size0 = offsetof(struct group_filter, gf_slist_flex);
struct group_filter __user *p = optval;
struct group_filter gsf;
int num;
@@ -1468,7 +1468,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
return -EFAULT;
num = gsf.gf_numsrc;
- err = ip_mc_gsfget(sk, &gsf, p->gf_slist);
+ err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex);
if (err)
return err;
if (gsf.gf_numsrc < num)
@@ -1482,7 +1482,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
int __user *optlen, int len)
{
- const int size0 = offsetof(struct compat_group_filter, gf_slist);
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
struct compat_group_filter __user *p = optval;
struct compat_group_filter gf32;
struct group_filter gf;
@@ -1499,7 +1499,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval,
num = gf.gf_numsrc = gf32.gf_numsrc;
gf.gf_group = gf32.gf_group;
- err = ip_mc_gsfget(sk, &gf, p->gf_slist);
+ err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex);
if (err)
return err;
if (gf.gf_numsrc < num)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index be75b409445c..fe9101d3d69e 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -958,19 +958,20 @@ done:
}
EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
-int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
{
struct ip_tunnel_parm p;
int err;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
return -EFAULT;
err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
- if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (!err && copy_to_user(data, &p, sizeof(p)))
return -EFAULT;
return err;
}
-EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
{
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index eb560eecee08..efe25a0172e6 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -405,7 +405,7 @@ static const struct net_device_ops vti_netdev_ops = {
.ndo_init = vti_tunnel_init,
.ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = vti_tunnel_xmit,
- .ndo_do_ioctl = ip_tunnel_ioctl,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 266c65577ba6..3aa78ccbec3e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -347,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_init = ipip_tunnel_init,
.ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = ipip_tunnel_xmit,
- .ndo_do_ioctl = ip_tunnel_ioctl,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 6922612df456..3de78416ec76 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -18,15 +18,12 @@ MODULE_DESCRIPTION("arptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
(1 << NF_ARP_FORWARD))
-static int __net_init arptable_filter_table_init(struct net *net);
-
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_ARP,
.priority = NF_IP_PRI_FILTER,
- .table_init = arptable_filter_table_init,
};
/* The work comes in here from netfilter.c */
@@ -39,7 +36,7 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *arpfilter_ops __read_mostly;
-static int __net_init arptable_filter_table_init(struct net *net)
+static int arptable_filter_table_init(struct net *net)
{
struct arpt_replace *repl;
int err;
@@ -69,30 +66,32 @@ static struct pernet_operations arptable_filter_net_ops = {
static int __init arptable_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ arptable_filter_table_init);
+
+ if (ret < 0)
+ return ret;
arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
- if (IS_ERR(arpfilter_ops))
+ if (IS_ERR(arpfilter_ops)) {
+ xt_unregister_template(&packet_filter);
return PTR_ERR(arpfilter_ops);
+ }
ret = register_pernet_subsys(&arptable_filter_net_ops);
if (ret < 0) {
+ xt_unregister_template(&packet_filter);
kfree(arpfilter_ops);
return ret;
}
- ret = arptable_filter_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&arptable_filter_net_ops);
- kfree(arpfilter_ops);
- }
-
return ret;
}
static void __exit arptable_filter_fini(void)
{
unregister_pernet_subsys(&arptable_filter_net_ops);
+ xt_unregister_template(&packet_filter);
kfree(arpfilter_ops);
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 8f7ca67475b7..8fd1aba8af31 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -66,11 +66,22 @@ struct clusterip_net {
/* lock protects the configs list */
spinlock_t lock;
+ bool clusterip_deprecated_warning;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *procdir;
/* mutex protects the config->pde*/
struct mutex mutex;
#endif
+ unsigned int hook_users;
+};
+
+static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state);
+
+static const struct nf_hook_ops cip_arp_ops = {
+ .hook = clusterip_arp_mangle,
+ .pf = NFPROTO_ARP,
+ .hooknum = NF_ARP_OUT,
+ .priority = -1
};
static unsigned int clusterip_net_id __read_mostly;
@@ -458,6 +469,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int clusterip_tg_check(const struct xt_tgchk_param *par)
{
struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+ struct clusterip_net *cn = clusterip_pernet(par->net);
const struct ipt_entry *e = par->entryinfo;
struct clusterip_config *config;
int ret, i;
@@ -467,6 +479,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
return -EOPNOTSUPP;
}
+ if (cn->hook_users == UINT_MAX)
+ return -EOVERFLOW;
+
if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
@@ -517,10 +532,23 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
return ret;
}
- if (!par->net->xt.clusterip_deprecated_warning) {
+ if (cn->hook_users == 0) {
+ ret = nf_register_net_hook(par->net, &cip_arp_ops);
+
+ if (ret < 0) {
+ clusterip_config_entry_put(config);
+ clusterip_config_put(config);
+ nf_ct_netns_put(par->net, par->family);
+ return ret;
+ }
+ }
+
+ cn->hook_users++;
+
+ if (!cn->clusterip_deprecated_warning) {
pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
"use xt_cluster instead\n");
- par->net->xt.clusterip_deprecated_warning = true;
+ cn->clusterip_deprecated_warning = true;
}
cipinfo->config = config;
@@ -531,6 +559,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
{
const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+ struct clusterip_net *cn = clusterip_pernet(par->net);
/* if no more entries are referencing the config, remove it
* from the list and destroy the proc entry */
@@ -539,6 +568,10 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
clusterip_config_put(cipinfo->config);
nf_ct_netns_put(par->net, par->family);
+ cn->hook_users--;
+
+ if (cn->hook_users == 0)
+ nf_unregister_net_hook(par->net, &cip_arp_ops);
}
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
@@ -602,9 +635,8 @@ static void arp_print(struct arp_payload *payload)
#endif
static unsigned int
-arp_mangle(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
+clusterip_arp_mangle(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
@@ -654,13 +686,6 @@ arp_mangle(void *priv,
return NF_ACCEPT;
}
-static const struct nf_hook_ops cip_arp_ops = {
- .hook = arp_mangle,
- .pf = NFPROTO_ARP,
- .hooknum = NF_ARP_OUT,
- .priority = -1
-};
-
/***********************************************************************
* PROC DIR HANDLING
***********************************************************************/
@@ -817,20 +842,14 @@ static const struct proc_ops clusterip_proc_ops = {
static int clusterip_net_init(struct net *net)
{
struct clusterip_net *cn = clusterip_pernet(net);
- int ret;
INIT_LIST_HEAD(&cn->configs);
spin_lock_init(&cn->lock);
- ret = nf_register_net_hook(net, &cip_arp_ops);
- if (ret < 0)
- return ret;
-
#ifdef CONFIG_PROC_FS
cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
if (!cn->procdir) {
- nf_unregister_net_hook(net, &cip_arp_ops);
pr_err("Unable to proc dir entry\n");
return -ENOMEM;
}
@@ -850,7 +869,6 @@ static void clusterip_net_exit(struct net *net)
cn->procdir = NULL;
mutex_unlock(&cn->mutex);
#endif
- nf_unregister_net_hook(net, &cip_arp_ops);
}
static struct pernet_operations clusterip_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 8272df7c6ad5..0eb0e2ab9bfc 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -19,7 +19,6 @@ MODULE_DESCRIPTION("iptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
-static int __net_init iptable_filter_table_init(struct net *net);
static const struct xt_table packet_filter = {
.name = "filter",
@@ -27,7 +26,6 @@ static const struct xt_table packet_filter = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
- .table_init = iptable_filter_table_init,
};
static unsigned int
@@ -43,7 +41,7 @@ static struct nf_hook_ops *filter_ops __read_mostly;
static bool forward __read_mostly = true;
module_param(forward, bool, 0000);
-static int __net_init iptable_filter_table_init(struct net *net)
+static int iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
int err;
@@ -62,7 +60,7 @@ static int __net_init iptable_filter_table_init(struct net *net)
static int __net_init iptable_filter_net_init(struct net *net)
{
- if (net == &init_net || !forward)
+ if (!forward)
return iptable_filter_table_init(net);
return 0;
@@ -86,22 +84,32 @@ static struct pernet_operations iptable_filter_net_ops = {
static int __init iptable_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ iptable_filter_table_init);
+
+ if (ret < 0)
+ return ret;
filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
- if (IS_ERR(filter_ops))
+ if (IS_ERR(filter_ops)) {
+ xt_unregister_template(&packet_filter);
return PTR_ERR(filter_ops);
+ }
ret = register_pernet_subsys(&iptable_filter_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
+ return ret;
+ }
- return ret;
+ return 0;
}
static void __exit iptable_filter_fini(void)
{
unregister_pernet_subsys(&iptable_filter_net_ops);
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
}
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 2abc3836f391..40417a3f930b 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -25,15 +25,12 @@ MODULE_DESCRIPTION("iptables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
-static int __net_init iptable_mangle_table_init(struct net *net);
-
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_MANGLE,
- .table_init = iptable_mangle_table_init,
};
static unsigned int
@@ -83,7 +80,7 @@ iptable_mangle_hook(void *priv,
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init iptable_mangle_table_init(struct net *net)
+static int iptable_mangle_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;
@@ -113,32 +110,32 @@ static struct pernet_operations iptable_mangle_net_ops = {
static int __init iptable_mangle_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_mangler,
+ iptable_mangle_table_init);
+ if (ret < 0)
+ return ret;
mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
if (IS_ERR(mangle_ops)) {
+ xt_unregister_template(&packet_mangler);
ret = PTR_ERR(mangle_ops);
return ret;
}
ret = register_pernet_subsys(&iptable_mangle_net_ops);
if (ret < 0) {
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
return ret;
}
- ret = iptable_mangle_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&iptable_mangle_net_ops);
- kfree(mangle_ops);
- }
-
return ret;
}
static void __exit iptable_mangle_fini(void)
{
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
}
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a9913842ef18..45d7e072e6a5 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -17,8 +17,6 @@ struct iptable_nat_pernet {
struct nf_hook_ops *nf_nat_ops;
};
-static int __net_init iptable_nat_table_init(struct net *net);
-
static unsigned int iptable_nat_net_id __read_mostly;
static const struct xt_table nf_nat_ipv4_table = {
@@ -29,7 +27,6 @@ static const struct xt_table nf_nat_ipv4_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
- .table_init = iptable_nat_table_init,
};
static unsigned int iptable_nat_do_chain(void *priv,
@@ -113,7 +110,7 @@ static void ipt_nat_unregister_lookups(struct net *net)
kfree(ops);
}
-static int __net_init iptable_nat_table_init(struct net *net)
+static int iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;
@@ -155,20 +152,25 @@ static struct pernet_operations iptable_nat_net_ops = {
static int __init iptable_nat_init(void)
{
- int ret = register_pernet_subsys(&iptable_nat_net_ops);
+ int ret = xt_register_template(&nf_nat_ipv4_table,
+ iptable_nat_table_init);
+
+ if (ret < 0)
+ return ret;
- if (ret)
+ ret = register_pernet_subsys(&iptable_nat_net_ops);
+ if (ret < 0) {
+ xt_unregister_template(&nf_nat_ipv4_table);
return ret;
+ }
- ret = iptable_nat_table_init(&init_net);
- if (ret)
- unregister_pernet_subsys(&iptable_nat_net_ops);
return ret;
}
static void __exit iptable_nat_exit(void)
{
unregister_pernet_subsys(&iptable_nat_net_ops);
+ xt_unregister_template(&nf_nat_ipv4_table);
}
module_init(iptable_nat_init);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index ceef397c1f5f..b88e0f36cd05 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -12,8 +12,6 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
-static int __net_init iptable_raw_table_init(struct net *net);
-
static bool raw_before_defrag __read_mostly;
MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
module_param(raw_before_defrag, bool, 0000);
@@ -24,7 +22,6 @@ static const struct xt_table packet_raw = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_RAW,
- .table_init = iptable_raw_table_init,
};
static const struct xt_table packet_raw_before_defrag = {
@@ -33,7 +30,6 @@ static const struct xt_table packet_raw_before_defrag = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG,
- .table_init = iptable_raw_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -89,22 +85,24 @@ static int __init iptable_raw_init(void)
pr_info("Enabling raw table before defrag\n");
}
+ ret = xt_register_template(table,
+ iptable_raw_table_init);
+ if (ret < 0)
+ return ret;
+
rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook);
- if (IS_ERR(rawtable_ops))
+ if (IS_ERR(rawtable_ops)) {
+ xt_unregister_template(table);
return PTR_ERR(rawtable_ops);
+ }
ret = register_pernet_subsys(&iptable_raw_net_ops);
if (ret < 0) {
+ xt_unregister_template(table);
kfree(rawtable_ops);
return ret;
}
- ret = iptable_raw_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&iptable_raw_net_ops);
- kfree(rawtable_ops);
- }
-
return ret;
}
@@ -112,6 +110,7 @@ static void __exit iptable_raw_fini(void)
{
unregister_pernet_subsys(&iptable_raw_net_ops);
kfree(rawtable_ops);
+ xt_unregister_template(&packet_raw);
}
module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 77973f5fd8f6..f519162a2fa5 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -25,15 +25,12 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
-static int __net_init iptable_security_table_init(struct net *net);
-
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_SECURITY,
- .table_init = iptable_security_table_init,
};
static unsigned int
@@ -45,7 +42,7 @@ iptable_security_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init iptable_security_table_init(struct net *net)
+static int iptable_security_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;
@@ -75,24 +72,25 @@ static struct pernet_operations iptable_security_net_ops = {
static int __init iptable_security_init(void)
{
- int ret;
+ int ret = xt_register_template(&security_table,
+ iptable_security_table_init);
+
+ if (ret < 0)
+ return ret;
sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
- if (IS_ERR(sectbl_ops))
+ if (IS_ERR(sectbl_ops)) {
+ xt_unregister_template(&security_table);
return PTR_ERR(sectbl_ops);
+ }
ret = register_pernet_subsys(&iptable_security_net_ops);
if (ret < 0) {
+ xt_unregister_template(&security_table);
kfree(sectbl_ops);
return ret;
}
- ret = iptable_security_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&iptable_security_net_ops);
- kfree(sectbl_ops);
- }
-
return ret;
}
@@ -100,6 +98,7 @@ static void __exit iptable_security_fini(void)
{
unregister_pernet_subsys(&iptable_security_net_ops);
kfree(sectbl_ops);
+ xt_unregister_template(&security_table);
}
module_init(iptable_security_init);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a6f20ee35335..d6899ab5fb39 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -276,12 +276,13 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
struct rt_cache_stat *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+ seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
return 0;
}
- seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
- " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x\n",
dst_entries_get_slow(&ipv4_dst_ops),
0, /* st->in_hit */
st->in_slow_tot,
@@ -586,18 +587,25 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
}
}
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
{
- struct fib_nh_exception *fnhe, *oldest;
+ struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
+ struct fib_nh_exception *fnhe, *oldest = NULL;
- oldest = rcu_dereference(hash->chain);
- for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
- fnhe = rcu_dereference(fnhe->fnhe_next)) {
- if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
+ fnhe = rcu_dereference_protected(*fnhe_p,
+ lockdep_is_held(&fnhe_lock));
+ if (!fnhe)
+ break;
+ if (!oldest ||
+ time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
oldest = fnhe;
+ oldest_p = fnhe_p;
+ }
}
fnhe_flush_routes(oldest);
- return oldest;
+ *oldest_p = oldest->fnhe_next;
+ kfree_rcu(oldest, rcu);
}
static u32 fnhe_hashfun(__be32 daddr)
@@ -676,16 +684,21 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
if (rt)
fill_route_from_fnhe(rt, fnhe);
} else {
- if (depth > FNHE_RECLAIM_DEPTH)
- fnhe = fnhe_oldest(hash);
- else {
- fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
- if (!fnhe)
- goto out_unlock;
-
- fnhe->fnhe_next = hash->chain;
- rcu_assign_pointer(hash->chain, fnhe);
+ /* Randomize max depth to avoid some side channels attacks. */
+ int max_depth = FNHE_RECLAIM_DEPTH +
+ prandom_u32_max(FNHE_RECLAIM_DEPTH);
+
+ while (depth > max_depth) {
+ fnhe_remove_oldest(hash);
+ depth--;
}
+
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
+
+ fnhe->fnhe_next = hash->chain;
+
fnhe->fnhe_genid = genid;
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
@@ -693,6 +706,8 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
fnhe->fnhe_mtu_locked = lock;
fnhe->fnhe_expires = max(1UL, expires);
+ rcu_assign_pointer(hash->chain, fnhe);
+
/* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception
* applies to them.
@@ -1299,26 +1314,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
{
- const struct rtable *rt = (const struct rtable *)dst;
- unsigned int mtu = rt->rt_pmtu;
-
- if (!mtu || time_after_eq(jiffies, rt->dst.expires))
- mtu = dst_metric_raw(dst, RTAX_MTU);
-
- if (mtu)
- goto out;
-
- mtu = READ_ONCE(dst->dev->mtu);
-
- if (unlikely(ip_mtu_locked(dst))) {
- if (rt->rt_uses_gateway && mtu > 576)
- mtu = 576;
- }
-
-out:
- mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
-
- return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+ return ip_dst_mtu_maybe_forward(dst, false);
}
EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
@@ -2831,8 +2827,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->output = dst_discard_out;
new->dev = net->loopback_dev;
- if (new->dev)
- dev_hold(new->dev);
+ dev_hold(new->dev);
rt->rt_is_input = ort->rt_is_input;
rt->rt_iif = ort->rt_iif;
@@ -3170,7 +3165,7 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
udph = skb_put_zero(skb, sizeof(struct udphdr));
udph->source = sport;
udph->dest = dport;
- udph->len = sizeof(struct udphdr);
+ udph->len = htons(sizeof(struct udphdr));
udph->check = 0;
break;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8cb44040ec68..e8b48df73c85 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3338,6 +3338,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
} else {
tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
SOCK_MIN_RCVBUF / 2 : val;
+ tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
}
return 0;
}
@@ -4512,7 +4513,9 @@ void __init tcp_init(void)
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT,
+ NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 25fa4c01a17f..59412d6354a0 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -55,12 +55,7 @@ void tcp_fastopen_ctx_destroy(struct net *net)
{
struct tcp_fastopen_context *ctxt;
- spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
-
- ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
- lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
- rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
- spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+ ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL);
if (ctxt)
call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
@@ -89,18 +84,12 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
ctx->num = 1;
}
- spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
if (sk) {
q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
- octx = rcu_dereference_protected(q->ctx,
- lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
- rcu_assign_pointer(q->ctx, ctx);
+ octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx);
} else {
- octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
- lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
- rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+ octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx);
}
- spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
if (octx)
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
@@ -379,8 +368,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
return NULL;
}
- if (syn_data &&
- tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
+ if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
goto fastopen;
if (foc->len == 0) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 149ceb5c94ff..3f7bd7ae7d7a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
+#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -454,11 +455,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
*/
/* Slow part of check#2. */
-static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int skbtruesize)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
- int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
+ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
while (tp->rcv_ssthresh <= window) {
@@ -471,7 +473,27 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
return 0;
}
-static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
+ * can play nice with us, as sk_buff and skb->head might be either
+ * freed or shared with up to MAX_SKB_FRAGS segments.
+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
+ * and if no payload was pulled in skb->head before reaching us.
+ */
+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
+{
+ u32 truesize = skb->truesize;
+
+ if (adjust && !skb_headlen(skb)) {
+ truesize -= SKB_TRUESIZE(skb_end_offset(skb));
+ /* paranoid check, some drivers might be buggy */
+ if (unlikely((int)truesize < (int)skb->len))
+ truesize = skb->truesize;
+ }
+ return truesize;
+}
+
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
+ bool adjust)
{
struct tcp_sock *tp = tcp_sk(sk);
int room;
@@ -480,15 +502,16 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */
if (room > 0 && !tcp_under_memory_pressure(sk)) {
+ unsigned int truesize = truesize_adjust(adjust, skb);
int incr;
/* Check #2. Increase window, if skb with such overhead
* will fit to rcvbuf in future.
*/
- if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
+ if (tcp_win_from_space(sk, truesize) <= skb->len)
incr = 2 * tp->advmss;
else
- incr = __tcp_grow_window(sk, skb);
+ incr = __tcp_grow_window(sk, skb, truesize);
if (incr) {
incr = max_t(int, incr, 2 * skb->len);
@@ -782,7 +805,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
tcp_ecn_check_ce(sk, skb);
if (skb->len >= 128)
- tcp_grow_window(sk, skb);
+ tcp_grow_window(sk, skb, true);
}
/* Called to compute a smoothed rtt estimate. The data fed to this
@@ -969,6 +992,8 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
return 0;
if (seq_len > tp->mss_cache)
dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
+ else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
+ state->flag |= FLAG_DSACK_TLP;
tp->dsack_dups += dup_segs;
/* Skip the DSACK if dup segs weren't retransmitted by sender */
@@ -976,7 +1001,14 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
return 0;
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
- tp->rack.dsack_seen = 1;
+ /* We increase the RACK ordering window in rounds where we receive
+ * DSACKs that may have been due to reordering causing RACK to trigger
+ * a spurious fast recovery. Thus RACK ignores DSACKs that happen
+ * without having seen reordering, or that match TLP probes (TLP
+ * is timer-driven, not triggered by RACK).
+ */
+ if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
+ tp->rack.dsack_seen = 1;
state->flag |= FLAG_DSACKING_ACK;
/* A spurious retransmission is delivered */
@@ -3628,7 +3660,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
if (!tp->tlp_retrans) {
/* TLP of new data has been acknowledged */
tp->tlp_high_seq = 0;
- } else if (flag & FLAG_DSACKING_ACK) {
+ } else if (flag & FLAG_DSACK_TLP) {
/* This DSACK means original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0;
} else if (after(ack, tp->tlp_high_seq)) {
@@ -4769,7 +4801,7 @@ coalesce_done:
* and trigger fast retransmit.
*/
if (tcp_is_sack(tp))
- tcp_grow_window(sk, skb);
+ tcp_grow_window(sk, skb, true);
kfree_skb_partial(skb, fragstolen);
skb = NULL;
goto add_sack;
@@ -4857,7 +4889,7 @@ end:
* and trigger fast retransmit.
*/
if (tcp_is_sack(tp))
- tcp_grow_window(sk, skb);
+ tcp_grow_window(sk, skb, false);
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
@@ -5383,7 +5415,7 @@ static void tcp_new_space(struct sock *sk)
tp->snd_cwnd_stamp = tcp_jiffies32;
}
- sk->sk_write_space(sk);
+ INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
}
static void tcp_check_space(struct sock *sk)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a692626c19e4..2e62e0d6373a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2277,51 +2277,72 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-/*
- * Get next listener socket follow cur. If cur is NULL, get first socket
- * starting from bucket given in st->bucket; when st->bucket is zero the
- * very first socket in the hash table is returned.
+static unsigned short seq_file_family(const struct seq_file *seq);
+
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+ unsigned short family = seq_file_family(seq);
+
+ /* AF_UNSPEC is used as a match all */
+ return ((family == AF_UNSPEC || family == sk->sk_family) &&
+ net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+/* Find a non empty bucket (starting from st->bucket)
+ * and return the first sk from it.
*/
-static void *listening_get_next(struct seq_file *seq, void *cur)
+static void *listening_get_first(struct seq_file *seq)
{
- struct tcp_seq_afinfo *afinfo;
struct tcp_iter_state *st = seq->private;
- struct net *net = seq_file_net(seq);
- struct inet_listen_hashbucket *ilb;
- struct hlist_nulls_node *node;
- struct sock *sk = cur;
- if (st->bpf_seq_afinfo)
- afinfo = st->bpf_seq_afinfo;
- else
- afinfo = PDE_DATA(file_inode(seq->file));
+ st->offset = 0;
+ for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
+ struct inet_listen_hashbucket *ilb2;
+ struct inet_connection_sock *icsk;
+ struct sock *sk;
- if (!sk) {
-get_head:
- ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock(&ilb->lock);
- sk = sk_nulls_head(&ilb->nulls_head);
- st->offset = 0;
- goto get_sk;
+ ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+ if (hlist_empty(&ilb2->head))
+ continue;
+
+ spin_lock(&ilb2->lock);
+ inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
+ sk = (struct sock *)icsk;
+ if (seq_sk_match(seq, sk))
+ return sk;
+ }
+ spin_unlock(&ilb2->lock);
}
- ilb = &tcp_hashinfo.listening_hash[st->bucket];
+
+ return NULL;
+}
+
+/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
+ * If "cur" is the last one in the st->bucket,
+ * call listening_get_first() to return the first sk of the next
+ * non empty bucket.
+ */
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+ struct tcp_iter_state *st = seq->private;
+ struct inet_listen_hashbucket *ilb2;
+ struct inet_connection_sock *icsk;
+ struct sock *sk = cur;
+
++st->num;
++st->offset;
- sk = sk_nulls_next(sk);
-get_sk:
- sk_nulls_for_each_from(sk, node) {
- if (!net_eq(sock_net(sk), net))
- continue;
- if (afinfo->family == AF_UNSPEC ||
- sk->sk_family == afinfo->family)
+ icsk = inet_csk(sk);
+ inet_lhash2_for_each_icsk_continue(icsk) {
+ sk = (struct sock *)icsk;
+ if (seq_sk_match(seq, sk))
return sk;
}
- spin_unlock(&ilb->lock);
- st->offset = 0;
- if (++st->bucket < INET_LHTABLE_SIZE)
- goto get_head;
- return NULL;
+
+ ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+ spin_unlock(&ilb2->lock);
+ ++st->bucket;
+ return listening_get_first(seq);
}
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
@@ -2331,7 +2352,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
st->bucket = 0;
st->offset = 0;
- rc = listening_get_next(seq, NULL);
+ rc = listening_get_first(seq);
while (rc && *pos) {
rc = listening_get_next(seq, rc);
@@ -2351,15 +2372,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
*/
static void *established_get_first(struct seq_file *seq)
{
- struct tcp_seq_afinfo *afinfo;
struct tcp_iter_state *st = seq->private;
- struct net *net = seq_file_net(seq);
- void *rc = NULL;
-
- if (st->bpf_seq_afinfo)
- afinfo = st->bpf_seq_afinfo;
- else
- afinfo = PDE_DATA(file_inode(seq->file));
st->offset = 0;
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
@@ -2373,32 +2386,20 @@ static void *established_get_first(struct seq_file *seq)
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
- if ((afinfo->family != AF_UNSPEC &&
- sk->sk_family != afinfo->family) ||
- !net_eq(sock_net(sk), net)) {
- continue;
- }
- rc = sk;
- goto out;
+ if (seq_sk_match(seq, sk))
+ return sk;
}
spin_unlock_bh(lock);
}
-out:
- return rc;
+
+ return NULL;
}
static void *established_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_seq_afinfo *afinfo;
struct sock *sk = cur;
struct hlist_nulls_node *node;
struct tcp_iter_state *st = seq->private;
- struct net *net = seq_file_net(seq);
-
- if (st->bpf_seq_afinfo)
- afinfo = st->bpf_seq_afinfo;
- else
- afinfo = PDE_DATA(file_inode(seq->file));
++st->num;
++st->offset;
@@ -2406,9 +2407,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
- if ((afinfo->family == AF_UNSPEC ||
- sk->sk_family == afinfo->family) &&
- net_eq(sock_net(sk), net))
+ if (seq_sk_match(seq, sk))
return sk;
}
@@ -2451,17 +2450,18 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
static void *tcp_seek_last_pos(struct seq_file *seq)
{
struct tcp_iter_state *st = seq->private;
+ int bucket = st->bucket;
int offset = st->offset;
int orig_num = st->num;
void *rc = NULL;
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
- if (st->bucket >= INET_LHTABLE_SIZE)
+ if (st->bucket > tcp_hashinfo.lhash2_mask)
break;
st->state = TCP_SEQ_STATE_LISTENING;
- rc = listening_get_next(seq, NULL);
- while (offset-- && rc)
+ rc = listening_get_first(seq);
+ while (offset-- && rc && bucket == st->bucket)
rc = listening_get_next(seq, rc);
if (rc)
break;
@@ -2472,7 +2472,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
rc = established_get_first(seq);
- while (offset-- && rc)
+ while (offset-- && rc && bucket == st->bucket)
rc = established_get_next(seq, rc);
}
@@ -2542,7 +2542,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
+ spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
break;
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
@@ -2687,6 +2687,15 @@ out:
}
#ifdef CONFIG_BPF_SYSCALL
+struct bpf_tcp_iter_state {
+ struct tcp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ struct sock **batch;
+ bool st_bucket_done;
+};
+
struct bpf_iter__tcp {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct sock_common *, sk_common);
@@ -2705,16 +2714,204 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
return bpf_iter_run_prog(prog, &ctx);
}
+static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
+{
+ while (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
+ unsigned int new_batch_sz)
+{
+ struct sock **new_batch;
+
+ new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+ GFP_USER | __GFP_NOWARN);
+ if (!new_batch)
+ return -ENOMEM;
+
+ bpf_iter_tcp_put_batch(iter);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+ struct sock *start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct inet_connection_sock *icsk;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(start_sk);
+ iter->batch[iter->end_sk++] = start_sk;
+
+ icsk = inet_csk(start_sk);
+ inet_lhash2_for_each_icsk_continue(icsk) {
+ sk = (struct sock *)icsk;
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++] = sk;
+ }
+ expected++;
+ }
+ }
+ spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
+
+ return expected;
+}
+
+static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
+ struct sock *start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct hlist_nulls_node *node;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(start_sk);
+ iter->batch[iter->end_sk++] = start_sk;
+
+ sk = sk_nulls_next(start_sk);
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++] = sk;
+ }
+ expected++;
+ }
+ }
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+
+ return expected;
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int expected;
+ bool resized = false;
+ struct sock *sk;
+
+ /* The st->bucket is done. Directly advance to the next
+ * bucket instead of having the tcp_seek_last_pos() to skip
+ * one by one in the current bucket and eventually find out
+ * it has to advance to the next bucket.
+ */
+ if (iter->st_bucket_done) {
+ st->offset = 0;
+ st->bucket++;
+ if (st->state == TCP_SEQ_STATE_LISTENING &&
+ st->bucket > tcp_hashinfo.lhash2_mask) {
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ st->bucket = 0;
+ }
+ }
+
+again:
+ /* Get a new batch */
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+ iter->st_bucket_done = false;
+
+ sk = tcp_seek_last_pos(seq);
+ if (!sk)
+ return NULL; /* Done */
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ expected = bpf_iter_tcp_listening_batch(seq, sk);
+ else
+ expected = bpf_iter_tcp_established_batch(seq, sk);
+
+ if (iter->end_sk == expected) {
+ iter->st_bucket_done = true;
+ return sk;
+ }
+
+ if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
+ resized = true;
+ goto again;
+ }
+
+ return sk;
+}
+
+static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ if (*pos)
+ return bpf_iter_tcp_batch(seq);
+
+ return SEQ_START_TOKEN;
+}
+
+static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so advance to the next sk in
+ * the batch.
+ */
+ if (iter->cur_sk < iter->end_sk) {
+ /* Keeping st->num consistent in tcp_iter_state.
+ * bpf_iter_tcp does not use st->num.
+ * meta.seq_num is used instead.
+ */
+ st->num++;
+ /* Move st->offset to the next sk in the bucket such that
+ * the future start() will resume at st->offset in
+ * st->bucket. See tcp_seek_last_pos().
+ */
+ st->offset++;
+ sock_put(iter->batch[iter->cur_sk++]);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk];
+ else
+ sk = bpf_iter_tcp_batch(seq);
+
+ ++*pos;
+ /* Keeping st->last_pos consistent in tcp_iter_state.
+ * bpf iter does not do lseek, so st->last_pos always equals to *pos.
+ */
+ st->last_pos = *pos;
+ return sk;
+}
+
static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
struct sock *sk = v;
+ bool slow;
uid_t uid;
+ int ret;
if (v == SEQ_START_TOKEN)
return 0;
+ if (sk_fullsock(sk))
+ slow = lock_sock_fast(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
+ }
+
if (sk->sk_state == TCP_TIME_WAIT) {
uid = 0;
} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
@@ -2728,11 +2925,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
- return tcp_prog_seq_show(prog, &meta, v, uid);
+ ret = tcp_prog_seq_show(prog, &meta, v, uid);
+
+unlock:
+ if (sk_fullsock(sk))
+ unlock_sock_fast(sk, slow);
+ return ret;
+
}
static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
{
+ struct bpf_tcp_iter_state *iter = seq->private;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
@@ -2743,16 +2947,33 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
(void)tcp_prog_seq_show(prog, &meta, v, 0);
}
- tcp_seq_stop(seq, v);
+ if (iter->cur_sk < iter->end_sk) {
+ bpf_iter_tcp_put_batch(iter);
+ iter->st_bucket_done = false;
+ }
}
static const struct seq_operations bpf_iter_tcp_seq_ops = {
.show = bpf_iter_tcp_seq_show,
- .start = tcp_seq_start,
- .next = tcp_seq_next,
+ .start = bpf_iter_tcp_seq_start,
+ .next = bpf_iter_tcp_seq_next,
.stop = bpf_iter_tcp_seq_stop,
};
#endif
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+ const struct tcp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
+ if (seq->op == &bpf_iter_tcp_seq_ops)
+ return AF_UNSPEC;
+#endif
+
+ /* Iterated from proc fs */
+ afinfo = PDE_DATA(file_inode(seq->file));
+ return afinfo->family;
+}
static const struct seq_operations tcp4_seq_ops = {
.show = tcp4_seq_show,
@@ -2964,7 +3185,6 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
- spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
@@ -3003,39 +3223,55 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
struct sock_common *sk_common, uid_t uid)
+#define INIT_BATCH_SZ 16
+
static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
{
- struct tcp_iter_state *st = priv_data;
- struct tcp_seq_afinfo *afinfo;
- int ret;
+ struct bpf_tcp_iter_state *iter = priv_data;
+ int err;
- afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
- if (!afinfo)
- return -ENOMEM;
+ err = bpf_iter_init_seq_net(priv_data, aux);
+ if (err)
+ return err;
- afinfo->family = AF_UNSPEC;
- st->bpf_seq_afinfo = afinfo;
- ret = bpf_iter_init_seq_net(priv_data, aux);
- if (ret)
- kfree(afinfo);
- return ret;
+ err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
+ if (err) {
+ bpf_iter_fini_seq_net(priv_data);
+ return err;
+ }
+
+ return 0;
}
static void bpf_iter_fini_tcp(void *priv_data)
{
- struct tcp_iter_state *st = priv_data;
+ struct bpf_tcp_iter_state *iter = priv_data;
- kfree(st->bpf_seq_afinfo);
bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
}
static const struct bpf_iter_seq_info tcp_seq_info = {
.seq_ops = &bpf_iter_tcp_seq_ops,
.init_seq_private = bpf_iter_init_tcp,
.fini_seq_private = bpf_iter_fini_tcp,
- .seq_priv_size = sizeof(struct tcp_iter_state),
+ .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
};
+static const struct bpf_func_proto *
+bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_sk_setsockopt_proto;
+ case BPF_FUNC_getsockopt:
+ return &bpf_sk_getsockopt_proto;
+ default:
+ return NULL;
+ }
+}
+
static struct bpf_iter_reg tcp_reg_info = {
.target = "tcp",
.ctx_arg_info_size = 1,
@@ -3043,6 +3279,7 @@ static struct bpf_iter_reg tcp_reg_info = {
{ offsetof(struct bpf_iter__tcp, sk_common),
PTR_TO_BTF_ID_OR_NULL },
},
+ .get_func_proto = bpf_iter_tcp_get_func_proto,
.seq_info = &tcp_seq_info,
};
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 29553fce8502..6d72f3ea48c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3373,7 +3373,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_charge_skmem(sk->sk_memcg, amt);
+ mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
}
/* Send a FIN. The caller locks the socket for us.
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 6f1b4ac7fe99..fd113f6226ef 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -172,7 +172,8 @@ void tcp_rack_reo_timeout(struct sock *sk)
/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
*
- * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
+ * If a DSACK is received that seems like it may have been due to reordering
+ * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
* by srtt), since there is possibility that spurious retransmission was
* due to reordering delay longer than reo_wnd.
*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1a742b710e54..8851c9463b4b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
- if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
+ if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
(struct sockaddr *)usin, &ipc.addr);
if (err)
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 9f5a5cdc38e6..7a1d5f473878 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -112,7 +112,6 @@ static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
{
*prot = *base;
- prot->unhash = sock_map_unhash;
prot->close = sock_map_close;
prot->recvmsg = udp_bpf_recvmsg;
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 1380a6b6f4ff..86d32a1e62ac 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -152,8 +152,8 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
bool is_ipv6)
{
+ const struct net_offload __rcu **offloads;
__be16 protocol = skb->protocol;
- const struct net_offload **offloads;
const struct net_offload *ops;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,