summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig14
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/fib_trie.c11
-rw-r--r--net/ipv4/fou.c2
-rw-r--r--net/ipv4/inet_diag.c86
-rw-r--r--net/ipv4/inet_fragment.c10
-rw-r--r--net/ipv4/ip_fragment.c2
-rw-r--r--net/ipv4/ip_gre.c8
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ip_tunnel.c8
-rw-r--r--net/ipv4/ip_tunnel_core.c26
-rw-r--r--net/ipv4/ip_vti.c3
-rw-r--r--net/ipv4/ipconfig.c66
-rw-r--r--net/ipv4/ipip.c4
-rw-r--r--net/ipv4/ipmr.c741
-rw-r--r--net/ipv4/netfilter/arp_tables.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c22
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c1
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c2
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c2
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c54
-rw-r--r--net/ipv4/tcp.c65
-rw-r--r--net/ipv4/tcp_diag.c19
-rw-r--r--net/ipv4/tcp_input.c17
-rw-r--r--net/ipv4/tcp_ipv4.c50
-rw-r--r--net/ipv4/tcp_memcontrol.c233
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c21
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c161
-rw-r--r--net/ipv4/udp_diag.c4
-rw-r--r--net/ipv4/udp_offload.c15
-rw-r--r--net/ipv4/udp_tunnel.c11
-rw-r--r--net/ipv4/xfrm4_policy.c46
42 files changed, 897 insertions, 857 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 416dfa004cfb..775824720b6b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -353,6 +353,7 @@ config INET_ESP
select CRYPTO_CBC
select CRYPTO_SHA1
select CRYPTO_DES
+ select CRYPTO_ECHAINIV
---help---
Support for IPsec ESP.
@@ -436,6 +437,19 @@ config INET_UDP_DIAG
Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
+config INET_DIAG_DESTROY
+ bool "INET: allow privileged process to administratively close sockets"
+ depends on INET_DIAG
+ default n
+ ---help---
+ Provides a SOCK_DESTROY operation that allows privileged processes
+ (e.g., a connection manager or a network administration tool such as
+ ss) to close sockets opened by other processes. Closing a socket in
+ this way interrupts any blocking read/write/connect operations on
+ the socket and causes future socket calls to behave as if the socket
+ had been disconnected.
+ If unsure, say N.
+
menuconfig TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c29809f765dc..62c049b647e9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -56,7 +56,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 744e5936c10d..d07fc076bea0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
- else if (n->tn_bits <= TNODE_KMALLOC_MAX)
- kfree(n);
else
- vfree(n);
+ kvfree(n);
}
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
@@ -1396,9 +1394,10 @@ found:
struct fib_info *fi = fa->fa_info;
int nhsel, err;
- if ((index >= (1ul << fa->fa_slen)) &&
- ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
- continue;
+ if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
+ if (index >= (1ul << fa->fa_slen))
+ continue;
+ }
if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
continue;
if (fi->fib_dead)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index bd903fe0f750..976f0dcf6991 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -498,7 +498,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk->sk_allocation = GFP_ATOMIC;
if (cfg->udp_config.family == AF_INET) {
- err = udp_add_offload(&fou->udp_offloads);
+ err = udp_add_offload(net, &fou->udp_offloads);
if (err)
goto error;
}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ab9f8a66615d..6029157a19ed 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -350,40 +350,60 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
nlmsg_flags, unlh);
}
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
- const struct inet_diag_req_v2 *req)
+struct sock *inet_diag_find_one_icsk(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const struct inet_diag_req_v2 *req)
{
- struct net *net = sock_net(in_skb->sk);
- struct sk_buff *rep;
struct sock *sk;
- int err;
- err = -EINVAL;
if (req->sdiag_family == AF_INET)
sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6)
- sk = inet6_lookup(net, hashinfo,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
+ else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, hashinfo,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+ }
#endif
else
- goto out_nosk;
+ return ERR_PTR(-EINVAL);
- err = -ENOENT;
if (!sk)
- goto out_nosk;
+ return ERR_PTR(-ENOENT);
- err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
- if (err)
- goto out;
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
+
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+ struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ sk = inet_diag_find_one_icsk(net, hashinfo, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
if (!rep) {
@@ -409,12 +429,11 @@ out:
if (sk)
sock_gen_put(sk);
-out_nosk:
return err;
}
EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-static int inet_diag_get_exact(struct sk_buff *in_skb,
+static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *req)
{
@@ -424,8 +443,12 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,
handler = inet_diag_lock_handler(req->sdiag_protocol);
if (IS_ERR(handler))
err = PTR_ERR(handler);
- else
+ else if (cmd == SOCK_DIAG_BY_FAMILY)
err = handler->dump_one(in_skb, nlh, req);
+ else if (cmd == SOCK_DESTROY && handler->destroy)
+ err = handler->destroy(in_skb, req);
+ else
+ err = -EOPNOTSUPP;
inet_diag_unlock_handler(handler);
return err;
@@ -938,7 +961,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
req.idiag_states = rc->idiag_states;
req.id = rc->id;
- return inet_diag_get_exact(in_skb, nlh, &req);
+ return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req);
}
static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -972,7 +995,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
return inet_diag_get_exact_compat(skb, nlh);
}
-static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct inet_diag_req_v2);
struct net *net = sock_net(skb->sk);
@@ -980,7 +1003,8 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
- if (h->nlmsg_flags & NLM_F_DUMP) {
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) {
struct nlattr *attr;
@@ -999,7 +1023,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
}
- return inet_diag_get_exact(skb, h, nlmsg_data(h));
+ return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
}
static
@@ -1050,14 +1074,16 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
- .dump = inet_diag_handler_dump,
+ .dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
+ .destroy = inet_diag_handler_cmd,
};
static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
- .dump = inet_diag_handler_dump,
+ .dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
+ .destroy = inet_diag_handler_cmd,
};
int inet_diag_register(const struct inet_diag_handler *h)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index fe144dae7372..3a88b0c73797 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -285,14 +285,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
}
EXPORT_SYMBOL(inet_frag_kill);
-static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
- struct sk_buff *skb)
-{
- if (f->skb_free)
- f->skb_free(skb);
- kfree_skb(skb);
-}
-
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
{
struct sk_buff *fp;
@@ -309,7 +301,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
- frag_kfree_skb(nf, f, fp);
+ kfree_skb(fp);
fp = xp;
}
sum = sum_truesize + f->qsize;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1fe55ae81781..187c6fcc3027 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -661,6 +661,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
struct ipq *qp;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+ skb_orphan(skb);
/* Lookup (or create) queue header */
qp = ip_find(net, ip_hdr(skb), user, vif);
@@ -891,7 +892,6 @@ void __init ipfrag_init(void)
ip4_frags.hashfn = ip4_hashfn;
ip4_frags.constructor = ip4_frag_init;
ip4_frags.destructor = ip4_frag_free;
- ip4_frags.skb_free = NULL;
ip4_frags.qsize = sizeof(struct ipq);
ip4_frags.match = ip4_frag_match;
ip4_frags.frag_expire = ip_expire;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 614521437e30..7c51c4e1661f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -24,7 +24,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/in6.h>
@@ -562,10 +561,9 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
tunnel_id_to_key(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
- key->u.ipv4.dst, IPPROTO_GRE,
- key->tos, key->ttl, df, false);
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+
+ iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
+ key->tos, key->ttl, df, false);
return;
err_free_rt:
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b1209b63381f..d77eb0c3b684 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -316,7 +316,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
+ if (sysctl_ip_early_demux &&
+ !skb_dst(skb) &&
+ !skb->sk &&
+ !ip_is_fragment(iph)) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4233cbe47052..64878efa045c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -76,7 +76,6 @@
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
-#include <linux/mroute.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
@@ -240,6 +239,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
* from host network stack.
*/
features = netif_skb_features(skb);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -912,7 +912,7 @@ static int __ip_append_data(struct sock *sk,
*/
if (transhdrlen &&
length + fragheaderlen <= mtu &&
- rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
!(flags & MSG_MORE) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -921,7 +921,7 @@ static int __ip_append_data(struct sock *sk,
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
- (sk->sk_type == SOCK_DGRAM)) {
+ (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index cbb51f3fac06..c7bd72e9b544 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -30,7 +30,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -657,7 +656,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
struct rtable *rt; /* Route to the other host */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
- int err;
bool connected;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
@@ -795,10 +793,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
return;
}
- err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
- tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
-
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)));
return;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6cb9009c3d96..859d415c0b2d 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -24,7 +24,6 @@
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -48,12 +47,13 @@
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
-int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
- __be32 src, __be32 dst, __u8 proto,
- __u8 tos, __u8 ttl, __be16 df, bool xnet)
+void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet)
{
int pkt_len = skb->len - skb_inner_network_offset(skb);
struct net *net = dev_net(rt->dst.dev);
+ struct net_device *dev = skb->dev;
struct iphdr *iph;
int err;
@@ -82,7 +82,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
err = ip_local_out(net, sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
- return pkt_len;
+ iptunnel_xmit_stats(dev, pkt_len);
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
@@ -251,7 +251,7 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
if (tb[LWTUNNEL_IP_ID])
- tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]);
+ tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
if (tb[LWTUNNEL_IP_DST])
tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
@@ -266,7 +266,7 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
if (tb[LWTUNNEL_IP_FLAGS])
- tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]);
+ tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
tun_info->mode = IP_TUNNEL_INFO_TX;
tun_info->options_len = 0;
@@ -281,12 +281,12 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
- nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
return -ENOMEM;
return 0;
@@ -346,7 +346,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
if (tb[LWTUNNEL_IP6_ID])
- tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]);
+ tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
if (tb[LWTUNNEL_IP6_DST])
tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
@@ -361,7 +361,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
if (tb[LWTUNNEL_IP6_FLAGS])
- tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]);
+ tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
tun_info->options_len = 0;
@@ -376,12 +376,12 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) ||
- nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
return -ENOMEM;
return 0;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 4d8f0b698777..5cf10b777b7e 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -30,7 +30,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -200,7 +199,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ iptunnel_xmit_stats(dev, err);
return NETDEV_TX_OK;
tx_error_icmp:
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 0bc7412d9e14..2ed9dd2b5f2f 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -65,15 +65,6 @@
#include <net/checksum.h>
#include <asm/processor.h>
-/* Define this to allow debugging output */
-#undef IPCONFIG_DEBUG
-
-#ifdef IPCONFIG_DEBUG
-#define DBG(x) printk x
-#else
-#define DBG(x) do { } while(0)
-#endif
-
#if defined(CONFIG_IP_PNP_DHCP)
#define IPCONFIG_DHCP
#endif
@@ -152,7 +143,11 @@ static char dhcp_client_identifier[253] __initdata;
/* Persistent data: */
+#ifdef IPCONFIG_DYNAMIC
static int ic_proto_used; /* Protocol used, if any */
+#else
+#define ic_proto_used 0
+#endif
static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
static u8 ic_domain[64]; /* DNS (not NIS) domain name */
@@ -227,7 +222,7 @@ static int __init ic_open_devs(void)
if (dev->mtu >= 364)
able |= IC_BOOTP;
else
- pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small",
+ pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small\n",
dev->name, dev->mtu);
if (!(dev->flags & IFF_NOARP))
able |= IC_RARP;
@@ -254,8 +249,8 @@ static int __init ic_open_devs(void)
else
d->xid = 0;
ic_proto_have_if |= able;
- DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
- dev->name, able, d->xid));
+ pr_debug("IP-Config: %s UP (able=%d, xid=%08x)\n",
+ dev->name, able, d->xid);
}
}
@@ -311,7 +306,7 @@ static void __init ic_close_devs(void)
next = d->next;
dev = d->dev;
if (dev != ic_dev && !netdev_uses_dsa(dev)) {
- DBG(("IP-Config: Downing %s\n", dev->name));
+ pr_debug("IP-Config: Downing %s\n", dev->name);
dev_change_flags(dev, d->flags);
}
kfree(d);
@@ -464,7 +459,8 @@ static int __init ic_defaults(void)
&ic_myaddr);
return -1;
}
- printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
+ pr_notice("IP-Config: Guessing netmask %pI4\n",
+ &ic_netmask);
}
return 0;
@@ -675,9 +671,7 @@ ic_dhcp_init_options(u8 *options)
u8 *e = options;
int len;
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Sending message type %d\n", mt);
-#endif
+ pr_debug("DHCP: Sending message type %d\n", mt);
memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
e += 4;
@@ -847,7 +841,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
else if (dev->type == ARPHRD_FDDI)
b->htype = ARPHRD_ETHER;
else {
- printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+ pr_warn("Unknown ARP type 0x%04x for device %s\n", dev->type,
+ dev->name);
b->htype = dev->type; /* can cause undefined behavior */
}
@@ -904,14 +899,12 @@ static void __init ic_do_bootp_ext(u8 *ext)
int i;
__be16 mtu;
-#ifdef IPCONFIG_DEBUG
u8 *c;
- printk("DHCP/BOOTP: Got extension %d:",*ext);
+ pr_debug("DHCP/BOOTP: Got extension %d:", *ext);
for (c=ext+2; c<ext+2+ext[1]; c++)
- printk(" %02x", *c);
- printk("\n");
-#endif
+ pr_debug(" %02x", *c);
+ pr_debug("\n");
switch (*ext++) {
case 1: /* Subnet mask */
@@ -1080,9 +1073,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
}
}
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Got message type %d\n", mt);
-#endif
+ pr_debug("DHCP: Got message type %d\n", mt);
switch (mt) {
case DHCPOFFER:
@@ -1095,10 +1086,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Let's accept that offer. */
ic_myaddr = b->your_ip;
ic_servaddr = server_id;
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Offered address %pI4 by server %pI4\n",
- &ic_myaddr, &b->iph.saddr);
-#endif
+ pr_debug("DHCP: Offered address %pI4 by server %pI4\n",
+ &ic_myaddr, &b->iph.saddr);
/* The DHCP indicated server address takes
* precedence over the bootp header one if
* they are different.
@@ -1295,11 +1284,10 @@ static int __init ic_dynamic(void)
return -1;
}
- printk("IP-Config: Got %s answer from %pI4, ",
+ pr_info("IP-Config: Got %s answer from %pI4, my address is %pI4\n",
((ic_got_reply & IC_RARP) ? "RARP"
- : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
- &ic_addrservaddr);
- pr_cont("my address is %pI4\n", &ic_myaddr);
+ : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+ &ic_addrservaddr, &ic_myaddr);
return 0;
}
@@ -1426,7 +1414,7 @@ static int __init ip_auto_config(void)
if (!ic_enable)
return 0;
- DBG(("IP-Config: Entered.\n"));
+ pr_debug("IP-Config: Entered.\n");
#ifdef IPCONFIG_DYNAMIC
try_try_again:
#endif
@@ -1542,7 +1530,7 @@ static int __init ip_auto_config(void)
pr_cont(", mtu=%d", ic_dev_mtu);
for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
if (ic_nameservers[i] != NONE) {
- pr_info(" nameserver%u=%pI4",
+ pr_cont(" nameserver%u=%pI4",
i, &ic_nameservers[i]);
break;
}
@@ -1585,7 +1573,7 @@ static int __init ic_proto_name(char *name)
return 1;
*v = 0;
if (kstrtou8(client_id, 0, dhcp_client_identifier))
- DBG("DHCP: Invalid client identifier type\n");
+ pr_debug("DHCP: Invalid client identifier type\n");
strncpy(dhcp_client_identifier + 1, v + 1, 251);
*v = ',';
}
@@ -1644,7 +1632,7 @@ static int __init ip_auto_config_setup(char *addrs)
if ((cp = strchr(ip, ':')))
*cp++ = '\0';
if (strlen(ip) > 0) {
- DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+ pr_debug("IP-Config: Parameter #%d: `%s'\n", num, ip);
switch (num) {
case 0:
if ((ic_myaddr = in_aton(ip)) == ANY)
@@ -1716,7 +1704,7 @@ static int __init vendor_class_identifier_setup(char *addrs)
if (strlcpy(vendor_class_identifier, addrs,
sizeof(vendor_class_identifier))
>= sizeof(vendor_class_identifier))
- pr_warn("DHCP: vendorclass too long, truncated to \"%s\"",
+ pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n",
vendor_class_identifier);
return 1;
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index f34c31defafe..4044da61e747 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -103,7 +103,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -253,9 +252,6 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
p.i_key = p.o_key = 0;
p.i_flags = p.o_flags = 0;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
err = ip_tunnel_ioctl(dev, &p, cmd);
if (err)
return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c3a38353f5dc..395e2814a46d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -66,28 +66,7 @@
#include <net/netlink.h>
#include <net/fib_rules.h>
#include <linux/netconf.h>
-
-#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
-#define CONFIG_IP_PIMSM 1
-#endif
-
-struct mr_table {
- struct list_head list;
- possible_net_t net;
- u32 id;
- struct sock __rcu *mroute_sk;
- struct timer_list ipmr_expire_timer;
- struct list_head mfc_unres_queue;
- struct list_head mfc_cache_array[MFC_LINES];
- struct vif_device vif_table[MAXVIFS];
- int maxvif;
- atomic_t cache_resolve_queue_len;
- bool mroute_do_assert;
- bool mroute_do_pim;
-#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
- int mroute_reg_vif_num;
-#endif
-};
+#include <net/nexthop.h>
struct ipmr_rule {
struct fib_rule common;
@@ -103,11 +82,7 @@ struct ipmr_result {
static DEFINE_RWLOCK(mrt_lock);
-/*
- * Multicast router control variables
- */
-
-#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
+/* Multicast router control variables */
/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);
@@ -252,8 +227,8 @@ static int __net_init ipmr_rules_init(struct net *net)
INIT_LIST_HEAD(&net->ipv4.mr_tables);
mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
- if (!mrt) {
- err = -ENOMEM;
+ if (IS_ERR(mrt)) {
+ err = PTR_ERR(mrt);
goto err1;
}
@@ -301,8 +276,13 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
static int __net_init ipmr_rules_init(struct net *net)
{
- net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
- return net->ipv4.mrt ? 0 : -ENOMEM;
+ struct mr_table *mrt;
+
+ mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ if (IS_ERR(mrt))
+ return PTR_ERR(mrt);
+ net->ipv4.mrt = mrt;
+ return 0;
}
static void __net_exit ipmr_rules_exit(struct net *net)
@@ -319,13 +299,17 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
struct mr_table *mrt;
unsigned int i;
+ /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (id != RT_TABLE_DEFAULT && id >= 1000000000)
+ return ERR_PTR(-EINVAL);
+
mrt = ipmr_get_table(net, id);
if (mrt)
return mrt;
mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
if (!mrt)
- return NULL;
+ return ERR_PTR(-ENOMEM);
write_pnet(&mrt->net, net);
mrt->id = id;
@@ -338,9 +322,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
(unsigned long)mrt);
-#ifdef CONFIG_IP_PIMSM
mrt->mroute_reg_vif_num = -1;
-#endif
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
#endif
@@ -387,8 +369,24 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
}
}
-static
-struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
+/* Initialize ipmr pimreg/tunnel in_device */
+static bool ipmr_init_vif_indev(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+
+ ASSERT_RTNL();
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return false;
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+
+ return true;
+}
+
+static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
{
struct net_device *dev;
@@ -399,7 +397,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
int err;
struct ifreq ifr;
struct ip_tunnel_parm p;
- struct in_device *in_dev;
memset(&p, 0, sizeof(p));
p.iph.daddr = v->vifc_rmt_addr.s_addr;
@@ -424,15 +421,8 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
if (err == 0 &&
(dev = __dev_get_by_name(net, p.name)) != NULL) {
dev->flags |= IFF_MULTICAST;
-
- in_dev = __in_dev_get_rtnl(dev);
- if (!in_dev)
+ if (!ipmr_init_vif_indev(dev))
goto failure;
-
- ipv4_devconf_setall(in_dev);
- neigh_parms_data_state_setall(in_dev->arp_parms);
- IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
-
if (dev_open(dev))
goto failure;
dev_hold(dev);
@@ -445,8 +435,7 @@ failure:
return NULL;
}
-#ifdef CONFIG_IP_PIMSM
-
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct net *net = dev_net(dev);
@@ -496,7 +485,6 @@ static void reg_vif_setup(struct net_device *dev)
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
struct net_device *dev;
- struct in_device *in_dev;
char name[IFNAMSIZ];
if (mrt->id == RT_TABLE_DEFAULT)
@@ -516,18 +504,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
return NULL;
}
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (!in_dev) {
- rcu_read_unlock();
+ if (!ipmr_init_vif_indev(dev))
goto failure;
- }
-
- ipv4_devconf_setall(in_dev);
- neigh_parms_data_state_setall(in_dev->arp_parms);
- IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
- rcu_read_unlock();
-
if (dev_open(dev))
goto failure;
@@ -539,13 +517,56 @@ failure:
unregister_netdevice(dev);
return NULL;
}
+
+/* called with rcu_read_lock() */
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+ unsigned int pimlen)
+{
+ struct net_device *reg_dev = NULL;
+ struct iphdr *encap;
+
+ encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
+ /* Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
+ */
+ if (!ipv4_is_multicast(encap->daddr) ||
+ encap->tot_len == 0 ||
+ ntohs(encap->tot_len) + pimlen > skb->len)
+ return 1;
+
+ read_lock(&mrt_lock);
+ if (mrt->mroute_reg_vif_num >= 0)
+ reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
+ read_unlock(&mrt_lock);
+
+ if (!reg_dev)
+ return 1;
+
+ skb->mac_header = skb->network_header;
+ skb_pull(skb, (u8 *)encap - skb->data);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+ netif_rx(skb);
+
+ return NET_RX_SUCCESS;
+}
+#else
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+ return NULL;
+}
#endif
/**
* vif_delete - Delete a VIF entry
* @notify: Set to 1, if the caller is a notifier_call
*/
-
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
struct list_head *head)
{
@@ -567,10 +588,8 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
return -EADDRNOTAVAIL;
}
-#ifdef CONFIG_IP_PIMSM
if (vifi == mrt->mroute_reg_vif_num)
mrt->mroute_reg_vif_num = -1;
-#endif
if (vifi + 1 == mrt->maxvif) {
int tmp;
@@ -617,7 +636,6 @@ static inline void ipmr_cache_free(struct mfc_cache *c)
/* Destroy an unresolved cache entry, killing queued skbs
* and reporting error to netlink readers.
*/
-
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
{
struct net *net = read_pnet(&mrt->net);
@@ -645,9 +663,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
ipmr_cache_free(c);
}
-
/* Timer process for the unresolved queue. */
-
static void ipmr_expire_process(unsigned long arg)
{
struct mr_table *mrt = (struct mr_table *)arg;
@@ -687,7 +703,6 @@ out:
}
/* Fill oifs list. It is called under write locked mrt_lock. */
-
static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
unsigned char *ttls)
{
@@ -723,10 +738,10 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EADDRINUSE;
switch (vifc->vifc_flags) {
-#ifdef CONFIG_IP_PIMSM
case VIFF_REGISTER:
- /*
- * Special Purpose VIF in PIM
+ if (!ipmr_pimsm_enabled())
+ return -EINVAL;
+ /* Special Purpose VIF in PIM
* All the packets will be sent to the daemon
*/
if (mrt->mroute_reg_vif_num >= 0)
@@ -741,7 +756,6 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return err;
}
break;
-#endif
case VIFF_TUNNEL:
dev = ipmr_new_tunnel(net, vifc);
if (!dev)
@@ -753,7 +767,6 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return err;
}
break;
-
case VIFF_USE_IFINDEX:
case 0:
if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
@@ -807,10 +820,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
/* And finish update writing critical data */
write_lock_bh(&mrt_lock);
v->dev = dev;
-#ifdef CONFIG_IP_PIMSM
if (v->flags & VIFF_REGISTER)
mrt->mroute_reg_vif_num = vifi;
-#endif
if (vifi+1 > mrt->maxvif)
mrt->maxvif = vifi+1;
write_unlock_bh(&mrt_lock);
@@ -875,9 +886,7 @@ skip:
return ipmr_cache_find_any_parent(mrt, vifi);
}
-/*
- * Allocate a multicast cache entry
- */
+/* Allocate a multicast cache entry */
static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
@@ -898,10 +907,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
return c;
}
-/*
- * A cache entry has gone into a resolved state from queued
- */
-
+/* A cache entry has gone into a resolved state from queued */
static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
struct mfc_cache *uc, struct mfc_cache *c)
{
@@ -909,7 +915,6 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
struct nlmsgerr *e;
/* Play the pending entries through our router */
-
while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
@@ -933,34 +938,29 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
}
}
-/*
- * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
- * expects the following bizarre scheme.
+/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ * expects the following bizarre scheme.
*
- * Called under mrt_lock.
+ * Called under mrt_lock.
*/
-
static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert)
{
- struct sk_buff *skb;
const int ihl = ip_hdrlen(pkt);
+ struct sock *mroute_sk;
struct igmphdr *igmp;
struct igmpmsg *msg;
- struct sock *mroute_sk;
+ struct sk_buff *skb;
int ret;
-#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
-#endif
skb = alloc_skb(128, GFP_ATOMIC);
if (!skb)
return -ENOBUFS;
-#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT) {
/* Ugly, but we have no choice with this interface.
* Duplicate old header, fix ihl, length etc.
@@ -978,28 +978,23 @@ static int ipmr_cache_report(struct mr_table *mrt,
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
- } else
-#endif
- {
-
- /* Copy the IP header */
-
- skb_set_network_header(skb, skb->len);
- skb_put(skb, ihl);
- skb_copy_to_linear_data(skb, pkt->data, ihl);
- ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
- msg = (struct igmpmsg *)skb_network_header(skb);
- msg->im_vif = vifi;
- skb_dst_set(skb, dst_clone(skb_dst(pkt)));
-
- /* Add our header */
-
- igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
- igmp->type =
- msg->im_msgtype = assert;
- igmp->code = 0;
- ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
- skb->transport_header = skb->network_header;
+ } else {
+ /* Copy the IP header */
+ skb_set_network_header(skb, skb->len);
+ skb_put(skb, ihl);
+ skb_copy_to_linear_data(skb, pkt->data, ihl);
+ /* Flag to the kernel this is a route add */
+ ip_hdr(skb)->protocol = 0;
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ msg->im_vif = vifi;
+ skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+ /* Add our header */
+ igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp->type = assert;
+ msg->im_msgtype = assert;
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ skb->transport_header = skb->network_header;
}
rcu_read_lock();
@@ -1011,7 +1006,6 @@ static int ipmr_cache_report(struct mr_table *mrt,
}
/* Deliver to mrouted */
-
ret = sock_queue_rcv_skb(mroute_sk, skb);
rcu_read_unlock();
if (ret < 0) {
@@ -1022,12 +1016,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
return ret;
}
-/*
- * Queue a packet for resolution. It gets locked cache entry!
- */
-
-static int
-ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
+/* Queue a packet for resolution. It gets locked cache entry! */
+static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
+ struct sk_buff *skb)
{
bool found = false;
int err;
@@ -1045,7 +1036,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
if (!found) {
/* Create a new entry if allowable */
-
if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
(c = ipmr_cache_alloc_unres()) == NULL) {
spin_unlock_bh(&mfc_unres_lock);
@@ -1055,13 +1045,11 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
}
/* Fill in the new cache entry */
-
c->mfc_parent = -1;
c->mfc_origin = iph->saddr;
c->mfc_mcastgrp = iph->daddr;
/* Reflect first query at mrouted. */
-
err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
if (err < 0) {
/* If the report failed throw the cache entry
@@ -1083,7 +1071,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
}
/* See if we can append the packet */
-
if (c->mfc_un.unres.unresolved.qlen > 3) {
kfree_skb(skb);
err = -ENOBUFS;
@@ -1096,9 +1083,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
return err;
}
-/*
- * MFC cache manipulation by user space mroute daemon
- */
+/* MFC cache manipulation by user space mroute daemon */
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
@@ -1169,9 +1154,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
- /*
- * Check to see if we resolved a queued list. If so we
- * need to send on the frames and tidy up.
+ /* Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
*/
found = false;
spin_lock_bh(&mfc_unres_lock);
@@ -1196,10 +1180,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
return 0;
}
-/*
- * Close the multicast socket, and clear the vif tables etc
- */
-
+/* Close the multicast socket, and clear the vif tables etc */
static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
int i;
@@ -1207,7 +1188,6 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
struct mfc_cache *c, *next;
/* Shut down all active vif entries */
-
for (i = 0; i < mrt->maxvif; i++) {
if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
continue;
@@ -1216,7 +1196,6 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
unregister_netdevice_many(&list);
/* Wipe the cache */
-
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
if (!all && (c->mfc_flags & MFC_STATIC))
@@ -1260,44 +1239,52 @@ static void mrtsock_destruct(struct sock *sk)
rtnl_unlock();
}
-/*
- * Socket options and virtual interface manipulation. The whole
- * virtual interface system is a complete heap, but unfortunately
- * that's how BSD mrouted happens to think. Maybe one day with a proper
- * MOSPF/PIM router set up we can clean this up.
+/* Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
*/
-int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
+ unsigned int optlen)
{
- int ret, parent = 0;
- struct vifctl vif;
- struct mfcctl mfc;
struct net *net = sock_net(sk);
+ int val, ret = 0, parent = 0;
struct mr_table *mrt;
+ struct vifctl vif;
+ struct mfcctl mfc;
+ u32 uval;
+ /* There's one exception to the lock - MRT_DONE which needs to unlock */
+ rtnl_lock();
if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->inet_num != IPPROTO_IGMP)
- return -EOPNOTSUPP;
+ inet_sk(sk)->inet_num != IPPROTO_IGMP) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
- if (!mrt)
- return -ENOENT;
-
+ if (!mrt) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
if (optname != MRT_INIT) {
if (sk != rcu_access_pointer(mrt->mroute_sk) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EACCES;
+ !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EACCES;
+ goto out_unlock;
+ }
}
switch (optname) {
case MRT_INIT:
- if (optlen != sizeof(int))
- return -EINVAL;
-
- rtnl_lock();
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
if (rtnl_dereference(mrt->mroute_sk)) {
- rtnl_unlock();
- return -EADDRINUSE;
+ ret = -EADDRINUSE;
+ break;
}
ret = ip_ra_control(sk, 1, mrtsock_destruct);
@@ -1308,129 +1295,133 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
}
- rtnl_unlock();
- return ret;
+ break;
case MRT_DONE:
- if (sk != rcu_access_pointer(mrt->mroute_sk))
- return -EACCES;
- return ip_ra_control(sk, 0, NULL);
+ if (sk != rcu_access_pointer(mrt->mroute_sk)) {
+ ret = -EACCES;
+ } else {
+ /* We need to unlock here because mrtsock_destruct takes
+ * care of rtnl itself and we can't change that due to
+ * the IP_ROUTER_ALERT setsockopt which runs without it.
+ */
+ rtnl_unlock();
+ ret = ip_ra_control(sk, 0, NULL);
+ goto out;
+ }
+ break;
case MRT_ADD_VIF:
case MRT_DEL_VIF:
- if (optlen != sizeof(vif))
- return -EINVAL;
- if (copy_from_user(&vif, optval, sizeof(vif)))
- return -EFAULT;
- if (vif.vifc_vifi >= MAXVIFS)
- return -ENFILE;
- rtnl_lock();
+ if (optlen != sizeof(vif)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&vif, optval, sizeof(vif))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (vif.vifc_vifi >= MAXVIFS) {
+ ret = -ENFILE;
+ break;
+ }
if (optname == MRT_ADD_VIF) {
ret = vif_add(net, mrt, &vif,
sk == rtnl_dereference(mrt->mroute_sk));
} else {
ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
}
- rtnl_unlock();
- return ret;
-
- /*
- * Manipulate the forwarding caches. These live
- * in a sort of kernel/user symbiosis.
- */
+ break;
+ /* Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
case MRT_ADD_MFC:
case MRT_DEL_MFC:
parent = -1;
case MRT_ADD_MFC_PROXY:
case MRT_DEL_MFC_PROXY:
- if (optlen != sizeof(mfc))
- return -EINVAL;
- if (copy_from_user(&mfc, optval, sizeof(mfc)))
- return -EFAULT;
+ if (optlen != sizeof(mfc)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&mfc, optval, sizeof(mfc))) {
+ ret = -EFAULT;
+ break;
+ }
if (parent == 0)
parent = mfc.mfcc_parent;
- rtnl_lock();
if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
ret = ipmr_mfc_delete(mrt, &mfc, parent);
else
ret = ipmr_mfc_add(net, mrt, &mfc,
sk == rtnl_dereference(mrt->mroute_sk),
parent);
- rtnl_unlock();
- return ret;
- /*
- * Control PIM assert.
- */
+ break;
+ /* Control PIM assert. */
case MRT_ASSERT:
- {
- int v;
- if (optlen != sizeof(v))
- return -EINVAL;
- if (get_user(v, (int __user *)optval))
- return -EFAULT;
- mrt->mroute_do_assert = v;
- return 0;
- }
-#ifdef CONFIG_IP_PIMSM
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (get_user(val, (int __user *)optval)) {
+ ret = -EFAULT;
+ break;
+ }
+ mrt->mroute_do_assert = val;
+ break;
case MRT_PIM:
- {
- int v;
-
- if (optlen != sizeof(v))
- return -EINVAL;
- if (get_user(v, (int __user *)optval))
- return -EFAULT;
- v = !!v;
+ if (!ipmr_pimsm_enabled()) {
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (get_user(val, (int __user *)optval)) {
+ ret = -EFAULT;
+ break;
+ }
- rtnl_lock();
- ret = 0;
- if (v != mrt->mroute_do_pim) {
- mrt->mroute_do_pim = v;
- mrt->mroute_do_assert = v;
+ val = !!val;
+ if (val != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = val;
+ mrt->mroute_do_assert = val;
}
- rtnl_unlock();
- return ret;
- }
-#endif
-#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ break;
case MRT_TABLE:
- {
- u32 v;
-
- if (optlen != sizeof(u32))
- return -EINVAL;
- if (get_user(v, (u32 __user *)optval))
- return -EFAULT;
-
- /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
- if (v != RT_TABLE_DEFAULT && v >= 1000000000)
- return -EINVAL;
+ if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ if (optlen != sizeof(uval)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (get_user(uval, (u32 __user *)optval)) {
+ ret = -EFAULT;
+ break;
+ }
- rtnl_lock();
- ret = 0;
if (sk == rtnl_dereference(mrt->mroute_sk)) {
ret = -EBUSY;
} else {
- if (!ipmr_new_table(net, v))
- ret = -ENOMEM;
+ mrt = ipmr_new_table(net, uval);
+ if (IS_ERR(mrt))
+ ret = PTR_ERR(mrt);
else
- raw_sk(sk)->ipmr_table = v;
+ raw_sk(sk)->ipmr_table = uval;
}
- rtnl_unlock();
- return ret;
- }
-#endif
- /*
- * Spurious command, or MRT_VERSION which you cannot
- * set.
- */
+ break;
+ /* Spurious command, or MRT_VERSION which you cannot set. */
default:
- return -ENOPROTOOPT;
+ ret = -ENOPROTOOPT;
}
+out_unlock:
+ rtnl_unlock();
+out:
+ return ret;
}
-/*
- * Getsock opt support for the multicast routing system.
- */
-
+/* Getsock opt support for the multicast routing system. */
int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
{
int olr;
@@ -1446,39 +1437,35 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
if (!mrt)
return -ENOENT;
- if (optname != MRT_VERSION &&
-#ifdef CONFIG_IP_PIMSM
- optname != MRT_PIM &&
-#endif
- optname != MRT_ASSERT)
+ switch (optname) {
+ case MRT_VERSION:
+ val = 0x0305;
+ break;
+ case MRT_PIM:
+ if (!ipmr_pimsm_enabled())
+ return -ENOPROTOOPT;
+ val = mrt->mroute_do_pim;
+ break;
+ case MRT_ASSERT:
+ val = mrt->mroute_do_assert;
+ break;
+ default:
return -ENOPROTOOPT;
+ }
if (get_user(olr, optlen))
return -EFAULT;
-
olr = min_t(unsigned int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
-
if (put_user(olr, optlen))
return -EFAULT;
- if (optname == MRT_VERSION)
- val = 0x0305;
-#ifdef CONFIG_IP_PIMSM
- else if (optname == MRT_PIM)
- val = mrt->mroute_do_pim;
-#endif
- else
- val = mrt->mroute_do_assert;
if (copy_to_user(optval, &val, olr))
return -EFAULT;
return 0;
}
-/*
- * The IP multicast ioctl support routines.
- */
-
+/* The IP multicast ioctl support routines. */
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
{
struct sioc_sg_req sr;
@@ -1611,7 +1598,6 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
}
#endif
-
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
@@ -1633,17 +1619,14 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
return NOTIFY_DONE;
}
-
static struct notifier_block ip_mr_notifier = {
.notifier_call = ipmr_device_event,
};
-/*
- * Encapsulate a packet by attaching a valid IPIP header to it.
- * This avoids tunnel drivers and other mess and gives us the speed so
- * important for multicast video.
+/* Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
*/
-
static void ip_encap(struct net *net, struct sk_buff *skb,
__be32 saddr, __be32 daddr)
{
@@ -1685,9 +1668,7 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
return dst_output(net, sk, skb);
}
-/*
- * Processing handlers for ipmr_forward
- */
+/* Processing handlers for ipmr_forward */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, struct mfc_cache *c, int vifi)
@@ -1702,7 +1683,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (!vif->dev)
goto out_free;
-#ifdef CONFIG_IP_PIMSM
if (vif->flags & VIFF_REGISTER) {
vif->pkt_out++;
vif->bytes_out += skb->len;
@@ -1711,7 +1691,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
goto out_free;
}
-#endif
if (vif->flags & VIFF_TUNNEL) {
rt = ip_route_output_ports(net, &fl4, NULL,
@@ -1738,7 +1717,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* allow to send ICMP, so that packets will disappear
* to blackhole.
*/
-
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
goto out_free;
@@ -1770,8 +1748,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
IPCB(skb)->flags |= IPSKB_FORWARDED;
- /*
- * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
* not only before forwarding, but after forwarding on all output
* interfaces. It is clear, if mrouter runs a multicasting
* program, it should receive packets not depending to what interface
@@ -1802,7 +1779,6 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
}
/* "local" means that we should preserve one skb (for local delivery) */
-
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, struct mfc_cache *cache,
int local)
@@ -1827,9 +1803,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
goto forward;
}
- /*
- * Wrong interface: drop packet and (maybe) send PIM assert.
- */
+ /* Wrong interface: drop packet and (maybe) send PIM assert. */
if (mrt->vif_table[vif].dev != skb->dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
@@ -1868,9 +1842,7 @@ forward:
mrt->vif_table[vif].pkt_in++;
mrt->vif_table[vif].bytes_in += skb->len;
- /*
- * Forward the frame
- */
+ /* Forward the frame */
if (cache->mfc_origin == htonl(INADDR_ANY) &&
cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
if (true_vifi >= 0 &&
@@ -1944,11 +1916,9 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
return mrt;
}
-/*
- * Multicast packets for forwarding arrive here
- * Called with rcu_read_lock();
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
*/
-
int ip_mr_input(struct sk_buff *skb)
{
struct mfc_cache *cache;
@@ -1999,9 +1969,7 @@ int ip_mr_input(struct sk_buff *skb)
vif);
}
- /*
- * No usable cache entry
- */
+ /* No usable cache entry */
if (!cache) {
int vif;
@@ -2042,53 +2010,8 @@ dont_forward:
return 0;
}
-#ifdef CONFIG_IP_PIMSM
-/* called with rcu_read_lock() */
-static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
- unsigned int pimlen)
-{
- struct net_device *reg_dev = NULL;
- struct iphdr *encap;
-
- encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
- /*
- * Check that:
- * a. packet is really sent to a multicast group
- * b. packet is not a NULL-REGISTER
- * c. packet is not truncated
- */
- if (!ipv4_is_multicast(encap->daddr) ||
- encap->tot_len == 0 ||
- ntohs(encap->tot_len) + pimlen > skb->len)
- return 1;
-
- read_lock(&mrt_lock);
- if (mrt->mroute_reg_vif_num >= 0)
- reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- read_unlock(&mrt_lock);
-
- if (!reg_dev)
- return 1;
-
- skb->mac_header = skb->network_header;
- skb_pull(skb, (u8 *)encap - skb->data);
- skb_reset_network_header(skb);
- skb->protocol = htons(ETH_P_IP);
- skb->ip_summed = CHECKSUM_NONE;
-
- skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
-
- netif_rx(skb);
-
- return NET_RX_SUCCESS;
-}
-#endif
-
#ifdef CONFIG_IP_PIMSM_V1
-/*
- * Handle IGMP messages of PIMv1
- */
-
+/* Handle IGMP messages of PIMv1 */
int pim_rcv_v1(struct sk_buff *skb)
{
struct igmphdr *pim;
@@ -2249,8 +2172,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
}
read_lock(&mrt_lock);
- if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
- cache->mfc_flags |= MFC_NOTIFY;
err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(&mrt_lock);
rcu_read_unlock();
@@ -2412,10 +2333,133 @@ done:
return skb->len;
}
+static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
+ [RTA_SRC] = { .type = NLA_U32 },
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+};
+
+static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
+{
+ switch (rtm_protocol) {
+ case RTPROT_STATIC:
+ case RTPROT_MROUTED:
+ return true;
+ }
+ return false;
+}
+
+static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
+{
+ struct rtnexthop *rtnh = nla_data(nla);
+ int remaining = nla_len(nla), vifi = 0;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
+ if (++vifi == MAXVIFS)
+ break;
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return remaining > 0 ? -EINVAL : vifi;
+}
+
+/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
+static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
+ struct mfcctl *mfcc, int *mrtsock,
+ struct mr_table **mrtret)
+{
+ struct net_device *dev = NULL;
+ u32 tblid = RT_TABLE_DEFAULT;
+ struct mr_table *mrt;
+ struct nlattr *attr;
+ struct rtmsg *rtm;
+ int ret, rem;
+
+ ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy);
+ if (ret < 0)
+ goto out;
+ rtm = nlmsg_data(nlh);
+
+ ret = -EINVAL;
+ if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
+ rtm->rtm_type != RTN_MULTICAST ||
+ rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
+ !ipmr_rtm_validate_proto(rtm->rtm_protocol))
+ goto out;
+
+ memset(mfcc, 0, sizeof(*mfcc));
+ mfcc->mfcc_parent = -1;
+ ret = 0;
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
+ switch (nla_type(attr)) {
+ case RTA_SRC:
+ mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
+ break;
+ case RTA_DST:
+ mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
+ break;
+ case RTA_IIF:
+ dev = __dev_get_by_index(net, nla_get_u32(attr));
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ break;
+ case RTA_MULTIPATH:
+ if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case RTA_PREFSRC:
+ ret = 1;
+ break;
+ case RTA_TABLE:
+ tblid = nla_get_u32(attr);
+ break;
+ }
+ }
+ mrt = ipmr_get_table(net, tblid);
+ if (!mrt) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *mrtret = mrt;
+ *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
+ if (dev)
+ mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);
+
+out:
+ return ret;
+}
+
+/* takes care of both newroute and delroute */
+static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ int ret, mrtsock, parent;
+ struct mr_table *tbl;
+ struct mfcctl mfcc;
+
+ mrtsock = 0;
+ tbl = NULL;
+ ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl);
+ if (ret < 0)
+ return ret;
+
+ parent = ret ? mfcc.mfcc_parent : -1;
+ if (nlh->nlmsg_type == RTM_NEWROUTE)
+ return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
+ else
+ return ipmr_mfc_delete(tbl, &mfcc, parent);
+}
+
#ifdef CONFIG_PROC_FS
-/*
- * The /proc interfaces to multicast routing :
- * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
+/* The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
*/
struct ipmr_vif_iter {
struct seq_net_private p;
@@ -2699,10 +2743,7 @@ static const struct net_protocol pim_protocol = {
};
#endif
-
-/*
- * Setup for IP multicast routing
- */
+/* Setup for IP multicast routing */
static int __net_init ipmr_net_init(struct net *net)
{
int err;
@@ -2752,8 +2793,6 @@ int __init ip_mr_init(void)
sizeof(struct mfc_cache),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
- if (!mrt_cachep)
- return -ENOMEM;
err = register_pernet_subsys(&ipmr_net_ops);
if (err)
@@ -2771,6 +2810,10 @@ int __init ip_mr_init(void)
#endif
rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
NULL, ipmr_rtm_dumproute, NULL);
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
+ ipmr_rtm_route, NULL, NULL);
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
+ ipmr_rtm_route, NULL, NULL);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 11dccba474b7..b488cac9c5ca 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -38,13 +38,13 @@ MODULE_DESCRIPTION("arptables core");
/*#define DEBUG_ARP_TABLES_USER*/
#ifdef DEBUG_ARP_TABLES
-#define dprintf(format, args...) printk(format , ## args)
+#define dprintf(format, args...) pr_debug(format, ## args)
#else
#define dprintf(format, args...)
#endif
#ifdef DEBUG_ARP_TABLES_USER
-#define duprintf(format, args...) printk(format , ## args)
+#define duprintf(format, args...) pr_debug(format, ## args)
#else
#define duprintf(format, args...)
#endif
@@ -1905,7 +1905,7 @@ static int __init arp_tables_init(void)
if (ret < 0)
goto err4;
- printk(KERN_INFO "arp_tables: (C) 2002 David S. Miller\n");
+ pr_info("arp_tables: (C) 2002 David S. Miller\n");
return 0;
err4:
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 461ca926fd39..e3c46e8e2762 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -451,7 +451,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
- printk(KERN_ERR "Unable to register netfilter socket option\n");
+ pr_err("Unable to register netfilter socket option\n");
return ret;
}
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 6fb869f646bf..a04dee536b8e 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -27,8 +27,6 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
{
int err;
- skb_orphan(skb);
-
local_bh_disable();
err = ip_defrag(net, skb, user);
local_bh_enable();
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 5075b7ecd26d..61c7cc22ea68 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -132,7 +132,8 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_PARTIAL) {
if (!(rt->rt_flags & RTCF_LOCAL) &&
- (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+ (!skb->dev || skb->dev->features &
+ (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) +
skb_network_offset(skb) +
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ddb894ac1458..c9b52c361da2 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1048,7 +1048,7 @@ static int snmp_parse_mangle(unsigned char *msg,
if (!asn1_uint_decode (&ctx, end, &vers))
return 0;
if (debug > 1)
- printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+ pr_debug("bsalg: snmp version: %u\n", vers + 1);
if (vers > 1)
return 1;
@@ -1064,10 +1064,10 @@ static int snmp_parse_mangle(unsigned char *msg,
if (debug > 1) {
unsigned int i;
- printk(KERN_DEBUG "bsalg: community: ");
+ pr_debug("bsalg: community: ");
for (i = 0; i < comm.len; i++)
- printk("%c", comm.data[i]);
- printk("\n");
+ pr_cont("%c", comm.data[i]);
+ pr_cont("\n");
}
kfree(comm.data);
@@ -1091,9 +1091,9 @@ static int snmp_parse_mangle(unsigned char *msg,
};
if (pdutype > SNMP_PDU_TRAP2)
- printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+ pr_debug("bsalg: bad pdu type %u\n", pdutype);
else
- printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+ pr_debug("bsalg: pdu: %s\n", pdus[pdutype]);
}
if (pdutype != SNMP_PDU_RESPONSE &&
pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
@@ -1119,7 +1119,7 @@ static int snmp_parse_mangle(unsigned char *msg,
return 0;
if (debug > 1)
- printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+ pr_debug("bsalg: request: id=0x%lx error_status=%u "
"error_index=%u\n", req.id, req.error_status,
req.error_index);
}
@@ -1145,13 +1145,13 @@ static int snmp_parse_mangle(unsigned char *msg,
}
if (debug > 1) {
- printk(KERN_DEBUG "bsalg: object: ");
+ pr_debug("bsalg: object: ");
for (i = 0; i < obj->id_len; i++) {
if (i > 0)
- printk(".");
- printk("%lu", obj->id[i]);
+ pr_cont(".");
+ pr_cont("%lu", obj->id[i]);
}
- printk(": type=%u\n", obj->type);
+ pr_cont(": type=%u\n", obj->type);
}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index c747b2d9eb77..b6ea57ec5e14 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -14,7 +14,6 @@
#include <net/netfilter/ipv4/nf_reject.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
-#include <net/netfilter/ipv4/nf_reject.h>
const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
struct tcphdr *_oth, int hook)
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 9d09d4f59545..cd84d4295a20 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -57,7 +57,7 @@ err:
static void nf_tables_arp_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.arp);
+ nft_unregister_afinfo(net, net->nft.arp);
kfree(net->nft.arp);
}
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index ca9dc3c46c4f..e44ba3b12fbb 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -78,7 +78,7 @@ err:
static void nf_tables_ipv4_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.ipv4);
+ nft_unregister_afinfo(net, net->nft.ipv4);
kfree(net->nft.ipv4);
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index e89094ab5ddb..c117b21b937d 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1063,6 +1063,7 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
}
void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
+ __acquires(ping_table.lock)
{
struct ping_iter_state *state = seq->private;
state->bucket = 0;
@@ -1094,6 +1095,7 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
+ __releases(ping_table.lock)
{
read_unlock_bh(&ping_table.lock);
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 63e5be0abd86..bc35f1842512 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -601,8 +601,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
- if (!saddr && ipc.oif)
- l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (err < 0)
+ goto done;
+ }
if (!inet->hdrincl) {
rfv.msg = msg;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4cbe9f0a4281..643a86c49020 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq->snt_synack.v64 = 0;
treq->tfo_listener = false;
- ireq->ir_iif = sk->sk_bound_dev_if;
+ ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
* hasn't changed since we received the original syn, but I see
* no easy way to do this.
*/
- flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a0bd7a55193e..4d367b4139a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,7 +24,6 @@
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
#include <net/ping.h>
-#include <net/tcp_memcontrol.h>
static int zero;
static int one = 1;
@@ -337,27 +336,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_keepalive_time",
- .data = &sysctl_tcp_keepalive_time,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "tcp_keepalive_probes",
- .data = &sysctl_tcp_keepalive_probes,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_keepalive_intvl",
- .data = &sysctl_tcp_keepalive_intvl,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
.procname = "tcp_retries1",
.data = &sysctl_tcp_retries1,
.maxlen = sizeof(int),
@@ -915,6 +893,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "tcp_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{
.procname = "tcp_mtu_probing",
.data = &init_net.ipv4.sysctl_tcp_mtu_probing,
@@ -950,6 +939,27 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "tcp_keepalive_time",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_keepalive_probes",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_probes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_keepalive_intvl",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_intvl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82cca18c90f..19746b3fcbbe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <asm/unaligned.h>
#include <net/busy_poll.h>
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
@@ -422,7 +423,8 @@ void tcp_init_sock(struct sock *sk)
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
local_bh_disable();
- sock_update_memcg(sk);
+ if (mem_cgroup_sockets_enabled)
+ sock_update_memcg(sk);
sk_sockets_allocated_inc(sk);
local_bh_enable();
}
@@ -1018,7 +1020,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
ssize_t res;
if (!(sk->sk_route_caps & NETIF_F_SG) ||
- !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
+ !sk_check_csum_caps(sk))
return sock_no_sendpage(sk->sk_socket, page, offset, size,
flags);
@@ -1175,7 +1177,7 @@ new_segment:
/*
* Check whether we can use HW checksum.
*/
- if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
+ if (sk_check_csum_caps(sk))
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
@@ -2637,6 +2639,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
unsigned int start;
+ u64 rate64;
u32 rate;
memset(info, 0, sizeof(*info));
@@ -2702,15 +2705,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
rate = READ_ONCE(sk->sk_pacing_rate);
- info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_pacing_rate);
rate = READ_ONCE(sk->sk_max_pacing_rate);
- info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_max_pacing_rate);
do {
start = u64_stats_fetch_begin_irq(&tp->syncp);
- info->tcpi_bytes_acked = tp->bytes_acked;
- info->tcpi_bytes_received = tp->bytes_received;
+ put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
+ put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
@@ -3080,6 +3085,52 @@ void tcp_done(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_done);
+int tcp_abort(struct sock *sk, int err)
+{
+ if (!sk_fullsock(sk)) {
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ local_bh_disable();
+ inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
+ req);
+ local_bh_enable();
+ return 0;
+ }
+ sock_gen_put(sk);
+ return -EOPNOTSUPP;
+ }
+
+ /* Don't race with userspace socket closes such as tcp_close. */
+ lock_sock(sk);
+
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet_csk_listen_stop(sk);
+ }
+
+ /* Don't race with BH socket closes such as inet_csk_listen_stop. */
+ local_bh_disable();
+ bh_lock_sock(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_err = err;
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+ sk->sk_error_report(sk);
+ if (tcp_need_reset(sk->sk_state))
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
+ }
+
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_abort);
+
extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index b31604086edd..4d610934fb39 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -10,6 +10,8 @@
*/
#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/sock_diag.h>
#include <linux/inet_diag.h>
#include <linux/tcp.h>
@@ -46,12 +48,29 @@ static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
}
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int tcp_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
+
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ return sock_diag_destroy(sk, ECONNABORTED);
+}
+#endif
+
static const struct inet_diag_handler tcp_diag_handler = {
.dump = tcp_diag_dump,
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_type = IPPROTO_TCP,
.idiag_info_size = sizeof(struct tcp_info),
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = tcp_diag_destroy,
+#endif
};
static int __init tcp_diag_init(void)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2d656eef7f8e..1c2a73406261 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2164,8 +2164,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt, oldcnt;
- int err;
+ int cnt, oldcnt, lost;
unsigned int mss;
/* Use SACK to deduce losses of new sequences sent during recovery */
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
@@ -2205,9 +2204,10 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
break;
mss = tcp_skb_mss(skb);
- err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
- mss, GFP_ATOMIC);
- if (err < 0)
+ /* If needed, chop off the prefix to mark as lost. */
+ lost = (packets - oldcnt) * mss;
+ if (lost < skb->len &&
+ tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
break;
cnt = packets;
}
@@ -2366,8 +2366,6 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
tp->snd_ssthresh = tp->prior_ssthresh;
tcp_ecn_withdraw_cwr(tp);
}
- } else {
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->undo_marker = 0;
@@ -2478,6 +2476,9 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
int newly_acked_sacked = prior_unsacked -
(tp->packets_out - tp->sacked_out);
+ if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+ return;
+
tp->prr_delivered += newly_acked_sacked;
if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
@@ -6204,7 +6205,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init(req, &tmp_opt, skb, sk);
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
- inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
af_ops->init_req(req, sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d8841a2f1569..a4d523709ab3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,7 +73,6 @@
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
#include <net/busy_poll.h>
#include <linux/inet.h>
@@ -587,7 +586,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
} rep;
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *key;
+ struct tcp_md5sig_key *key = NULL;
const __u8 *hash_location = NULL;
unsigned char newhash[16];
int genhash;
@@ -627,7 +626,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
hash_location = tcp_parse_md5sig_option(th);
- if (!sk && hash_location) {
+ if (sk && sk_fullsock(sk)) {
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr, AF_INET);
+ } else if (hash_location) {
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -651,10 +653,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
goto release_sk1;
- } else {
- key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr,
- AF_INET) : NULL;
}
if (key) {
@@ -675,7 +673,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
- arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+ arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+
/* When socket is gone, all binding information is lost.
* routing might fail in this case. No choice here, if we choose to force
* input interface, we will misroute in case of asymmetric route.
@@ -683,6 +682,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
+ BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
+ offsetof(struct inet_timewait_sock, tw_bound_dev_if));
+
arg.tos = ip_hdr(skb)->tos;
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -705,7 +707,8 @@ release_sk1:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+static void tcp_v4_send_ack(struct net *net,
+ struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
@@ -720,7 +723,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
];
} rep;
struct ip_reply_arg arg;
- struct net *net = dev_net(skb_dst(skb)->dev);
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -782,7 +784,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcp_v4_send_ack(sock_net(sk), skb,
+ tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
@@ -801,8 +804,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
- tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
- tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+ u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+ tcp_sk(sk)->snd_nxt;
+
+ tcp_v4_send_ack(sock_net(sk), skb, seq,
tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
@@ -1276,6 +1281,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
ireq = inet_rsk(req);
sk_daddr_set(newsk, ireq->ir_rmt_addr);
sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+ newsk->sk_bound_dev_if = ireq->ir_iif;
newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = ireq->opt;
rcu_assign_pointer(newinet->inet_opt, inet_opt);
@@ -1705,7 +1711,9 @@ do_time_wait:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
- goto no_tcp_socket;
+ tcp_v4_send_reset(sk, skb);
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ goto discard_it;
case TCP_TW_SUCCESS:;
}
goto discard_it;
@@ -1812,7 +1820,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_saved_syn_free(tp);
sk_sockets_allocated_dec(sk);
- sock_release_memcg(sk);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ sock_release_memcg(sk);
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -2336,11 +2346,7 @@ struct proto tcp_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
-#ifdef CONFIG_MEMCG_KMEM
- .init_cgroup = tcp_init_cgroup,
- .destroy_cgroup = tcp_destroy_cgroup,
- .proto_cgroup = tcp_proto_cgroup,
-#endif
+ .diag_destroy = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);
@@ -2378,6 +2384,10 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+ net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+ net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+
return 0;
fail:
tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
deleted file mode 100644
index 2379c1b4efb2..000000000000
--- a/net/ipv4/tcp_memcontrol.c
+++ /dev/null
@@ -1,233 +0,0 @@
-#include <net/tcp.h>
-#include <net/tcp_memcontrol.h>
-#include <net/sock.h>
-#include <net/ip.h>
-#include <linux/nsproxy.h>
-#include <linux/memcontrol.h>
-#include <linux/module.h>
-
-int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- /*
- * The root cgroup does not use page_counters, but rather,
- * rely on the data already collected by the network
- * subsystem
- */
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct page_counter *counter_parent = NULL;
- struct cg_proto *cg_proto, *parent_cg;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return 0;
-
- cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
- cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
- cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
- cg_proto->memory_pressure = 0;
- cg_proto->memcg = memcg;
-
- parent_cg = tcp_prot.proto_cgroup(parent);
- if (parent_cg)
- counter_parent = &parent_cg->memory_allocated;
-
- page_counter_init(&cg_proto->memory_allocated, counter_parent);
- percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
-
- return 0;
-}
-EXPORT_SYMBOL(tcp_init_cgroup);
-
-void tcp_destroy_cgroup(struct mem_cgroup *memcg)
-{
- struct cg_proto *cg_proto;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return;
-
- percpu_counter_destroy(&cg_proto->sockets_allocated);
-
- if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
- static_key_slow_dec(&memcg_socket_limit_enabled);
-
-}
-EXPORT_SYMBOL(tcp_destroy_cgroup);
-
-static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
- struct cg_proto *cg_proto;
- int i;
- int ret;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return -EINVAL;
-
- ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
- if (ret)
- return ret;
-
- for (i = 0; i < 3; i++)
- cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
- sysctl_tcp_mem[i]);
-
- if (nr_pages == PAGE_COUNTER_MAX)
- clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
- else {
- /*
- * The active bit needs to be written after the static_key
- * update. This is what guarantees that the socket activation
- * function is the last one to run. See sock_update_memcg() for
- * details, and note that we don't mark any socket as belonging
- * to this memcg until that flag is up.
- *
- * We need to do this, because static_keys will span multiple
- * sites, but we can't control their order. If we mark a socket
- * as accounted, but the accounting functions are not patched in
- * yet, we'll lose accounting.
- *
- * We never race with the readers in sock_update_memcg(),
- * because when this value change, the code to process it is not
- * patched in yet.
- *
- * The activated bit is used to guarantee that no two writers
- * will do the update in the same memcg. Without that, we can't
- * properly shutdown the static key.
- */
- if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
- static_key_slow_inc(&memcg_socket_limit_enabled);
- set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
- }
-
- return 0;
-}
-
-enum {
- RES_USAGE,
- RES_LIMIT,
- RES_MAX_USAGE,
- RES_FAILCNT,
-};
-
-static DEFINE_MUTEX(tcp_limit_mutex);
-
-static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long nr_pages;
- int ret = 0;
-
- buf = strstrip(buf);
-
- switch (of_cft(of)->private) {
- case RES_LIMIT:
- /* see memcontrol.c */
- ret = page_counter_memparse(buf, "-1", &nr_pages);
- if (ret)
- break;
- mutex_lock(&tcp_limit_mutex);
- ret = tcp_update_limit(memcg, nr_pages);
- mutex_unlock(&tcp_limit_mutex);
- break;
- default:
- ret = -EINVAL;
- break;
- }
- return ret ?: nbytes;
-}
-
-static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
- u64 val;
-
- switch (cft->private) {
- case RES_LIMIT:
- if (!cg_proto)
- return PAGE_COUNTER_MAX;
- val = cg_proto->memory_allocated.limit;
- val *= PAGE_SIZE;
- break;
- case RES_USAGE:
- if (!cg_proto)
- val = atomic_long_read(&tcp_memory_allocated);
- else
- val = page_counter_read(&cg_proto->memory_allocated);
- val *= PAGE_SIZE;
- break;
- case RES_FAILCNT:
- if (!cg_proto)
- return 0;
- val = cg_proto->memory_allocated.failcnt;
- break;
- case RES_MAX_USAGE:
- if (!cg_proto)
- return 0;
- val = cg_proto->memory_allocated.watermark;
- val *= PAGE_SIZE;
- break;
- default:
- BUG();
- }
- return val;
-}
-
-static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct mem_cgroup *memcg;
- struct cg_proto *cg_proto;
-
- memcg = mem_cgroup_from_css(of_css(of));
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return nbytes;
-
- switch (of_cft(of)->private) {
- case RES_MAX_USAGE:
- page_counter_reset_watermark(&cg_proto->memory_allocated);
- break;
- case RES_FAILCNT:
- cg_proto->memory_allocated.failcnt = 0;
- break;
- }
-
- return nbytes;
-}
-
-static struct cftype tcp_files[] = {
- {
- .name = "kmem.tcp.limit_in_bytes",
- .write = tcp_cgroup_write,
- .read_u64 = tcp_cgroup_read,
- .private = RES_LIMIT,
- },
- {
- .name = "kmem.tcp.usage_in_bytes",
- .read_u64 = tcp_cgroup_read,
- .private = RES_USAGE,
- },
- {
- .name = "kmem.tcp.failcnt",
- .private = RES_FAILCNT,
- .write = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- {
- .name = "kmem.tcp.max_usage_in_bytes",
- .private = RES_MAX_USAGE,
- .write = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- { } /* terminate */
-};
-
-static int __init tcp_memcontrol_init(void)
-{
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
- return 0;
-}
-__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ac6b1961ffeb..75632a925824 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -131,7 +131,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
goto kill;
if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
- goto kill_with_rst;
+ return TCP_TW_RST;
/* Dup ACK? */
if (!th->ack ||
@@ -145,11 +145,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
-kill_with_rst:
- inet_twsk_deschedule_put(tw);
+ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
return TCP_TW_RST;
- }
/* FIN arrived, enter true time-wait state. */
tw->tw_substate = TCP_TIME_WAIT;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9bfc39ff2285..fda379cd600d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2296,7 +2296,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
return;
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
- sk_gfp_atomic(sk, GFP_ATOMIC)))
+ sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
@@ -2813,13 +2813,16 @@ begin_fwd:
*/
void sk_forced_mem_schedule(struct sock *sk, int size)
{
- int amt, status;
+ int amt;
if (size <= sk->sk_forward_alloc)
return;
amt = sk_mem_pages(size);
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- sk_memory_allocated_add(sk, amt, &status);
+ sk_memory_allocated_add(sk, amt);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_charge_skmem(sk->sk_memcg, amt);
}
/* Send a FIN. The caller locks the socket for us.
@@ -3353,8 +3356,9 @@ void tcp_send_ack(struct sock *sk)
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (!buff) {
+ buff = alloc_skb(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
+ if (unlikely(!buff)) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -3376,7 +3380,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
skb_mstamp_get(&buff->skb_mstamp);
- tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
+ tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
}
EXPORT_SYMBOL_GPL(tcp_send_ack);
@@ -3397,7 +3401,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+ skb = alloc_skb(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (!skb)
return -1;
@@ -3410,7 +3415,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp);
NET_INC_STATS(sock_net(sk), mib);
- return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
+ return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
void tcp_send_window_probe(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 193ba1fa8a9a..a4730a28b220 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -24,9 +24,6 @@
int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
-int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME;
-int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES;
-int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 17d35662930d..3e6a472e6b88 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -219,7 +219,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- return tp->snd_cwnd - reduction;
+ return max_t(int, tp->snd_cwnd - reduction, 2);
}
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0c7b0e61b917..be0b21852b13 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -113,6 +113,7 @@
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include "udp_impl.h"
+#include <net/sock_reuseport.h>
struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
@@ -137,7 +138,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
unsigned long *bitmap,
struct sock *sk,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2),
+ const struct sock *sk2,
+ bool match_wildcard),
unsigned int log)
{
struct sock *sk2;
@@ -152,8 +154,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
!uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2)) {
+ saddr_comp(sk, sk2, true)) {
if (!bitmap)
return 1;
__set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
@@ -170,7 +173,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
struct udp_hslot *hslot2,
struct sock *sk,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2))
+ const struct sock *sk2,
+ bool match_wildcard))
{
struct sock *sk2;
struct hlist_nulls_node *node;
@@ -186,8 +190,9 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
!uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2)) {
+ saddr_comp(sk, sk2, true)) {
res = 1;
break;
}
@@ -196,6 +201,35 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
return res;
}
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
+{
+ struct net *net = sock_net(sk);
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+ struct sock *sk2;
+
+ sk_nulls_for_each(sk2, node, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
+ (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ (*saddr_same)(sk, sk2, false)) {
+ return reuseport_add_sock(sk, sk2);
+ }
+ }
+
+ /* Initial allocation may have already happened via setsockopt */
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return reuseport_alloc(sk);
+ return 0;
+}
+
/**
* udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
*
@@ -207,7 +241,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2),
+ const struct sock *sk2,
+ bool match_wildcard),
unsigned int hash2_nulladdr)
{
struct udp_hslot *hslot, *hslot2;
@@ -290,6 +325,14 @@ found:
udp_sk(sk)->udp_port_hash = snum;
udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
+ if (sk->sk_reuseport &&
+ udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
+ inet_sk(sk)->inet_num = 0;
+ udp_sk(sk)->udp_port_hash = 0;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
+ goto fail_unlock;
+ }
+
sk_nulls_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -309,13 +352,22 @@ fail:
}
EXPORT_SYMBOL(udp_lib_get_port);
-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
+/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * 0.0.0.0 only equals to 0.0.0.0
+ */
+static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
+ bool match_wildcard)
{
struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
- return (!ipv6_only_sock(sk2) &&
- (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
- inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
+ if (!ipv6_only_sock(sk2)) {
+ if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
+ return 1;
+ if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
}
static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
@@ -441,11 +493,13 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2)
+ struct udp_hslot *hslot2, unsigned int slot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 hash = 0;
begin:
@@ -461,6 +515,17 @@ begin:
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
+ if (sk2) {
+ result = sk2;
+ select_ok = false;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -478,6 +543,7 @@ begin:
if (get_nulls_value(node) != slot2)
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -494,7 +560,7 @@ begin:
*/
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
- int dif, struct udp_table *udptable)
+ int dif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
@@ -502,6 +568,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 hash = 0;
rcu_read_lock();
@@ -514,7 +581,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2);
+ hslot2, slot2, skb);
if (!result) {
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
@@ -524,7 +591,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif,
- hslot2, slot2);
+ hslot2, slot2, skb);
}
rcu_read_unlock();
return result;
@@ -542,6 +609,17 @@ begin:
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
+ if (sk2) {
+ result = sk2;
+ select_ok = false;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -560,6 +638,7 @@ begin:
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score(result, net, saddr, hnum, sport,
@@ -581,13 +660,14 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- udptable);
+ udptable, skb);
}
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+ return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
+ &udp_table, NULL);
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
@@ -635,7 +715,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, udptable);
+ iph->saddr, uh->source, skb->dev->ifindex, udptable,
+ NULL);
if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
@@ -772,7 +853,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
else if (skb_is_gso(skb))
uh->check = ~udp_v4_check(len, saddr, daddr, 0);
else if (skb_dst(skb) && skb_dst(skb)->dev &&
- (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
+ (skb_dst(skb)->dev->features &
+ (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
@@ -1025,8 +1107,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flow_flags,
faddr, saddr, dport, inet->inet_sport);
- if (!saddr && ipc.oif)
- l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (err < 0)
+ goto out;
+ }
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -1270,6 +1355,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int peeked, off = 0;
int err;
int is_udplite = IS_UDPLITE(sk);
+ bool checksum_valid = false;
bool slow;
if (flags & MSG_ERRQUEUE)
@@ -1295,11 +1381,12 @@ try_again:
*/
if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
- if (udp_lib_checksum_complete(skb))
+ checksum_valid = !udp_lib_checksum_complete(skb);
+ if (!checksum_valid)
goto csum_copy_err;
}
- if (skb_csum_unnecessary(skb))
+ if (checksum_valid || skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
msg, copied);
else {
@@ -1395,6 +1482,8 @@ void udp_lib_unhash(struct sock *sk)
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock_bh(&hslot->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
if (sk_nulls_del_node_init_rcu(sk)) {
hslot->count--;
inet_sk(sk)->inet_num = 0;
@@ -1422,22 +1511,28 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
nhslot2 = udp_hashslot2(udptable, newhash);
udp_sk(sk)->udp_portaddr_hash = newhash;
- if (hslot2 != nhslot2) {
+
+ if (hslot2 != nhslot2 ||
+ rcu_access_pointer(sk->sk_reuseport_cb)) {
hslot = udp_hashslot(udptable, sock_net(sk),
udp_sk(sk)->udp_port_hash);
/* we must lock primary chain too */
spin_lock_bh(&hslot->lock);
-
- spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
- hslot2->count--;
- spin_unlock(&hslot2->lock);
-
- spin_lock(&nhslot2->lock);
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
- &nhslot2->head);
- nhslot2->count++;
- spin_unlock(&nhslot2->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &nhslot2->head);
+ nhslot2->count++;
+ spin_unlock(&nhslot2->lock);
+ }
spin_unlock_bh(&hslot->lock);
}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 6116604bf6e8..df1966f3b6ec 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -44,7 +44,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
- req->id.idiag_if, tbl);
+ req->id.idiag_if, tbl, NULL);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6)
sk = __udp6_lib_lookup(net,
@@ -52,7 +52,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
req->id.idiag_sport,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
- req->id.idiag_if, tbl);
+ req->id.idiag_if, tbl, NULL);
#endif
else
goto out_nosk;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f9386160cbee..4c519c1dc161 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,6 +21,7 @@ static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
struct udp_offload_priv {
struct udp_offload *offload;
+ possible_net_t net;
struct rcu_head rcu;
struct udp_offload_priv __rcu *next;
};
@@ -60,8 +61,9 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
- (skb->dev->features &
- (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM)));
+ ((skb->dev->features & NETIF_F_HW_CSUM) ||
+ (skb->dev->features & (is_ipv6 ?
+ NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM))));
/* segment inner packet. */
enc_features = skb->dev->hw_enc_features & features;
@@ -241,13 +243,14 @@ out:
return segs;
}
-int udp_add_offload(struct udp_offload *uo)
+int udp_add_offload(struct net *net, struct udp_offload *uo)
{
struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
if (!new_offload)
return -ENOMEM;
+ write_pnet(&new_offload->net, net);
new_offload->offload = uo;
spin_lock(&udp_offload_lock);
@@ -311,7 +314,8 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
rcu_read_lock();
uo_priv = rcu_dereference(udp_offload_base);
for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (uo_priv->offload->port == uh->dest &&
+ if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
+ uo_priv->offload->port == uh->dest &&
uo_priv->offload->callbacks.gro_receive)
goto unflush;
}
@@ -389,7 +393,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
uo_priv = rcu_dereference(udp_offload_base);
for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (uo_priv->offload->port == uh->dest &&
+ if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
+ uo_priv->offload->port == uh->dest &&
uo_priv->offload->callbacks.gro_complete)
break;
}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index aba428626b52..0ec08814f37d 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -74,10 +74,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
-int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, __u8 tos, __u8 ttl,
- __be16 df, __be16 src_port, __be16 dst_port,
- bool xnet, bool nocheck)
+void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 tos, __u8 ttl,
+ __be16 df, __be16 src_port, __be16 dst_port,
+ bool xnet, bool nocheck)
{
struct udphdr *uh;
@@ -91,8 +91,7 @@ int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
udp_set_csum(nocheck, skb, src, dst, skb->len);
- return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,
- tos, ttl, df, xnet);
+ iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
}
EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 1e0c3c835a63..7b0edb37a115 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -259,7 +259,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm4_dst_ops = {
+static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
@@ -273,7 +273,7 @@ static struct dst_ops xfrm4_dst_ops = {
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.family = AF_INET,
- .dst_ops = &xfrm4_dst_ops,
+ .dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
.decode_session = _decode_session4,
@@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static int __net_init xfrm4_net_init(struct net *net)
+static int __net_init xfrm4_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -323,7 +323,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm4_net_exit(struct net *net)
+static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -335,12 +335,44 @@ static void __net_exit xfrm4_net_exit(struct net *net)
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm4_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm4_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
+ sizeof(xfrm4_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm4_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+ xfrm4_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+}
static struct pernet_operations __net_initdata xfrm4_net_ops = {
.init = xfrm4_net_init,
.exit = xfrm4_net_exit,
};
-#endif
static void __init xfrm4_policy_init(void)
{
@@ -349,13 +381,9 @@ static void __init xfrm4_policy_init(void)
void __init xfrm4_init(void)
{
- dst_entries_init(&xfrm4_dst_ops);
-
xfrm4_state_init();
xfrm4_policy_init();
xfrm4_protocol_init();
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm4_net_ops);
-#endif
}