summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/ax25/ax25_ip.c15
-rw-r--r--net/bridge/br_fdb.c15
-rw-r--r--net/bridge/br_netfilter_hooks.c68
-rw-r--r--net/bridge/br_stp.c11
-rw-r--r--net/ceph/messenger.c15
-rw-r--r--net/ceph/osd_client.c4
-rw-r--r--net/core/filter.c135
-rw-r--r--net/core/flow_dissector.c13
-rw-r--r--net/core/rtnetlink.c6
-rw-r--r--net/core/skbuff.c61
-rw-r--r--net/dsa/dsa.c16
-rw-r--r--net/ipv4/arp.c35
-rw-r--r--net/ipv4/fou.c3
-rw-r--r--net/ipv4/gre_offload.c3
-rw-r--r--net/ipv4/igmp.c3
-rw-r--r--net/ipv4/ip_gre.c10
-rw-r--r--net/ipv4/ip_output.c5
-rw-r--r--net/ipv4/ip_tunnel.c3
-rw-r--r--net/ipv4/netfilter/arp_tables.c66
-rw-r--r--net/ipv4/netfilter/arptable_filter.c40
-rw-r--r--net/ipv4/netfilter/ip_tables.c63
-rw-r--r--net/ipv4/netfilter/iptable_filter.c44
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c41
-rw-r--r--net/ipv4/netfilter/iptable_nat.c41
-rw-r--r--net/ipv4/netfilter/iptable_raw.c38
-rw-r--r--net/ipv4/netfilter/iptable_security.c44
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c7
-rw-r--r--net/ipv4/tcp.c15
-rw-r--r--net/ipv4/tcp_metrics.c2
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/udp_tunnel.c2
-rw-r--r--net/ipv6/exthdrs_core.c6
-rw-r--r--net/ipv6/ip6_fib.c91
-rw-r--r--net/ipv6/ip6_gre.c2
-rw-r--r--net/ipv6/ip6_tunnel.c2
-rw-r--r--net/ipv6/ip6_udp_tunnel.c6
-rw-r--r--net/ipv6/mcast.c3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c65
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c47
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c46
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c41
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c46
-rw-r--r--net/ipv6/netfilter/ip6table_security.c44
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c7
-rw-r--r--net/ipv6/udp.c6
-rw-r--r--net/kcm/Kconfig10
-rw-r--r--net/kcm/Makefile3
-rw-r--r--net/kcm/kcmproc.c426
-rw-r--r--net/kcm/kcmsock.c2409
-rw-r--r--net/mac80211/agg-rx.c2
-rw-r--r--net/mac80211/ieee80211_i.h2
-rw-r--r--net/mac80211/rc80211_minstrel.c2
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c16
-rw-r--r--net/mac80211/rx.c37
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c17
-rw-r--r--net/netfilter/nft_masq.c51
-rw-r--r--net/netfilter/nft_meta.c11
-rw-r--r--net/netfilter/x_tables.c65
-rw-r--r--net/netfilter/xt_osf.c2
-rw-r--r--net/netlabel/netlabel_domainhash.c4
-rw-r--r--net/netlabel/netlabel_unlabeled.c6
-rw-r--r--net/packet/af_packet.c43
-rw-r--r--net/sched/act_ife.c4
-rw-r--r--net/sched/act_ipt.c2
-rw-r--r--net/sched/cls_flower.c64
-rw-r--r--net/sched/sch_dsmark.c2
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sctp/proc.c10
-rw-r--r--net/socket.c18
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/cache.c2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c2
-rw-r--r--net/switchdev/switchdev.c5
-rw-r--r--net/tipc/bcast.c5
-rw-r--r--net/tipc/bcast.h1
-rw-r--r--net/tipc/bearer.c18
-rw-r--r--net/tipc/link.c122
-rw-r--r--net/tipc/link.h1
-rw-r--r--net/tipc/name_table.c6
-rw-r--r--net/tipc/net.c7
-rw-r--r--net/tipc/netlink.c69
-rw-r--r--net/tipc/netlink.h11
-rw-r--r--net/tipc/node.c25
-rw-r--r--net/tipc/socket.c42
-rw-r--r--net/tipc/subscr.c3
-rw-r--r--net/tipc/udp_media.c42
-rw-r--r--net/wireless/core.c2
-rw-r--r--net/wireless/nl80211.c2
-rw-r--r--net/wireless/sme.c6
-rw-r--r--net/wireless/wext-core.c52
94 files changed, 4133 insertions, 753 deletions
diff --git a/net/Kconfig b/net/Kconfig
index 2760825e53fa..10640d5f8bee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -360,6 +360,7 @@ source "net/can/Kconfig"
source "net/irda/Kconfig"
source "net/bluetooth/Kconfig"
source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
config FIB_RULES
bool
diff --git a/net/Makefile b/net/Makefile
index a5d04098dfce..81d14119eab5 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA) += irda/
obj-$(CONFIG_BT) += bluetooth/
obj-$(CONFIG_SUNRPC) += sunrpc/
obj-$(CONFIG_AF_RXRPC) += rxrpc/
+obj-$(CONFIG_AF_KCM) += kcm/
obj-$(CONFIG_ATM) += atm/
obj-$(CONFIG_L2TP) += l2tp/
obj-$(CONFIG_DECNET) += decnet/
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index b563a3f5f2a8..2fa3be965101 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -228,8 +228,23 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
}
#endif
+static bool ax25_validate_header(const char *header, unsigned int len)
+{
+ ax25_digi digi;
+
+ if (!len)
+ return false;
+
+ if (header[0])
+ return true;
+
+ return ax25_addr_parse(header + 1, len - 1, NULL, NULL, &digi, NULL,
+ NULL);
+}
+
const struct header_ops ax25_header_ops = {
.create = ax25_hard_header,
+ .validate = ax25_validate_header,
};
EXPORT_SYMBOL(ax25_header_ops);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 82e3e9705017..dcea4f4c62b3 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -723,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb,
struct net_bridge_fdb_entry *f;
hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
+ int err;
+
if (idx < cb->args[0])
goto skip;
@@ -741,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb,
if (!filter_dev && f->dst)
goto skip;
- if (fdb_fill_info(skb, br, f,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- NLM_F_MULTI) < 0)
+ err = fdb_fill_info(skb, br, f,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI);
+ if (err < 0) {
+ cb->args[1] = err;
break;
+ }
skip:
++idx;
}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 7ddbe7ec81d6..44114a94c576 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -37,6 +37,7 @@
#include <net/addrconf.h>
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
+#include <net/netns/generic.h>
#include <asm/uaccess.h>
#include "br_private.h"
@@ -44,6 +45,12 @@
#include <linux/sysctl.h>
#endif
+static int brnf_net_id __read_mostly;
+
+struct brnf_net {
+ bool enabled;
+};
+
#ifdef CONFIG_SYSCTL
static struct ctl_table_header *brnf_sysctl_header;
static int brnf_call_iptables __read_mostly = 1;
@@ -938,6 +945,53 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
},
};
+static int brnf_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct brnf_net *brnet;
+ struct net *net;
+ int ret;
+
+ if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE))
+ return NOTIFY_DONE;
+
+ ASSERT_RTNL();
+
+ net = dev_net(dev);
+ brnet = net_generic(net, brnf_net_id);
+ if (brnet->enabled)
+ return NOTIFY_OK;
+
+ ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ if (ret)
+ return NOTIFY_BAD;
+
+ brnet->enabled = true;
+ return NOTIFY_OK;
+}
+
+static void __net_exit brnf_exit_net(struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ if (!brnet->enabled)
+ return;
+
+ nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ brnet->enabled = false;
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+ .exit = brnf_exit_net,
+ .id = &brnf_net_id,
+ .size = sizeof(struct brnf_net),
+};
+
+static struct notifier_block brnf_notifier __read_mostly = {
+ .notifier_call = brnf_device_event,
+};
+
#ifdef CONFIG_SYSCTL
static
int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
@@ -1003,16 +1057,23 @@ static int __init br_netfilter_init(void)
{
int ret;
- ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ ret = register_pernet_subsys(&brnf_net_ops);
if (ret < 0)
return ret;
+ ret = register_netdevice_notifier(&brnf_notifier);
+ if (ret < 0) {
+ unregister_pernet_subsys(&brnf_net_ops);
+ return ret;
+ }
+
#ifdef CONFIG_SYSCTL
brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
if (brnf_sysctl_header == NULL) {
printk(KERN_WARNING
"br_netfilter: can't register to sysctl.\n");
- nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ unregister_netdevice_notifier(&brnf_notifier);
+ unregister_pernet_subsys(&brnf_net_ops);
return -ENOMEM;
}
#endif
@@ -1024,7 +1085,8 @@ static int __init br_netfilter_init(void)
static void __exit br_netfilter_fini(void)
{
RCU_INIT_POINTER(nf_br_ops, NULL);
- nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ unregister_netdevice_notifier(&brnf_notifier);
+ unregister_pernet_subsys(&brnf_net_ops);
#ifdef CONFIG_SYSCTL
unregister_net_sysctl_table(brnf_sysctl_header);
#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index c22816a0b1b1..e23449094188 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -562,6 +562,14 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
}
+/* Set time interval that dynamic forwarding entries live
+ * For pure software bridge, allow values outside the 802.1
+ * standard specification for special cases:
+ * 0 - entry never ages (all permanant)
+ * 1 - entry disappears (no persistance)
+ *
+ * Offloaded switch entries maybe more restrictive
+ */
int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
{
struct switchdev_attr attr = {
@@ -573,9 +581,6 @@ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
unsigned long t = clock_t_to_jiffies(ageing_time);
int err;
- if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME)
- return -ERANGE;
-
err = switchdev_port_attr_set(br->dev, &attr);
if (err)
return err;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9cfedf565f5b..9382619a405b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1197,6 +1197,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
return new_piece;
}
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+ return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+ sizeof(struct ceph_msg_footer) :
+ sizeof(struct ceph_msg_footer_old);
+}
+
static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
{
BUG_ON(!msg);
@@ -2335,9 +2342,9 @@ static int read_partial_message(struct ceph_connection *con)
ceph_pr_addr(&con->peer_addr.in_addr),
seq, con->in_seq + 1);
con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
+ sizeof_footer(con);
con->in_tag = CEPH_MSGR_TAG_READY;
- return 0;
+ return 1;
} else if ((s64)seq - (s64)con->in_seq > 1) {
pr_err("read_partial_message bad seq %lld expected %lld\n",
seq, con->in_seq + 1);
@@ -2360,10 +2367,10 @@ static int read_partial_message(struct ceph_connection *con)
/* skip this message */
dout("alloc_msg said skip message\n");
con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
+ sizeof_footer(con);
con->in_tag = CEPH_MSGR_TAG_READY;
con->in_seq++;
- return 0;
+ return 1;
}
BUG_ON(!con->in_msg);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3534e12683d3..5bc053778fed 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2853,8 +2853,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
mutex_lock(&osdc->request_mutex);
req = __lookup_request(osdc, tid);
if (!req) {
- pr_warn("%s osd%d tid %llu unknown, skipping\n",
- __func__, osd->o_osd, tid);
+ dout("%s osd%d tid %llu unknown, skipping\n", __func__,
+ osd->o_osd, tid);
m = NULL;
*skip = 1;
goto out;
diff --git a/net/core/filter.c b/net/core/filter.c
index 5e2a3b5e5196..6fc3893a6170 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
unsigned int len = (unsigned int) r4;
void *ptr;
- if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
return -EINVAL;
/* bpf verifier guarantees that:
@@ -1384,11 +1384,13 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
if (flags & BPF_F_RECOMPUTE_CSUM)
skb_postpush_rcsum(skb, ptr, len);
+ if (flags & BPF_F_INVALIDATE_HASH)
+ skb_clear_hash(skb);
return 0;
}
-const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.func = bpf_skb_store_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1419,7 +1421,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
return 0;
}
-const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.func = bpf_skb_load_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1447,6 +1449,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EFAULT;
switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ csum_replace_by_diff(ptr, to);
+ break;
case 2:
csum_replace2(ptr, from, to);
break;
@@ -1464,7 +1472,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
.func = bpf_l3_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1523,7 +1531,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.func = bpf_l4_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1562,7 +1570,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
return csum_partial(sp->diff, diff_size, seed);
}
-const struct bpf_func_proto bpf_csum_diff_proto = {
+static const struct bpf_func_proto bpf_csum_diff_proto = {
.func = bpf_csum_diff,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1600,7 +1608,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
return dev_queue_xmit(skb2);
}
-const struct bpf_func_proto bpf_clone_redirect_proto = {
+static const struct bpf_func_proto bpf_clone_redirect_proto = {
.func = bpf_clone_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1652,7 +1660,7 @@ int skb_do_redirect(struct sk_buff *skb)
return dev_queue_xmit(skb);
}
-const struct bpf_func_proto bpf_redirect_proto = {
+static const struct bpf_func_proto bpf_redirect_proto = {
.func = bpf_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1762,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return -EPROTO;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ goto set_compat;
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
/* Fixup deprecated structure layouts here, so we have
* a common path later on.
*/
if (ip_tunnel_info_af(info) != AF_INET)
return -EINVAL;
+set_compat:
to = (struct bpf_tunnel_key *)compat;
break;
default:
@@ -1779,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
- if (flags & BPF_F_TUNINFO_IPV6)
+ if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
sizeof(to->remote_ipv6));
- else
+ to->tunnel_label = be32_to_cpu(info->key.label);
+ } else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ }
if (unlikely(size != sizeof(struct bpf_tunnel_key)))
memcpy((void *)(long) r2, to, size);
@@ -1791,7 +1804,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return 0;
}
-const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.func = bpf_skb_get_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1801,6 +1814,32 @@ const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *to = (u8 *) (long) r2;
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!info ||
+ !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
+ return -ENOENT;
+ if (unlikely(size < info->options_len))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_get(to, info);
+
+ return info->options_len;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+ .func = bpf_skb_get_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
static struct metadata_dst __percpu *md_dst;
static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1811,10 +1850,12 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
- if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6)))
+ if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+ BPF_F_DONT_FRAGMENT)))
return -EINVAL;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
/* Fixup deprecated structure layouts here, so we have
* a common path later on.
@@ -1827,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
return -EINVAL;
}
}
+ if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label))
+ return -EINVAL;
skb_dst_drop(skb);
dst_hold((struct dst_entry *) md);
@@ -1835,7 +1878,10 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info = &md->u.tun_info;
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY;
+ info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ if (flags & BPF_F_DONT_FRAGMENT)
+ info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
info->key.ttl = from->tunnel_ttl;
@@ -1844,14 +1890,18 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info->mode |= IP_TUNNEL_INFO_IPV6;
memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
sizeof(from->remote_ipv6));
+ info->key.label = cpu_to_be32(from->tunnel_label) &
+ IPV6_FLOWLABEL_MASK;
} else {
info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ if (flags & BPF_F_ZERO_CSUM_TX)
+ info->key.tun_flags &= ~TUNNEL_CSUM;
}
return 0;
}
-const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.func = bpf_skb_set_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1861,17 +1911,58 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+#define BPF_TUNLEN_MAX 255
+
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *from = (u8 *) (long) r2;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+ if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+ return -EINVAL;
+ if (unlikely(size > BPF_TUNLEN_MAX))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_set(info, from, size);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+ .func = bpf_skb_set_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
if (!md_dst) {
- /* race is not possible, since it's called from
- * verifier that is holding verifier mutex
+ BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info,
+ options_len) != 1);
+
+ /* Race is not possible, since it's called from verifier
+ * that is holding verifier mutex.
*/
- md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+ md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX,
+ GFP_KERNEL);
if (!md_dst)
return NULL;
}
- return &bpf_skb_set_tunnel_key_proto;
+
+ switch (which) {
+ case BPF_FUNC_skb_set_tunnel_key:
+ return &bpf_skb_set_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return &bpf_skb_set_tunnel_opt_proto;
+ default:
+ return NULL;
+ }
}
static const struct bpf_func_proto *
@@ -1925,7 +2016,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
- return bpf_get_skb_set_tunnel_key_proto();
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
case BPF_FUNC_get_route_realm:
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7c7b8739b8b8..a669dea146c6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,25 +19,12 @@
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
-static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id)
-{
- return flow_dissector->used_keys & (1 << key_id);
-}
-
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
{
flow_dissector->used_keys |= (1 << key_id);
}
-static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id,
- void *target_container)
-{
- return ((char *) target_container) + flow_dissector->offset[key_id];
-}
-
void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key,
unsigned int key_count)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 6128aac01b11..d2d9e5ebf58e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2970,6 +2970,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
+ cb->args[1] = err;
return idx;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -3003,6 +3004,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
+ cb->args[1] = 0;
for_each_netdev(net, dev) {
if (brport_idx && (dev->ifindex != brport_idx))
continue;
@@ -3030,12 +3032,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
idx);
}
+ if (cb->args[1] == -EMSGSIZE)
+ break;
if (dev->netdev_ops->ndo_fdb_dump)
idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
idx);
else
idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+ if (cb->args[1] == -EMSGSIZE)
+ break;
cops = NULL;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7af7ec635d90..51d768e7bc90 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1918,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
+ struct sk_buff *iter;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
@@ -1944,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return true;
}
+ skb_walk_frags(skb, iter) {
+ if (*offset >= iter->len) {
+ *offset -= iter->len;
+ continue;
+ }
+ /* __skb_splice_bits() only fails if the output has no room
+ * left, so no point in going over the frag_list for the error
+ * case.
+ */
+ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+ return true;
+ }
+
return false;
}
@@ -1970,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk,
/*
* Map data from the skb to a pipe. Should handle both the linear part,
- * the fragments, and the frag list. It does NOT handle frag lists within
- * the frag list, if such a thing exists. We'd probably need to recurse to
- * handle that cleanly.
+ * the fragments, and the frag list.
*/
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
@@ -1991,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
.ops = &nosteal_pipe_buf_ops,
.spd_release = sock_spd_release,
};
- struct sk_buff *frag_iter;
int ret = 0;
- /*
- * __skb_splice_bits() only fails if the output has no room left,
- * so no point in going over the frag_list for the error case.
- */
- if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
- goto done;
- else if (!tlen)
- goto done;
+ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
- /*
- * now see if we have a frag_list to map
- */
- skb_walk_frags(skb, frag_iter) {
- if (!tlen)
- break;
- if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
- break;
- }
-
-done:
if (spd.nr_pages)
ret = splice_cb(sk, pipe, &spd);
@@ -3023,6 +3016,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
EXPORT_SYMBOL_GPL(skb_append_pagefrags);
/**
+ * skb_push_rcsum - push skb and update receive checksum
+ * @skb: buffer to update
+ * @len: length of data pulled
+ *
+ * This function performs an skb_push on the packet and updates
+ * the CHECKSUM_COMPLETE checksum. It should be used on
+ * receive path processing instead of skb_push unless you know
+ * that the checksum difference is zero (e.g., a valid IP header)
+ * or you are setting ip_summed to CHECKSUM_NONE.
+ */
+static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len)
+{
+ skb_push(skb, len);
+ skb_postpush_rcsum(skb, skb->data, len);
+ return skb->data;
+}
+
+/**
* skb_pull_rcsum - pull skb and update receive checksum
* @skb: buffer to update
* @len: length of data pulled
@@ -4167,9 +4178,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
if (!pskb_may_pull(skb_chk, offset))
goto err;
- __skb_pull(skb_chk, offset);
+ skb_pull_rcsum(skb_chk, offset);
ret = skb_chkf(skb_chk);
- __skb_push(skb_chk, offset);
+ skb_push_rcsum(skb_chk, offset);
if (ret)
goto err;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index fa4daba8db55..d8fb47fcad05 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -935,6 +935,14 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
{
int i;
+ dst->master_netdev->dsa_ptr = NULL;
+
+ /* If we used a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point get sent
+ * without the tag and go through the regular receive path.
+ */
+ wmb();
+
for (i = 0; i < dst->pd->nr_chips; i++) {
struct dsa_switch *ds = dst->ds[i];
@@ -988,14 +996,6 @@ static int dsa_suspend(struct device *d)
struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
int i, ret = 0;
- dst->master_netdev->dsa_ptr = NULL;
-
- /* If we used a tagging format that doesn't have an ethertype
- * field, make sure that all packets from this point get sent
- * without the tag and go through the regular receive path.
- */
- wmb();
-
for (i = 0; i < dst->pd->nr_chips; i++) {
struct dsa_switch *ds = dst->ds[i];
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c102eb5ac55c..c34c7544d1db 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!in_dev)
- goto out;
+ goto out_free_skb;
arp = arp_hdr(skb);
@@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
default:
if (arp->ar_pro != htons(ETH_P_IP) ||
htons(dev_type) != arp->ar_hrd)
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_ETHER:
case ARPHRD_FDDI:
@@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
arp->ar_pro != htons(ETH_P_IP))
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
- goto out;
+ goto out_free_skb;
break;
}
@@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
- goto out;
+ goto out_free_skb;
/*
* Extract fields
@@ -733,7 +733,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (ipv4_is_multicast(tip) ||
(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
- goto out;
+ goto out_free_skb;
/*
* For some 802.11 wireless deployments (and possibly other networks),
@@ -741,7 +741,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
* and thus should not be accepted.
*/
if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
- goto out;
+ goto out_free_skb;
/*
* Special case: We must set Frame Relay source Q.922 address
@@ -778,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
!arp_ignore(in_dev, sip, tip))
arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
sha, dev->dev_addr, sha, reply_dst);
- goto out;
+ goto out_consume_skb;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
@@ -803,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
neigh_release(n);
}
}
- goto out;
+ goto out_consume_skb;
} else if (IN_DEV_FORWARD(in_dev)) {
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
@@ -826,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
in_dev->arp_parms, skb);
goto out_free_dst;
}
- goto out;
+ goto out_consume_skb;
}
}
}
@@ -876,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
neigh_release(n);
}
-out:
+out_consume_skb:
consume_skb(skb);
+
out_free_dst:
dst_release(reply_dst);
- return 0;
+ return NET_RX_SUCCESS;
+
+out_free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
static void parp_redo(struct sk_buff *skb)
@@ -924,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
consumeskb:
consume_skb(skb);
- return 0;
+ return NET_RX_SUCCESS;
freeskb:
kfree_skb(skb);
out_of_mem:
- return 0;
+ return NET_RX_DROP;
}
/*
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 88dab0c1670c..780484243e14 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -319,8 +319,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
skb_gro_pull(skb, hdrlen);
- flush = 0;
-
for (p = *head; p; p = p->next) {
const struct guehdr *guehdr2;
@@ -352,6 +350,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
goto out_unlock;
pp = ops->callbacks.gro_receive(head, skb);
+ flush = 0;
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 47f4c544c916..540866dbd27d 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -175,8 +175,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
null_compute_pseudo);
}
- flush = 0;
-
for (p = *head; p; p = p->next) {
const struct gre_base_hdr *greh2;
@@ -213,6 +211,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
skb_gro_postpull_rcsum(skb, greh, grehlen);
pp = ptype->callbacks.gro_receive(head, skb);
+ flush = 0;
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 2aea9f1a2a31..9b4ca87f70ba 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -350,9 +350,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
skb_dst_set(skb, &rt->dst);
skb->dev = dev;
- skb->reserved_tailroom = skb_end_offset(skb) -
- min(mtu, skb_end_offset(skb));
skb_reserve(skb, hlen);
+ skb_tailroom_reserve(skb, mtu, tlen);
skb_reset_network_header(skb);
pip = ip_hdr(skb);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 202437d6087b..31936d387cfd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -527,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
+ struct rtable *rt = NULL;
struct flowi4 fl;
- struct rtable *rt;
int min_headroom;
int tunnel_hlen;
__be16 df, flags;
+ bool use_cache;
int err;
tun_info = skb_tunnel_info(skb);
@@ -540,13 +541,14 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
goto err_free_skb;
key = &tun_info->key;
- rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) :
- NULL;
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr);
if (!rt) {
rt = gre_get_rt(skb, dev, &fl, key);
if (IS_ERR(rt))
goto err_free_skb;
- if (!skb->mark)
+ if (use_cache)
dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
fl.saddr);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f734c42acdaf..124bf0a66328 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1233,13 +1233,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
if (!skb)
return -EINVAL;
- cork->length += size;
if ((size + skb->len > mtu) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO)) {
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ return -EOPNOTSUPP;
+
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
}
+ cork->length += size;
while (size > 0) {
if (skb_is_gso(skb)) {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index dff8a05739a2..6aad0192443d 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -607,6 +607,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
dst = tnl_params->daddr;
if (dst == 0) {
/* NBMA tunnel */
@@ -706,7 +708,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
dst_link_failure(skb);
} else
tunnel->err_count = 0;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b488cac9c5ca..bf081927e06b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1780,9 +1780,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
return ret;
}
-struct xt_table *arpt_register_table(struct net *net,
- const struct xt_table *table,
- const struct arpt_replace *repl)
+static void __arpt_unregister_table(struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct arpt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int arpt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct arpt_replace *repl,
+ const struct nf_hook_ops *ops,
+ struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -1791,10 +1811,8 @@ struct xt_table *arpt_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -1809,30 +1827,28 @@ struct xt_table *arpt_register_table(struct net *net,
ret = PTR_ERR(new_table);
goto out_free;
}
- return new_table;
+
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __arpt_unregister_table(new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void arpt_unregister_table(struct xt_table *table)
+void arpt_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct arpt_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __arpt_unregister_table(table);
}
/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 1897ee160920..dd8c80dc32a2 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
(1 << NF_ARP_FORWARD))
+static int __net_init arptable_filter_table_init(struct net *net);
+
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_ARP,
.priority = NF_IP_PRI_FILTER,
+ .table_init = arptable_filter_table_init,
};
/* The work comes in here from netfilter.c */
@@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *arpfilter_ops __read_mostly;
-static int __net_init arptable_filter_net_init(struct net *net)
+static int __net_init arptable_filter_table_init(struct net *net)
{
struct arpt_replace *repl;
-
+ int err;
+
+ if (net->ipv4.arptable_filter)
+ return 0;
+
repl = arpt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.arptable_filter =
- arpt_register_table(net, &packet_filter, repl);
+ err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops,
+ &net->ipv4.arptable_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
+ return err;
}
static void __net_exit arptable_filter_net_exit(struct net *net)
{
- arpt_unregister_table(net->ipv4.arptable_filter);
+ if (!net->ipv4.arptable_filter)
+ return;
+ arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
+ net->ipv4.arptable_filter = NULL;
}
static struct pernet_operations arptable_filter_net_ops = {
- .init = arptable_filter_net_init,
.exit = arptable_filter_net_exit,
};
@@ -62,26 +71,23 @@ static int __init arptable_filter_init(void)
{
int ret;
+ arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
+ if (IS_ERR(arpfilter_ops))
+ return PTR_ERR(arpfilter_ops);
+
ret = register_pernet_subsys(&arptable_filter_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(arpfilter_ops);
return ret;
-
- arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
- if (IS_ERR(arpfilter_ops)) {
- ret = PTR_ERR(arpfilter_ops);
- goto cleanup_table;
}
- return ret;
-cleanup_table:
- unregister_pernet_subsys(&arptable_filter_net_ops);
return ret;
}
static void __exit arptable_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, arpfilter_ops);
unregister_pernet_subsys(&arptable_filter_net_ops);
+ kfree(arpfilter_ops);
}
module_init(arptable_filter_init);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b99affad6ba1..e53f8d6f326d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2062,9 +2062,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
return ret;
}
-struct xt_table *ipt_register_table(struct net *net,
- const struct xt_table *table,
- const struct ipt_replace *repl)
+static void __ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ipt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int ipt_register_table(struct net *net, const struct xt_table *table,
+ const struct ipt_replace *repl,
+ const struct nf_hook_ops *ops, struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -2073,10 +2091,8 @@ struct xt_table *ipt_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2091,30 +2107,27 @@ struct xt_table *ipt_register_table(struct net *net,
goto out_free;
}
- return new_table;
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __ipt_unregister_table(net, new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void ipt_unregister_table(struct net *net, struct xt_table *table)
+void ipt_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct ipt_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter, net);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __ipt_unregister_table(net, table);
}
/* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 397ef2dd133e..7667f223d7f8 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_filter_table_init(struct net *net);
static const struct xt_table packet_filter = {
.name = "filter",
@@ -30,6 +31,7 @@ static const struct xt_table packet_filter = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
+ .table_init = iptable_filter_table_init,
};
static unsigned int
@@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
-static bool forward = true;
+static bool forward __read_mostly = true;
module_param(forward, bool, 0000);
-static int __net_init iptable_filter_net_init(struct net *net)
+static int __net_init iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int err;
+
+ if (net->ipv4.iptable_filter)
+ return 0;
repl = ipt_alloc_initial_table(&packet_filter);
if (repl == NULL)
@@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net)
((struct ipt_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
- net->ipv4.iptable_filter =
- ipt_register_table(net, &packet_filter, repl);
+ err = ipt_register_table(net, &packet_filter, repl, filter_ops,
+ &net->ipv4.iptable_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
+ return err;
+}
+
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+ if (net == &init_net || !forward)
+ return iptable_filter_table_init(net);
+
+ return 0;
}
static void __net_exit iptable_filter_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_filter);
+ if (!net->ipv4.iptable_filter)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
+ net->ipv4.iptable_filter = NULL;
}
static struct pernet_operations iptable_filter_net_ops = {
@@ -82,24 +99,21 @@ static int __init iptable_filter_init(void)
{
int ret;
+ filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
+ if (IS_ERR(filter_ops))
+ return PTR_ERR(filter_ops);
+
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
- return ret;
-
- /* Register hooks */
- filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
- if (IS_ERR(filter_ops)) {
- ret = PTR_ERR(filter_ops);
- unregister_pernet_subsys(&iptable_filter_net_ops);
- }
+ kfree(filter_ops);
return ret;
}
static void __exit iptable_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, filter_ops);
unregister_pernet_subsys(&iptable_filter_net_ops);
+ kfree(filter_ops);
}
module_init(iptable_filter_init);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index ba5d392a13c4..57fc97cdac70 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
+static int __net_init iptable_mangle_table_init(struct net *net);
+
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_MANGLE,
+ .table_init = iptable_mangle_table_init,
};
static unsigned int
@@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv,
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-
-static int __net_init iptable_mangle_net_init(struct net *net)
+static int __net_init iptable_mangle_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_mangle)
+ return 0;
repl = ipt_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_mangle =
- ipt_register_table(net, &packet_mangler, repl);
+ ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops,
+ &net->ipv4.iptable_mangle);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
+ return ret;
}
static void __net_exit iptable_mangle_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_mangle);
+ if (!net->ipv4.iptable_mangle)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
+ net->ipv4.iptable_mangle = NULL;
}
static struct pernet_operations iptable_mangle_net_ops = {
- .init = iptable_mangle_net_init,
.exit = iptable_mangle_net_exit,
};
@@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void)
{
int ret;
+ mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ ret = PTR_ERR(mangle_ops);
+ return ret;
+ }
+
ret = register_pernet_subsys(&iptable_mangle_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(mangle_ops);
return ret;
+ }
- /* Register hooks */
- mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
- if (IS_ERR(mangle_ops)) {
- ret = PTR_ERR(mangle_ops);
+ ret = iptable_mangle_table_init(&init_net);
+ if (ret) {
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ kfree(mangle_ops);
}
return ret;
@@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void)
static void __exit iptable_mangle_fini(void)
{
- xt_hook_unlink(&packet_mangler, mangle_ops);
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ kfree(mangle_ops);
}
module_init(iptable_mangle_init);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ae2cd2752046..138a24bc76ad 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -18,6 +18,8 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l3proto.h>
+static int __net_init iptable_nat_table_init(struct net *net);
+
static const struct xt_table nf_nat_ipv4_table = {
.name = "nat",
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
+ .table_init = iptable_nat_table_init,
};
static unsigned int iptable_nat_do_chain(void *priv,
@@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
},
};
-static int __net_init iptable_nat_net_init(struct net *net)
+static int __net_init iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.nat_table)
+ return 0;
repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+ ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
+ nf_nat_ipv4_ops, &net->ipv4.nat_table);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+ return ret;
}
static void __net_exit iptable_nat_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.nat_table);
+ if (!net->ipv4.nat_table)
+ return;
+ ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+ net->ipv4.nat_table = NULL;
}
static struct pernet_operations iptable_nat_net_ops = {
- .init = iptable_nat_net_init,
.exit = iptable_nat_net_exit,
};
static int __init iptable_nat_init(void)
{
- int err;
+ int ret = register_pernet_subsys(&iptable_nat_net_ops);
- err = register_pernet_subsys(&iptable_nat_net_ops);
- if (err < 0)
- goto err1;
+ if (ret)
+ return ret;
- err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- unregister_pernet_subsys(&iptable_nat_net_ops);
-err1:
- return err;
+ ret = iptable_nat_table_init(&init_net);
+ if (ret)
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+ return ret;
}
static void __exit iptable_nat_exit(void)
{
- nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
unregister_pernet_subsys(&iptable_nat_net_ops);
}
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1ba02811acb0..2642ecd2645c 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -10,12 +10,15 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_raw_table_init(struct net *net);
+
static const struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_RAW,
+ .table_init = iptable_raw_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init iptable_raw_net_init(struct net *net)
+static int __net_init iptable_raw_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_raw)
+ return 0;
repl = ipt_alloc_initial_table(&packet_raw);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_raw =
- ipt_register_table(net, &packet_raw, repl);
+ ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops,
+ &net->ipv4.iptable_raw);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
+ return ret;
}
static void __net_exit iptable_raw_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_raw);
+ if (!net->ipv4.iptable_raw)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
+ net->ipv4.iptable_raw = NULL;
}
static struct pernet_operations iptable_raw_net_ops = {
- .init = iptable_raw_net_init,
.exit = iptable_raw_net_exit,
};
@@ -61,15 +70,20 @@ static int __init iptable_raw_init(void)
{
int ret;
+ rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
+ if (IS_ERR(rawtable_ops))
+ return PTR_ERR(rawtable_ops);
+
ret = register_pernet_subsys(&iptable_raw_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(rawtable_ops);
return ret;
+ }
- /* Register hooks */
- rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
- if (IS_ERR(rawtable_ops)) {
- ret = PTR_ERR(rawtable_ops);
+ ret = iptable_raw_table_init(&init_net);
+ if (ret) {
unregister_pernet_subsys(&iptable_raw_net_ops);
+ kfree(rawtable_ops);
}
return ret;
@@ -77,8 +91,8 @@ static int __init iptable_raw_init(void)
static void __exit iptable_raw_fini(void)
{
- xt_hook_unlink(&packet_raw, rawtable_ops);
unregister_pernet_subsys(&iptable_raw_net_ops);
+ kfree(rawtable_ops);
}
module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c2e23d5e9cd4..ff226596e4b5 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
+static int __net_init iptable_security_table_init(struct net *net);
+
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_SECURITY,
+ .table_init = iptable_security_table_init,
};
static unsigned int
@@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init iptable_security_net_init(struct net *net)
+static int __net_init iptable_security_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_security)
+ return 0;
repl = ipt_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_security =
- ipt_register_table(net, &security_table, repl);
+ ret = ipt_register_table(net, &security_table, repl, sectbl_ops,
+ &net->ipv4.iptable_security);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
+ return ret;
}
static void __net_exit iptable_security_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_security);
+ if (!net->ipv4.iptable_security)
+ return;
+
+ ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
+ net->ipv4.iptable_security = NULL;
}
static struct pernet_operations iptable_security_net_ops = {
- .init = iptable_security_net_init,
.exit = iptable_security_net_exit,
};
@@ -78,27 +88,29 @@ static int __init iptable_security_init(void)
{
int ret;
+ sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
+ if (IS_ERR(sectbl_ops))
+ return PTR_ERR(sectbl_ops);
+
ret = register_pernet_subsys(&iptable_security_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(sectbl_ops);
return ret;
-
- sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
- if (IS_ERR(sectbl_ops)) {
- ret = PTR_ERR(sectbl_ops);
- goto cleanup_table;
}
- return ret;
+ ret = iptable_security_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&iptable_security_net_ops);
+ kfree(sectbl_ops);
+ }
-cleanup_table:
- unregister_pernet_subsys(&iptable_security_net_ops);
return ret;
}
static void __exit iptable_security_fini(void)
{
- xt_hook_unlink(&security_table, sectbl_ops);
unregister_pernet_subsys(&iptable_security_net_ops);
+ kfree(sectbl_ops);
}
module_init(iptable_security_init);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index a04dee536b8e..d88da36b383c 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
err = ip_defrag(net, skb, user);
local_bh_enable();
- if (!err) {
- ip_send_check(ip_hdr(skb));
+ if (!err)
skb->ignore_df = 1;
- }
return err;
}
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index b72ffc58e255..51ced81b616c 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
-
+ if (priv->sreg_proto_min) {
+ range.min_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ range.max_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ }
regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
&range, pkt->out);
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f9faadb42485..a265f00b9df9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -556,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -EINVAL;
slow = lock_sock_fast(sk);
- if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
- answ = 0;
- else if (sock_flag(sk, SOCK_URGINLINE) ||
- !tp->urg_data ||
- before(tp->urg_seq, tp->copied_seq) ||
- !before(tp->urg_seq, tp->rcv_nxt)) {
-
- answ = tp->rcv_nxt - tp->copied_seq;
-
- /* Subtract 1, if FIN was received */
- if (answ && sock_flag(sk, SOCK_DONE))
- answ--;
- } else
- answ = tp->urg_seq - tp->copied_seq;
+ answ = tcp_inq(sk);
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c26241f3057b..7b7eec439906 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -551,7 +551,7 @@ reset:
*/
if (crtt > tp->srtt_us) {
/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
- crtt /= 8 * USEC_PER_MSEC;
+ crtt /= 8 * USEC_PER_SEC / HZ;
inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
} else if (tp->srtt_us == 0) {
/* RFC6298: 5.7 We've failed to get a valid RTT sample from
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index fadd8b978951..ae90e4b34bd3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -452,7 +452,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->segs_in = 0;
+ newtp->segs_in = 1;
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -812,6 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int ret = 0;
int state = child->sk_state;
+ tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
if (!sock_owned_by_user(child)) {
ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 0ec08814f37d..96599d1a1318 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
uh->source = src_port;
uh->len = htons(skb->len);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
udp_set_csum(nocheck, skb, src, dst, skb->len);
iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 5c5d23e59da5..9508a20fbf61 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -257,7 +257,11 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
*fragoff = _frag_off;
return hp->nexthdr;
}
- return -ENOENT;
+ if (!found)
+ return -ENOENT;
+ if (fragoff)
+ *fragoff = _frag_off;
+ break;
}
hdrlen = 8;
} else if (nexthdr == NEXTHDR_AUTH) {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0c7e276c230e..ea071fad67a0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -55,8 +55,6 @@ struct fib6_cleaner {
void *arg;
};
-static DEFINE_RWLOCK(fib6_walker_lock);
-
#ifdef CONFIG_IPV6_SUBTREES
#define FWS_INIT FWS_S
#else
@@ -66,7 +64,7 @@ static DEFINE_RWLOCK(fib6_walker_lock);
static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
-static int fib6_walk(struct fib6_walker *w);
+static int fib6_walk(struct net *net, struct fib6_walker *w);
static int fib6_walk_continue(struct fib6_walker *w);
/*
@@ -78,21 +76,21 @@ static int fib6_walk_continue(struct fib6_walker *w);
static void fib6_gc_timer_cb(unsigned long arg);
-static LIST_HEAD(fib6_walkers);
-#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
+#define FOR_WALKERS(net, w) \
+ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
-static void fib6_walker_link(struct fib6_walker *w)
+static void fib6_walker_link(struct net *net, struct fib6_walker *w)
{
- write_lock_bh(&fib6_walker_lock);
- list_add(&w->lh, &fib6_walkers);
- write_unlock_bh(&fib6_walker_lock);
+ write_lock_bh(&net->ipv6.fib6_walker_lock);
+ list_add(&w->lh, &net->ipv6.fib6_walkers);
+ write_unlock_bh(&net->ipv6.fib6_walker_lock);
}
-static void fib6_walker_unlink(struct fib6_walker *w)
+static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
{
- write_lock_bh(&fib6_walker_lock);
+ write_lock_bh(&net->ipv6.fib6_walker_lock);
list_del(&w->lh);
- write_unlock_bh(&fib6_walker_lock);
+ write_unlock_bh(&net->ipv6.fib6_walker_lock);
}
static int fib6_new_sernum(struct net *net)
@@ -325,12 +323,13 @@ static int fib6_dump_node(struct fib6_walker *w)
static void fib6_dump_end(struct netlink_callback *cb)
{
+ struct net *net = sock_net(cb->skb->sk);
struct fib6_walker *w = (void *)cb->args[2];
if (w) {
if (cb->args[4]) {
cb->args[4] = 0;
- fib6_walker_unlink(w);
+ fib6_walker_unlink(net, w);
}
cb->args[2] = 0;
kfree(w);
@@ -348,6 +347,7 @@ static int fib6_dump_done(struct netlink_callback *cb)
static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
struct fib6_walker *w;
int res;
@@ -359,7 +359,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
w->skip = 0;
read_lock_bh(&table->tb6_lock);
- res = fib6_walk(w);
+ res = fib6_walk(net, w);
read_unlock_bh(&table->tb6_lock);
if (res > 0) {
cb->args[4] = 1;
@@ -379,7 +379,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
res = fib6_walk_continue(w);
read_unlock_bh(&table->tb6_lock);
if (res <= 0) {
- fib6_walker_unlink(w);
+ fib6_walker_unlink(net, w);
cb->args[4] = 0;
}
}
@@ -1340,8 +1340,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
}
#endif
- read_lock(&fib6_walker_lock);
- FOR_WALKERS(w) {
+ read_lock(&net->ipv6.fib6_walker_lock);
+ FOR_WALKERS(net, w) {
if (!child) {
if (w->root == fn) {
w->root = w->node = NULL;
@@ -1368,7 +1368,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
}
}
}
- read_unlock(&fib6_walker_lock);
+ read_unlock(&net->ipv6.fib6_walker_lock);
node_free(fn);
if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
@@ -1411,8 +1411,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
}
/* Adjust walkers */
- read_lock(&fib6_walker_lock);
- FOR_WALKERS(w) {
+ read_lock(&net->ipv6.fib6_walker_lock);
+ FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
w->leaf = rt->dst.rt6_next;
@@ -1420,7 +1420,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
w->state = FWS_U;
}
}
- read_unlock(&fib6_walker_lock);
+ read_unlock(&net->ipv6.fib6_walker_lock);
rt->dst.rt6_next = NULL;
@@ -1588,17 +1588,17 @@ skip:
}
}
-static int fib6_walk(struct fib6_walker *w)
+static int fib6_walk(struct net *net, struct fib6_walker *w)
{
int res;
w->state = FWS_INIT;
w->node = w->root;
- fib6_walker_link(w);
+ fib6_walker_link(net, w);
res = fib6_walk_continue(w);
if (res <= 0)
- fib6_walker_unlink(w);
+ fib6_walker_unlink(net, w);
return res;
}
@@ -1668,7 +1668,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.arg = arg;
c.net = net;
- fib6_walk(&c.w);
+ fib6_walk(net, &c.w);
}
static void __fib6_clean_all(struct net *net,
@@ -1725,14 +1725,15 @@ static void fib6_flush_trees(struct net *net)
* Garbage collection
*/
-static struct fib6_gc_args
+struct fib6_gc_args
{
int timeout;
int more;
-} gc_args;
+};
static int fib6_age(struct rt6_info *rt, void *arg)
{
+ struct fib6_gc_args *gc_args = arg;
unsigned long now = jiffies;
/*
@@ -1748,10 +1749,10 @@ static int fib6_age(struct rt6_info *rt, void *arg)
RT6_TRACE("expiring %p\n", rt);
return -1;
}
- gc_args.more++;
+ gc_args->more++;
} else if (rt->rt6i_flags & RTF_CACHE) {
if (atomic_read(&rt->dst.__refcnt) == 0 &&
- time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
+ time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
RT6_TRACE("aging clone %p\n", rt);
return -1;
} else if (rt->rt6i_flags & RTF_GATEWAY) {
@@ -1769,21 +1770,20 @@ static int fib6_age(struct rt6_info *rt, void *arg)
return -1;
}
}
- gc_args.more++;
+ gc_args->more++;
}
return 0;
}
-static DEFINE_SPINLOCK(fib6_gc_lock);
-
void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
+ struct fib6_gc_args gc_args;
unsigned long now;
if (force) {
- spin_lock_bh(&fib6_gc_lock);
- } else if (!spin_trylock_bh(&fib6_gc_lock)) {
+ spin_lock_bh(&net->ipv6.fib6_gc_lock);
+ } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
return;
}
@@ -1792,7 +1792,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
gc_args.more = icmp6_dst_gc();
- fib6_clean_all(net, fib6_age, NULL);
+ fib6_clean_all(net, fib6_age, &gc_args);
now = jiffies;
net->ipv6.ip6_rt_last_gc = now;
@@ -1802,7 +1802,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
+ net->ipv6.sysctl.ip6_rt_gc_interval));
else
del_timer(&net->ipv6.ip6_fib_timer);
- spin_unlock_bh(&fib6_gc_lock);
+ spin_unlock_bh(&net->ipv6.fib6_gc_lock);
}
static void fib6_gc_timer_cb(unsigned long arg)
@@ -1814,6 +1814,9 @@ static int __net_init fib6_net_init(struct net *net)
{
size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+ spin_lock_init(&net->ipv6.fib6_gc_lock);
+ rwlock_init(&net->ipv6.fib6_walker_lock);
+ INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
@@ -1974,7 +1977,8 @@ static int ipv6_route_yield(struct fib6_walker *w)
return 0;
}
-static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
+static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
+ struct net *net)
{
memset(&iter->w, 0, sizeof(iter->w));
iter->w.func = ipv6_route_yield;
@@ -1984,7 +1988,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
iter->w.args = iter;
iter->sernum = iter->w.root->fn_sernum;
INIT_LIST_HEAD(&iter->w.lh);
- fib6_walker_link(&iter->w);
+ fib6_walker_link(net, &iter->w);
}
static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
@@ -2045,16 +2049,16 @@ iter_table:
++*pos;
return iter->w.leaf;
} else if (r < 0) {
- fib6_walker_unlink(&iter->w);
+ fib6_walker_unlink(net, &iter->w);
return NULL;
}
- fib6_walker_unlink(&iter->w);
+ fib6_walker_unlink(net, &iter->w);
iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
if (!iter->tbl)
return NULL;
- ipv6_route_seq_setup_walk(iter);
+ ipv6_route_seq_setup_walk(iter, net);
goto iter_table;
}
@@ -2069,7 +2073,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
iter->skip = *pos;
if (iter->tbl) {
- ipv6_route_seq_setup_walk(iter);
+ ipv6_route_seq_setup_walk(iter, net);
return ipv6_route_seq_next(seq, NULL, pos);
} else {
return NULL;
@@ -2085,10 +2089,11 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
__releases(RCU_BH)
{
+ struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
if (ipv6_route_iter_active(iter))
- fib6_walker_unlink(&iter->w);
+ fib6_walker_unlink(net, &iter->w);
rcu_read_unlock_bh();
}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index f7c9560b75fa..4e636e60a360 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
__u32 mtu;
int err;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3f3aabd2f07b..eb2ac4bb09ce 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1089,6 +1089,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
u8 tproto;
int err;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
tproto = ACCESS_ONCE(t->parms.proto);
if (tproto != IPPROTO_IPIP && tproto != 0)
return -1;
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 14dacf1df529..a7520528ecd2 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb,
struct net_device *dev, struct in6_addr *saddr,
struct in6_addr *daddr,
- __u8 prio, __u8 ttl, __be16 src_port,
- __be16 dst_port, bool nocheck)
+ __u8 prio, __u8 ttl, __be32 label,
+ __be16 src_port, __be16 dst_port, bool nocheck)
{
struct udphdr *uh;
struct ipv6hdr *ip6h;
@@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
__skb_push(skb, sizeof(*ip6h));
skb_reset_network_header(skb);
ip6h = ipv6_hdr(skb);
- ip6_flow_hdr(ip6h, prio, htonl(0));
+ ip6_flow_hdr(ip6h, prio, label);
ip6h->payload_len = htons(skb->len);
ip6h->nexthdr = IPPROTO_UDP;
ip6h->hop_limit = ttl;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 5ee56d0a8699..d64ee7e83664 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
return NULL;
skb->priority = TC_PRIO_CONTROL;
- skb->reserved_tailroom = skb_end_offset(skb) -
- min(mtu, skb_end_offset(skb));
skb_reserve(skb, hlen);
+ skb_tailroom_reserve(skb, mtu, tlen);
if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
/* <draft-ietf-magma-mld-source-05.txt>:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 99425cf2819b..84f9baf7aee8 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -2071,9 +2071,28 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
return ret;
}
-struct xt_table *ip6t_register_table(struct net *net,
- const struct xt_table *table,
- const struct ip6t_replace *repl)
+static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ip6t_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int ip6t_register_table(struct net *net, const struct xt_table *table,
+ const struct ip6t_replace *repl,
+ const struct nf_hook_ops *ops,
+ struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -2082,10 +2101,8 @@ struct xt_table *ip6t_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2099,30 +2116,28 @@ struct xt_table *ip6t_register_table(struct net *net,
ret = PTR_ERR(new_table);
goto out_free;
}
- return new_table;
+
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __ip6t_unregister_table(net, new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void ip6t_unregister_table(struct net *net, struct xt_table *table)
+void ip6t_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct ip6t_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter, net);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __ip6t_unregister_table(net, table);
}
/* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 8b277b983ca5..1343077dde93 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -22,12 +22,15 @@ MODULE_DESCRIPTION("ip6tables filter table");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
+static int __net_init ip6table_filter_table_init(struct net *net);
+
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_FILTER,
+ .table_init = ip6table_filter_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -44,9 +47,13 @@ static struct nf_hook_ops *filter_ops __read_mostly;
static bool forward = true;
module_param(forward, bool, 0000);
-static int __net_init ip6table_filter_net_init(struct net *net)
+static int __net_init ip6table_filter_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int err;
+
+ if (net->ipv6.ip6table_filter)
+ return 0;
repl = ip6t_alloc_initial_table(&packet_filter);
if (repl == NULL)
@@ -55,15 +62,26 @@ static int __net_init ip6table_filter_net_init(struct net *net)
((struct ip6t_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
- net->ipv6.ip6table_filter =
- ip6t_register_table(net, &packet_filter, repl);
+ err = ip6t_register_table(net, &packet_filter, repl, filter_ops,
+ &net->ipv6.ip6table_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter);
+ return err;
+}
+
+static int __net_init ip6table_filter_net_init(struct net *net)
+{
+ if (net == &init_net || !forward)
+ return ip6table_filter_table_init(net);
+
+ return 0;
}
static void __net_exit ip6table_filter_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+ if (!net->ipv6.ip6table_filter)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
+ net->ipv6.ip6table_filter = NULL;
}
static struct pernet_operations ip6table_filter_net_ops = {
@@ -75,28 +93,21 @@ static int __init ip6table_filter_init(void)
{
int ret;
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook);
+ if (IS_ERR(filter_ops))
+ return PTR_ERR(filter_ops);
+
ret = register_pernet_subsys(&ip6table_filter_net_ops);
if (ret < 0)
- return ret;
-
- /* Register hooks */
- filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook);
- if (IS_ERR(filter_ops)) {
- ret = PTR_ERR(filter_ops);
- goto cleanup_table;
- }
+ kfree(filter_ops);
return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&ip6table_filter_net_ops);
- return ret;
}
static void __exit ip6table_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, filter_ops);
unregister_pernet_subsys(&ip6table_filter_net_ops);
+ kfree(filter_ops);
}
module_init(ip6table_filter_init);
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index abe278b07932..cb2b28883252 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -23,12 +23,15 @@ MODULE_DESCRIPTION("ip6tables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
+static int __net_init ip6table_mangle_table_init(struct net *net);
+
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_MANGLE,
+ .table_init = ip6table_mangle_table_init,
};
static unsigned int
@@ -88,26 +91,33 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init ip6table_mangle_net_init(struct net *net)
+static int __net_init ip6table_mangle_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_mangle)
+ return 0;
repl = ip6t_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_mangle =
- ip6t_register_table(net, &packet_mangler, repl);
+ ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops,
+ &net->ipv6.ip6table_mangle);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle);
+ return ret;
}
static void __net_exit ip6table_mangle_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+ if (!net->ipv6.ip6table_mangle)
+ return;
+
+ ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
+ net->ipv6.ip6table_mangle = NULL;
}
static struct pernet_operations ip6table_mangle_net_ops = {
- .init = ip6table_mangle_net_init,
.exit = ip6table_mangle_net_exit,
};
@@ -115,28 +125,28 @@ static int __init ip6table_mangle_init(void)
{
int ret;
+ mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
+ if (IS_ERR(mangle_ops))
+ return PTR_ERR(mangle_ops);
+
ret = register_pernet_subsys(&ip6table_mangle_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(mangle_ops);
return ret;
-
- /* Register hooks */
- mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook);
- if (IS_ERR(mangle_ops)) {
- ret = PTR_ERR(mangle_ops);
- goto cleanup_table;
}
- return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ ret = ip6table_mangle_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ kfree(mangle_ops);
+ }
return ret;
}
static void __exit ip6table_mangle_fini(void)
{
- xt_hook_unlink(&packet_mangler, mangle_ops);
unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ kfree(mangle_ops);
}
module_init(ip6table_mangle_init);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index de2a10a565f5..7d2bd940291f 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -20,6 +20,8 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l3proto.h>
+static int __net_init ip6table_nat_table_init(struct net *net);
+
static const struct xt_table nf_nat_ipv6_table = {
.name = "nat",
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -28,6 +30,7 @@ static const struct xt_table nf_nat_ipv6_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
+ .table_init = ip6table_nat_table_init,
};
static unsigned int ip6table_nat_do_chain(void *priv,
@@ -97,50 +100,50 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
},
};
-static int __net_init ip6table_nat_net_init(struct net *net)
+static int __net_init ip6table_nat_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_nat)
+ return 0;
repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl);
+ ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
+ nf_nat_ipv6_ops, &net->ipv6.ip6table_nat);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat);
+ return ret;
}
static void __net_exit ip6table_nat_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+ if (!net->ipv6.ip6table_nat)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
+ net->ipv6.ip6table_nat = NULL;
}
static struct pernet_operations ip6table_nat_net_ops = {
- .init = ip6table_nat_net_init,
.exit = ip6table_nat_net_exit,
};
static int __init ip6table_nat_init(void)
{
- int err;
+ int ret = register_pernet_subsys(&ip6table_nat_net_ops);
- err = register_pernet_subsys(&ip6table_nat_net_ops);
- if (err < 0)
- goto err1;
+ if (ret)
+ return ret;
- err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- unregister_pernet_subsys(&ip6table_nat_net_ops);
-err1:
- return err;
+ ret = ip6table_nat_table_init(&init_net);
+ if (ret)
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+ return ret;
}
static void __exit ip6table_nat_exit(void)
{
- nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
unregister_pernet_subsys(&ip6table_nat_net_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 9021963565c3..d4bc56443dc1 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -9,12 +9,15 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+static int __net_init ip6table_raw_table_init(struct net *net);
+
static const struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_RAW,
+ .table_init = ip6table_raw_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -27,26 +30,32 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init ip6table_raw_net_init(struct net *net)
+static int __net_init ip6table_raw_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_raw)
+ return 0;
repl = ip6t_alloc_initial_table(&packet_raw);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_raw =
- ip6t_register_table(net, &packet_raw, repl);
+ ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops,
+ &net->ipv6.ip6table_raw);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw);
+ return ret;
}
static void __net_exit ip6table_raw_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+ if (!net->ipv6.ip6table_raw)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
+ net->ipv6.ip6table_raw = NULL;
}
static struct pernet_operations ip6table_raw_net_ops = {
- .init = ip6table_raw_net_init,
.exit = ip6table_raw_net_exit,
};
@@ -54,28 +63,29 @@ static int __init ip6table_raw_init(void)
{
int ret;
+ /* Register hooks */
+ rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook);
+ if (IS_ERR(rawtable_ops))
+ return PTR_ERR(rawtable_ops);
+
ret = register_pernet_subsys(&ip6table_raw_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(rawtable_ops);
return ret;
-
- /* Register hooks */
- rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook);
- if (IS_ERR(rawtable_ops)) {
- ret = PTR_ERR(rawtable_ops);
- goto cleanup_table;
}
- return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&ip6table_raw_net_ops);
+ ret = ip6table_raw_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&ip6table_raw_net_ops);
+ kfree(rawtable_ops);
+ }
return ret;
}
static void __exit ip6table_raw_fini(void)
{
- xt_hook_unlink(&packet_raw, rawtable_ops);
unregister_pernet_subsys(&ip6table_raw_net_ops);
+ kfree(rawtable_ops);
}
module_init(ip6table_raw_init);
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 0d856fedfeb0..cf26ccb04056 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -27,12 +27,15 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
+static int __net_init ip6table_security_table_init(struct net *net);
+
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_SECURITY,
+ .table_init = ip6table_security_table_init,
};
static unsigned int
@@ -44,26 +47,32 @@ ip6table_security_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init ip6table_security_net_init(struct net *net)
+static int __net_init ip6table_security_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_security)
+ return 0;
repl = ip6t_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_security =
- ip6t_register_table(net, &security_table, repl);
+ ret = ip6t_register_table(net, &security_table, repl, sectbl_ops,
+ &net->ipv6.ip6table_security);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security);
+ return ret;
}
static void __net_exit ip6table_security_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_security);
+ if (!net->ipv6.ip6table_security)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
+ net->ipv6.ip6table_security = NULL;
}
static struct pernet_operations ip6table_security_net_ops = {
- .init = ip6table_security_net_init,
.exit = ip6table_security_net_exit,
};
@@ -71,27 +80,28 @@ static int __init ip6table_security_init(void)
{
int ret;
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook);
+ if (IS_ERR(sectbl_ops))
+ return PTR_ERR(sectbl_ops);
+
ret = register_pernet_subsys(&ip6table_security_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(sectbl_ops);
return ret;
-
- sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook);
- if (IS_ERR(sectbl_ops)) {
- ret = PTR_ERR(sectbl_ops);
- goto cleanup_table;
}
- return ret;
-
-cleanup_table:
- unregister_pernet_subsys(&ip6table_security_net_ops);
+ ret = ip6table_security_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&ip6table_security_net_ops);
+ kfree(sectbl_ops);
+ }
return ret;
}
static void __exit ip6table_security_fini(void)
{
- xt_hook_unlink(&security_table, sectbl_ops);
unregister_pernet_subsys(&ip6table_security_net_ops);
+ kfree(sectbl_ops);
}
module_init(ip6table_security_init);
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index cd1ac1637a05..9597ffb74077 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
-
+ if (priv->sreg_proto_min) {
+ range.min_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ range.max_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ }
regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0711f8fe4d44..fd25e447a5fa 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -922,11 +922,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
ret = udpv6_queue_rcv_skb(sk, skb);
sock_put(sk);
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
+ /* a return value > 0 means to resubmit the input */
if (ret > 0)
- return -ret;
+ return ret;
return 0;
}
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
new file mode 100644
index 000000000000..5db94d940ecc
--- /dev/null
+++ b/net/kcm/Kconfig
@@ -0,0 +1,10 @@
+
+config AF_KCM
+ tristate "KCM sockets"
+ depends on INET
+ select BPF_SYSCALL
+ ---help---
+ KCM (Kernel Connection Multiplexor) sockets provide a method
+ for multiplexing messages of a message based application
+ protocol over kernel connectons (e.g. TCP connections).
+
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
new file mode 100644
index 000000000000..71256133e677
--- /dev/null
+++ b/net/kcm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_AF_KCM) += kcm.o
+
+kcm-y := kcmsock.o kcmproc.o
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
new file mode 100644
index 000000000000..738008726cc6
--- /dev/null
+++ b/net/kcm/kcmproc.c
@@ -0,0 +1,426 @@
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/rculist.h>
+#include <linux/seq_file.h>
+#include <linux/socket.h>
+#include <net/inet_sock.h>
+#include <net/kcm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
+
+#ifdef CONFIG_PROC_FS
+struct kcm_seq_muxinfo {
+ char *name;
+ const struct file_operations *seq_fops;
+ const struct seq_operations seq_ops;
+};
+
+static struct kcm_mux *kcm_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ return list_first_or_null_rcu(&knet->mux_list,
+ struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_next(struct kcm_mux *mux)
+{
+ struct kcm_net *knet = mux->knet;
+
+ return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list,
+ struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+ struct kcm_mux *m;
+
+ list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) {
+ if (!pos)
+ return m;
+ --pos;
+ }
+ return NULL;
+}
+
+static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ void *p;
+
+ if (v == SEQ_START_TOKEN)
+ p = kcm_get_first(seq);
+ else
+ p = kcm_get_next(v);
+ ++*pos;
+ return p;
+}
+
+static void *kcm_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+
+ if (!*pos)
+ return SEQ_START_TOKEN;
+ else
+ return kcm_get_idx(seq, *pos - 1);
+}
+
+static void kcm_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+struct kcm_proc_mux_state {
+ struct seq_net_private p;
+ int idx;
+};
+
+static int kcm_seq_open(struct inode *inode, struct file *file)
+{
+ struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode);
+ int err;
+
+ err = seq_open_net(inode, file, &muxinfo->seq_ops,
+ sizeof(struct kcm_proc_mux_state));
+ if (err < 0)
+ return err;
+ return err;
+}
+
+static void kcm_format_mux_header(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ seq_printf(seq,
+ "*** KCM statistics (%d MUX) ****\n",
+ knet->count);
+
+ seq_printf(seq,
+ "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s",
+ "Object",
+ "RX-Msgs",
+ "RX-Bytes",
+ "TX-Msgs",
+ "TX-Bytes",
+ "Recv-Q",
+ "Rmem",
+ "Send-Q",
+ "Smem",
+ "Status");
+
+ /* XXX: pdsts header stuff here */
+ seq_puts(seq, "\n");
+}
+
+static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq,
+ int i, int *len)
+{
+ seq_printf(seq,
+ " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ",
+ kcm->index,
+ kcm->stats.rx_msgs,
+ kcm->stats.rx_bytes,
+ kcm->stats.tx_msgs,
+ kcm->stats.tx_bytes,
+ kcm->sk.sk_receive_queue.qlen,
+ sk_rmem_alloc_get(&kcm->sk),
+ kcm->sk.sk_write_queue.qlen,
+ "-");
+
+ if (kcm->tx_psock)
+ seq_printf(seq, "Psck-%u ", kcm->tx_psock->index);
+
+ if (kcm->tx_wait)
+ seq_puts(seq, "TxWait ");
+
+ if (kcm->tx_wait_more)
+ seq_puts(seq, "WMore ");
+
+ if (kcm->rx_wait)
+ seq_puts(seq, "RxWait ");
+
+ seq_puts(seq, "\n");
+}
+
+static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
+ int i, int *len)
+{
+ seq_printf(seq,
+ " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ",
+ psock->index,
+ psock->stats.rx_msgs,
+ psock->stats.rx_bytes,
+ psock->stats.tx_msgs,
+ psock->stats.tx_bytes,
+ psock->sk->sk_receive_queue.qlen,
+ atomic_read(&psock->sk->sk_rmem_alloc),
+ psock->sk->sk_write_queue.qlen,
+ atomic_read(&psock->sk->sk_wmem_alloc));
+
+ if (psock->done)
+ seq_puts(seq, "Done ");
+
+ if (psock->tx_stopped)
+ seq_puts(seq, "TxStop ");
+
+ if (psock->rx_stopped)
+ seq_puts(seq, "RxStop ");
+
+ if (psock->tx_kcm)
+ seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index);
+
+ if (psock->ready_rx_msg)
+ seq_puts(seq, "RdyRx ");
+
+ seq_puts(seq, "\n");
+}
+
+static void
+kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq)
+{
+ int i, len;
+ struct kcm_sock *kcm;
+ struct kcm_psock *psock;
+
+ /* mux information */
+ seq_printf(seq,
+ "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ",
+ "mux", "",
+ mux->stats.rx_msgs,
+ mux->stats.rx_bytes,
+ mux->stats.tx_msgs,
+ mux->stats.tx_bytes,
+ "-", "-", "-", "-");
+
+ seq_printf(seq, "KCMs: %d, Psocks %d\n",
+ mux->kcm_socks_cnt, mux->psocks_cnt);
+
+ /* kcm sock information */
+ i = 0;
+ spin_lock_bh(&mux->lock);
+ list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) {
+ kcm_format_sock(kcm, seq, i, &len);
+ i++;
+ }
+ i = 0;
+ list_for_each_entry(psock, &mux->psocks, psock_list) {
+ kcm_format_psock(psock, seq, i, &len);
+ i++;
+ }
+ spin_unlock_bh(&mux->lock);
+}
+
+static int kcm_seq_show(struct seq_file *seq, void *v)
+{
+ struct kcm_proc_mux_state *mux_state;
+
+ mux_state = seq->private;
+ if (v == SEQ_START_TOKEN) {
+ mux_state->idx = 0;
+ kcm_format_mux_header(seq);
+ } else {
+ kcm_format_mux(v, mux_state->idx, seq);
+ mux_state->idx++;
+ }
+ return 0;
+}
+
+static const struct file_operations kcm_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = kcm_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+static struct kcm_seq_muxinfo kcm_seq_muxinfo = {
+ .name = "kcm",
+ .seq_fops = &kcm_seq_fops,
+ .seq_ops = {
+ .show = kcm_seq_show,
+ .start = kcm_seq_start,
+ .next = kcm_seq_next,
+ .stop = kcm_seq_stop,
+ }
+};
+
+static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo)
+{
+ struct proc_dir_entry *p;
+ int rc = 0;
+
+ p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net,
+ muxinfo->seq_fops, muxinfo);
+ if (!p)
+ rc = -ENOMEM;
+ return rc;
+}
+EXPORT_SYMBOL(kcm_proc_register);
+
+static void kcm_proc_unregister(struct net *net,
+ struct kcm_seq_muxinfo *muxinfo)
+{
+ remove_proc_entry(muxinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(kcm_proc_unregister);
+
+static int kcm_stats_seq_show(struct seq_file *seq, void *v)
+{
+ struct kcm_psock_stats psock_stats;
+ struct kcm_mux_stats mux_stats;
+ struct kcm_mux *mux;
+ struct kcm_psock *psock;
+ struct net *net = seq->private;
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ memset(&mux_stats, 0, sizeof(mux_stats));
+ memset(&psock_stats, 0, sizeof(psock_stats));
+
+ mutex_lock(&knet->mutex);
+
+ aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats);
+ aggregate_psock_stats(&knet->aggregate_psock_stats,
+ &psock_stats);
+
+ list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
+ spin_lock_bh(&mux->lock);
+ aggregate_mux_stats(&mux->stats, &mux_stats);
+ aggregate_psock_stats(&mux->aggregate_psock_stats,
+ &psock_stats);
+ list_for_each_entry(psock, &mux->psocks, psock_list)
+ aggregate_psock_stats(&psock->stats, &psock_stats);
+ spin_unlock_bh(&mux->lock);
+ }
+
+ mutex_unlock(&knet->mutex);
+
+ seq_printf(seq,
+ "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n",
+ "MUX",
+ "RX-Msgs",
+ "RX-Bytes",
+ "TX-Msgs",
+ "TX-Bytes",
+ "TX-Retries",
+ "Attach",
+ "Unattach",
+ "UnattchRsvd",
+ "RX-RdyDrops");
+
+ seq_printf(seq,
+ "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n",
+ "",
+ mux_stats.rx_msgs,
+ mux_stats.rx_bytes,
+ mux_stats.tx_msgs,
+ mux_stats.tx_bytes,
+ mux_stats.tx_retries,
+ mux_stats.psock_attach,
+ mux_stats.psock_unattach_rsvd,
+ mux_stats.psock_unattach,
+ mux_stats.rx_ready_drops);
+
+ seq_printf(seq,
+ "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+ "Psock",
+ "RX-Msgs",
+ "RX-Bytes",
+ "TX-Msgs",
+ "TX-Bytes",
+ "Reserved",
+ "Unreserved",
+ "RX-Aborts",
+ "RX-MemFail",
+ "RX-NeedMor",
+ "RX-BadLen",
+ "RX-TooBig",
+ "RX-Timeout",
+ "TX-Aborts");
+
+ seq_printf(seq,
+ "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
+ "",
+ psock_stats.rx_msgs,
+ psock_stats.rx_bytes,
+ psock_stats.tx_msgs,
+ psock_stats.tx_bytes,
+ psock_stats.reserved,
+ psock_stats.unreserved,
+ psock_stats.rx_aborts,
+ psock_stats.rx_mem_fail,
+ psock_stats.rx_need_more_hdr,
+ psock_stats.rx_bad_hdr_len,
+ psock_stats.rx_msg_too_big,
+ psock_stats.rx_msg_timeouts,
+ psock_stats.tx_aborts);
+
+ return 0;
+}
+
+static int kcm_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, kcm_stats_seq_show);
+}
+
+static const struct file_operations kcm_stats_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = kcm_stats_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static int kcm_proc_init_net(struct net *net)
+{
+ int err;
+
+ if (!proc_create("kcm_stats", S_IRUGO, net->proc_net,
+ &kcm_stats_seq_fops)) {
+ err = -ENOMEM;
+ goto out_kcm_stats;
+ }
+
+ err = kcm_proc_register(net, &kcm_seq_muxinfo);
+ if (err)
+ goto out_kcm;
+
+ return 0;
+
+out_kcm:
+ remove_proc_entry("kcm_stats", net->proc_net);
+out_kcm_stats:
+ return err;
+}
+
+static void kcm_proc_exit_net(struct net *net)
+{
+ kcm_proc_unregister(net, &kcm_seq_muxinfo);
+ remove_proc_entry("kcm_stats", net->proc_net);
+}
+
+static struct pernet_operations kcm_net_ops = {
+ .init = kcm_proc_init_net,
+ .exit = kcm_proc_exit_net,
+};
+
+int __init kcm_proc_init(void)
+{
+ return register_pernet_subsys(&kcm_net_ops);
+}
+
+void __exit kcm_proc_exit(void)
+{
+ unregister_pernet_subsys(&kcm_net_ops);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
new file mode 100644
index 000000000000..40662d73204f
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2409 @@
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/kcm.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <uapi/linux/kcm.h>
+
+unsigned int kcm_net_id;
+
+static struct kmem_cache *kcm_psockp __read_mostly;
+static struct kmem_cache *kcm_muxp __read_mostly;
+static struct workqueue_struct *kcm_wq;
+
+static inline struct kcm_sock *kcm_sk(const struct sock *sk)
+{
+ return (struct kcm_sock *)sk;
+}
+
+static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
+{
+ return (struct kcm_tx_msg *)skb->cb;
+}
+
+static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
+{
+ return (struct kcm_rx_msg *)((void *)skb->cb +
+ offsetof(struct qdisc_skb_cb, data));
+}
+
+static void report_csk_error(struct sock *csk, int err)
+{
+ csk->sk_err = EPIPE;
+ csk->sk_error_report(csk);
+}
+
+/* Callback lock held */
+static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
+ struct sk_buff *skb)
+{
+ struct sock *csk = psock->sk;
+
+ /* Unrecoverable error in receive */
+
+ del_timer(&psock->rx_msg_timer);
+
+ if (psock->rx_stopped)
+ return;
+
+ psock->rx_stopped = 1;
+ KCM_STATS_INCR(psock->stats.rx_aborts);
+
+ /* Report an error on the lower socket */
+ report_csk_error(csk, err);
+}
+
+static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
+ bool wakeup_kcm)
+{
+ struct sock *csk = psock->sk;
+ struct kcm_mux *mux = psock->mux;
+
+ /* Unrecoverable error in transmit */
+
+ spin_lock_bh(&mux->lock);
+
+ if (psock->tx_stopped) {
+ spin_unlock_bh(&mux->lock);
+ return;
+ }
+
+ psock->tx_stopped = 1;
+ KCM_STATS_INCR(psock->stats.tx_aborts);
+
+ if (!psock->tx_kcm) {
+ /* Take off psocks_avail list */
+ list_del(&psock->psock_avail_list);
+ } else if (wakeup_kcm) {
+ /* In this case psock is being aborted while outside of
+ * write_msgs and psock is reserved. Schedule tx_work
+ * to handle the failure there. Need to commit tx_stopped
+ * before queuing work.
+ */
+ smp_mb();
+
+ queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+ }
+
+ spin_unlock_bh(&mux->lock);
+
+ /* Report error on lower socket */
+ report_csk_error(csk, err);
+}
+
+/* RX mux lock held. */
+static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
+ struct kcm_psock *psock)
+{
+ KCM_STATS_ADD(mux->stats.rx_bytes,
+ psock->stats.rx_bytes - psock->saved_rx_bytes);
+ mux->stats.rx_msgs +=
+ psock->stats.rx_msgs - psock->saved_rx_msgs;
+ psock->saved_rx_msgs = psock->stats.rx_msgs;
+ psock->saved_rx_bytes = psock->stats.rx_bytes;
+}
+
+static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
+ struct kcm_psock *psock)
+{
+ KCM_STATS_ADD(mux->stats.tx_bytes,
+ psock->stats.tx_bytes - psock->saved_tx_bytes);
+ mux->stats.tx_msgs +=
+ psock->stats.tx_msgs - psock->saved_tx_msgs;
+ psock->saved_tx_msgs = psock->stats.tx_msgs;
+ psock->saved_tx_bytes = psock->stats.tx_bytes;
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* KCM is ready to receive messages on its queue-- either the KCM is new or
+ * has become unblocked after being blocked on full socket buffer. Queue any
+ * pending ready messages on a psock. RX mux lock held.
+ */
+static void kcm_rcv_ready(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+ struct kcm_psock *psock;
+ struct sk_buff *skb;
+
+ if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
+ return;
+
+ while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
+ if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+ /* Assuming buffer limit has been reached */
+ skb_queue_head(&mux->rx_hold_queue, skb);
+ WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+ return;
+ }
+ }
+
+ while (!list_empty(&mux->psocks_ready)) {
+ psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
+ psock_ready_list);
+
+ if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
+ /* Assuming buffer limit has been reached */
+ WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+ return;
+ }
+
+ /* Consumed the ready message on the psock. Schedule rx_work to
+ * get more messages.
+ */
+ list_del(&psock->psock_ready_list);
+ psock->ready_rx_msg = NULL;
+
+ /* Commit clearing of ready_rx_msg for queuing work */
+ smp_mb();
+
+ queue_work(kcm_wq, &psock->rx_work);
+ }
+
+ /* Buffer limit is okay now, add to ready list */
+ list_add_tail(&kcm->wait_rx_list,
+ &kcm->mux->kcm_rx_waiters);
+ kcm->rx_wait = true;
+}
+
+static void kcm_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ struct kcm_mux *mux = kcm->mux;
+ unsigned int len = skb->truesize;
+
+ sk_mem_uncharge(sk, len);
+ atomic_sub(len, &sk->sk_rmem_alloc);
+
+ /* For reading rx_wait and rx_psock without holding lock */
+ smp_mb__after_atomic();
+
+ if (!kcm->rx_wait && !kcm->rx_psock &&
+ sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
+ spin_lock_bh(&mux->rx_lock);
+ kcm_rcv_ready(kcm);
+ spin_unlock_bh(&mux->rx_lock);
+ }
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ return -ENOMEM;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ return -ENOBUFS;
+
+ skb->dev = NULL;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = kcm_rfree;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, skb->truesize);
+
+ skb_queue_tail(list, skb);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ return 0;
+}
+
+/* Requeue received messages for a kcm socket to other kcm sockets. This is
+ * called with a kcm socket is receive disabled.
+ * RX mux lock held.
+ */
+static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
+{
+ struct sk_buff *skb;
+ struct kcm_sock *kcm;
+
+ while ((skb = __skb_dequeue(head))) {
+ /* Reset destructor to avoid calling kcm_rcv_ready */
+ skb->destructor = sock_rfree;
+ skb_orphan(skb);
+try_again:
+ if (list_empty(&mux->kcm_rx_waiters)) {
+ skb_queue_tail(&mux->rx_hold_queue, skb);
+ continue;
+ }
+
+ kcm = list_first_entry(&mux->kcm_rx_waiters,
+ struct kcm_sock, wait_rx_list);
+
+ if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+ /* Should mean socket buffer full */
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+
+ /* Commit rx_wait to read in kcm_free */
+ smp_wmb();
+
+ goto try_again;
+ }
+ }
+}
+
+/* Lower sock lock held */
+static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
+ struct sk_buff *head)
+{
+ struct kcm_mux *mux = psock->mux;
+ struct kcm_sock *kcm;
+
+ WARN_ON(psock->ready_rx_msg);
+
+ if (psock->rx_kcm)
+ return psock->rx_kcm;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ if (psock->rx_kcm) {
+ spin_unlock_bh(&mux->rx_lock);
+ return psock->rx_kcm;
+ }
+
+ kcm_update_rx_mux_stats(mux, psock);
+
+ if (list_empty(&mux->kcm_rx_waiters)) {
+ psock->ready_rx_msg = head;
+ list_add_tail(&psock->psock_ready_list,
+ &mux->psocks_ready);
+ spin_unlock_bh(&mux->rx_lock);
+ return NULL;
+ }
+
+ kcm = list_first_entry(&mux->kcm_rx_waiters,
+ struct kcm_sock, wait_rx_list);
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+
+ psock->rx_kcm = kcm;
+ kcm->rx_psock = psock;
+
+ spin_unlock_bh(&mux->rx_lock);
+
+ return kcm;
+}
+
+static void kcm_done(struct kcm_sock *kcm);
+
+static void kcm_done_work(struct work_struct *w)
+{
+ kcm_done(container_of(w, struct kcm_sock, done_work));
+}
+
+/* Lower sock held */
+static void unreserve_rx_kcm(struct kcm_psock *psock,
+ bool rcv_ready)
+{
+ struct kcm_sock *kcm = psock->rx_kcm;
+ struct kcm_mux *mux = psock->mux;
+
+ if (!kcm)
+ return;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ psock->rx_kcm = NULL;
+ kcm->rx_psock = NULL;
+
+ /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
+ * kcm_rfree
+ */
+ smp_mb();
+
+ if (unlikely(kcm->done)) {
+ spin_unlock_bh(&mux->rx_lock);
+
+ /* Need to run kcm_done in a task since we need to qcquire
+ * callback locks which may already be held here.
+ */
+ INIT_WORK(&kcm->done_work, kcm_done_work);
+ schedule_work(&kcm->done_work);
+ return;
+ }
+
+ if (unlikely(kcm->rx_disabled)) {
+ requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+ } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
+ /* Check for degenerative race with rx_wait that all
+ * data was dequeued (accounted for in kcm_rfree).
+ */
+ kcm_rcv_ready(kcm);
+ }
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+static void kcm_start_rx_timer(struct kcm_psock *psock)
+{
+ if (psock->sk->sk_rcvtimeo)
+ mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo);
+}
+
+/* Macro to invoke filter function. */
+#define KCM_RUN_FILTER(prog, ctx) \
+ (*prog->bpf_func)(ctx, prog->insnsi)
+
+/* Lower socket lock held */
+static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+ unsigned int orig_offset, size_t orig_len)
+{
+ struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
+ struct kcm_rx_msg *rxm;
+ struct kcm_sock *kcm;
+ struct sk_buff *head, *skb;
+ size_t eaten = 0, cand_len;
+ ssize_t extra;
+ int err;
+ bool cloned_orig = false;
+
+ if (psock->ready_rx_msg)
+ return 0;
+
+ head = psock->rx_skb_head;
+ if (head) {
+ /* Message already in progress */
+
+ rxm = kcm_rx_msg(head);
+ if (unlikely(rxm->early_eaten)) {
+ /* Already some number of bytes on the receive sock
+ * data saved in rx_skb_head, just indicate they
+ * are consumed.
+ */
+ eaten = orig_len <= rxm->early_eaten ?
+ orig_len : rxm->early_eaten;
+ rxm->early_eaten -= eaten;
+
+ return eaten;
+ }
+
+ if (unlikely(orig_offset)) {
+ /* Getting data with a non-zero offset when a message is
+ * in progress is not expected. If it does happen, we
+ * need to clone and pull since we can't deal with
+ * offsets in the skbs for a message expect in the head.
+ */
+ orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!orig_skb) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ if (!pskb_pull(orig_skb, orig_offset)) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ kfree_skb(orig_skb);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ cloned_orig = true;
+ orig_offset = 0;
+ }
+
+ if (!psock->rx_skb_nextp) {
+ /* We are going to append to the frags_list of head.
+ * Need to unshare the frag_list.
+ */
+ err = skb_unclone(head, GFP_ATOMIC);
+ if (err) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = err;
+ return 0;
+ }
+
+ if (unlikely(skb_shinfo(head)->frag_list)) {
+ /* We can't append to an sk_buff that already
+ * has a frag_list. We create a new head, point
+ * the frag_list of that to the old head, and
+ * then are able to use the old head->next for
+ * appending to the message.
+ */
+ if (WARN_ON(head->next)) {
+ desc->error = -EINVAL;
+ return 0;
+ }
+
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ skb->len = head->len;
+ skb->data_len = head->len;
+ skb->truesize = head->truesize;
+ *kcm_rx_msg(skb) = *kcm_rx_msg(head);
+ psock->rx_skb_nextp = &head->next;
+ skb_shinfo(skb)->frag_list = head;
+ psock->rx_skb_head = skb;
+ head = skb;
+ } else {
+ psock->rx_skb_nextp =
+ &skb_shinfo(head)->frag_list;
+ }
+ }
+ }
+
+ while (eaten < orig_len) {
+ /* Always clone since we will consume something */
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ break;
+ }
+
+ cand_len = orig_len - eaten;
+
+ head = psock->rx_skb_head;
+ if (!head) {
+ head = skb;
+ psock->rx_skb_head = head;
+ /* Will set rx_skb_nextp on next packet if needed */
+ psock->rx_skb_nextp = NULL;
+ rxm = kcm_rx_msg(head);
+ memset(rxm, 0, sizeof(*rxm));
+ rxm->offset = orig_offset + eaten;
+ } else {
+ /* Unclone since we may be appending to an skb that we
+ * already share a frag_list with.
+ */
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = err;
+ break;
+ }
+
+ rxm = kcm_rx_msg(head);
+ *psock->rx_skb_nextp = skb;
+ psock->rx_skb_nextp = &skb->next;
+ head->data_len += skb->len;
+ head->len += skb->len;
+ head->truesize += skb->truesize;
+ }
+
+ if (!rxm->full_len) {
+ ssize_t len;
+
+ len = KCM_RUN_FILTER(psock->bpf_prog, head);
+
+ if (!len) {
+ /* Need more header to determine length */
+ if (!rxm->accum_len) {
+ /* Start RX timer for new message */
+ kcm_start_rx_timer(psock);
+ }
+ rxm->accum_len += cand_len;
+ eaten += cand_len;
+ KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
+ WARN_ON(eaten != orig_len);
+ break;
+ } else if (len > psock->sk->sk_rcvbuf) {
+ /* Message length exceeds maximum allowed */
+ KCM_STATS_INCR(psock->stats.rx_msg_too_big);
+ desc->error = -EMSGSIZE;
+ psock->rx_skb_head = NULL;
+ kcm_abort_rx_psock(psock, EMSGSIZE, head);
+ break;
+ } else if (len <= (ssize_t)head->len -
+ skb->len - rxm->offset) {
+ /* Length must be into new skb (and also
+ * greater than zero)
+ */
+ KCM_STATS_INCR(psock->stats.rx_bad_hdr_len);
+ desc->error = -EPROTO;
+ psock->rx_skb_head = NULL;
+ kcm_abort_rx_psock(psock, EPROTO, head);
+ break;
+ }
+
+ rxm->full_len = len;
+ }
+
+ extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
+
+ if (extra < 0) {
+ /* Message not complete yet. */
+ if (rxm->full_len - rxm->accum_len >
+ tcp_inq(psock->sk)) {
+ /* Don't have the whole messages in the socket
+ * buffer. Set psock->rx_need_bytes to wait for
+ * the rest of the message. Also, set "early
+ * eaten" since we've already buffered the skb
+ * but don't consume yet per tcp_read_sock.
+ */
+
+ if (!rxm->accum_len) {
+ /* Start RX timer for new message */
+ kcm_start_rx_timer(psock);
+ }
+
+ psock->rx_need_bytes = rxm->full_len -
+ rxm->accum_len;
+ rxm->accum_len += cand_len;
+ rxm->early_eaten = cand_len;
+ KCM_STATS_ADD(psock->stats.rx_bytes, cand_len);
+ desc->count = 0; /* Stop reading socket */
+ break;
+ }
+ rxm->accum_len += cand_len;
+ eaten += cand_len;
+ WARN_ON(eaten != orig_len);
+ break;
+ }
+
+ /* Positive extra indicates ore bytes than needed for the
+ * message
+ */
+
+ WARN_ON(extra > cand_len);
+
+ eaten += (cand_len - extra);
+
+ /* Hurray, we have a new message! */
+ del_timer(&psock->rx_msg_timer);
+ psock->rx_skb_head = NULL;
+ KCM_STATS_INCR(psock->stats.rx_msgs);
+
+try_queue:
+ kcm = reserve_rx_kcm(psock, head);
+ if (!kcm) {
+ /* Unable to reserve a KCM, message is held in psock. */
+ break;
+ }
+
+ if (kcm_queue_rcv_skb(&kcm->sk, head)) {
+ /* Should mean socket buffer full */
+ unreserve_rx_kcm(psock, false);
+ goto try_queue;
+ }
+ }
+
+ if (cloned_orig)
+ kfree_skb(orig_skb);
+
+ KCM_STATS_ADD(psock->stats.rx_bytes, eaten);
+
+ return eaten;
+}
+
+/* Called with lock held on lower socket */
+static int psock_tcp_read_sock(struct kcm_psock *psock)
+{
+ read_descriptor_t desc;
+
+ desc.arg.data = psock;
+ desc.error = 0;
+ desc.count = 1; /* give more than one skb per call */
+
+ /* sk should be locked here, so okay to do tcp_read_sock */
+ tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
+
+ unreserve_rx_kcm(psock, true);
+
+ return desc.error;
+}
+
+/* Lower sock lock held */
+static void psock_tcp_data_ready(struct sock *sk)
+{
+ struct kcm_psock *psock;
+
+ read_lock_bh(&sk->sk_callback_lock);
+
+ psock = (struct kcm_psock *)sk->sk_user_data;
+ if (unlikely(!psock || psock->rx_stopped))
+ goto out;
+
+ if (psock->ready_rx_msg)
+ goto out;
+
+ if (psock->rx_need_bytes) {
+ if (tcp_inq(sk) >= psock->rx_need_bytes)
+ psock->rx_need_bytes = 0;
+ else
+ goto out;
+ }
+
+ if (psock_tcp_read_sock(psock) == -ENOMEM)
+ queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void do_psock_rx_work(struct kcm_psock *psock)
+{
+ read_descriptor_t rd_desc;
+ struct sock *csk = psock->sk;
+
+ /* We need the read lock to synchronize with psock_tcp_data_ready. We
+ * need the socket lock for calling tcp_read_sock.
+ */
+ lock_sock(csk);
+ read_lock_bh(&csk->sk_callback_lock);
+
+ if (unlikely(csk->sk_user_data != psock))
+ goto out;
+
+ if (unlikely(psock->rx_stopped))
+ goto out;
+
+ if (psock->ready_rx_msg)
+ goto out;
+
+ rd_desc.arg.data = psock;
+
+ if (psock_tcp_read_sock(psock) == -ENOMEM)
+ queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+ read_unlock_bh(&csk->sk_callback_lock);
+ release_sock(csk);
+}
+
+static void psock_rx_work(struct work_struct *w)
+{
+ do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+}
+
+static void psock_rx_delayed_work(struct work_struct *w)
+{
+ do_psock_rx_work(container_of(w, struct kcm_psock,
+ rx_delayed_work.work));
+}
+
+static void psock_tcp_state_change(struct sock *sk)
+{
+ /* TCP only does a POLLIN for a half close. Do a POLLHUP here
+ * since application will normally not poll with POLLIN
+ * on the TCP sockets.
+ */
+
+ report_csk_error(sk, EPIPE);
+}
+
+static void psock_tcp_write_space(struct sock *sk)
+{
+ struct kcm_psock *psock;
+ struct kcm_mux *mux;
+ struct kcm_sock *kcm;
+
+ read_lock_bh(&sk->sk_callback_lock);
+
+ psock = (struct kcm_psock *)sk->sk_user_data;
+ if (unlikely(!psock))
+ goto out;
+
+ mux = psock->mux;
+
+ spin_lock_bh(&mux->lock);
+
+ /* Check if the socket is reserved so someone is waiting for sending. */
+ kcm = psock->tx_kcm;
+ if (kcm)
+ queue_work(kcm_wq, &kcm->tx_work);
+
+ spin_unlock_bh(&mux->lock);
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void unreserve_psock(struct kcm_sock *kcm);
+
+/* kcm sock is locked. */
+static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+ struct kcm_psock *psock;
+
+ psock = kcm->tx_psock;
+
+ smp_rmb(); /* Must read tx_psock before tx_wait */
+
+ if (psock) {
+ WARN_ON(kcm->tx_wait);
+ if (unlikely(psock->tx_stopped))
+ unreserve_psock(kcm);
+ else
+ return kcm->tx_psock;
+ }
+
+ spin_lock_bh(&mux->lock);
+
+ /* Check again under lock to see if psock was reserved for this
+ * psock via psock_unreserve.
+ */
+ psock = kcm->tx_psock;
+ if (unlikely(psock)) {
+ WARN_ON(kcm->tx_wait);
+ spin_unlock_bh(&mux->lock);
+ return kcm->tx_psock;
+ }
+
+ if (!list_empty(&mux->psocks_avail)) {
+ psock = list_first_entry(&mux->psocks_avail,
+ struct kcm_psock,
+ psock_avail_list);
+ list_del(&psock->psock_avail_list);
+ if (kcm->tx_wait) {
+ list_del(&kcm->wait_psock_list);
+ kcm->tx_wait = false;
+ }
+ kcm->tx_psock = psock;
+ psock->tx_kcm = kcm;
+ KCM_STATS_INCR(psock->stats.reserved);
+ } else if (!kcm->tx_wait) {
+ list_add_tail(&kcm->wait_psock_list,
+ &mux->kcm_tx_waiters);
+ kcm->tx_wait = true;
+ }
+
+ spin_unlock_bh(&mux->lock);
+
+ return psock;
+}
+
+/* mux lock held */
+static void psock_now_avail(struct kcm_psock *psock)
+{
+ struct kcm_mux *mux = psock->mux;
+ struct kcm_sock *kcm;
+
+ if (list_empty(&mux->kcm_tx_waiters)) {
+ list_add_tail(&psock->psock_avail_list,
+ &mux->psocks_avail);
+ } else {
+ kcm = list_first_entry(&mux->kcm_tx_waiters,
+ struct kcm_sock,
+ wait_psock_list);
+ list_del(&kcm->wait_psock_list);
+ kcm->tx_wait = false;
+ psock->tx_kcm = kcm;
+
+ /* Commit before changing tx_psock since that is read in
+ * reserve_psock before queuing work.
+ */
+ smp_mb();
+
+ kcm->tx_psock = psock;
+ KCM_STATS_INCR(psock->stats.reserved);
+ queue_work(kcm_wq, &kcm->tx_work);
+ }
+}
+
+/* kcm sock is locked. */
+static void unreserve_psock(struct kcm_sock *kcm)
+{
+ struct kcm_psock *psock;
+ struct kcm_mux *mux = kcm->mux;
+
+ spin_lock_bh(&mux->lock);
+
+ psock = kcm->tx_psock;
+
+ if (WARN_ON(!psock)) {
+ spin_unlock_bh(&mux->lock);
+ return;
+ }
+
+ smp_rmb(); /* Read tx_psock before tx_wait */
+
+ kcm_update_tx_mux_stats(mux, psock);
+
+ WARN_ON(kcm->tx_wait);
+
+ kcm->tx_psock = NULL;
+ psock->tx_kcm = NULL;
+ KCM_STATS_INCR(psock->stats.unreserved);
+
+ if (unlikely(psock->tx_stopped)) {
+ if (psock->done) {
+ /* Deferred free */
+ list_del(&psock->psock_list);
+ mux->psocks_cnt--;
+ sock_put(psock->sk);
+ fput(psock->sk->sk_socket->file);
+ kmem_cache_free(kcm_psockp, psock);
+ }
+
+ /* Don't put back on available list */
+
+ spin_unlock_bh(&mux->lock);
+
+ return;
+ }
+
+ psock_now_avail(psock);
+
+ spin_unlock_bh(&mux->lock);
+}
+
+static void kcm_report_tx_retry(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+
+ spin_lock_bh(&mux->lock);
+ KCM_STATS_INCR(mux->stats.tx_retries);
+ spin_unlock_bh(&mux->lock);
+}
+
+/* Write any messages ready on the kcm socket. Called with kcm sock lock
+ * held. Return bytes actually sent or error.
+ */
+static int kcm_write_msgs(struct kcm_sock *kcm)
+{
+ struct sock *sk = &kcm->sk;
+ struct kcm_psock *psock;
+ struct sk_buff *skb, *head;
+ struct kcm_tx_msg *txm;
+ unsigned short fragidx, frag_offset;
+ unsigned int sent, total_sent = 0;
+ int ret = 0;
+
+ kcm->tx_wait_more = false;
+ psock = kcm->tx_psock;
+ if (unlikely(psock && psock->tx_stopped)) {
+ /* A reserved psock was aborted asynchronously. Unreserve
+ * it and we'll retry the message.
+ */
+ unreserve_psock(kcm);
+ kcm_report_tx_retry(kcm);
+ if (skb_queue_empty(&sk->sk_write_queue))
+ return 0;
+
+ kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+
+ } else if (skb_queue_empty(&sk->sk_write_queue)) {
+ return 0;
+ }
+
+ head = skb_peek(&sk->sk_write_queue);
+ txm = kcm_tx_msg(head);
+
+ if (txm->sent) {
+ /* Send of first skbuff in queue already in progress */
+ if (WARN_ON(!psock)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ sent = txm->sent;
+ frag_offset = txm->frag_offset;
+ fragidx = txm->fragidx;
+ skb = txm->frag_skb;
+
+ goto do_frag;
+ }
+
+try_again:
+ psock = reserve_psock(kcm);
+ if (!psock)
+ goto out;
+
+ do {
+ skb = head;
+ txm = kcm_tx_msg(head);
+ sent = 0;
+
+do_frag_list:
+ if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+ fragidx++) {
+ skb_frag_t *frag;
+
+ frag_offset = 0;
+do_frag:
+ frag = &skb_shinfo(skb)->frags[fragidx];
+ if (WARN_ON(!frag->size)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = kernel_sendpage(psock->sk->sk_socket,
+ frag->page.p,
+ frag->page_offset + frag_offset,
+ frag->size - frag_offset,
+ MSG_DONTWAIT);
+ if (ret <= 0) {
+ if (ret == -EAGAIN) {
+ /* Save state to try again when there's
+ * write space on the socket
+ */
+ txm->sent = sent;
+ txm->frag_offset = frag_offset;
+ txm->fragidx = fragidx;
+ txm->frag_skb = skb;
+
+ ret = 0;
+ goto out;
+ }
+
+ /* Hard failure in sending message, abort this
+ * psock since it has lost framing
+ * synchonization and retry sending the
+ * message from the beginning.
+ */
+ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
+ true);
+ unreserve_psock(kcm);
+
+ txm->sent = 0;
+ kcm_report_tx_retry(kcm);
+ ret = 0;
+
+ goto try_again;
+ }
+
+ sent += ret;
+ frag_offset += ret;
+ KCM_STATS_ADD(psock->stats.tx_bytes, ret);
+ if (frag_offset < frag->size) {
+ /* Not finished with this frag */
+ goto do_frag;
+ }
+ }
+
+ if (skb == head) {
+ if (skb_has_frag_list(skb)) {
+ skb = skb_shinfo(skb)->frag_list;
+ goto do_frag_list;
+ }
+ } else if (skb->next) {
+ skb = skb->next;
+ goto do_frag_list;
+ }
+
+ /* Successfully sent the whole packet, account for it. */
+ skb_dequeue(&sk->sk_write_queue);
+ kfree_skb(head);
+ sk->sk_wmem_queued -= sent;
+ total_sent += sent;
+ KCM_STATS_INCR(psock->stats.tx_msgs);
+ } while ((head = skb_peek(&sk->sk_write_queue)));
+out:
+ if (!head) {
+ /* Done with all queued messages. */
+ WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+ unreserve_psock(kcm);
+ }
+
+ /* Check if write space is available */
+ sk->sk_write_space(sk);
+
+ return total_sent ? : ret;
+}
+
+static void kcm_tx_work(struct work_struct *w)
+{
+ struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
+ struct sock *sk = &kcm->sk;
+ int err;
+
+ lock_sock(sk);
+
+ /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
+ * aborts
+ */
+ err = kcm_write_msgs(kcm);
+ if (err < 0) {
+ /* Hard failure in write, report error on KCM socket */
+ pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
+ report_csk_error(&kcm->sk, -err);
+ goto out;
+ }
+
+ /* Primarily for SOCK_SEQPACKET sockets */
+ if (likely(sk->sk_socket) &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_space(sk);
+ }
+
+out:
+ release_sock(sk);
+}
+
+static void kcm_push(struct kcm_sock *kcm)
+{
+ if (kcm->tx_wait_more)
+ kcm_write_msgs(kcm);
+}
+
+static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags)
+
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ struct sk_buff *skb = NULL, *head = NULL;
+ long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ bool eor;
+ int err = 0;
+ int i;
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
+ /* No MSG_EOR from splice, only look at MSG_MORE */
+ eor = !(flags & MSG_MORE);
+
+ lock_sock(sk);
+
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ err = -EPIPE;
+ if (sk->sk_err)
+ goto out_error;
+
+ if (kcm->seq_skb) {
+ /* Previously opened message */
+ head = kcm->seq_skb;
+ skb = kcm_tx_msg(head)->last_skb;
+ i = skb_shinfo(skb)->nr_frags;
+
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+ goto coalesced;
+ }
+
+ if (i >= MAX_SKB_FRAGS) {
+ struct sk_buff *tskb;
+
+ tskb = alloc_skb(0, sk->sk_allocation);
+ while (!tskb) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ if (head == skb)
+ skb_shinfo(head)->frag_list = tskb;
+ else
+ skb->next = tskb;
+
+ skb = tskb;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ i = 0;
+ }
+ } else {
+ /* Call the sk_stream functions to manage the sndbuf mem. */
+ if (!sk_stream_memory_free(sk)) {
+ kcm_push(kcm);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ head = alloc_skb(0, sk->sk_allocation);
+ while (!head) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ skb = head;
+ i = 0;
+ }
+
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, size);
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+coalesced:
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += size;
+ sk->sk_wmem_queued += size;
+ sk_mem_charge(sk, size);
+
+ if (head != skb) {
+ head->len += size;
+ head->data_len += size;
+ head->truesize += size;
+ }
+
+ if (eor) {
+ bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+ /* Message complete, queue it on send buffer */
+ __skb_queue_tail(&sk->sk_write_queue, head);
+ kcm->seq_skb = NULL;
+ KCM_STATS_INCR(kcm->stats.tx_msgs);
+
+ if (flags & MSG_BATCH) {
+ kcm->tx_wait_more = true;
+ } else if (kcm->tx_wait_more || not_busy) {
+ err = kcm_write_msgs(kcm);
+ if (err < 0) {
+ /* We got a hard error in write_msgs but have
+ * already queued this message. Report an error
+ * in the socket, but don't affect return value
+ * from sendmsg
+ */
+ pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+ report_csk_error(&kcm->sk, -err);
+ }
+ }
+ } else {
+ /* Message not complete, save state */
+ kcm->seq_skb = head;
+ kcm_tx_msg(head)->last_skb = skb;
+ }
+
+ KCM_STATS_ADD(kcm->stats.tx_bytes, size);
+
+ release_sock(sk);
+ return size;
+
+out_error:
+ kcm_push(kcm);
+
+ err = sk_stream_error(sk, flags, err);
+
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
+
+ release_sock(sk);
+ return err;
+}
+
+static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ struct sk_buff *skb = NULL, *head = NULL;
+ size_t copy, copied = 0;
+ long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ int eor = (sock->type == SOCK_DGRAM) ?
+ !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+ int err = -EPIPE;
+
+ lock_sock(sk);
+
+ /* Per tcp_sendmsg this should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err)
+ goto out_error;
+
+ if (kcm->seq_skb) {
+ /* Previously opened message */
+ head = kcm->seq_skb;
+ skb = kcm_tx_msg(head)->last_skb;
+ goto start;
+ }
+
+ /* Call the sk_stream functions to manage the sndbuf mem. */
+ if (!sk_stream_memory_free(sk)) {
+ kcm_push(kcm);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ /* New message, alloc head skb */
+ head = alloc_skb(0, sk->sk_allocation);
+ while (!head) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+
+ head = alloc_skb(0, sk->sk_allocation);
+ }
+
+ skb = head;
+
+ /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+ * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+ */
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+start:
+ while (msg_data_left(msg)) {
+ bool merge = true;
+ int i = skb_shinfo(skb)->nr_frags;
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ if (i == MAX_SKB_FRAGS) {
+ struct sk_buff *tskb;
+
+ tskb = alloc_skb(0, sk->sk_allocation);
+ if (!tskb)
+ goto wait_for_memory;
+
+ if (head == skb)
+ skb_shinfo(head)->frag_list = tskb;
+ else
+ skb->next = tskb;
+
+ skb = tskb;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ continue;
+ }
+ merge = false;
+ }
+
+ copy = min_t(int, msg_data_left(msg),
+ pfrag->size - pfrag->offset);
+
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
+ goto out_error;
+
+ /* Update the skb. */
+ if (merge) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
+ }
+
+ pfrag->offset += copy;
+ copied += copy;
+ if (head != skb) {
+ head->len += copy;
+ head->data_len += copy;
+ }
+
+ continue;
+
+wait_for_memory:
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ if (eor) {
+ bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+ /* Message complete, queue it on send buffer */
+ __skb_queue_tail(&sk->sk_write_queue, head);
+ kcm->seq_skb = NULL;
+ KCM_STATS_INCR(kcm->stats.tx_msgs);
+
+ if (msg->msg_flags & MSG_BATCH) {
+ kcm->tx_wait_more = true;
+ } else if (kcm->tx_wait_more || not_busy) {
+ err = kcm_write_msgs(kcm);
+ if (err < 0) {
+ /* We got a hard error in write_msgs but have
+ * already queued this message. Report an error
+ * in the socket, but don't affect return value
+ * from sendmsg
+ */
+ pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+ report_csk_error(&kcm->sk, -err);
+ }
+ }
+ } else {
+ /* Message not complete, save state */
+partial_message:
+ kcm->seq_skb = head;
+ kcm_tx_msg(head)->last_skb = skb;
+ }
+
+ KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
+
+ release_sock(sk);
+ return copied;
+
+out_error:
+ kcm_push(kcm);
+
+ if (copied && sock->type == SOCK_SEQPACKET) {
+ /* Wrote some bytes before encountering an
+ * error, return partial success.
+ */
+ goto partial_message;
+ }
+
+ if (head != kcm->seq_skb)
+ kfree_skb(head);
+
+ err = sk_stream_error(sk, msg->msg_flags, err);
+
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
+
+ release_sock(sk);
+ return err;
+}
+
+static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
+ long timeo, int *err)
+{
+ struct sk_buff *skb;
+
+ while (!(skb = skb_peek(&sk->sk_receive_queue))) {
+ if (sk->sk_err) {
+ *err = sock_error(sk);
+ return NULL;
+ }
+
+ if (sock_flag(sk, SOCK_DONE))
+ return NULL;
+
+ if ((flags & MSG_DONTWAIT) || !timeo) {
+ *err = -EAGAIN;
+ return NULL;
+ }
+
+ sk_wait_data(sk, &timeo, NULL);
+
+ /* Handle signals */
+ if (signal_pending(current)) {
+ *err = sock_intr_errno(timeo);
+ return NULL;
+ }
+ }
+
+ return skb;
+}
+
+static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ int err = 0;
+ long timeo;
+ struct kcm_rx_msg *rxm;
+ int copied = 0;
+ struct sk_buff *skb;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ lock_sock(sk);
+
+ skb = kcm_wait_data(sk, flags, timeo, &err);
+ if (!skb)
+ goto out;
+
+ /* Okay, have a message on the receive queue */
+
+ rxm = kcm_rx_msg(skb);
+
+ if (len > rxm->full_len)
+ len = rxm->full_len;
+
+ err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+ if (err < 0)
+ goto out;
+
+ copied = len;
+ if (likely(!(flags & MSG_PEEK))) {
+ KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
+ if (copied < rxm->full_len) {
+ if (sock->type == SOCK_DGRAM) {
+ /* Truncated message */
+ msg->msg_flags |= MSG_TRUNC;
+ goto msg_finished;
+ }
+ rxm->offset += copied;
+ rxm->full_len -= copied;
+ } else {
+msg_finished:
+ /* Finished with message */
+ msg->msg_flags |= MSG_EOR;
+ KCM_STATS_INCR(kcm->stats.rx_msgs);
+ skb_unlink(skb, &sk->sk_receive_queue);
+ kfree_skb(skb);
+ }
+ }
+
+out:
+ release_sock(sk);
+
+ return copied ? : err;
+}
+
+static ssize_t kcm_sock_splice(struct sock *sk,
+ struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
+{
+ int ret;
+
+ release_sock(sk);
+ ret = splice_to_pipe(pipe, spd);
+ lock_sock(sk);
+
+ return ret;
+}
+
+static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ long timeo;
+ struct kcm_rx_msg *rxm;
+ int err = 0;
+ size_t copied;
+ struct sk_buff *skb;
+
+ /* Only support splice for SOCKSEQPACKET */
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ lock_sock(sk);
+
+ skb = kcm_wait_data(sk, flags, timeo, &err);
+ if (!skb)
+ goto err_out;
+
+ /* Okay, have a message on the receive queue */
+
+ rxm = kcm_rx_msg(skb);
+
+ if (len > rxm->full_len)
+ len = rxm->full_len;
+
+ copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags,
+ kcm_sock_splice);
+ if (copied < 0) {
+ err = copied;
+ goto err_out;
+ }
+
+ KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
+
+ rxm->offset += copied;
+ rxm->full_len -= copied;
+
+ /* We have no way to return MSG_EOR. If all the bytes have been
+ * read we still leave the message in the receive socket buffer.
+ * A subsequent recvmsg needs to be done to return MSG_EOR and
+ * finish reading the message.
+ */
+
+ release_sock(sk);
+
+ return copied;
+
+err_out:
+ release_sock(sk);
+
+ return err;
+}
+
+/* kcm sock lock held */
+static void kcm_recv_disable(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+
+ if (kcm->rx_disabled)
+ return;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ kcm->rx_disabled = 1;
+
+ /* If a psock is reserved we'll do cleanup in unreserve */
+ if (!kcm->rx_psock) {
+ if (kcm->rx_wait) {
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+ }
+
+ requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+ }
+
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+/* kcm sock lock held */
+static void kcm_recv_enable(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+
+ if (!kcm->rx_disabled)
+ return;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ kcm->rx_disabled = 0;
+ kcm_rcv_ready(kcm);
+
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ int val, valbool;
+ int err = 0;
+
+ if (level != SOL_KCM)
+ return -ENOPROTOOPT;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EINVAL;
+
+ valbool = val ? 1 : 0;
+
+ switch (optname) {
+ case KCM_RECV_DISABLE:
+ lock_sock(&kcm->sk);
+ if (valbool)
+ kcm_recv_disable(kcm);
+ else
+ kcm_recv_enable(kcm);
+ release_sock(&kcm->sk);
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ }
+
+ return err;
+}
+
+static int kcm_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ int val, len;
+
+ if (level != SOL_KCM)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case KCM_RECV_DISABLE:
+ val = kcm->rx_disabled;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+{
+ struct kcm_sock *tkcm;
+ struct list_head *head;
+ int index = 0;
+
+ /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+ * we set sk_state, otherwise epoll_wait always returns right away with
+ * POLLHUP
+ */
+ kcm->sk.sk_state = TCP_ESTABLISHED;
+
+ /* Add to mux's kcm sockets list */
+ kcm->mux = mux;
+ spin_lock_bh(&mux->lock);
+
+ head = &mux->kcm_socks;
+ list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
+ if (tkcm->index != index)
+ break;
+ head = &tkcm->kcm_sock_list;
+ index++;
+ }
+
+ list_add(&kcm->kcm_sock_list, head);
+ kcm->index = index;
+
+ mux->kcm_socks_cnt++;
+ spin_unlock_bh(&mux->lock);
+
+ INIT_WORK(&kcm->tx_work, kcm_tx_work);
+
+ spin_lock_bh(&mux->rx_lock);
+ kcm_rcv_ready(kcm);
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+static void kcm_rx_msg_timeout(unsigned long arg)
+{
+ struct kcm_psock *psock = (struct kcm_psock *)arg;
+
+ /* Message assembly timed out */
+ KCM_STATS_INCR(psock->stats.rx_msg_timeouts);
+ kcm_abort_rx_psock(psock, ETIMEDOUT, NULL);
+}
+
+static int kcm_attach(struct socket *sock, struct socket *csock,
+ struct bpf_prog *prog)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ struct kcm_mux *mux = kcm->mux;
+ struct sock *csk;
+ struct kcm_psock *psock = NULL, *tpsock;
+ struct list_head *head;
+ int index = 0;
+
+ if (csock->ops->family != PF_INET &&
+ csock->ops->family != PF_INET6)
+ return -EINVAL;
+
+ csk = csock->sk;
+ if (!csk)
+ return -EINVAL;
+
+ /* Only support TCP for now */
+ if (csk->sk_protocol != IPPROTO_TCP)
+ return -EINVAL;
+
+ psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+ if (!psock)
+ return -ENOMEM;
+
+ psock->mux = mux;
+ psock->sk = csk;
+ psock->bpf_prog = prog;
+
+ setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout,
+ (unsigned long)psock);
+
+ INIT_WORK(&psock->rx_work, psock_rx_work);
+ INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+
+ sock_hold(csk);
+
+ write_lock_bh(&csk->sk_callback_lock);
+ psock->save_data_ready = csk->sk_data_ready;
+ psock->save_write_space = csk->sk_write_space;
+ psock->save_state_change = csk->sk_state_change;
+ csk->sk_user_data = psock;
+ csk->sk_data_ready = psock_tcp_data_ready;
+ csk->sk_write_space = psock_tcp_write_space;
+ csk->sk_state_change = psock_tcp_state_change;
+ write_unlock_bh(&csk->sk_callback_lock);
+
+ /* Finished initialization, now add the psock to the MUX. */
+ spin_lock_bh(&mux->lock);
+ head = &mux->psocks;
+ list_for_each_entry(tpsock, &mux->psocks, psock_list) {
+ if (tpsock->index != index)
+ break;
+ head = &tpsock->psock_list;
+ index++;
+ }
+
+ list_add(&psock->psock_list, head);
+ psock->index = index;
+
+ KCM_STATS_INCR(mux->stats.psock_attach);
+ mux->psocks_cnt++;
+ psock_now_avail(psock);
+ spin_unlock_bh(&mux->lock);
+
+ /* Schedule RX work in case there are already bytes queued */
+ queue_work(kcm_wq, &psock->rx_work);
+
+ return 0;
+}
+
+static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
+{
+ struct socket *csock;
+ struct bpf_prog *prog;
+ int err;
+
+ csock = sockfd_lookup(info->fd, &err);
+ if (!csock)
+ return -ENOENT;
+
+ prog = bpf_prog_get(info->bpf_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto out;
+ }
+
+ if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = kcm_attach(sock, csock, prog);
+ if (err) {
+ bpf_prog_put(prog);
+ goto out;
+ }
+
+ /* Keep reference on file also */
+
+ return 0;
+out:
+ fput(csock->file);
+ return err;
+}
+
+static void kcm_unattach(struct kcm_psock *psock)
+{
+ struct sock *csk = psock->sk;
+ struct kcm_mux *mux = psock->mux;
+
+ /* Stop getting callbacks from TCP socket. After this there should
+ * be no way to reserve a kcm for this psock.
+ */
+ write_lock_bh(&csk->sk_callback_lock);
+ csk->sk_user_data = NULL;
+ csk->sk_data_ready = psock->save_data_ready;
+ csk->sk_write_space = psock->save_write_space;
+ csk->sk_state_change = psock->save_state_change;
+ psock->rx_stopped = 1;
+
+ if (WARN_ON(psock->rx_kcm)) {
+ write_unlock_bh(&csk->sk_callback_lock);
+ return;
+ }
+
+ spin_lock_bh(&mux->rx_lock);
+
+ /* Stop receiver activities. After this point psock should not be
+ * able to get onto ready list either through callbacks or work.
+ */
+ if (psock->ready_rx_msg) {
+ list_del(&psock->psock_ready_list);
+ kfree_skb(psock->ready_rx_msg);
+ psock->ready_rx_msg = NULL;
+ KCM_STATS_INCR(mux->stats.rx_ready_drops);
+ }
+
+ spin_unlock_bh(&mux->rx_lock);
+
+ write_unlock_bh(&csk->sk_callback_lock);
+
+ del_timer_sync(&psock->rx_msg_timer);
+ cancel_work_sync(&psock->rx_work);
+ cancel_delayed_work_sync(&psock->rx_delayed_work);
+
+ bpf_prog_put(psock->bpf_prog);
+
+ kfree_skb(psock->rx_skb_head);
+ psock->rx_skb_head = NULL;
+
+ spin_lock_bh(&mux->lock);
+
+ aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
+
+ KCM_STATS_INCR(mux->stats.psock_unattach);
+
+ if (psock->tx_kcm) {
+ /* psock was reserved. Just mark it finished and we will clean
+ * up in the kcm paths, we need kcm lock which can not be
+ * acquired here.
+ */
+ KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
+ spin_unlock_bh(&mux->lock);
+
+ /* We are unattaching a socket that is reserved. Abort the
+ * socket since we may be out of sync in sending on it. We need
+ * to do this without the mux lock.
+ */
+ kcm_abort_tx_psock(psock, EPIPE, false);
+
+ spin_lock_bh(&mux->lock);
+ if (!psock->tx_kcm) {
+ /* psock now unreserved in window mux was unlocked */
+ goto no_reserved;
+ }
+ psock->done = 1;
+
+ /* Commit done before queuing work to process it */
+ smp_mb();
+
+ /* Queue tx work to make sure psock->done is handled */
+ queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+ spin_unlock_bh(&mux->lock);
+ } else {
+no_reserved:
+ if (!psock->tx_stopped)
+ list_del(&psock->psock_avail_list);
+ list_del(&psock->psock_list);
+ mux->psocks_cnt--;
+ spin_unlock_bh(&mux->lock);
+
+ sock_put(csk);
+ fput(csk->sk_socket->file);
+ kmem_cache_free(kcm_psockp, psock);
+ }
+}
+
+static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ struct kcm_mux *mux = kcm->mux;
+ struct kcm_psock *psock;
+ struct socket *csock;
+ struct sock *csk;
+ int err;
+
+ csock = sockfd_lookup(info->fd, &err);
+ if (!csock)
+ return -ENOENT;
+
+ csk = csock->sk;
+ if (!csk) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -ENOENT;
+
+ spin_lock_bh(&mux->lock);
+
+ list_for_each_entry(psock, &mux->psocks, psock_list) {
+ if (psock->sk != csk)
+ continue;
+
+ /* Found the matching psock */
+
+ if (psock->unattaching || WARN_ON(psock->done)) {
+ err = -EALREADY;
+ break;
+ }
+
+ psock->unattaching = 1;
+
+ spin_unlock_bh(&mux->lock);
+
+ kcm_unattach(psock);
+
+ err = 0;
+ goto out;
+ }
+
+ spin_unlock_bh(&mux->lock);
+
+out:
+ fput(csock->file);
+ return err;
+}
+
+static struct proto kcm_proto = {
+ .name = "KCM",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct kcm_sock),
+};
+
+/* Clone a kcm socket. */
+static int kcm_clone(struct socket *osock, struct kcm_clone *info,
+ struct socket **newsockp)
+{
+ struct socket *newsock;
+ struct sock *newsk;
+ struct file *newfile;
+ int err, newfd;
+
+ err = -ENFILE;
+ newsock = sock_alloc();
+ if (!newsock)
+ goto out;
+
+ newsock->type = osock->type;
+ newsock->ops = osock->ops;
+
+ __module_get(newsock->ops->owner);
+
+ newfd = get_unused_fd_flags(0);
+ if (unlikely(newfd < 0)) {
+ err = newfd;
+ goto out_fd_fail;
+ }
+
+ newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+ if (unlikely(IS_ERR(newfile))) {
+ err = PTR_ERR(newfile);
+ goto out_sock_alloc_fail;
+ }
+
+ newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
+ &kcm_proto, true);
+ if (!newsk) {
+ err = -ENOMEM;
+ goto out_sk_alloc_fail;
+ }
+
+ sock_init_data(newsock, newsk);
+ init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
+
+ fd_install(newfd, newfile);
+ *newsockp = newsock;
+ info->fd = newfd;
+
+ return 0;
+
+out_sk_alloc_fail:
+ fput(newfile);
+out_sock_alloc_fail:
+ put_unused_fd(newfd);
+out_fd_fail:
+ sock_release(newsock);
+out:
+ return err;
+}
+
+static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ switch (cmd) {
+ case SIOCKCMATTACH: {
+ struct kcm_attach info;
+
+ if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+ err = -EFAULT;
+
+ err = kcm_attach_ioctl(sock, &info);
+
+ break;
+ }
+ case SIOCKCMUNATTACH: {
+ struct kcm_unattach info;
+
+ if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+ err = -EFAULT;
+
+ err = kcm_unattach_ioctl(sock, &info);
+
+ break;
+ }
+ case SIOCKCMCLONE: {
+ struct kcm_clone info;
+ struct socket *newsock = NULL;
+
+ if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+ err = -EFAULT;
+
+ err = kcm_clone(sock, &info, &newsock);
+
+ if (!err) {
+ if (copy_to_user((void __user *)arg, &info,
+ sizeof(info))) {
+ err = -EFAULT;
+ sock_release(newsock);
+ }
+ }
+
+ break;
+ }
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+
+ return err;
+}
+
+static void free_mux(struct rcu_head *rcu)
+{
+ struct kcm_mux *mux = container_of(rcu,
+ struct kcm_mux, rcu);
+
+ kmem_cache_free(kcm_muxp, mux);
+}
+
+static void release_mux(struct kcm_mux *mux)
+{
+ struct kcm_net *knet = mux->knet;
+ struct kcm_psock *psock, *tmp_psock;
+
+ /* Release psocks */
+ list_for_each_entry_safe(psock, tmp_psock,
+ &mux->psocks, psock_list) {
+ if (!WARN_ON(psock->unattaching))
+ kcm_unattach(psock);
+ }
+
+ if (WARN_ON(mux->psocks_cnt))
+ return;
+
+ __skb_queue_purge(&mux->rx_hold_queue);
+
+ mutex_lock(&knet->mutex);
+ aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
+ aggregate_psock_stats(&mux->aggregate_psock_stats,
+ &knet->aggregate_psock_stats);
+ list_del_rcu(&mux->kcm_mux_list);
+ knet->count--;
+ mutex_unlock(&knet->mutex);
+
+ call_rcu(&mux->rcu, free_mux);
+}
+
+static void kcm_done(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+ struct sock *sk = &kcm->sk;
+ int socks_cnt;
+
+ spin_lock_bh(&mux->rx_lock);
+ if (kcm->rx_psock) {
+ /* Cleanup in unreserve_rx_kcm */
+ WARN_ON(kcm->done);
+ kcm->rx_disabled = 1;
+ kcm->done = 1;
+ spin_unlock_bh(&mux->rx_lock);
+ return;
+ }
+
+ if (kcm->rx_wait) {
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+ }
+ /* Move any pending receive messages to other kcm sockets */
+ requeue_rx_msgs(mux, &sk->sk_receive_queue);
+
+ spin_unlock_bh(&mux->rx_lock);
+
+ if (WARN_ON(sk_rmem_alloc_get(sk)))
+ return;
+
+ /* Detach from MUX */
+ spin_lock_bh(&mux->lock);
+
+ list_del(&kcm->kcm_sock_list);
+ mux->kcm_socks_cnt--;
+ socks_cnt = mux->kcm_socks_cnt;
+
+ spin_unlock_bh(&mux->lock);
+
+ if (!socks_cnt) {
+ /* We are done with the mux now. */
+ release_mux(mux);
+ }
+
+ WARN_ON(kcm->rx_wait);
+
+ sock_put(&kcm->sk);
+}
+
+/* Called by kcm_release to close a KCM socket.
+ * If this is the last KCM socket on the MUX, destroy the MUX.
+ */
+static int kcm_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm;
+ struct kcm_mux *mux;
+ struct kcm_psock *psock;
+
+ if (!sk)
+ return 0;
+
+ kcm = kcm_sk(sk);
+ mux = kcm->mux;
+
+ sock_orphan(sk);
+ kfree_skb(kcm->seq_skb);
+
+ lock_sock(sk);
+ /* Purge queue under lock to avoid race condition with tx_work trying
+ * to act when queue is nonempty. If tx_work runs after this point
+ * it will just return.
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+ release_sock(sk);
+
+ spin_lock_bh(&mux->lock);
+ if (kcm->tx_wait) {
+ /* Take of tx_wait list, after this point there should be no way
+ * that a psock will be assigned to this kcm.
+ */
+ list_del(&kcm->wait_psock_list);
+ kcm->tx_wait = false;
+ }
+ spin_unlock_bh(&mux->lock);
+
+ /* Cancel work. After this point there should be no outside references
+ * to the kcm socket.
+ */
+ cancel_work_sync(&kcm->tx_work);
+
+ lock_sock(sk);
+ psock = kcm->tx_psock;
+ if (psock) {
+ /* A psock was reserved, so we need to kill it since it
+ * may already have some bytes queued from a message. We
+ * need to do this after removing kcm from tx_wait list.
+ */
+ kcm_abort_tx_psock(psock, EPIPE, false);
+ unreserve_psock(kcm);
+ }
+ release_sock(sk);
+
+ WARN_ON(kcm->tx_wait);
+ WARN_ON(kcm->tx_psock);
+
+ sock->sk = NULL;
+
+ kcm_done(kcm);
+
+ return 0;
+}
+
+static const struct proto_ops kcm_dgram_ops = {
+ .family = PF_KCM,
+ .owner = THIS_MODULE,
+ .release = kcm_release,
+ .bind = sock_no_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = kcm_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = kcm_setsockopt,
+ .getsockopt = kcm_getsockopt,
+ .sendmsg = kcm_sendmsg,
+ .recvmsg = kcm_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = kcm_sendpage,
+};
+
+static const struct proto_ops kcm_seqpacket_ops = {
+ .family = PF_KCM,
+ .owner = THIS_MODULE,
+ .release = kcm_release,
+ .bind = sock_no_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = kcm_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = kcm_setsockopt,
+ .getsockopt = kcm_getsockopt,
+ .sendmsg = kcm_sendmsg,
+ .recvmsg = kcm_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = kcm_sendpage,
+ .splice_read = kcm_splice_read,
+};
+
+/* Create proto operation for kcm sockets */
+static int kcm_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+ struct sock *sk;
+ struct kcm_mux *mux;
+
+ switch (sock->type) {
+ case SOCK_DGRAM:
+ sock->ops = &kcm_dgram_ops;
+ break;
+ case SOCK_SEQPACKET:
+ sock->ops = &kcm_seqpacket_ops;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ if (protocol != KCMPROTO_CONNECTED)
+ return -EPROTONOSUPPORT;
+
+ sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ /* Allocate a kcm mux, shared between KCM sockets */
+ mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
+ if (!mux) {
+ sk_free(sk);
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&mux->lock);
+ spin_lock_init(&mux->rx_lock);
+ INIT_LIST_HEAD(&mux->kcm_socks);
+ INIT_LIST_HEAD(&mux->kcm_rx_waiters);
+ INIT_LIST_HEAD(&mux->kcm_tx_waiters);
+
+ INIT_LIST_HEAD(&mux->psocks);
+ INIT_LIST_HEAD(&mux->psocks_ready);
+ INIT_LIST_HEAD(&mux->psocks_avail);
+
+ mux->knet = knet;
+
+ /* Add new MUX to list */
+ mutex_lock(&knet->mutex);
+ list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
+ knet->count++;
+ mutex_unlock(&knet->mutex);
+
+ skb_queue_head_init(&mux->rx_hold_queue);
+
+ /* Init KCM socket */
+ sock_init_data(sock, sk);
+ init_kcm_sock(kcm_sk(sk), mux);
+
+ return 0;
+}
+
+static struct net_proto_family kcm_family_ops = {
+ .family = PF_KCM,
+ .create = kcm_create,
+ .owner = THIS_MODULE,
+};
+
+static __net_init int kcm_init_net(struct net *net)
+{
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ INIT_LIST_HEAD_RCU(&knet->mux_list);
+ mutex_init(&knet->mutex);
+
+ return 0;
+}
+
+static __net_exit void kcm_exit_net(struct net *net)
+{
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ /* All KCM sockets should be closed at this point, which should mean
+ * that all multiplexors and psocks have been destroyed.
+ */
+ WARN_ON(!list_empty(&knet->mux_list));
+}
+
+static struct pernet_operations kcm_net_ops = {
+ .init = kcm_init_net,
+ .exit = kcm_exit_net,
+ .id = &kcm_net_id,
+ .size = sizeof(struct kcm_net),
+};
+
+static int __init kcm_init(void)
+{
+ int err = -ENOMEM;
+
+ kcm_muxp = kmem_cache_create("kcm_mux_cache",
+ sizeof(struct kcm_mux), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ if (!kcm_muxp)
+ goto fail;
+
+ kcm_psockp = kmem_cache_create("kcm_psock_cache",
+ sizeof(struct kcm_psock), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ if (!kcm_psockp)
+ goto fail;
+
+ kcm_wq = create_singlethread_workqueue("kkcmd");
+ if (!kcm_wq)
+ goto fail;
+
+ err = proto_register(&kcm_proto, 1);
+ if (err)
+ goto fail;
+
+ err = sock_register(&kcm_family_ops);
+ if (err)
+ goto sock_register_fail;
+
+ err = register_pernet_device(&kcm_net_ops);
+ if (err)
+ goto net_ops_fail;
+
+ err = kcm_proc_init();
+ if (err)
+ goto proc_init_fail;
+
+ return 0;
+
+proc_init_fail:
+ unregister_pernet_device(&kcm_net_ops);
+
+net_ops_fail:
+ sock_unregister(PF_KCM);
+
+sock_register_fail:
+ proto_unregister(&kcm_proto);
+
+fail:
+ kmem_cache_destroy(kcm_muxp);
+ kmem_cache_destroy(kcm_psockp);
+
+ if (kcm_wq)
+ destroy_workqueue(kcm_wq);
+
+ return err;
+}
+
+static void __exit kcm_exit(void)
+{
+ kcm_proc_exit();
+ unregister_pernet_device(&kcm_net_ops);
+ sock_unregister(PF_KCM);
+ proto_unregister(&kcm_proto);
+ destroy_workqueue(kcm_wq);
+
+ kmem_cache_destroy(kcm_muxp);
+ kmem_cache_destroy(kcm_psockp);
+}
+
+module_init(kcm_init);
+module_exit(kcm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KCM);
+
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 1b8a5caa221e..3a8f881b22f1 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -327,7 +327,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
/* prepare A-MPDU MLME for Rx aggregation */
- tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+ tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL);
if (!tid_agg_rx)
goto end;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 1630975c89f1..804575ff7af5 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -92,7 +92,7 @@ struct ieee80211_fragment_entry {
u16 extra_len;
u16 last_frag;
u8 rx_queue;
- bool ccmp; /* Whether fragments were encrypted with CCMP */
+ bool check_sequential_pn; /* needed for CCMP/GCMP */
u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
};
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 3ece7d1034c8..b54f398cda5d 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
* computing cur_tp
*/
tmp_mrs = &mi->r[idx].stats;
- tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+ tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
return tmp_cur_tp;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 3928dbd24e25..370d677b547b 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -414,15 +414,16 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
(max_tp_group != MINSTREL_CCK_GROUP))
return;
+ max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
+ max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
+ max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
+
if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) {
cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx,
mrs->prob_ewma);
if (cur_tp_avg > tmp_tp_avg)
mi->max_prob_rate = index;
- max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
- max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
- max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group,
max_gpr_idx,
max_gpr_prob);
@@ -431,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
} else {
if (mrs->prob_ewma > tmp_prob)
mi->max_prob_rate = index;
- if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma)
+ if (mrs->prob_ewma > max_gpr_prob)
mg->max_group_prob_rate = index;
}
}
@@ -691,7 +692,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
if (likely(sta->ampdu_mlme.tid_tx[tid]))
return;
- ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+ ieee80211_start_tx_ba_session(pubsta, tid, 0);
}
static void
@@ -871,7 +872,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
* - if station is in dynamic SMPS (and streams > 1)
* - for fallback rates, to increase chances of getting through
*/
- if (offset > 0 &&
+ if (offset > 0 ||
(mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC &&
group->streams > 1)) {
ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts;
@@ -1334,7 +1335,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
prob = mi->groups[i].rates[j].prob_ewma;
/* convert tp_avg from pkt per second in kbps */
- tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
+ tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024;
return tp_avg;
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 5690e4c67486..dc27becb9b71 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1777,7 +1777,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
entry->seq = seq;
entry->rx_queue = rx_queue;
entry->last_frag = frag;
- entry->ccmp = 0;
+ entry->check_sequential_pn = false;
entry->extra_len = 0;
return entry;
@@ -1873,15 +1873,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
rx->seqno_idx, &(rx->skb));
if (rx->key &&
(rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
- rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
ieee80211_has_protected(fc)) {
int queue = rx->security_idx;
- /* Store CCMP PN so that we can verify that the next
- * fragment has a sequential PN value. */
- entry->ccmp = 1;
+
+ /* Store CCMP/GCMP PN so that we can verify that the
+ * next fragment has a sequential PN value.
+ */
+ entry->check_sequential_pn = true;
memcpy(entry->last_pn,
rx->key->u.ccmp.rx_pn[queue],
IEEE80211_CCMP_PN_LEN);
+ BUILD_BUG_ON(offsetof(struct ieee80211_key,
+ u.ccmp.rx_pn) !=
+ offsetof(struct ieee80211_key,
+ u.gcmp.rx_pn));
+ BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
+ sizeof(rx->key->u.gcmp.rx_pn[queue]));
+ BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
+ IEEE80211_GCMP_PN_LEN);
}
return RX_QUEUED;
}
@@ -1896,15 +1908,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
return RX_DROP_MONITOR;
}
- /* Verify that MPDUs within one MSDU have sequential PN values.
- * (IEEE 802.11i, 8.3.3.4.5) */
- if (entry->ccmp) {
+ /* "The receiver shall discard MSDUs and MMPDUs whose constituent
+ * MPDU PN values are not incrementing in steps of 1."
+ * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
+ * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
+ */
+ if (entry->check_sequential_pn) {
int i;
u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
int queue;
+
if (!rx->key ||
(rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
- rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256))
return RX_DROP_UNUSABLE;
memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -3473,6 +3491,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
return false;
/* ignore action frames to TDLS-peers */
if (ieee80211_is_action(hdr->frame_control) &&
+ !is_broadcast_ether_addr(bssid) &&
!ether_addr_equal(bssid, hdr->addr1))
return false;
}
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 0328f7250693..299edc6add5a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -605,17 +605,13 @@ static const struct file_operations ip_vs_app_fops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
-
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
+ proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops);
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
-
unregister_ip_vs_app(ipvs, NULL /* all */);
- remove_proc_entry("ip_vs_app", net->proc_net);
+ remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e7c1b052c2a3..404b2a4f4b5b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1376,8 +1376,6 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
struct ip_vs_pe *old_pe;
struct netns_ipvs *ipvs = svc->ipvs;
- pr_info("%s: enter\n", __func__);
-
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services--;
@@ -3947,7 +3945,6 @@ static struct notifier_block ip_vs_dst_notifier = {
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
int i, idx;
/* Initialize rs_table */
@@ -3974,9 +3971,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
spin_lock_init(&ipvs->tot_stats.lock);
- proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
- proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
- proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+ proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops);
+ proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops);
+ proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
&ip_vs_stats_percpu_fops);
if (ip_vs_control_net_init_sysctl(ipvs))
@@ -3991,13 +3988,11 @@ err:
void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
-
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
- remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
- remove_proc_entry("ip_vs_stats", net->proc_net);
- remove_proc_entry("ip_vs", net->proc_net);
+ remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
+ remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+ remove_proc_entry("ip_vs", ipvs->net->proc_net);
free_percpu(ipvs->tot_stats.cpustats);
}
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9aea747b43ea..81b5ad6165ac 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -17,7 +17,9 @@
#include <net/netfilter/nft_masq.h>
const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
- [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+ [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+ [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 },
+ [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(nft_masq_policy);
@@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
struct nft_masq *priv = nft_expr_priv(expr);
int err;
@@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx,
if (err)
return err;
- if (tb[NFTA_MASQ_FLAGS] == NULL)
- return 0;
-
- priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
+ if (tb[NFTA_MASQ_FLAGS]) {
+ priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
+ if (priv->flags & ~NF_NAT_RANGE_MASK)
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
+ priv->sreg_proto_min =
+ nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
+
+ err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
+ priv->sreg_proto_max =
+ nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
+
+ err = nft_validate_register_load(priv->sreg_proto_max,
+ plen);
+ if (err < 0)
+ return err;
+ } else {
+ priv->sreg_proto_max = priv->sreg_proto_min;
+ }
+ }
return 0;
}
@@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_masq *priv = nft_expr_priv(expr);
- if (priv->flags == 0)
- return 0;
-
- if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
+ if (priv->flags != 0 &&
+ nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
goto nla_put_failure;
+ if (priv->sreg_proto_min) {
+ if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN,
+ priv->sreg_proto_min) ||
+ nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX,
+ priv->sreg_proto_max))
+ goto nla_put_failure;
+ }
+
return 0;
nla_put_failure:
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index fe885bf271c5..16c50b0dd426 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -28,6 +28,8 @@
#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+
void nft_meta_get_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
break;
#endif
+ case NFT_META_PRANDOM: {
+ struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
+ *dest = prandom_u32_state(state);
+ break;
+ }
default:
WARN_ON(1);
goto err;
@@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_OIFNAME:
len = IFNAMSIZ;
break;
+ case NFT_META_PRANDOM:
+ prandom_init_once(&nft_prandom_state);
+ len = sizeof(u32);
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c8a0b7da5ff4..d0cd2b9bf844 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -694,12 +694,45 @@ EXPORT_SYMBOL(xt_free_table_info);
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name)
{
- struct xt_table *t;
+ struct xt_table *t, *found = NULL;
mutex_lock(&xt[af].mutex);
list_for_each_entry(t, &net->xt.tables[af], list)
if (strcmp(t->name, name) == 0 && try_module_get(t->me))
return t;
+
+ if (net == &init_net)
+ goto out;
+
+ /* Table doesn't exist in this netns, re-try init */
+ list_for_each_entry(t, &init_net.xt.tables[af], list) {
+ if (strcmp(t->name, name))
+ continue;
+ if (!try_module_get(t->me))
+ return NULL;
+
+ mutex_unlock(&xt[af].mutex);
+ if (t->table_init(net) != 0) {
+ module_put(t->me);
+ return NULL;
+ }
+
+ found = t;
+
+ mutex_lock(&xt[af].mutex);
+ break;
+ }
+
+ if (!found)
+ goto out;
+
+ /* and once again: */
+ list_for_each_entry(t, &net->xt.tables[af], list)
+ if (strcmp(t->name, name) == 0)
+ return t;
+
+ module_put(found->me);
+ out:
mutex_unlock(&xt[af].mutex);
return NULL;
}
@@ -1170,20 +1203,20 @@ static const struct file_operations xt_target_ops = {
#endif /* CONFIG_PROC_FS */
/**
- * xt_hook_link - set up hooks for a new table
+ * xt_hook_ops_alloc - set up hooks for a new table
* @table: table with metadata needed to set up hooks
* @fn: Hook function
*
- * This function will take care of creating and registering the necessary
- * Netfilter hooks for XT tables.
+ * This function will create the nf_hook_ops that the x_table needs
+ * to hand to xt_hook_link_net().
*/
-struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
+struct nf_hook_ops *
+xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
{
unsigned int hook_mask = table->valid_hooks;
uint8_t i, num_hooks = hweight32(hook_mask);
uint8_t hooknum;
struct nf_hook_ops *ops;
- int ret;
ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
if (ops == NULL)
@@ -1200,27 +1233,9 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
++i;
}
- ret = nf_register_hooks(ops, num_hooks);
- if (ret < 0) {
- kfree(ops);
- return ERR_PTR(ret);
- }
-
return ops;
}
-EXPORT_SYMBOL_GPL(xt_hook_link);
-
-/**
- * xt_hook_unlink - remove hooks for a table
- * @ops: nf_hook_ops array as returned by nf_hook_link
- * @hook_mask: the very same mask that was passed to nf_hook_link
- */
-void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops)
-{
- nf_unregister_hooks(ops, hweight32(table->valid_hooks));
- kfree(ops);
-}
-EXPORT_SYMBOL_GPL(xt_hook_unlink);
+EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
int xt_proto_init(struct net *net, u_int8_t af)
{
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 4e3c3affd285..2455b69b5810 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -262,7 +262,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
if (f->opt[optnum].kind == (*optp)) {
__u32 len = f->opt[optnum].length;
const __u8 *optend = optp + len;
- int loop_cont = 0;
fmatch = FMATCH_OK;
@@ -275,7 +274,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
mss = ntohs((__force __be16)mss);
break;
case OSFOPT_TS:
- loop_cont = 1;
break;
}
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f0cb92f3ddaf..ada67422234b 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -55,8 +55,8 @@ struct netlbl_domhsh_tbl {
static DEFINE_SPINLOCK(netlbl_domhsh_lock);
#define netlbl_domhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
-static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
-static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
+static struct netlbl_domhsh_tbl *netlbl_domhsh;
+static struct netlbl_dom_map *netlbl_domhsh_def;
/*
* Domain Hash Table Helper Functions
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index b0380927f05f..9eaa9a1e8629 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -116,11 +116,11 @@ struct netlbl_unlhsh_walk_arg {
static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
#define netlbl_unlhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
-static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
+static struct netlbl_unlhsh_tbl *netlbl_unlhsh;
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_def;
/* Accept unlabeled packets flag */
-static u8 netlabel_unlabel_acceptflg = 0;
+static u8 netlabel_unlabel_acceptflg;
/* NetLabel Generic NETLINK unlabeled family */
static struct genl_family netlbl_unlabel_gnl_family = {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d41b1074cb2d..1ecfa710ca98 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1915,6 +1915,10 @@ retry:
goto retry;
}
+ if (!dev_validate_header(dev, skb->data, len)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
!packet_extra_vlan_len_allowed(dev, skb)) {
err = -EMSGSIZE;
@@ -2393,18 +2397,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
-static bool ll_header_truncated(const struct net_device *dev, int len)
-{
- /* net device doesn't like empty head */
- if (unlikely(len < dev->hard_header_len)) {
- net_warn_ratelimited("%s: packet size is too short (%d < %d)\n",
- current->comm, len, dev->hard_header_len);
- return true;
- }
-
- return false;
-}
-
static void tpacket_set_protocol(const struct net_device *dev,
struct sk_buff *skb)
{
@@ -2522,16 +2514,20 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
if (unlikely(err < 0))
return -EINVAL;
} else if (copylen) {
+ int hdrlen = min_t(int, copylen, tp_len);
+
skb_push(skb, dev->hard_header_len);
skb_put(skb, copylen - dev->hard_header_len);
- err = skb_store_bits(skb, 0, data, copylen);
+ err = skb_store_bits(skb, 0, data, hdrlen);
if (unlikely(err))
return err;
+ if (!dev_validate_header(dev, skb->data, hdrlen))
+ return -EINVAL;
if (!skb->protocol)
tpacket_set_protocol(dev, skb);
- data += copylen;
- to_write -= copylen;
+ data += hdrlen;
+ to_write -= hdrlen;
}
offset = offset_in_page(data);
@@ -2703,13 +2699,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
copylen = __virtio16_to_cpu(vio_le(),
vnet_hdr->hdr_len);
}
- if (dev->hard_header_len) {
- if (ll_header_truncated(dev, tp_len)) {
- tp_len = -EINVAL;
- goto tpacket_error;
- }
- copylen = max_t(int, copylen, dev->hard_header_len);
- }
+ copylen = max_t(int, copylen, dev->hard_header_len);
skb = sock_alloc_send_skb(&po->sk,
hlen + tlen + sizeof(struct sockaddr_ll) +
(copylen - dev->hard_header_len),
@@ -2905,9 +2895,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
if (unlikely(offset < 0))
goto out_free;
- } else {
- if (ll_header_truncated(dev, len))
- goto out_free;
}
/* Returns -EFAULT on error */
@@ -2915,6 +2902,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (err)
goto out_free;
+ if (sock->type == SOCK_RAW &&
+ !dev_validate_header(dev, skb->data, len)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 6e7ec257790d..c589a9ba506a 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(ife_get_meta_u16);
int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
{
- mi->metaval = kmemdup(&metaval, sizeof(u32), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL);
if (!mi->metaval)
return -ENOMEM;
@@ -118,7 +118,7 @@ EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
{
- mi->metaval = kmemdup(&metaval, sizeof(u16), GFP_KERNEL);
+ mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL);
if (!mi->metaval)
return -ENOMEM;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 89c41a1f3589..350e134cffb3 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -66,6 +66,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
.targinfo = t->data,
+ .family = NFPROTO_IPV4,
};
if (par.target->destroy != NULL)
par.target->destroy(&par);
@@ -219,6 +220,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
par.hooknum = ipt->tcfi_hook;
par.target = ipt->tcfi_t->u.kernel.target;
par.targinfo = ipt->tcfi_t->data;
+ par.family = NFPROTO_IPV4;
ret = par.target->target(skb, &par);
switch (ret) {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 95b021243233..2181ffc76638 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head)
kfree(f);
}
+static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_flower_offload offload = {0};
+ struct tc_to_netdev tc;
+
+ if (!tc_should_offload(dev, 0))
+ return;
+
+ offload.command = TC_CLSFLOWER_DESTROY;
+ offload.cookie = cookie;
+
+ tc.type = TC_SETUP_CLSFLOWER;
+ tc.cls_flower = &offload;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
+static void fl_hw_replace_filter(struct tcf_proto *tp,
+ struct flow_dissector *dissector,
+ struct fl_flow_key *mask,
+ struct fl_flow_key *key,
+ struct tcf_exts *actions,
+ unsigned long cookie, u32 flags)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_flower_offload offload = {0};
+ struct tc_to_netdev tc;
+
+ if (!tc_should_offload(dev, flags))
+ return;
+
+ offload.command = TC_CLSFLOWER_REPLACE;
+ offload.cookie = cookie;
+ offload.dissector = dissector;
+ offload.mask = mask;
+ offload.key = key;
+ offload.exts = actions;
+
+ tc.type = TC_SETUP_CLSFLOWER;
+ tc.cls_flower = &offload;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
static bool fl_destroy(struct tcf_proto *tp, bool force)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
return false;
list_for_each_entry_safe(f, next, &head->filters, list) {
+ fl_hw_destroy_filter(tp, (unsigned long)f);
list_del_rcu(&f->list);
call_rcu(&f->rcu, fl_destroy_filter);
}
@@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
struct cls_fl_filter *fnew;
struct nlattr *tb[TCA_FLOWER_MAX + 1];
struct fl_flow_mask mask = {};
+ u32 flags = 0;
int err;
if (!tca[TCA_OPTIONS])
@@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
}
fnew->handle = handle;
+ if (tb[TCA_FLOWER_FLAGS])
+ flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
if (err)
goto errout;
@@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
head->ht_params);
if (err)
goto errout;
- if (fold)
+
+ fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ &fnew->key,
+ &fnew->exts,
+ (unsigned long)fnew,
+ flags);
+
+ if (fold) {
rhashtable_remove_fast(&head->ht, &fold->ht_node,
head->ht_params);
+ fl_hw_destroy_filter(tp, (unsigned long)fold);
+ }
*arg = (unsigned long) fnew;
@@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
rhashtable_remove_fast(&head->ht, &f->ht_node,
head->ht_params);
list_del_rcu(&f->list);
+ fl_hw_destroy_filter(tp, (unsigned long)f);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fl_destroy_filter);
return 0;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index d0dff0cd8186..34b4ddaca27c 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -276,7 +276,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
- skb = p->q->ops->dequeue(p->q);
+ skb = qdisc_dequeue_peeked(p->q);
if (skb == NULL)
return NULL;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index ec529121f38a..ce46f1c7f133 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -526,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
}
return 0;
}
+ if (addr1->v6.sin6_port != addr2->v6.sin6_port)
+ return 0;
if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
return 0;
/* If this is a linklocal address, compare the scope_id. */
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index cfc3c7101a38..5cfac8d5d3b3 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -480,7 +480,7 @@ static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
{
struct sctp_association *assoc;
- struct sctp_transport *tsp;
+ struct sctp_transport *transport, *tsp;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
@@ -488,10 +488,10 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
return 0;
}
- tsp = (struct sctp_transport *)v;
- if (!sctp_transport_hold(tsp))
+ transport = (struct sctp_transport *)v;
+ if (!sctp_transport_hold(transport))
return 0;
- assoc = tsp->asoc;
+ assoc = transport->asoc;
list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
transports) {
@@ -544,7 +544,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
- sctp_transport_put(tsp);
+ sctp_transport_put(transport);
return 0;
}
diff --git a/net/socket.c b/net/socket.c
index c044d1e8508c..886649c88d8f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -533,7 +533,7 @@ static const struct inode_operations sockfs_inode_ops = {
* NULL is returned.
*/
-static struct socket *sock_alloc(void)
+struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
@@ -554,6 +554,7 @@ static struct socket *sock_alloc(void)
this_cpu_add(sockets_in_use, 1);
return sock;
}
+EXPORT_SYMBOL(sock_alloc);
/**
* sock_release - close a socket
@@ -1874,7 +1875,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
struct msghdr *msg_sys, unsigned int flags,
- struct used_address *used_address)
+ struct used_address *used_address,
+ unsigned int allowed_msghdr_flags)
{
struct compat_msghdr __user *msg_compat =
(struct compat_msghdr __user *)msg;
@@ -1900,6 +1902,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
if (msg_sys->msg_controllen > INT_MAX)
goto out_freeiov;
+ flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
ctl_len = msg_sys->msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
err =
@@ -1978,7 +1981,7 @@ long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
if (!sock)
goto out;
- err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
+ err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
fput_light(sock->file, fput_needed);
out:
@@ -2005,6 +2008,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
struct used_address used_address;
+ unsigned int oflags = flags;
if (vlen > UIO_MAXIOV)
vlen = UIO_MAXIOV;
@@ -2019,11 +2023,15 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
entry = mmsg;
compat_entry = (struct compat_mmsghdr __user *)mmsg;
err = 0;
+ flags |= MSG_BATCH;
while (datagrams < vlen) {
+ if (datagrams == vlen - 1)
+ flags = oflags;
+
if (MSG_CMSG_COMPAT & flags) {
err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
- &msg_sys, flags, &used_address);
+ &msg_sys, flags, &used_address, MSG_EOR);
if (err < 0)
break;
err = __put_user(err, &compat_entry->msg_len);
@@ -2031,7 +2039,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
} else {
err = ___sys_sendmsg(sock,
(struct user_msghdr __user *)entry,
- &msg_sys, flags, &used_address);
+ &msg_sys, flags, &used_address, MSG_EOR);
if (err < 0)
break;
err = put_user(err, &entry->msg_len);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 799e65b944b9..cabf586f47d7 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -740,7 +740,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
default:
printk(KERN_CRIT "%s: bad return from "
"gss_fill_context: %zd\n", __func__, err);
- BUG();
+ gss_msg->msg.errno = -EIO;
}
goto err_release_msg;
}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2b32fd602669..273bc3a35425 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1225,7 +1225,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
if (bp[0] == '\\' && bp[1] == 'x') {
/* HEX STRING */
bp += 2;
- while (len < bufsize) {
+ while (len < bufsize - 1) {
int h, l;
h = hex_to_bin(bp[0]);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index cc1251d07297..2dcd7640eeb5 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -341,6 +341,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
rqst->rq_reply_bytes_recvd = 0;
rqst->rq_bytes_sent = 0;
rqst->rq_xid = headerp->rm_xid;
+
+ rqst->rq_private_buf.len = size;
set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
buf = &rqst->rq_rcv_buf;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 47f7da58a7f0..8b5833c1ff2e 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1093,8 +1093,11 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
.cb = cb,
.idx = idx,
};
+ int err;
- switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb);
+ err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
+ switchdev_port_fdb_dump_cb);
+ cb->args[1] = err;
return dump.idx;
}
EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index e401108360a2..ae469b37d852 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -412,11 +412,6 @@ enomem:
return -ENOMEM;
}
-void tipc_bcast_reinit(struct net *net)
-{
- tipc_link_reinit(tipc_bc_sndlink(net), tipc_own_addr(net));
-}
-
void tipc_bcast_stop(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 1944c6c00bb9..d5e79b3767fd 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -46,7 +46,6 @@ struct tipc_node_map;
extern const char tipc_bclink_name[];
int tipc_bcast_init(struct net *net);
-void tipc_bcast_reinit(struct net *net);
void tipc_bcast_stop(struct net *net);
void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
struct sk_buff_head *xmitq);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 802ffad3200d..27a5406213c6 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -40,6 +40,7 @@
#include "link.h"
#include "discover.h"
#include "bcast.h"
+#include "netlink.h"
#define MAX_ADDR_STR 60
@@ -54,23 +55,6 @@ static struct tipc_media * const media_info_array[] = {
NULL
};
-static const struct nla_policy
-tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = {
- [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_BEARER_NAME] = {
- .type = NLA_STRING,
- .len = TIPC_MAX_BEARER_NAME
- },
- [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED },
- [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 }
-};
-
-static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
- [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING },
- [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED }
-};
-
static void bearer_disable(struct net *net, struct tipc_bearer *b);
/**
diff --git a/net/tipc/link.c b/net/tipc/link.c
index e31d92f80572..7d2bb3e70baa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1,7 +1,7 @@
/*
* net/tipc/link.c: TIPC link code
*
- * Copyright (c) 1996-2007, 2012-2015, Ericsson AB
+ * Copyright (c) 1996-2007, 2012-2016, Ericsson AB
* Copyright (c) 2004-2007, 2010-2013, Wind River Systems
* All rights reserved.
*
@@ -127,6 +127,7 @@ struct tipc_link {
/* Management and link supervision data */
u32 peer_session;
+ u32 session;
u32 peer_bearer_id;
u32 bearer_id;
u32 tolerance;
@@ -136,11 +137,7 @@ struct tipc_link {
u16 peer_caps;
bool active;
u32 silent_intv_cnt;
- struct {
- unchar hdr[INT_H_SIZE];
- unchar body[TIPC_MAX_IF_NAME];
- } proto_msg;
- struct tipc_msg *pmsg;
+ char if_name[TIPC_MAX_IF_NAME];
u32 priority;
char net_plane;
@@ -195,14 +192,6 @@ struct tipc_link {
static const char *link_co_err = "Link tunneling error, ";
static const char *link_rst_msg = "Resetting link ";
-/* Properties valid for media, bearar and link */
-static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
- [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 },
- [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 },
- [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }
-};
-
/* Send states for broadcast NACKs
*/
enum {
@@ -215,10 +204,11 @@ enum {
* Interval between NACKs when packets arrive out of order
*/
#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
-/*
- * Out-of-range value for link session numbers
+
+/* Wildcard value for link session numbers. When it is known that
+ * peer endpoint is down, any session number must be accepted.
*/
-#define WILDCARD_SESSION 0x10000
+#define ANY_SESSION 0x10000
/* Link FSM states:
*/
@@ -398,16 +388,6 @@ char *tipc_link_name(struct tipc_link *l)
return l->name;
}
-static u32 link_own_addr(struct tipc_link *l)
-{
- return msg_prevnode(l->pmsg);
-}
-
-void tipc_link_reinit(struct tipc_link *l, u32 addr)
-{
- msg_set_prevnode(l->pmsg, addr);
-}
-
/**
* tipc_link_create - create a new link
* @n: pointer to associated node
@@ -441,29 +421,22 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
struct tipc_link **link)
{
struct tipc_link *l;
- struct tipc_msg *hdr;
l = kzalloc(sizeof(*l), GFP_ATOMIC);
if (!l)
return false;
*link = l;
- l->pmsg = (struct tipc_msg *)&l->proto_msg;
- hdr = l->pmsg;
- tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer);
- msg_set_size(hdr, sizeof(l->proto_msg));
- msg_set_session(hdr, session);
- msg_set_bearer_id(hdr, l->bearer_id);
+ l->session = session;
/* Note: peer i/f name is completed by reset/activate message */
sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode),
if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
- strcpy((char *)msg_data(hdr), if_name);
-
+ strcpy(l->if_name, if_name);
l->addr = peer;
l->peer_caps = peer_caps;
l->net = net;
- l->peer_session = WILDCARD_SESSION;
+ l->peer_session = ANY_SESSION;
l->bearer_id = bearer_id;
l->tolerance = tolerance;
l->net_plane = net_plane;
@@ -790,7 +763,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
struct tipc_msg *msg = buf_msg(skb_peek(list));
int imp = msg_importance(msg);
u32 oport = msg_origport(msg);
- u32 addr = link_own_addr(link);
+ u32 addr = tipc_own_addr(link->net);
struct sk_buff *skb;
/* This really cannot happen... */
@@ -839,16 +812,9 @@ void link_prepare_wakeup(struct tipc_link *l)
void tipc_link_reset(struct tipc_link *l)
{
- /* Link is down, accept any session */
- l->peer_session = WILDCARD_SESSION;
-
- /* If peer is up, it only accepts an incremented session number */
- msg_set_session(l->pmsg, msg_session(l->pmsg) + 1);
-
- /* Prepare for renewed mtu size negotiation */
+ l->peer_session = ANY_SESSION;
+ l->session++;
l->mtu = l->advertised_mtu;
-
- /* Clean up all queues and counters: */
__skb_queue_purge(&l->transmq);
__skb_queue_purge(&l->deferdq);
skb_queue_splice_init(&l->wakeupq, l->inputq);
@@ -1156,7 +1122,7 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
/* Broadcast ACK must be sent via a unicast link => defer to caller */
if (link_is_bc_rcvlink(l)) {
- if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf)
+ if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
return 0;
l->rcv_unacked = 0;
return TIPC_LINK_SND_BC_ACK;
@@ -1268,15 +1234,30 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
u16 rcvgap, int tolerance, int priority,
struct sk_buff_head *xmitq)
{
- struct sk_buff *skb = NULL;
- struct tipc_msg *hdr = l->pmsg;
+ struct sk_buff *skb;
+ struct tipc_msg *hdr;
+ struct sk_buff_head *dfq = &l->deferdq;
bool node_up = link_is_up(l->bc_rcvlink);
/* Don't send protocol message during reset or link failover */
if (tipc_link_is_blocked(l))
return;
- msg_set_type(hdr, mtyp);
+ if (!tipc_link_is_up(l) && (mtyp == STATE_MSG))
+ return;
+
+ if (!skb_queue_empty(dfq))
+ rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
+
+ skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
+ TIPC_MAX_IF_NAME, l->addr,
+ tipc_own_addr(l->net), 0, 0, 0);
+ if (!skb)
+ return;
+
+ hdr = buf_msg(skb);
+ msg_set_session(hdr, l->session);
+ msg_set_bearer_id(hdr, l->bearer_id);
msg_set_net_plane(hdr, l->net_plane);
msg_set_next_sent(hdr, l->snd_nxt);
msg_set_ack(hdr, l->rcv_nxt - 1);
@@ -1286,36 +1267,23 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
msg_set_linkprio(hdr, priority);
msg_set_redundant_link(hdr, node_up);
msg_set_seq_gap(hdr, 0);
-
- /* Compatibility: created msg must not be in sequence with pkt flow */
msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
if (mtyp == STATE_MSG) {
- if (!tipc_link_is_up(l))
- return;
-
- /* Override rcvgap if there are packets in deferred queue */
- if (!skb_queue_empty(&l->deferdq))
- rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt;
- if (rcvgap) {
- msg_set_seq_gap(hdr, rcvgap);
- l->stats.sent_nacks++;
- }
+ msg_set_seq_gap(hdr, rcvgap);
+ msg_set_size(hdr, INT_H_SIZE);
msg_set_probe(hdr, probe);
- if (probe)
- l->stats.sent_probes++;
l->stats.sent_states++;
l->rcv_unacked = 0;
} else {
/* RESET_MSG or ACTIVATE_MSG */
msg_set_max_pkt(hdr, l->advertised_mtu);
- msg_set_ack(hdr, l->rcv_nxt - 1);
- msg_set_next_sent(hdr, 1);
+ strcpy(msg_data(hdr), l->if_name);
}
- skb = tipc_buf_acquire(msg_size(hdr));
- if (!skb)
- return;
- skb_copy_to_linear_data(skb, hdr, msg_size(hdr));
+ if (probe)
+ l->stats.sent_probes++;
+ if (rcvgap)
+ l->stats.sent_nacks++;
skb->priority = TC_PRIO_CONTROL;
__skb_queue_tail(xmitq, skb);
}
@@ -1340,7 +1308,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
/* At least one packet required for safe algorithm => add dummy */
skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
- BASIC_H_SIZE, 0, l->addr, link_own_addr(l),
+ BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net),
0, 0, TIPC_ERR_NO_PORT);
if (!skb) {
pr_warn("%sunable to create tunnel packet\n", link_co_err);
@@ -1351,7 +1319,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
__skb_queue_purge(&tmpxq);
/* Initialize reusable tunnel packet header */
- tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL,
+ tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL,
mtyp, INT_H_SIZE, l->addr);
pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq);
msg_set_msgcnt(&tnlhdr, pktcnt);
@@ -1410,7 +1378,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
if (tipc_link_is_blocked(l) || !xmitq)
goto exit;
- if (link_own_addr(l) > msg_prevnode(hdr))
+ if (tipc_own_addr(l->net) > msg_prevnode(hdr))
l->net_plane = msg_net_plane(hdr);
switch (mtyp) {
@@ -1418,7 +1386,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Ignore duplicate RESET with old session number */
if ((less_eq(msg_session(hdr), l->peer_session)) &&
- (l->peer_session != WILDCARD_SESSION))
+ (l->peer_session != ANY_SESSION))
break;
/* fall thru' */
@@ -1515,7 +1483,7 @@ static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast,
u16 gap_to = peers_snd_nxt - 1;
skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE,
- 0, l->addr, link_own_addr(l), 0, 0, 0);
+ 0, l->addr, tipc_own_addr(l->net), 0, 0, 0);
if (!skb)
return false;
hdr = buf_msg(skb);
@@ -1670,7 +1638,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
if (mtyp != STATE_MSG)
return 0;
- if (dnode == link_own_addr(l)) {
+ if (dnode == tipc_own_addr(l->net)) {
tipc_link_bc_ack_rcv(l, acked, xmitq);
rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
l->stats.recv_nacks++;
diff --git a/net/tipc/link.h b/net/tipc/link.h
index b4ee9d6e181d..6a94175ee20a 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -86,7 +86,6 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
struct sk_buff_head *namedq,
struct tipc_link *bc_sndlink,
struct tipc_link **link);
-void tipc_link_reinit(struct tipc_link *l, u32 addr);
void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
int mtyp, struct sk_buff_head *xmitq);
void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 777b979b8463..e190460fe0d3 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -47,12 +47,6 @@
#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */
-static const struct nla_policy
-tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
- [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED }
-};
-
/**
* struct name_info - name sequence publication info
* @node_list: circular list of publications made by own node
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 77bf9113c7a7..28bf4feeb81c 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -41,11 +41,7 @@
#include "socket.h"
#include "node.h"
#include "bcast.h"
-
-static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
- [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NET_ID] = { .type = NLA_U32 }
-};
+#include "netlink.h"
/*
* The TIPC locking policy is designed to ensure a very fine locking
@@ -116,7 +112,6 @@ int tipc_net_start(struct net *net, u32 addr)
tn->own_addr = addr;
tipc_named_reinit(net);
tipc_sk_reinit(net);
- tipc_bcast_reinit(net);
tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr,
TIPC_ZONE_SCOPE, 0, tn->own_addr);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 8975b0135b76..56935df2167a 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -55,6 +55,75 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = {
[TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, }
};
+const struct nla_policy
+tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
+ [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED }
+};
+
+const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
+ [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 },
+ [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED },
+ [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG }
+};
+
+const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
+ [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NET_ID] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
+ [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING,
+ .len = TIPC_MAX_LINK_NAME },
+ [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 },
+ [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED },
+ [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED },
+ [TIPC_NLA_LINK_RX] = { .type = NLA_U32 },
+ [TIPC_NLA_LINK_TX] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
+ [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }
+};
+
+/* Properties valid for media, bearer and link */
+const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
+ [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 },
+ [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 },
+ [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = {
+ [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING,
+ .len = TIPC_MAX_BEARER_NAME },
+ [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED },
+ [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
+ [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING },
+ [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED }
+};
+
+const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
+ [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC},
+ [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY,
+ .len = sizeof(struct sockaddr_storage)},
+ [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY,
+ .len = sizeof(struct sockaddr_storage)},
+};
+
/* Users of the legacy API (tipc-config) can't handle that we add operations,
* so we have a separate genl handling for the new API.
*/
diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h
index 08a1db67b927..ed1dbcb4afbd 100644
--- a/net/tipc/netlink.h
+++ b/net/tipc/netlink.h
@@ -35,6 +35,7 @@
#ifndef _TIPC_NETLINK_H
#define _TIPC_NETLINK_H
+#include <net/netlink.h>
extern struct genl_family tipc_genl_family;
int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf);
@@ -45,6 +46,16 @@ struct tipc_nl_msg {
u32 seq;
};
+extern const struct nla_policy tipc_nl_name_table_policy[];
+extern const struct nla_policy tipc_nl_sock_policy[];
+extern const struct nla_policy tipc_nl_net_policy[];
+extern const struct nla_policy tipc_nl_link_policy[];
+extern const struct nla_policy tipc_nl_node_policy[];
+extern const struct nla_policy tipc_nl_prop_policy[];
+extern const struct nla_policy tipc_nl_bearer_policy[];
+extern const struct nla_policy tipc_nl_media_policy[];
+extern const struct nla_policy tipc_nl_udp_policy[];
+
int tipc_netlink_start(void);
int tipc_netlink_compat_start(void);
void tipc_netlink_stop(void);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index cdb79503d890..ace178fd3850 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -41,6 +41,7 @@
#include "socket.h"
#include "bcast.h"
#include "discover.h"
+#include "netlink.h"
#define INVALID_NODE_SIG 0x10000
@@ -164,28 +165,6 @@ struct tipc_sock_conn {
struct list_head list;
};
-static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
- [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_LINK_NAME] = {
- .type = NLA_STRING,
- .len = TIPC_MAX_LINK_NAME
- },
- [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 },
- [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG },
- [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG },
- [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG },
- [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED },
- [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED },
- [TIPC_NLA_LINK_RX] = { .type = NLA_U32 },
- [TIPC_NLA_LINK_TX] = { .type = NLA_U32 }
-};
-
-static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
- [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 },
- [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }
-};
-
static struct tipc_link *node_active_link(struct tipc_node *n, int sel)
{
int bearer_id = n->active_links[sel & 1];
@@ -843,7 +822,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
memcpy(&le->maddr, maddr, sizeof(*maddr));
exit:
tipc_node_write_unlock(n);
- if (reset && !tipc_link_is_reset(l))
+ if (reset && l && !tipc_link_is_reset(l))
tipc_node_link_down(n, b->identity, false);
tipc_node_put(n);
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 69c29050f14a..3eeb50a27b89 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -42,6 +42,7 @@
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
+#include "netlink.h"
#define SS_LISTENING -1 /* socket is listening */
#define SS_READY -2 /* socket is connectionless */
@@ -126,14 +127,6 @@ static const struct proto_ops stream_ops;
static const struct proto_ops msg_ops;
static struct proto tipc_proto;
-static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
- [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 },
- [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 },
- [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED },
- [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG }
-};
-
static const struct rhashtable_params tsk_rht_params;
/*
@@ -673,7 +666,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
struct tipc_sock *tsk = tipc_sk(sk);
struct net *net = sock_net(sk);
struct tipc_msg *mhdr = &tsk->phdr;
- struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct sk_buff_head pktchain;
struct iov_iter save = msg->msg_iter;
uint mtu;
int rc;
@@ -687,14 +680,16 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
msg_set_nameupper(mhdr, seq->upper);
msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
+ skb_queue_head_init(&pktchain);
+
new_mtu:
mtu = tipc_bcast_get_mtu(net);
- rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain);
+ rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
if (unlikely(rc < 0))
return rc;
do {
- rc = tipc_bcast_xmit(net, pktchain);
+ rc = tipc_bcast_xmit(net, &pktchain);
if (likely(!rc))
return dsz;
@@ -704,7 +699,7 @@ new_mtu:
if (!rc)
continue;
}
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
if (rc == -EMSGSIZE) {
msg->msg_iter = save;
goto new_mtu;
@@ -863,7 +858,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
struct net *net = sock_net(sk);
struct tipc_msg *mhdr = &tsk->phdr;
u32 dnode, dport;
- struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct sk_buff_head pktchain;
struct sk_buff *skb;
struct tipc_name_seq *seq;
struct iov_iter save;
@@ -924,17 +919,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
}
+ skb_queue_head_init(&pktchain);
save = m->msg_iter;
new_mtu:
mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
- rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain);
+ rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
if (rc < 0)
return rc;
do {
- skb = skb_peek(pktchain);
+ skb = skb_peek(&pktchain);
TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
- rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+ rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
if (likely(!rc)) {
if (sock->state != SS_READY)
sock->state = SS_CONNECTING;
@@ -946,7 +942,7 @@ new_mtu:
if (!rc)
continue;
}
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
if (rc == -EMSGSIZE) {
m->msg_iter = save;
goto new_mtu;
@@ -1016,7 +1012,7 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
struct net *net = sock_net(sk);
struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_msg *mhdr = &tsk->phdr;
- struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct sk_buff_head pktchain;
DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
u32 portid = tsk->portid;
int rc = -EINVAL;
@@ -1044,17 +1040,19 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
dnode = tsk_peer_node(tsk);
+ skb_queue_head_init(&pktchain);
next:
save = m->msg_iter;
mtu = tsk->max_pkt;
send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
- rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain);
+ rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
if (unlikely(rc < 0))
return rc;
+
do {
if (likely(!tsk_conn_cong(tsk))) {
- rc = tipc_node_xmit(net, pktchain, dnode, portid);
+ rc = tipc_node_xmit(net, &pktchain, dnode, portid);
if (likely(!rc)) {
tsk->sent_unacked++;
sent += send;
@@ -1063,7 +1061,7 @@ next:
goto next;
}
if (rc == -EMSGSIZE) {
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
tsk->max_pkt = tipc_node_get_mtu(net, dnode,
portid);
m->msg_iter = save;
@@ -1077,7 +1075,7 @@ next:
rc = tipc_wait_for_sndpkt(sock, &timeo);
} while (!rc);
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
return sent ? sent : rc;
}
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 22963cafd5ed..e6cb386fbf34 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -326,7 +326,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
return tipc_subscrp_cancel(s, subscriber);
}
- tipc_subscrp_subscribe(net, s, subscriber, swap);
+ if (s)
+ tipc_subscrp_subscribe(net, s, subscriber, swap);
}
/* Handle one request to establish a new subscriber */
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index d63a911e7fe2..c94f9a15e2cd 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -48,20 +48,13 @@
#include <linux/tipc_netlink.h>
#include "core.h"
#include "bearer.h"
+#include "netlink.h"
/* IANA assigned UDP port */
#define UDP_PORT_DEFAULT 6118
#define UDP_MIN_HEADROOM 28
-static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
- [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC},
- [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY,
- .len = sizeof(struct sockaddr_storage)},
- [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY,
- .len = sizeof(struct sockaddr_storage)},
-};
-
/**
* struct udp_media_addr - IP/UDP addressing information
*
@@ -181,6 +174,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
err = PTR_ERR(rt);
goto tx_error;
}
+
+ skb->dev = rt->dst.dev;
ttl = ip4_dst_hoplimit(&rt->dst);
udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
dst->ipv4.s_addr, 0, ttl, 0, src->udp_port,
@@ -201,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
ttl = ip6_dst_hoplimit(ndst);
err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
ndst->dev, &src->ipv6,
- &dst->ipv6, 0, ttl, src->udp_port,
+ &dst->ipv6, 0, ttl, 0, src->udp_port,
dst->udp_port, false);
#endif
}
@@ -274,7 +269,7 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
struct udp_media_addr *remote)
{
struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
- struct sockaddr_storage *sa_local, *sa_remote;
+ struct sockaddr_storage sa_local, sa_remote;
if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
goto err;
@@ -283,41 +278,48 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
tipc_nl_udp_policy))
goto err;
if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) {
- sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]);
- sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]);
+ nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL],
+ sizeof(sa_local));
+ nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE],
+ sizeof(sa_remote));
} else {
err:
pr_err("Invalid UDP bearer configuration");
return -EINVAL;
}
- if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) {
+ if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) {
struct sockaddr_in *ip4;
- ip4 = (struct sockaddr_in *)sa_local;
+ ip4 = (struct sockaddr_in *)&sa_local;
local->proto = htons(ETH_P_IP);
local->udp_port = ip4->sin_port;
local->ipv4.s_addr = ip4->sin_addr.s_addr;
- ip4 = (struct sockaddr_in *)sa_remote;
+ ip4 = (struct sockaddr_in *)&sa_remote;
remote->proto = htons(ETH_P_IP);
remote->udp_port = ip4->sin_port;
remote->ipv4.s_addr = ip4->sin_addr.s_addr;
return 0;
#if IS_ENABLED(CONFIG_IPV6)
- } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) {
+ } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) {
+ int atype;
struct sockaddr_in6 *ip6;
- ip6 = (struct sockaddr_in6 *)sa_local;
+ ip6 = (struct sockaddr_in6 *)&sa_local;
+ atype = ipv6_addr_type(&ip6->sin6_addr);
+ if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id)
+ return -EINVAL;
+
local->proto = htons(ETH_P_IPV6);
local->udp_port = ip6->sin6_port;
- local->ipv6 = ip6->sin6_addr;
+ memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
ub->ifindex = ip6->sin6_scope_id;
- ip6 = (struct sockaddr_in6 *)sa_remote;
+ ip6 = (struct sockaddr_in6 *)&sa_remote;
remote->proto = htons(ETH_P_IPV6);
remote->udp_port = ip6->sin6_port;
- remote->ipv6 = ip6->sin6_addr;
+ memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
return 0;
#endif
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 3a9c41bc849a..9f1c4aa851ef 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1157,6 +1157,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
return NOTIFY_DONE;
}
+ wireless_nlevent_flush();
+
return NOTIFY_OK;
}
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 90890f183c0e..98c924260b3d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7554,7 +7554,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
no_ht) {
- kfree(connkeys);
+ kzfree(connkeys);
return -EINVAL;
}
}
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 79bd3a171caa..544558171787 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -917,6 +917,12 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
+ /* stop critical protocol if supported */
+ if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) {
+ rdev->crit_proto_nlportid = 0;
+ rdev_crit_proto_stop(rdev, wdev);
+ }
+
/*
* Delete all the keys ... pairwise keys can't really
* exist any more anyway, but default keys might.
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index c8717c1d082e..b50ee5d622e1 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -342,6 +342,40 @@ static const int compat_event_type_size[] = {
/* IW event code */
+void wireless_nlevent_flush(void)
+{
+ struct sk_buff *skb;
+ struct net *net;
+
+ ASSERT_RTNL();
+
+ for_each_net(net) {
+ while ((skb = skb_dequeue(&net->wext_nlevents)))
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+ GFP_KERNEL);
+ }
+}
+EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
+
+static int wext_netdev_notifier_call(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ /*
+ * When a netdev changes state in any way, flush all pending messages
+ * to avoid them going out in a strange order, e.g. RTM_NEWLINK after
+ * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close()
+ * or similar - all of which could otherwise happen due to delays from
+ * schedule_work().
+ */
+ wireless_nlevent_flush();
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block wext_netdev_notifier = {
+ .notifier_call = wext_netdev_notifier_call,
+};
+
static int __net_init wext_pernet_init(struct net *net)
{
skb_queue_head_init(&net->wext_nlevents);
@@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = {
static int __init wireless_nlevent_init(void)
{
- return register_pernet_subsys(&wext_pernet_ops);
+ int err = register_pernet_subsys(&wext_pernet_ops);
+
+ if (err)
+ return err;
+
+ return register_netdevice_notifier(&wext_netdev_notifier);
}
subsys_initcall(wireless_nlevent_init);
@@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init);
/* Process events generated by the wireless layer or the driver. */
static void wireless_nlevent_process(struct work_struct *work)
{
- struct sk_buff *skb;
- struct net *net;
-
rtnl_lock();
-
- for_each_net(net) {
- while ((skb = skb_dequeue(&net->wext_nlevents)))
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
- GFP_KERNEL);
- }
-
+ wireless_nlevent_flush();
rtnl_unlock();
}