From ba062ebb2cd561d404e0fba8ee4b3f5ebce7cbfc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Jun 2018 09:13:39 -0700
Subject: netfilter: nf_queue: augment nfqa_cfg_policy

Three attributes are currently not verified, thus can trigger KMSAN
warnings such as :

BUG: KMSAN: uninit-value in __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline]
BUG: KMSAN: uninit-value in __fswab32 include/uapi/linux/swab.h:59 [inline]
BUG: KMSAN: uninit-value in nfqnl_recv_config+0x939/0x17d0 net/netfilter/nfnetlink_queue.c:1268
CPU: 1 PID: 4521 Comm: syz-executor120 Not tainted 4.17.0+ #5
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1117
 __msan_warning_32+0x70/0xc0 mm/kmsan/kmsan_instr.c:620
 __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline]
 __fswab32 include/uapi/linux/swab.h:59 [inline]
 nfqnl_recv_config+0x939/0x17d0 net/netfilter/nfnetlink_queue.c:1268
 nfnetlink_rcv_msg+0xb2e/0xc80 net/netfilter/nfnetlink.c:212
 netlink_rcv_skb+0x37e/0x600 net/netlink/af_netlink.c:2448
 nfnetlink_rcv+0x2fe/0x680 net/netfilter/nfnetlink.c:513
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1680/0x1750 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec8/0x1320 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x43fd59
RSP: 002b:00007ffde0e30d28 EFLAGS: 00000213 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fd59
RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8
R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000401680
R13: 0000000000401710 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
 kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:189
 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:315
 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan.c:322
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2753 [inline]
 __kmalloc_node_track_caller+0xb35/0x11b0 mm/slub.c:4395
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x2cb/0x9e0 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:988 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1182 [inline]
 netlink_sendmsg+0x76e/0x1350 net/netlink/af_netlink.c:1876
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec8/0x1320 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: fdb694a01f1f ("netfilter: Add fail-open support")
Fixes: 829e17a1a602 ("[NETFILTER]: nfnetlink_queue: allow changing queue length through netlink")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_queue.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 4ccd2988f9db..ea4ba551abb2 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1243,6 +1243,9 @@ static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl,
 static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
 	[NFQA_CFG_CMD]		= { .len = sizeof(struct nfqnl_msg_config_cmd) },
 	[NFQA_CFG_PARAMS]	= { .len = sizeof(struct nfqnl_msg_config_params) },
+	[NFQA_CFG_QUEUE_MAXLEN]	= { .type = NLA_U32 },
+	[NFQA_CFG_MASK]		= { .type = NLA_U32 },
+	[NFQA_CFG_FLAGS]	= { .type = NLA_U32 },
 };
 
 static const struct nf_queue_handler nfqh = {
-- 
cgit v1.2.3


From 9ce7bc036ae4cfe3393232c86e9e1fea2153c237 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 13 Jun 2018 10:11:56 -0700
Subject: netfilter: ipv6: nf_defrag: reduce struct net memory waste

It is a waste of memory to use a full "struct netns_sysctl_ipv6"
while only one pointer is really used, considering netns_sysctl_ipv6
keeps growing.

Also, since "struct netns_frags" has cache line alignment,
it is better to move the frags_hdr pointer outside, otherwise
we spend a full cache line for this pointer.

This saves 192 bytes of memory per netns.

Fixes: c038a767cd69 ("ipv6: add a new namespace for nf_conntrack_reasm")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/net_namespace.h             | 1 +
 include/net/netns/ipv6.h                | 1 -
 net/ipv6/netfilter/nf_conntrack_reasm.c | 6 +++---
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 47e35cce3b64..a71264d75d7f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -128,6 +128,7 @@ struct net {
 #endif
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 	struct netns_nf_frag	nf_frag;
+	struct ctl_table_header *nf_frag_frags_hdr;
 #endif
 	struct sock		*nfnl;
 	struct sock		*nfnl_stash;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index c978a31b0f84..762ac9931b62 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -109,7 +109,6 @@ struct netns_ipv6 {
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 struct netns_nf_frag {
-	struct netns_sysctl_ipv6 sysctl;
 	struct netns_frags	frags;
 };
 #endif
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5e0332014c17..a452d99c9f52 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -107,7 +107,7 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
 	if (hdr == NULL)
 		goto err_reg;
 
-	net->nf_frag.sysctl.frags_hdr = hdr;
+	net->nf_frag_frags_hdr = hdr;
 	return 0;
 
 err_reg:
@@ -121,8 +121,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
 {
 	struct ctl_table *table;
 
-	table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg;
-	unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr);
+	table = net->nf_frag_frags_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->nf_frag_frags_hdr);
 	if (!net_eq(net, &init_net))
 		kfree(table);
 }
-- 
cgit v1.2.3


From ad9852af97587b8abe8102f9ddcb05c9769656f6 Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Wed, 13 Jun 2018 12:26:13 +0800
Subject: netfilter: nf_ct_helper: Fix possible panic after
 nf_conntrack_helper_unregister

The helper module would be unloaded after nf_conntrack_helper_unregister,
so it may cause a possible panic caused by race.

nf_ct_iterate_destroy(unhelp, me) reset the helper of conntrack as NULL,
but maybe someone has gotten the helper pointer during this period. Then
it would panic, when it accesses the helper and the module was unloaded.

Take an example as following:
CPU0                                                   CPU1
ctnetlink_dump_helpinfo
helper = rcu_dereference(help->helper);
                                                       unhelp
                                                       set helper as NULL
                                                       unload helper module
helper->to_nlattr(skb, ct);

As above, the cpu0 tries to access the helper and its module is unloaded,
then the panic happens.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_helper.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 551a1eddf0fa..a75b11c39312 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -465,6 +465,11 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 
 	nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
 	nf_ct_iterate_destroy(unhelp, me);
+
+	/* Maybe someone has gotten the helper already when unhelp above.
+	 * So need to wait it.
+	 */
+	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
 
-- 
cgit v1.2.3


From b5685d2687d6612adf5eac519eb7008f74dfd1ec Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Jun 2018 17:26:34 +0200
Subject: batman-adv: Fix bat_ogm_iv best gw refcnt after netlink dump

A reference for the best gateway is taken when the list of gateways in the
mesh is sent via netlink. This is necessary to check whether the currently
dumped entry is the currently selected gateway or not. This information is
then transferred as flag BATADV_ATTR_FLAG_BEST.

After the comparison of the current entry is done,
batadv_iv_gw_dump_entry() has to decrease the reference counter again.
Otherwise the reference will be held and thus prevents a proper shutdown of
the batman-adv interfaces (and some of the interfaces enslaved in it).

Fixes: efb766af06e3 ("batman-adv: add B.A.T.M.A.N. IV bat_gw_dump implementations")
Reported-by: Andreas Ziegler <dev@andreas-ziegler.de>
Tested-by: Andreas Ziegler <dev@andreas-ziegler.de>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Acked-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_iv_ogm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index be09a9883825..73bf6a93a3cf 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -2732,7 +2732,7 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 {
 	struct batadv_neigh_ifinfo *router_ifinfo = NULL;
 	struct batadv_neigh_node *router;
-	struct batadv_gw_node *curr_gw;
+	struct batadv_gw_node *curr_gw = NULL;
 	int ret = 0;
 	void *hdr;
 
@@ -2780,6 +2780,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 	ret = 0;
 
 out:
+	if (curr_gw)
+		batadv_gw_node_put(curr_gw);
 	if (router_ifinfo)
 		batadv_neigh_ifinfo_put(router_ifinfo);
 	if (router)
-- 
cgit v1.2.3


From 9713cb0cf19f1cec6c007e3b37be0697042b6720 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 2 Jun 2018 17:26:35 +0200
Subject: batman-adv: Fix bat_v best gw refcnt after netlink dump

A reference for the best gateway is taken when the list of gateways in the
mesh is sent via netlink. This is necessary to check whether the currently
dumped entry is the currently selected gateway or not. This information is
then transferred as flag BATADV_ATTR_FLAG_BEST.

After the comparison of the current entry is done,
batadv_v_gw_dump_entry() has to decrease the reference counter again.
Otherwise the reference will be held and thus prevents a proper shutdown of
the batman-adv interfaces (and some of the interfaces enslaved in it).

Fixes: b71bb6f924fe ("batman-adv: add B.A.T.M.A.N. V bat_gw_dump implementations")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Acked-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_v.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index ec93337ee259..6baec4e68898 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -927,7 +927,7 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 {
 	struct batadv_neigh_ifinfo *router_ifinfo = NULL;
 	struct batadv_neigh_node *router;
-	struct batadv_gw_node *curr_gw;
+	struct batadv_gw_node *curr_gw = NULL;
 	int ret = 0;
 	void *hdr;
 
@@ -995,6 +995,8 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
 	ret = 0;
 
 out:
+	if (curr_gw)
+		batadv_gw_node_put(curr_gw);
 	if (router_ifinfo)
 		batadv_neigh_ifinfo_put(router_ifinfo);
 	if (router)
-- 
cgit v1.2.3


From 36dc621ceca1be3ec885aeade5fdafbbcc452a6d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 1 Jun 2018 19:24:23 +0200
Subject: batman-adv: Fix debugfs path for renamed hardif

batman-adv is creating special debugfs directories in the init
net_namespace for each valid hard-interface (net_device). But it is
possible to rename a net_device to a completely different name then the
original one.

It can therefore happen that a user registers a new net_device which gets
the name "wlan0" assigned by default. batman-adv is also adding a new
directory under $debugfs/batman-adv/ with the name "wlan0".

The user then decides to rename this device to "wl_pri" and registers a
different device. The kernel may now decide to use the name "wlan0" again
for this new device. batman-adv will detect it as a valid net_device and
tries to create a directory with the name "wlan0" under
$debugfs/batman-adv/. But there already exists one with this name under
this path and thus this fails. batman-adv will detect a problem and
rollback the registering of this device.

batman-adv must therefore take care of renaming the debugfs directories
for hard-interfaces whenever it detects such a net_device rename.

Fixes: 5bc7c1eb44f2 ("batman-adv: add debugfs structure for information per interface")
Reported-by: John Soros <sorosj@gmail.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/debugfs.c        | 20 ++++++++++++++++++++
 net/batman-adv/debugfs.h        |  6 ++++++
 net/batman-adv/hard-interface.c |  3 +++
 3 files changed, 29 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 4229b01ac7b5..7e5de7b9f6d5 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -19,6 +19,7 @@
 #include "debugfs.h"
 #include "main.h"
 
+#include <linux/dcache.h>
 #include <linux/debugfs.h>
 #include <linux/err.h>
 #include <linux/errno.h>
@@ -343,6 +344,25 @@ out:
 	return -ENOMEM;
 }
 
+/**
+ * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif
+ * @hard_iface: hard interface which was renamed
+ */
+void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
+{
+	const char *name = hard_iface->net_dev->name;
+	struct dentry *dir;
+	struct dentry *d;
+
+	dir = hard_iface->debug_dir;
+	if (!dir)
+		return;
+
+	d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
+	if (!d)
+		pr_err("Can't rename debugfs dir to %s\n", name);
+}
+
 /**
  * batadv_debugfs_del_hardif() - delete the base directory for a hard interface
  *  in debugfs.
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 37b069698b04..8538a7a75e93 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -32,6 +32,7 @@ void batadv_debugfs_destroy(void);
 int batadv_debugfs_add_meshif(struct net_device *dev);
 void batadv_debugfs_del_meshif(struct net_device *dev);
 int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
+void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface);
 void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
 
 #else
@@ -59,6 +60,11 @@ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
 	return 0;
 }
 
+static inline
+void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
+{
+}
+
 static inline
 void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
 {
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index c405d15befd6..dc2763b11107 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1051,6 +1051,9 @@ static int batadv_hard_if_event(struct notifier_block *this,
 		if (batadv_is_wifi_hardif(hard_iface))
 			hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
 		break;
+	case NETDEV_CHANGENAME:
+		batadv_debugfs_rename_hardif(hard_iface);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 6da7be7d24b2921f8215473ba7552796dff05fe1 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 1 Jun 2018 19:24:24 +0200
Subject: batman-adv: Fix debugfs path for renamed softif

batman-adv is creating special debugfs directories in the init
net_namespace for each created soft-interface (batadv net_device). But it
is possible to rename a net_device to a completely different name then the
original one.

It can therefore happen that a user registers a new batadv net_device with
the name "bat0". batman-adv is then also adding a new directory under
$debugfs/batman-adv/ with the name "wlan0".

The user then decides to rename this device to "bat1" and registers a
different batadv device with the name "bat0". batman-adv will then try to
create a directory with the name "bat0" under $debugfs/batman-adv/ again.
But there already exists one with this name under this path and thus this
fails. batman-adv will detect a problem and rollback the registering of
this device.

batman-adv must therefore take care of renaming the debugfs directories for
soft-interfaces whenever it detects such a net_device rename.

Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/debugfs.c        | 20 ++++++++++++++++++++
 net/batman-adv/debugfs.h        |  5 +++++
 net/batman-adv/hard-interface.c | 34 ++++++++++++++++++++++++++++------
 3 files changed, 53 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 7e5de7b9f6d5..87479c60670e 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -433,6 +433,26 @@ out:
 	return -ENOMEM;
 }
 
+/**
+ * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif
+ * @dev: net_device which was renamed
+ */
+void batadv_debugfs_rename_meshif(struct net_device *dev)
+{
+	struct batadv_priv *bat_priv = netdev_priv(dev);
+	const char *name = dev->name;
+	struct dentry *dir;
+	struct dentry *d;
+
+	dir = bat_priv->debug_dir;
+	if (!dir)
+		return;
+
+	d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
+	if (!d)
+		pr_err("Can't rename debugfs dir to %s\n", name);
+}
+
 /**
  * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries
  * @dev: netdev struct of the soft interface
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 8538a7a75e93..08a592ffbee5 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -30,6 +30,7 @@ struct net_device;
 void batadv_debugfs_init(void);
 void batadv_debugfs_destroy(void);
 int batadv_debugfs_add_meshif(struct net_device *dev);
+void batadv_debugfs_rename_meshif(struct net_device *dev);
 void batadv_debugfs_del_meshif(struct net_device *dev);
 int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
 void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface);
@@ -50,6 +51,10 @@ static inline int batadv_debugfs_add_meshif(struct net_device *dev)
 	return 0;
 }
 
+static inline void batadv_debugfs_rename_meshif(struct net_device *dev)
+{
+}
+
 static inline void batadv_debugfs_del_meshif(struct net_device *dev)
 {
 }
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index dc2763b11107..2f0d42f2f913 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -989,6 +989,32 @@ void batadv_hardif_remove_interfaces(void)
 	rtnl_unlock();
 }
 
+/**
+ * batadv_hard_if_event_softif() - Handle events for soft interfaces
+ * @event: NETDEV_* event to handle
+ * @net_dev: net_device which generated an event
+ *
+ * Return: NOTIFY_* result
+ */
+static int batadv_hard_if_event_softif(unsigned long event,
+				       struct net_device *net_dev)
+{
+	struct batadv_priv *bat_priv;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		batadv_sysfs_add_meshif(net_dev);
+		bat_priv = netdev_priv(net_dev);
+		batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
+		break;
+	case NETDEV_CHANGENAME:
+		batadv_debugfs_rename_meshif(net_dev);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
 static int batadv_hard_if_event(struct notifier_block *this,
 				unsigned long event, void *ptr)
 {
@@ -997,12 +1023,8 @@ static int batadv_hard_if_event(struct notifier_block *this,
 	struct batadv_hard_iface *primary_if = NULL;
 	struct batadv_priv *bat_priv;
 
-	if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) {
-		batadv_sysfs_add_meshif(net_dev);
-		bat_priv = netdev_priv(net_dev);
-		batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
-		return NOTIFY_DONE;
-	}
+	if (batadv_softif_is_valid(net_dev))
+		return batadv_hard_if_event_softif(event, net_dev);
 
 	hard_iface = batadv_hardif_get_by_netdev(net_dev);
 	if (!hard_iface && (event == NETDEV_REGISTER ||
-- 
cgit v1.2.3


From 4a519b83da16927fb98fd32b0f598e639d1f1859 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Thu, 7 Jun 2018 00:46:23 +0200
Subject: batman-adv: Avoid storing non-TT-sync flags on singular entries too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 54e22f265e87 ("batman-adv: fix TT sync flag inconsistencies")
TT sync flags and TT non-sync'd flags are supposed to be stored
separately.

The previous patch missed to apply this separation on a TT entry with
only a single TT orig entry.

This is a minor fix because with only a single TT orig entry the DDoS
issue the former patch solves does not apply.

Fixes: 54e22f265e87 ("batman-adv: fix TT sync flag inconsistencies")
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/translation-table.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 3986551397ca..61ce300091f3 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1705,7 +1705,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
 		ether_addr_copy(common->addr, tt_addr);
 		common->vid = vid;
 
-		common->flags = flags;
+		common->flags = flags & (~BATADV_TT_SYNC_MASK);
+
 		tt_global_entry->roam_at = 0;
 		/* node must store current time in case of roaming. This is
 		 * needed to purge this entry out on timeout (if nobody claims
-- 
cgit v1.2.3


From a44ebeff6bbd6ef50db41b4195fca87b21aefd20 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Thu, 7 Jun 2018 00:46:24 +0200
Subject: batman-adv: Fix multicast TT issues with bogus ROAM flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a (broken) node wrongly sends multicast TT entries with a ROAM
flag then this causes any receiving node to drop all entries for the
same multicast MAC address announced by other nodes, leading to
packet loss.

Fix this DoS vector by only storing TT sync flags. For multicast TT
non-sync'ing flag bits like ROAM are unused so far anyway.

Fixes: 1d8ab8d3c176 ("batman-adv: Modified forwarding behaviour for multicast packets")
Reported-by: Leonardo Mörlein <me@irrelefant.net>
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/translation-table.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 61ce300091f3..12a2b7d21376 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1705,7 +1705,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
 		ether_addr_copy(common->addr, tt_addr);
 		common->vid = vid;
 
-		common->flags = flags & (~BATADV_TT_SYNC_MASK);
+		if (!is_multicast_ether_addr(common->addr))
+			common->flags = flags & (~BATADV_TT_SYNC_MASK);
 
 		tt_global_entry->roam_at = 0;
 		/* node must store current time in case of roaming. This is
@@ -1769,7 +1770,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
 		 * TT_CLIENT_TEMP, therefore they have to be copied in the
 		 * client entry
 		 */
-		common->flags |= flags & (~BATADV_TT_SYNC_MASK);
+		if (!is_multicast_ether_addr(common->addr))
+			common->flags |= flags & (~BATADV_TT_SYNC_MASK);
 
 		/* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only
 		 * one originator left in the list and we previously received a
-- 
cgit v1.2.3


From dffd22aed2aa1e804bccf19b30a421e89ee2ae61 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 20 Jun 2018 18:33:45 +0200
Subject: netfilter: nf_log: fix uninit read in nf_log_proc_dostring

When proc_dostring() is called with a non-zero offset in strict mode, it
doesn't just write to the ->data buffer, it also reads. Make sure it
doesn't read uninitialized data.

Fixes: c6ac37d8d884 ("netfilter: nf_log: fix error on write NONE to [...]")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_log.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 426457047578..2c47f9ec3511 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -424,6 +424,10 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
 	if (write) {
 		struct ctl_table tmp = *table;
 
+		/* proc_dostring() can append to existing strings, so we need to
+		 * initialize it as an empty string.
+		 */
+		buf[0] = '\0';
 		tmp.data = buf;
 		r = proc_dostring(&tmp, write, buffer, lenp, ppos);
 		if (r)
-- 
cgit v1.2.3


From ce00bf07cc95a57cd20b208e02b3c2604e532ae8 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 25 Jun 2018 17:22:00 +0200
Subject: netfilter: nf_log: don't hold nf_log_mutex during user access

The old code would indefinitely block other users of nf_log_mutex if
a userspace access in proc_dostring() blocked e.g. due to a userfaultfd
region. Fix it by moving proc_dostring() out of the locked region.

This is a followup to commit 266d07cb1c9a ("netfilter: nf_log: fix
sleeping function called from invalid context"), which changed this code
from using rcu_read_lock() to taking nf_log_mutex.

Fixes: 266d07cb1c9a ("netfilter: nf_log: fix sleeping function calle[...]")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_log.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 2c47f9ec3511..a61d6df6e5f6 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -446,14 +446,17 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
 		rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
 		mutex_unlock(&nf_log_mutex);
 	} else {
+		struct ctl_table tmp = *table;
+
+		tmp.data = buf;
 		mutex_lock(&nf_log_mutex);
 		logger = nft_log_dereference(net->nf.nf_loggers[tindex]);
 		if (!logger)
-			table->data = "NONE";
+			strlcpy(buf, "NONE", sizeof(buf));
 		else
-			table->data = logger->name;
-		r = proc_dostring(table, write, buffer, lenp, ppos);
+			strlcpy(buf, logger->name, sizeof(buf));
 		mutex_unlock(&nf_log_mutex);
+		r = proc_dostring(&tmp, write, buffer, lenp, ppos);
 	}
 
 	return r;
-- 
cgit v1.2.3


From b36e4523d4d56e2595e28f16f6ccf1cd6a9fc452 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Jun 2018 23:32:26 +0200
Subject: netfilter: nf_conncount: fix garbage collection confirm race

Yi-Hung Wei and Justin Pettit found a race in the garbage collection scheme
used by nf_conncount.

When doing list walk, we lookup the tuple in the conntrack table.
If the lookup fails we remove this tuple from our list because
the conntrack entry is gone.

This is the common cause, but turns out its not the only one.
The list entry could have been created just before by another cpu, i.e. the
conntrack entry might not yet have been inserted into the global hash.

The avoid this, we introduce a timestamp and the owning cpu.
If the entry appears to be stale, evict only if:
 1. The current cpu is the one that added the entry, or,
 2. The timestamp is older than two jiffies

The second constraint allows GC to be taken over by other
cpu too (e.g. because a cpu was offlined or napi got moved to another
cpu).

We can't pretend the 'doubtful' entry wasn't in our list.
Instead, when we don't find an entry indicate via IS_ERR
that entry was removed ('did not exist' or withheld
('might-be-unconfirmed').

This most likely also fixes a xt_connlimit imbalance earlier reported by
Dmitry Andrianov.

Cc: Dmitry Andrianov <dmitry.andrianov@alertme.com>
Reported-by: Justin Pettit <jpettit@vmware.com>
Reported-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 52 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index d8383609fe28..510039862aa9 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -47,6 +47,8 @@ struct nf_conncount_tuple {
 	struct hlist_node		node;
 	struct nf_conntrack_tuple	tuple;
 	struct nf_conntrack_zone	zone;
+	int				cpu;
+	u32				jiffies32;
 };
 
 struct nf_conncount_rb {
@@ -91,11 +93,42 @@ bool nf_conncount_add(struct hlist_head *head,
 		return false;
 	conn->tuple = *tuple;
 	conn->zone = *zone;
+	conn->cpu = raw_smp_processor_id();
+	conn->jiffies32 = (u32)jiffies;
 	hlist_add_head(&conn->node, head);
 	return true;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_add);
 
+static const struct nf_conntrack_tuple_hash *
+find_or_evict(struct net *net, struct nf_conncount_tuple *conn)
+{
+	const struct nf_conntrack_tuple_hash *found;
+	unsigned long a, b;
+	int cpu = raw_smp_processor_id();
+	__s32 age;
+
+	found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
+	if (found)
+		return found;
+	b = conn->jiffies32;
+	a = (u32)jiffies;
+
+	/* conn might have been added just before by another cpu and
+	 * might still be unconfirmed.  In this case, nf_conntrack_find()
+	 * returns no result.  Thus only evict if this cpu added the
+	 * stale entry or if the entry is older than two jiffies.
+	 */
+	age = a - b;
+	if (conn->cpu == cpu || age >= 2) {
+		hlist_del(&conn->node);
+		kmem_cache_free(conncount_conn_cachep, conn);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return ERR_PTR(-EAGAIN);
+}
+
 unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 				 const struct nf_conntrack_tuple *tuple,
 				 const struct nf_conntrack_zone *zone,
@@ -103,18 +136,27 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn;
-	struct hlist_node *n;
 	struct nf_conn *found_ct;
+	struct hlist_node *n;
 	unsigned int length = 0;
 
 	*addit = tuple ? true : false;
 
 	/* check the saved connections */
 	hlist_for_each_entry_safe(conn, n, head, node) {
-		found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
-		if (found == NULL) {
-			hlist_del(&conn->node);
-			kmem_cache_free(conncount_conn_cachep, conn);
+		found = find_or_evict(net, conn);
+		if (IS_ERR(found)) {
+			/* Not found, but might be about to be confirmed */
+			if (PTR_ERR(found) == -EAGAIN) {
+				length++;
+				if (!tuple)
+					continue;
+
+				if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+				    nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
+				    nf_ct_zone_id(zone, zone->dir))
+					*addit = false;
+			}
 			continue;
 		}
 
-- 
cgit v1.2.3


From c809195f5523dd4d09403bbb1c9732d548aa0d1e Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Mon, 25 Jun 2018 06:41:25 -0700
Subject: rds: clean up loopback rds_connections on netns deletion

The RDS core module creates rds_connections based on callbacks
from rds_loop_transport when sending/receiving packets to local
addresses.

These connections will need to be cleaned up when they are
created from a netns that is not init_net, and that netns is deleted.

Add the changes aligned with the changes from
commit ebeeb1ad9b8a ("rds: tcp: use rds_destroy_pending() to synchronize
netns/module teardown and rds connection/workq management") for
rds_loop_transport

Reported-and-tested-by: syzbot+4c20b3866171ce8441d2@syzkaller.appspotmail.com
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c | 11 ++++++++++-
 net/rds/loop.c       | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/rds/loop.h       |  2 ++
 3 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index abef75da89a7..cfb05953b0e5 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -659,11 +659,19 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
 
 int rds_conn_init(void)
 {
+	int ret;
+
+	ret = rds_loop_net_init(); /* register pernet callback */
+	if (ret)
+		return ret;
+
 	rds_conn_slab = kmem_cache_create("rds_connection",
 					  sizeof(struct rds_connection),
 					  0, 0, NULL);
-	if (!rds_conn_slab)
+	if (!rds_conn_slab) {
+		rds_loop_net_exit();
 		return -ENOMEM;
+	}
 
 	rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
 	rds_info_register_func(RDS_INFO_SEND_MESSAGES,
@@ -676,6 +684,7 @@ int rds_conn_init(void)
 
 void rds_conn_exit(void)
 {
+	rds_loop_net_exit(); /* unregister pernet callback */
 	rds_loop_exit();
 
 	WARN_ON(!hlist_empty(rds_conn_hash));
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dac6218a460e..feea1f96ee2a 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -33,6 +33,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/in.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 
 #include "rds_single_path.h"
 #include "rds.h"
@@ -40,6 +42,17 @@
 
 static DEFINE_SPINLOCK(loop_conns_lock);
 static LIST_HEAD(loop_conns);
+static atomic_t rds_loop_unloading = ATOMIC_INIT(0);
+
+static void rds_loop_set_unloading(void)
+{
+	atomic_set(&rds_loop_unloading, 1);
+}
+
+static bool rds_loop_is_unloading(struct rds_connection *conn)
+{
+	return atomic_read(&rds_loop_unloading) != 0;
+}
 
 /*
  * This 'loopback' transport is a special case for flows that originate
@@ -165,6 +178,8 @@ void rds_loop_exit(void)
 	struct rds_loop_connection *lc, *_lc;
 	LIST_HEAD(tmp_list);
 
+	rds_loop_set_unloading();
+	synchronize_rcu();
 	/* avoid calling conn_destroy with irqs off */
 	spin_lock_irq(&loop_conns_lock);
 	list_splice(&loop_conns, &tmp_list);
@@ -177,6 +192,46 @@ void rds_loop_exit(void)
 	}
 }
 
+static void rds_loop_kill_conns(struct net *net)
+{
+	struct rds_loop_connection *lc, *_lc;
+	LIST_HEAD(tmp_list);
+
+	spin_lock_irq(&loop_conns_lock);
+	list_for_each_entry_safe(lc, _lc, &loop_conns, loop_node)  {
+		struct net *c_net = read_pnet(&lc->conn->c_net);
+
+		if (net != c_net)
+			continue;
+		list_move_tail(&lc->loop_node, &tmp_list);
+	}
+	spin_unlock_irq(&loop_conns_lock);
+
+	list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+		WARN_ON(lc->conn->c_passive);
+		rds_conn_destroy(lc->conn);
+	}
+}
+
+static void __net_exit rds_loop_exit_net(struct net *net)
+{
+	rds_loop_kill_conns(net);
+}
+
+static struct pernet_operations rds_loop_net_ops = {
+	.exit = rds_loop_exit_net,
+};
+
+int rds_loop_net_init(void)
+{
+	return register_pernet_device(&rds_loop_net_ops);
+}
+
+void rds_loop_net_exit(void)
+{
+	unregister_pernet_device(&rds_loop_net_ops);
+}
+
 /*
  * This is missing .xmit_* because loop doesn't go through generic
  * rds_send_xmit() and doesn't call rds_recv_incoming().  .listen_stop and
@@ -194,4 +249,5 @@ struct rds_transport rds_loop_transport = {
 	.inc_free		= rds_loop_inc_free,
 	.t_name			= "loopback",
 	.t_type			= RDS_TRANS_LOOP,
+	.t_unloading		= rds_loop_is_unloading,
 };
diff --git a/net/rds/loop.h b/net/rds/loop.h
index 469fa4b2da4f..bbc8cdd030df 100644
--- a/net/rds/loop.h
+++ b/net/rds/loop.h
@@ -5,6 +5,8 @@
 /* loop.c */
 extern struct rds_transport rds_loop_transport;
 
+int rds_loop_net_init(void);
+void rds_loop_net_exit(void);
 void rds_loop_exit(void);
 
 #endif
-- 
cgit v1.2.3


From 7c8f4e6dc30996bff806285730a0bb4e714d3d52 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 26 Jun 2018 01:39:32 +0200
Subject: fib_rules: match rules based on suppress_* properties too

Two rules with different values of suppress_prefix or suppress_ifgroup
are not the same. This fixes an -EEXIST when running:

   $ ip -4 rule add table main suppress_prefixlength 0

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 126ffc5bc630..bc8425d81022 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -416,6 +416,14 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
 		if (rule->mark && r->mark != rule->mark)
 			continue;
 
+		if (rule->suppress_ifgroup != -1 &&
+		    r->suppress_ifgroup != rule->suppress_ifgroup)
+			continue;
+
+		if (rule->suppress_prefixlen != -1 &&
+		    r->suppress_prefixlen != rule->suppress_prefixlen)
+			continue;
+
 		if (rule->mark_mask && r->mark_mask != rule->mark_mask)
 			continue;
 
-- 
cgit v1.2.3


From 88e85a7daf8e21f2d6cb054374d480c540725cde Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 26 Jun 2018 12:55:35 +0900
Subject: bpfilter: check compiler capability in Kconfig

With the brand-new syntax extension of Kconfig, we can directly
check the compiler capability in the configuration phase.

If the cc-can-link.sh fails, the BPFILTER_UMH is automatically
hidden by the dependency.

I also deleted 'default n', which is no-op.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Makefile               | 5 -----
 net/Makefile           | 4 ----
 net/bpfilter/Kconfig   | 2 +-
 scripts/cc-can-link.sh | 2 +-
 4 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/Makefile b/Makefile
index c9132594860b..e1bd4c3627bc 100644
--- a/Makefile
+++ b/Makefile
@@ -507,11 +507,6 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLA
   KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
-ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/cc-can-link.sh $(CC)), y)
-  CC_CAN_LINK := y
-  export CC_CAN_LINK
-endif
-
 # The expansion should be delayed until arch/$(SRCARCH)/Makefile is included.
 # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile.
 # CC_VERSION_TEXT is referenced from Kconfig (so it needs export),
diff --git a/net/Makefile b/net/Makefile
index 13ec0d5415c7..bdaf53925acd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,11 +20,7 @@ obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
-ifneq ($(CC_CAN_LINK),y)
-$(warning CC cannot link executables. Skipping bpfilter.)
-else
 obj-$(CONFIG_BPFILTER)		+= bpfilter/
-endif
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index a948b072c28f..76deb6615883 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -1,6 +1,5 @@
 menuconfig BPFILTER
 	bool "BPF based packet filtering framework (BPFILTER)"
-	default n
 	depends on NET && BPF && INET
 	help
 	  This builds experimental bpfilter framework that is aiming to
@@ -9,6 +8,7 @@ menuconfig BPFILTER
 if BPFILTER
 config BPFILTER_UMH
 	tristate "bpfilter kernel module with user mode helper"
+	depends on $(success,$(srctree)/scripts/cc-can-link.sh $(CC))
 	default m
 	help
 	  This builds bpfilter kernel module with embedded user mode helper
diff --git a/scripts/cc-can-link.sh b/scripts/cc-can-link.sh
index 208eb2825dab..6efcead31989 100755
--- a/scripts/cc-can-link.sh
+++ b/scripts/cc-can-link.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
-cat << "END" | $@ -x c - -o /dev/null >/dev/null 2>&1 && echo "y"
+cat << "END" | $@ -x c - -o /dev/null >/dev/null 2>&1
 #include <stdio.h>
 int main(void)
 {
-- 
cgit v1.2.3


From 977c7114ebda2e746a114840d3a875e0cdb826fb Mon Sep 17 00:00:00 2001
From: Doron Roberts-Kedes <doronrk@fb.com>
Date: Tue, 26 Jun 2018 18:33:33 -0700
Subject: strparser: Remove early eaten to fix full tcp receive buffer stall

On receving an incomplete message, the existing code stores the
remaining length of the cloned skb in the early_eaten field instead of
incrementing the value returned by __strp_recv. This defers invocation
of sock_rfree for the current skb until the next invocation of
__strp_recv, which returns early_eaten if early_eaten is non-zero.

This behavior causes a stall when the current message occupies the very
tail end of a massive skb, and strp_peek/need_bytes indicates that the
remainder of the current message has yet to arrive on the socket. The
TCP receive buffer is totally full, causing the TCP window to go to
zero, so the remainder of the message will never arrive.

Incrementing the value returned by __strp_recv by the amount otherwise
stored in early_eaten prevents stalls of this nature.

Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/strparser/strparser.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 373836615c57..625acb27efcc 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -35,7 +35,6 @@ struct _strp_msg {
 	 */
 	struct strp_msg strp;
 	int accum_len;
-	int early_eaten;
 };
 
 static inline struct _strp_msg *_strp_msg(struct sk_buff *skb)
@@ -115,20 +114,6 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 	head = strp->skb_head;
 	if (head) {
 		/* Message already in progress */
-
-		stm = _strp_msg(head);
-		if (unlikely(stm->early_eaten)) {
-			/* Already some number of bytes on the receive sock
-			 * data saved in skb_head, just indicate they
-			 * are consumed.
-			 */
-			eaten = orig_len <= stm->early_eaten ?
-				orig_len : stm->early_eaten;
-			stm->early_eaten -= eaten;
-
-			return eaten;
-		}
-
 		if (unlikely(orig_offset)) {
 			/* Getting data with a non-zero offset when a message is
 			 * in progress is not expected. If it does happen, we
@@ -297,9 +282,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				}
 
 				stm->accum_len += cand_len;
+				eaten += cand_len;
 				strp->need_bytes = stm->strp.full_len -
 						       stm->accum_len;
-				stm->early_eaten = cand_len;
 				STRP_STATS_ADD(strp->stats.bytes, cand_len);
 				desc->count = 0; /* Stop reading socket */
 				break;
-- 
cgit v1.2.3


From 8e75887d321d102200abf3a9fa621e2c10ff4cc5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 26 Jun 2018 20:13:48 -0700
Subject: bpfilter: include bpfilter_umh in assembly instead of using objcopy

What we want here is to embed a user-space program into the kernel.
Instead of the complex ELF magic, let's simply wrap it in the assembly
with the '.incbin' directive.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bpfilter/Makefile            | 17 ++---------------
 net/bpfilter/bpfilter_kern.c     | 11 +++++------
 net/bpfilter/bpfilter_umh_blob.S |  7 +++++++
 3 files changed, 14 insertions(+), 21 deletions(-)
 create mode 100644 net/bpfilter/bpfilter_umh_blob.S

(limited to 'net')

diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index 051dc18b8ccb..39c6980b5d99 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -15,20 +15,7 @@ ifeq ($(CONFIG_BPFILTER_UMH), y)
 HOSTLDFLAGS += -static
 endif
 
-# a bit of elf magic to convert bpfilter_umh binary into a binary blob
-# inside bpfilter_umh.o elf file referenced by
-# _binary_net_bpfilter_bpfilter_umh_start symbol
-# which bpfilter_kern.c passes further into umh blob loader at run-time
-quiet_cmd_copy_umh = GEN $@
-      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
-      $(OBJCOPY) -I binary \
-          `LC_ALL=C $(OBJDUMP) -f net/bpfilter/bpfilter_umh \
-          |awk -F' |,' '/file format/{print "-O",$$NF} \
-          /^architecture:/{print "-B",$$2}'` \
-      --rename-section .data=.init.rodata $< $@
-
-$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
-	$(call cmd,copy_umh)
+$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh
 
 obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
-bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
+bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 09522573f611..f0fc182d3db7 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -10,11 +10,8 @@
 #include <linux/file.h>
 #include "msgfmt.h"
 
-#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
-#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
-
-extern char UMH_start;
-extern char UMH_end;
+extern char bpfilter_umh_start;
+extern char bpfilter_umh_end;
 
 static struct umh_info info;
 /* since ip_getsockopt() can run in parallel, serialize access to umh */
@@ -93,7 +90,9 @@ static int __init load_umh(void)
 	int err;
 
 	/* fork usermode process */
-	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+	err = fork_usermode_blob(&bpfilter_umh_start,
+				 &bpfilter_umh_end - &bpfilter_umh_start,
+				 &info);
 	if (err)
 		return err;
 	pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
new file mode 100644
index 000000000000..40311d10d2f2
--- /dev/null
+++ b/net/bpfilter/bpfilter_umh_blob.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+	.section .init.rodata, "a"
+	.global bpfilter_umh_start
+bpfilter_umh_start:
+	.incbin "net/bpfilter/bpfilter_umh"
+	.global bpfilter_umh_end
+bpfilter_umh_end:
-- 
cgit v1.2.3


From 15ecbe94a45ef88491ca459b26efdd02f91edb6d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Jun 2018 08:47:21 -0700
Subject: tcp: add one more quick ack after after ECN events

Larry Brakmo proposal ( https://patchwork.ozlabs.org/patch/935233/
tcp: force cwnd at least 2 in tcp_cwnd_reduction) made us rethink
about our recent patch removing ~16 quick acks after ECN events.

tcp_enter_quickack_mode(sk, 1) makes sure one immediate ack is sent,
but in the case the sender cwnd was lowered to 1, we do not want
to have a delayed ack for the next packet we will receive.

Fixes: 522040ea5fdd ("tcp: do not aggressively quick ack after ECN events")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Neal Cardwell <ncardwell@google.com>
Cc: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 355d3dffd021..045d930d01a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -265,7 +265,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 		 * it is probably a retransmit.
 		 */
 		if (tp->ecn_flags & TCP_ECN_SEEN)
-			tcp_enter_quickack_mode(sk, 1);
+			tcp_enter_quickack_mode(sk, 2);
 		break;
 	case INET_ECN_CE:
 		if (tcp_ca_needs_ecn(sk))
@@ -273,7 +273,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
-			tcp_enter_quickack_mode(sk, 1);
+			tcp_enter_quickack_mode(sk, 2);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
 		tp->ecn_flags |= TCP_ECN_SEEN;
-- 
cgit v1.2.3


From 24ac3a08e65845a098140ff270229dec4a897404 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 27 Jun 2018 17:59:50 +0200
Subject: net/smc: rebuild nonblocking connect

The recent poll change may lead to stalls for non-blocking connecting
SMC sockets, since sock_poll_wait is no longer performed on the
internal CLC socket, but on the outer SMC socket.  kernel_connect() on
the internal CLC socket returns with -EINPROGRESS, but the wake up
logic does not work in all cases. If the internal CLC socket is still
in state TCP_SYN_SENT when polled, sock_poll_wait() from sock_poll()
does not sleep. It is supposed to sleep till the state of the internal
CLC socket switches to TCP_ESTABLISHED.

This problem triggered a redesign of the SMC nonblocking connect logic.
This patch introduces a connect worker covering all connect steps
followed by a wake up of socket waiters. It allows to get rid of all
delays and locks in smc_poll().

Fixes: c0129a061442 ("smc: convert to ->poll_mask")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 91 +++++++++++++++++++++++++++++++++++++++-----------------
 net/smc/smc.h    |  8 +++++
 2 files changed, 71 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index da7f02edcd37..4d1d4ddb2f49 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -45,6 +45,7 @@ static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 */
 
 static void smc_tcp_listen_work(struct work_struct *);
+static void smc_connect_work(struct work_struct *);
 
 static void smc_set_keepalive(struct sock *sk, int val)
 {
@@ -122,6 +123,12 @@ static int smc_release(struct socket *sock)
 		goto out;
 
 	smc = smc_sk(sk);
+
+	/* cleanup for a dangling non-blocking connect */
+	flush_work(&smc->connect_work);
+	kfree(smc->connect_info);
+	smc->connect_info = NULL;
+
 	if (sk->sk_state == SMC_LISTEN)
 		/* smc_close_non_accepted() is called and acquires
 		 * sock lock for child sockets again
@@ -186,6 +193,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 	sk->sk_protocol = protocol;
 	smc = smc_sk(sk);
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	INIT_WORK(&smc->connect_work, smc_connect_work);
 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
@@ -576,6 +584,35 @@ static int __smc_connect(struct smc_sock *smc)
 	return 0;
 }
 
+static void smc_connect_work(struct work_struct *work)
+{
+	struct smc_sock *smc = container_of(work, struct smc_sock,
+					    connect_work);
+	int rc;
+
+	lock_sock(&smc->sk);
+	rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
+			    smc->connect_info->alen, smc->connect_info->flags);
+	if (smc->clcsock->sk->sk_err) {
+		smc->sk.sk_err = smc->clcsock->sk->sk_err;
+		goto out;
+	}
+	if (rc < 0) {
+		smc->sk.sk_err = -rc;
+		goto out;
+	}
+
+	rc = __smc_connect(smc);
+	if (rc < 0)
+		smc->sk.sk_err = -rc;
+
+out:
+	smc->sk.sk_state_change(&smc->sk);
+	kfree(smc->connect_info);
+	smc->connect_info = NULL;
+	release_sock(&smc->sk);
+}
+
 static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		       int alen, int flags)
 {
@@ -605,15 +642,32 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 
 	smc_copy_sock_settings_to_clc(smc);
 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
-	rc = kernel_connect(smc->clcsock, addr, alen, flags);
-	if (rc)
-		goto out;
+	if (flags & O_NONBLOCK) {
+		if (smc->connect_info) {
+			rc = -EALREADY;
+			goto out;
+		}
+		smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
+		if (!smc->connect_info) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		smc->connect_info->alen = alen;
+		smc->connect_info->flags = flags ^ O_NONBLOCK;
+		memcpy(&smc->connect_info->addr, addr, alen);
+		schedule_work(&smc->connect_work);
+		rc = -EINPROGRESS;
+	} else {
+		rc = kernel_connect(smc->clcsock, addr, alen, flags);
+		if (rc)
+			goto out;
 
-	rc = __smc_connect(smc);
-	if (rc < 0)
-		goto out;
-	else
-		rc = 0; /* success cases including fallback */
+		rc = __smc_connect(smc);
+		if (rc < 0)
+			goto out;
+		else
+			rc = 0; /* success cases including fallback */
+	}
 
 out:
 	release_sock(sk);
@@ -1278,34 +1332,17 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 	struct sock *sk = sock->sk;
 	__poll_t mask = 0;
 	struct smc_sock *smc;
-	int rc;
 
 	if (!sk)
 		return EPOLLNVAL;
 
 	smc = smc_sk(sock->sk);
-	sock_hold(sk);
-	lock_sock(sk);
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
-		release_sock(sk);
 		mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
-		lock_sock(sk);
 		sk->sk_err = smc->clcsock->sk->sk_err;
-		if (sk->sk_err) {
+		if (sk->sk_err)
 			mask |= EPOLLERR;
-		} else {
-			/* if non-blocking connect finished ... */
-			if (sk->sk_state == SMC_INIT &&
-			    mask & EPOLLOUT &&
-			    smc->clcsock->sk->sk_state != TCP_CLOSE) {
-				rc = __smc_connect(smc);
-				if (rc < 0)
-					mask |= EPOLLERR;
-				/* success cases including fallback */
-				mask |= EPOLLOUT | EPOLLWRNORM;
-			}
-		}
 	} else {
 		if (sk->sk_err)
 			mask |= EPOLLERR;
@@ -1334,8 +1371,6 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 			mask |= EPOLLPRI;
 
 	}
-	release_sock(sk);
-	sock_put(sk);
 
 	return mask;
 }
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 51ae1f10d81a..d7ca26570482 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -187,11 +187,19 @@ struct smc_connection {
 	struct work_struct	close_work;	/* peer sent some closing */
 };
 
+struct smc_connect_info {
+	int			flags;
+	int			alen;
+	struct sockaddr		addr;
+};
+
 struct smc_sock {				/* smc sock container */
 	struct sock		sk;
 	struct socket		*clcsock;	/* internal tcp socket */
 	struct smc_connection	conn;		/* smc connection */
 	struct smc_sock		*listen_smc;	/* listen parent */
+	struct smc_connect_info *connect_info;	/* connect address & flags */
+	struct work_struct	connect_work;	/* handle non-blocking connect*/
 	struct work_struct	tcp_listen_work;/* handle tcp socket accepts */
 	struct work_struct	smc_listen_work;/* prepare new accept socket */
 	struct list_head	accept_q;	/* sockets to be accepted */
-- 
cgit v1.2.3


From 4c79579b44b1876444f4d04de31c1a37098a0350 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 26 Jun 2018 16:21:18 -0700
Subject: bpf: Change bpf_fib_lookup to return lookup status

For ACLs implemented using either FIB rules or FIB entries, the BPF
program needs the FIB lookup status to be able to drop the packet.
Since the bpf_fib_lookup API has not reached a released kernel yet,
change the return code to contain an encoding of the FIB lookup
result and return the nexthop device index in the params struct.

In addition, inform the BPF program of any post FIB lookup reason as
to why the packet needs to go up the stack.

The fib result for unicast routes must have an egress device, so remove
the check that it is non-NULL.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h   | 28 ++++++++++++---
 net/core/filter.c          | 86 +++++++++++++++++++++++++++++-----------------
 samples/bpf/xdp_fwd_kern.c |  8 ++---
 3 files changed, 81 insertions(+), 41 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59b19b6a40d7..b7db3261c62d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1857,7 +1857,8 @@ union bpf_attr {
  *		is resolved), the nexthop address is returned in ipv4_dst
  *		or ipv6_dst based on family, smac is set to mac address of
  *		egress device, dmac is set to nexthop mac address, rt_metric
- *		is set to metric from route (IPv4/IPv6 only).
+ *		is set to metric from route (IPv4/IPv6 only), and ifindex
+ *		is set to the device index of the nexthop from the FIB lookup.
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -1873,9 +1874,10 @@ union bpf_attr {
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
  *     Return
- *             Egress device index on success, 0 if packet needs to continue
- *             up the stack for further processing or a negative error in case
- *             of failure.
+ *		* < 0 if any input argument is invalid
+ *		*   0 on success (packet is forwarded, nexthop neighbor exists)
+ *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ *		*     packet is not forwarded or needs assist from full stack
  *
  * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
@@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
+enum {
+	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
+	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed; can be dropped */
+	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable; can be dropped */
+	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed; can be dropped */
+	BPF_FIB_LKUP_RET_NOT_FWDED,    /* packet is not forwarded */
+	BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
+	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires encapsulation */
+	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neighbor entry for nh */
+	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
+};
+
 struct bpf_fib_lookup {
 	/* input:  network family for lookup (AF_INET, AF_INET6)
 	 * output: network family of egress nexthop
@@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
 
 	/* total length of packet from network header - used for MTU check */
 	__u16	tot_len;
-	__u32	ifindex;  /* L3 device index for lookup */
+
+	/* input: L3 device index for lookup
+	 * output: device index from FIB lookup
+	 */
+	__u32	ifindex;
 
 	union {
 		/* inputs to lookup */
diff --git a/net/core/filter.c b/net/core/filter.c
index e7f12e9f598c..0ca6907d7efe 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
 	params->h_vlan_TCI = 0;
 	params->h_vlan_proto = 0;
+	params->ifindex = dev->ifindex;
 
-	return dev->ifindex;
+	return 0;
 }
 #endif
 
@@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	/* verify forwarding is enabled on this interface */
 	in_dev = __in_dev_get_rcu(dev);
 	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
-		return 0;
+		return BPF_FIB_LKUP_RET_FWD_DISABLED;
 
 	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
 		fl4.flowi4_iif = 1;
@@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 		tb = fib_get_table(net, tbid);
 		if (unlikely(!tb))
-			return 0;
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
 	} else {
@@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
 	}
 
-	if (err || res.type != RTN_UNICAST)
-		return 0;
+	if (err) {
+		/* map fib lookup errors to RTN_ type */
+		if (err == -EINVAL)
+			return BPF_FIB_LKUP_RET_BLACKHOLE;
+		if (err == -EHOSTUNREACH)
+			return BPF_FIB_LKUP_RET_UNREACHABLE;
+		if (err == -EACCES)
+			return BPF_FIB_LKUP_RET_PROHIBIT;
+
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
+	}
+
+	if (res.type != RTN_UNICAST)
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	if (res.fi->fib_nhs > 1)
 		fib_select_path(net, &res, &fl4, NULL);
@@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (check_mtu) {
 		mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
 		if (params->tot_len > mtu)
-			return 0;
+			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
 	nh = &res.fi->fib_nh[res.nh_sel];
 
 	/* do not handle lwt encaps right now */
 	if (nh->nh_lwtstate)
-		return 0;
+		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
 
 	dev = nh->nh_dev;
-	if (unlikely(!dev))
-		return 0;
-
 	if (nh->nh_gw)
 		params->ipv4_dst = nh->nh_gw;
 
@@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 * rcu_read_lock_bh is not needed here
 	 */
 	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
-	if (neigh)
-		return bpf_fib_set_fwd_params(params, neigh, dev);
+	if (!neigh)
+		return BPF_FIB_LKUP_RET_NO_NEIGH;
 
-	return 0;
+	return bpf_fib_set_fwd_params(params, neigh, dev);
 }
 #endif
 
@@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 	/* link local addresses are never forwarded */
 	if (rt6_need_strict(dst) || rt6_need_strict(src))
-		return 0;
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	dev = dev_get_by_index_rcu(net, params->ifindex);
 	if (unlikely(!dev))
@@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 	idev = __in6_dev_get_safely(dev);
 	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
-		return 0;
+		return BPF_FIB_LKUP_RET_FWD_DISABLED;
 
 	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
 		fl6.flowi6_iif = 1;
@@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 		tb = ipv6_stub->fib6_get_table(net, tbid);
 		if (unlikely(!tb))
-			return 0;
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
 	} else {
@@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	}
 
 	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
-		return 0;
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
+		switch (f6i->fib6_type) {
+		case RTN_BLACKHOLE:
+			return BPF_FIB_LKUP_RET_BLACKHOLE;
+		case RTN_UNREACHABLE:
+			return BPF_FIB_LKUP_RET_UNREACHABLE;
+		case RTN_PROHIBIT:
+			return BPF_FIB_LKUP_RET_PROHIBIT;
+		default:
+			return BPF_FIB_LKUP_RET_NOT_FWDED;
+		}
+	}
 
-	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
-	    f6i->fib6_type != RTN_UNICAST))
-		return 0;
+	if (f6i->fib6_type != RTN_UNICAST)
+		return BPF_FIB_LKUP_RET_NOT_FWDED;
 
 	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
 		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
@@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (check_mtu) {
 		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
 		if (params->tot_len > mtu)
-			return 0;
+			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
 	if (f6i->fib6_nh.nh_lwtstate)
-		return 0;
+		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
 
 	if (f6i->fib6_flags & RTF_GATEWAY)
 		*dst = f6i->fib6_nh.nh_gw;
@@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 */
 	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
 				      ndisc_hashfn, dst, dev);
-	if (neigh)
-		return bpf_fib_set_fwd_params(params, neigh, dev);
+	if (!neigh)
+		return BPF_FIB_LKUP_RET_NO_NEIGH;
 
-	return 0;
+	return bpf_fib_set_fwd_params(params, neigh, dev);
 }
 #endif
 
@@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
 	struct net *net = dev_net(skb->dev);
-	int index = -EAFNOSUPPORT;
+	int rc = -EAFNOSUPPORT;
 
 	if (plen < sizeof(*params))
 		return -EINVAL;
@@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
-		index = bpf_ipv4_fib_lookup(net, params, flags, false);
+		rc = bpf_ipv4_fib_lookup(net, params, flags, false);
 		break;
 #endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		index = bpf_ipv6_fib_lookup(net, params, flags, false);
+		rc = bpf_ipv6_fib_lookup(net, params, flags, false);
 		break;
 #endif
 	}
 
-	if (index > 0) {
+	if (!rc) {
 		struct net_device *dev;
 
-		dev = dev_get_by_index_rcu(net, index);
+		dev = dev_get_by_index_rcu(net, params->ifindex);
 		if (!is_skb_forwardable(dev, skb))
-			index = 0;
+			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
-	return index;
+	return rc;
 }
 
 static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 6673cdb9f55c..a7e94e7ff87d 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	struct ethhdr *eth = data;
 	struct ipv6hdr *ip6h;
 	struct iphdr *iph;
-	int out_index;
 	u16 h_proto;
 	u64 nh_off;
+	int rc;
 
 	nh_off = sizeof(*eth);
 	if (data + nh_off > data_end)
@@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 	fib_params.ifindex = ctx->ingress_ifindex;
 
-	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
 
 	/* verify egress index has xdp support
 	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
@@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	 * NOTE: without verification that egress index supports XDP
 	 *       forwarding packets are dropped.
 	 */
-	if (out_index > 0) {
+	if (rc == 0) {
 		if (h_proto == htons(ETH_P_IP))
 			ip_decrease_ttl(iph);
 		else if (h_proto == htons(ETH_P_IPV6))
@@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
 		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
-		return bpf_redirect_map(&tx_port, out_index, 0);
+		return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
 	}
 
 	return XDP_PASS;
-- 
cgit v1.2.3


From e7441c9274a6a5453e06f4c2b8b5f72eca0a3f17 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Tue, 19 Jun 2018 10:39:50 -0500
Subject: mac80211: disable BHs/preemption in ieee80211_tx_control_port()

On pre-emption enabled kernels the following print was being seen due to
missing local_bh_disable/local_bh_enable calls.  mac80211 assumes that
pre-emption is disabled in the data path.

    BUG: using smp_processor_id() in preemptible [00000000] code: iwd/517
    caller is __ieee80211_subif_start_xmit+0x144/0x210 [mac80211]
    [...]
    Call Trace:
    dump_stack+0x5c/0x80
    check_preemption_disabled.cold.0+0x46/0x51
    __ieee80211_subif_start_xmit+0x144/0x210 [mac80211]

Fixes: 911806491425 ("mac80211: Add support for tx_control_port")
Signed-off-by: Denis Kenzior <denkenz@gmail.com>
[commit message rewrite, fixes tag]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 44b5dfe8727d..fa1f1e63a264 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4845,7 +4845,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
 	skb_reset_network_header(skb);
 	skb_reset_mac_header(skb);
 
+	local_bh_disable();
 	__ieee80211_subif_start_xmit(skb, skb->dev, flags);
+	local_bh_enable();
 
 	return 0;
 }
-- 
cgit v1.2.3


From 188f60ab8e787fcbb5ac9d64ede23a0070231f09 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Sun, 24 Jun 2018 21:10:49 -0400
Subject: nl80211: relax ht operation checks for mesh

Commit 9757235f451c, "nl80211: correct checks for
NL80211_MESHCONF_HT_OPMODE value") relaxed the range for the HT
operation field in meshconf, while also adding checks requiring
the non-greenfield and non-ht-sta bits to be set in certain
circumstances.  The latter bit is actually reserved for mesh BSSes
according to Table 9-168 in 802.11-2016, so in fact it should not
be set.

wpa_supplicant sets these bits because the mesh and AP code share
the same implementation, but authsae does not.  As a result, some
meshconf updates from authsae which set only the NONHT_MIXED
protection bits were being rejected.

In order to avoid breaking userspace by changing the rules again,
simply accept the values with or without the bits set, and mask
off the reserved bit to match the spec.

While in here, update the 802.11-2012 reference to 802.11-2016.

Fixes: 9757235f451c ("nl80211: correct checks for NL80211_MESHCONF_HT_OPMODE value")
Cc: Masashi Honma <masashi.honma@gmail.com>
Signed-off-by: Bob Copeland <bobcopeland@fb.com>
Reviewed-by: Masashi Honma <masashi.honma@gmail.com>
Reviewed-by: Masashi Honma <masashi.honma@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c7bbe5f0aae8..351eeaf16abe 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6231,7 +6231,7 @@ do {									    \
 				  nl80211_check_s32);
 	/*
 	 * Check HT operation mode based on
-	 * IEEE 802.11 2012 8.4.2.59 HT Operation element.
+	 * IEEE 802.11-2016 9.4.2.57 HT Operation element.
 	 */
 	if (tb[NL80211_MESHCONF_HT_OPMODE]) {
 		ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]);
@@ -6241,22 +6241,9 @@ do {									    \
 				  IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
 			return -EINVAL;
 
-		if ((ht_opmode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT) &&
-		    (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
-			return -EINVAL;
+		/* NON_HT_STA bit is reserved, but some programs set it */
+		ht_opmode &= ~IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
 
-		switch (ht_opmode & IEEE80211_HT_OP_MODE_PROTECTION) {
-		case IEEE80211_HT_OP_MODE_PROTECTION_NONE:
-		case IEEE80211_HT_OP_MODE_PROTECTION_20MHZ:
-			if (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)
-				return -EINVAL;
-			break;
-		case IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER:
-		case IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED:
-			if (!(ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
-				return -EINVAL;
-			break;
-		}
 		cfg->ht_opmode = ht_opmode;
 		mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
 	}
-- 
cgit v1.2.3


From 95bca62fb723a121954fc7ae5473bb2c1f0d5986 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 29 Jun 2018 09:33:39 +0200
Subject: nl80211: check nla_parse_nested() return values

At the very least we should check the return value if
nla_parse_nested() is called with a non-NULL policy.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 351eeaf16abe..4eece06be1e7 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10949,9 +10949,12 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
 				    rem) {
 			u8 *mask_pat;
 
-			nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
-					 nl80211_packet_pattern_policy,
-					 info->extack);
+			err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
+					       nl80211_packet_pattern_policy,
+					       info->extack);
+			if (err)
+				goto error;
+
 			err = -EINVAL;
 			if (!pat_tb[NL80211_PKTPAT_MASK] ||
 			    !pat_tb[NL80211_PKTPAT_PATTERN])
@@ -11200,8 +11203,11 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
 			    rem) {
 		u8 *mask_pat;
 
-		nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
-				 nl80211_packet_pattern_policy, NULL);
+		err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
+				       nl80211_packet_pattern_policy, NULL);
+		if (err)
+			return err;
+
 		if (!pat_tb[NL80211_PKTPAT_MASK] ||
 		    !pat_tb[NL80211_PKTPAT_PATTERN])
 			return -EINVAL;
-- 
cgit v1.2.3


From e699e2c6a654ff8d7303f5297ab5dd83da7b23e0 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 27 Jun 2018 15:16:42 -0700
Subject: net, mm: account sock objects to kmemcg

Currently the kernel accounts the memory for network traffic through
mem_cgroup_[un]charge_skmem() interface. However the memory accounted
only includes the truesize of sk_buff which does not include the size of
sock objects. In our production environment, with opt-out kmem
accounting, the sock kmem caches (TCP[v6], UDP[v6], RAW[v6], UNIX) are
among the top most charged kmem caches and consume a significant amount
of memory which can not be left as system overhead. So, this patch
converts the kmem caches of all sock objects to SLAB_ACCOUNT.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index bcc41829a16d..9e8f65585b81 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3243,7 +3243,8 @@ static int req_prot_init(const struct proto *prot)
 
 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
 					   rsk_prot->obj_size, 0,
-					   prot->slab_flags, NULL);
+					   SLAB_ACCOUNT | prot->slab_flags,
+					   NULL);
 
 	if (!rsk_prot->slab) {
 		pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -3258,7 +3259,8 @@ int proto_register(struct proto *prot, int alloc_slab)
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create_usercopy(prot->name,
 					prot->obj_size, 0,
-					SLAB_HWCACHE_ALIGN | prot->slab_flags,
+					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+					prot->slab_flags,
 					prot->useroffset, prot->usersize,
 					NULL);
 
@@ -3281,6 +3283,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0,
+						  SLAB_ACCOUNT |
 						  prot->slab_flags,
 						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
-- 
cgit v1.2.3


From c860e997e9170a6d68f9d1e6e2cf61f572191aaf Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 27 Jun 2018 16:04:48 -0700
Subject: tcp: fix Fast Open key endianness

Fast Open key could be stored in different endian based on the CPU.
Previously hosts in different endianness in a server farm using
the same key config (sysctl value) would produce different cookies.
This patch fixes it by always storing it as little endian to keep
same API for LE hosts.

Reported-by: Daniele Iamartino <danielei@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d06247ba08b2..af0a857d8352 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -265,8 +265,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 	    ipv4.sysctl_tcp_fastopen);
 	struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
 	struct tcp_fastopen_context *ctxt;
-	int ret;
 	u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+	__le32 key[4];
+	int ret, i;
 
 	tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
 	if (!tbl.data)
@@ -275,11 +276,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 	rcu_read_lock();
 	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
 	if (ctxt)
-		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+		memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
 	else
-		memset(user_key, 0, sizeof(user_key));
+		memset(key, 0, sizeof(key));
 	rcu_read_unlock();
 
+	for (i = 0; i < ARRAY_SIZE(key); i++)
+		user_key[i] = le32_to_cpu(key[i]);
+
 	snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
 		user_key[0], user_key[1], user_key[2], user_key[3]);
 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
@@ -290,13 +294,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		tcp_fastopen_reset_cipher(net, NULL, user_key,
+
+		for (i = 0; i < ARRAY_SIZE(user_key); i++)
+			key[i] = cpu_to_le32(user_key[i]);
+
+		tcp_fastopen_reset_cipher(net, NULL, key,
 					  TCP_FASTOPEN_KEY_LENGTH);
 	}
 
 bad_key:
 	pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
-	       user_key[0], user_key[1], user_key[2], user_key[3],
+		user_key[0], user_key[1], user_key[2], user_key[3],
 	       (char *)tbl.data, ret);
 	kfree(tbl.data);
 	return ret;
-- 
cgit v1.2.3


From d14b56f508ad70eca3e659545aab3c45200f258c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 28 Jun 2018 17:53:06 +0200
Subject: net: cleanup gfp mask in alloc_skb_with_frags

alloc_skb_with_frags uses __GFP_NORETRY for non-sleeping allocations
which is just a noop and a little bit confusing.

__GFP_NORETRY was added by ed98df3361f0 ("net: use __GFP_NORETRY for
high order allocations") to prevent from the OOM killer. Yet this was
not enough because fb05e7a89f50 ("net: don't wait for order-3 page
allocation") didn't want an excessive reclaim for non-costly orders
so it made it completely NOWAIT while it preserved __GFP_NORETRY in
place which is now redundant.

Drop the pointless __GFP_NORETRY because this function is used as
copy&paste source for other places.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c642304f178c..eba8dae22c25 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5276,8 +5276,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
 			if (npages >= 1 << order) {
 				page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
 						   __GFP_COMP |
-						   __GFP_NOWARN |
-						   __GFP_NORETRY,
+						   __GFP_NOWARN,
 						   order);
 				if (page)
 					goto fill_page;
-- 
cgit v1.2.3


From e7c7faa936680a2f0e78fc6d9683a5728421a150 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 28 Jun 2018 13:36:55 -0700
Subject: net/ipv6: Fix updates to prefix route

Sowmini reported that a recent commit broke prefix routes for linklocal
addresses. The newly added modify_prefix_route is attempting to add a
new prefix route when the ifp priority does not match the route metric
however the check needs to account for the default priority. In addition,
the route add fails because the route already exists, and then the delete
removes the one that exists. Flip the order to do the delete first.

Fixes: 8308f3ff1753 ("net/ipv6: Add support for specifying metric of connected routes")
Reported-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Tested-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c134286d6a41..91580c62bb86 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4528,6 +4528,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
 			       unsigned long expires, u32 flags)
 {
 	struct fib6_info *f6i;
+	u32 prio;
 
 	f6i = addrconf_get_prefix_route(&ifp->addr,
 					ifp->prefix_len,
@@ -4536,13 +4537,15 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
 	if (!f6i)
 		return -ENOENT;
 
-	if (f6i->fib6_metric != ifp->rt_priority) {
+	prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
+	if (f6i->fib6_metric != prio) {
+		/* delete old one */
+		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+
 		/* add new one */
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
 				      ifp->rt_priority, ifp->idev->dev,
 				      expires, flags, GFP_KERNEL);
-		/* delete old one */
-		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
 	} else {
 		if (!expires)
 			fib6_clean_expires(f6i);
-- 
cgit v1.2.3


From 3f76df198288ceec92fc9eddecad1e73c52769b0 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 29 Jun 2018 13:42:48 -0700
Subject: net: use dev_change_tx_queue_len() for SIOCSIFTXQLEN

As noticed by Eric, we need to switch to the helper
dev_change_tx_queue_len() for SIOCSIFTXQLEN call path too,
otheriwse still miss dev_qdisc_change_tx_queue_len().

Fixes: 6a643ddb5624 ("net: introduce helper dev_change_tx_queue_len()")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev_ioctl.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index a04e1e88bf3a..50537ff961a7 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -285,16 +285,9 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 		if (ifr->ifr_qlen < 0)
 			return -EINVAL;
 		if (dev->tx_queue_len ^ ifr->ifr_qlen) {
-			unsigned int orig_len = dev->tx_queue_len;
-
-			dev->tx_queue_len = ifr->ifr_qlen;
-			err = call_netdevice_notifiers(
-					NETDEV_CHANGE_TX_QUEUE_LEN, dev);
-			err = notifier_to_errno(err);
-			if (err) {
-				dev->tx_queue_len = orig_len;
+			err = dev_change_tx_queue_len(dev, ifr->ifr_qlen);
+			if (err)
 				return err;
-			}
 		}
 		return 0;
 
-- 
cgit v1.2.3


From 35e8c7ba0863acadd4b9badd09740af7a8331125 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 29 Jun 2018 14:32:15 -0700
Subject: net: fib_rules: bring back rule_exists to match rule during add

After commit f9d4b0c1e969 ("fib_rules: move common handling of newrule
delrule msgs into fib_nl2rule"), rule_exists got replaced by rule_find
for existing rule lookup in both the add and del paths. While this
is good for the delete path, it solves a few problems but opens up
a few invalid key matches in the add path.

$ip -4 rule add table main tos 10 fwmark 1
$ip -4 rule add table main tos 10
RTNETLINK answers: File exists

The problem here is rule_find does not check if the key masks in
the new and old rule are the same and hence ends up matching a more
secific rule. Rule key masks cannot be easily compared today without
an elaborate if-else block. Its best to introduce key masks for easier
and accurate rule comparison in the future. Until then, due to fear of
regressions this patch re-introduces older loose rule_exists during add.
Also fixes both rule_exists and rule_find to cover missing attributes.

Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bc8425d81022..f64aa13811ea 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -444,6 +444,9 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
 		if (rule->ip_proto && r->ip_proto != rule->ip_proto)
 			continue;
 
+		if (rule->proto && r->proto != rule->proto)
+			continue;
+
 		if (fib_rule_port_range_set(&rule->sport_range) &&
 		    !fib_rule_port_range_compare(&r->sport_range,
 						 &rule->sport_range))
@@ -653,6 +656,73 @@ errout:
 	return err;
 }
 
+static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
+		       struct nlattr **tb, struct fib_rule *rule)
+{
+	struct fib_rule *r;
+
+	list_for_each_entry(r, &ops->rules_list, list) {
+		if (r->action != rule->action)
+			continue;
+
+		if (r->table != rule->table)
+			continue;
+
+		if (r->pref != rule->pref)
+			continue;
+
+		if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+			continue;
+
+		if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+			continue;
+
+		if (r->mark != rule->mark)
+			continue;
+
+		if (r->suppress_ifgroup != rule->suppress_ifgroup)
+			continue;
+
+		if (r->suppress_prefixlen != rule->suppress_prefixlen)
+			continue;
+
+		if (r->mark_mask != rule->mark_mask)
+			continue;
+
+		if (r->tun_id != rule->tun_id)
+			continue;
+
+		if (r->fr_net != rule->fr_net)
+			continue;
+
+		if (r->l3mdev != rule->l3mdev)
+			continue;
+
+		if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+		    !uid_eq(r->uid_range.end, rule->uid_range.end))
+			continue;
+
+		if (r->ip_proto != rule->ip_proto)
+			continue;
+
+		if (r->proto != rule->proto)
+			continue;
+
+		if (!fib_rule_port_range_compare(&r->sport_range,
+						 &rule->sport_range))
+			continue;
+
+		if (!fib_rule_port_range_compare(&r->dport_range,
+						 &rule->dport_range))
+			continue;
+
+		if (!ops->compare(r, frh, tb))
+			continue;
+		return 1;
+	}
+	return 0;
+}
+
 int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		   struct netlink_ext_ack *extack)
 {
@@ -687,7 +757,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout;
 
 	if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
-	    rule_find(ops, frh, tb, rule, user_priority)) {
+	    rule_exists(ops, frh, tb, rule)) {
 		err = -EEXIST;
 		goto errout_free;
 	}
-- 
cgit v1.2.3


From 1236f22fbae15df3736ab4a984c64c0c6ee6254c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Fri, 29 Jun 2018 13:07:53 +0300
Subject: tcp: prevent bogus FRTO undos with non-SACK flows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If SACK is not enabled and the first cumulative ACK after the RTO
retransmission covers more than the retransmitted skb, a spurious
FRTO undo will trigger (assuming FRTO is enabled for that RTO).
The reason is that any non-retransmitted segment acknowledged will
set FLAG_ORIG_SACK_ACKED in tcp_clean_rtx_queue even if there is
no indication that it would have been delivered for real (the
scoreboard is not kept with TCPCB_SACKED_ACKED bits in the non-SACK
case so the check for that bit won't help like it does with SACK).
Having FLAG_ORIG_SACK_ACKED set results in the spurious FRTO undo
in tcp_process_loss.

We need to use more strict condition for non-SACK case and check
that none of the cumulatively ACKed segments were retransmitted
to prove that progress is due to original transmissions. Only then
keep FLAG_ORIG_SACK_ACKED set, allowing FRTO undo to proceed in
non-SACK case.

(FLAG_ORIG_SACK_ACKED is planned to be renamed to FLAG_ORIG_PROGRESS
to better indicate its purpose but to keep this change minimal, it
will be done in another patch).

Besides burstiness and congestion control violations, this problem
can result in RTO loop: When the loss recovery is prematurely
undoed, only new data will be transmitted (if available) and
the next retransmission can occur only after a new RTO which in case
of multiple losses (that are not for consecutive packets) requires
one RTO per loss to recover.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Tested-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 045d930d01a9..8e5522c6833a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3181,6 +3181,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 
 		if (tcp_is_reno(tp)) {
 			tcp_remove_reno_sacks(sk, pkts_acked);
+
+			/* If any of the cumulatively ACKed segments was
+			 * retransmitted, non-SACK case cannot confirm that
+			 * progress was due to original transmission due to
+			 * lack of TCPCB_SACKED_ACKED bits even if some of
+			 * the packets may have been never retransmitted.
+			 */
+			if (flag & FLAG_RETRANS_DATA_ACKED)
+				flag &= ~FLAG_ORIG_SACK_ACKED;
 		} else {
 			int delta;
 
-- 
cgit v1.2.3


From 603d4cf8fe095b1ee78f423d514427be507fb513 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Sat, 30 Jun 2018 17:38:55 +0200
Subject: net: fix use-after-free in GRO with ESP

Since the addition of GRO for ESP, gro_receive can consume the skb and
return -EINPROGRESS. In that case, the lower layer GRO handler cannot
touch the skb anymore.

Commit 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") converted
some of the gro_receive handlers that can lead to ESP's gro_receive so
that they wouldn't access the skb when -EINPROGRESS is returned, but
missed other spots, mainly in tunneling protocols.

This patch finishes the conversion to using skb_gro_flush_final(), and
adds a new helper, skb_gro_flush_final_remcsum(), used in VXLAN and
GUE.

Fixes: 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c      |  2 +-
 drivers/net/vxlan.c       |  4 +---
 include/linux/netdevice.h | 20 ++++++++++++++++++++
 net/8021q/vlan.c          |  2 +-
 net/ipv4/fou.c            |  4 +---
 net/ipv4/gre_offload.c    |  2 +-
 net/ipv4/udp_offload.c    |  2 +-
 7 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 750eaa53bf0c..ada33c2d9ac2 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -476,7 +476,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index aee0e60471f1..f6bb1d54d4bd 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -623,9 +623,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk,
 	flush = 0;
 
 out:
-	skb_gro_remcsum_cleanup(skb, &grc);
-	skb->remcsum_offload = 0;
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
 
 	return pp;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..3d0cc0b5cec2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2789,11 +2789,31 @@ static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp,
 	if (PTR_ERR(pp) != -EINPROGRESS)
 		NAPI_GRO_CB(skb)->flush |= flush;
 }
+static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
+					       struct sk_buff **pp,
+					       int flush,
+					       struct gro_remcsum *grc)
+{
+	if (PTR_ERR(pp) != -EINPROGRESS) {
+		NAPI_GRO_CB(skb)->flush |= flush;
+		skb_gro_remcsum_cleanup(skb, grc);
+		skb->remcsum_offload = 0;
+	}
+}
 #else
 static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
 {
 	NAPI_GRO_CB(skb)->flush |= flush;
 }
+static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
+					       struct sk_buff **pp,
+					       int flush,
+					       struct gro_remcsum *grc)
+{
+	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_remcsum_cleanup(skb, grc);
+	skb->remcsum_offload = 0;
+}
 #endif
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 73a65789271b..8ccee3d01822 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -693,7 +693,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..c9ec1603666b 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -448,9 +448,7 @@ next_proto:
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
-	skb_gro_remcsum_cleanup(skb, &grc);
-	skb->remcsum_offload = 0;
+	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
 
 	return pp;
 }
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 1859c473b21a..6a7d980105f6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 
 	return pp;
 }
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92dc9e5a7ff3..69c54540d5b4 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -394,7 +394,7 @@ unflush:
 out_unlock:
 	rcu_read_unlock();
 out:
-	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_flush_final(skb, pp, flush);
 	return pp;
 }
 EXPORT_SYMBOL(udp_gro_receive);
-- 
cgit v1.2.3


From fc9c2029e37c3ae9efc28bf47045e0b87e09660c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 30 Jun 2018 15:26:56 -0700
Subject: ipv6: sr: fix passing wrong flags to crypto_alloc_shash()

The 'mask' argument to crypto_alloc_shash() uses the CRYPTO_ALG_* flags,
not 'gfp_t'.  So don't pass GFP_KERNEL to it.

Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_hmac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 33fb35cbfac1..558fe8cc6d43 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -373,7 +373,7 @@ static int seg6_hmac_init_algo(void)
 			return -ENOMEM;
 
 		for_each_possible_cpu(cpu) {
-			tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL);
+			tfm = crypto_alloc_shash(algo->name, 0, 0);
 			if (IS_ERR(tfm))
 				return PTR_ERR(tfm);
 			p_tfm = per_cpu_ptr(algo->tfms, cpu);
-- 
cgit v1.2.3


From 20b52a75166086a40d838397ef3db28a4f2c5998 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Fri, 29 Jun 2018 09:48:17 +0200
Subject: xsk: fix potential lost completion message in SKB path

The code in xskq_produce_addr erroneously checked if there
was up to LAZY_UPDATE_THRESHOLD amount of space in the completion
queue. It only needs to check if there is one slot left in the
queue. This bug could under some circumstances lead to a WARN_ON_ONCE
being triggered and the completion message to user space being lost.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Reported-by: Pavel Odintsov <pavel@fastnetmon.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/xdp/xsk_queue.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index ef6a6f0ec949..52ecaf770642 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -62,14 +62,9 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
 	return (entries > dcnt) ? dcnt : entries;
 }
 
-static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
-{
-	return q->nentries - (producer - q->cons_tail);
-}
-
 static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 {
-	u32 free_entries = xskq_nb_free_lazy(q, producer);
+	u32 free_entries = q->nentries - (producer - q->cons_tail);
 
 	if (free_entries >= dcnt)
 		return free_entries;
@@ -129,7 +124,7 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-	if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
+	if (xskq_nb_free(q, q->prod_tail, 1) == 0)
 		return -ENOSPC;
 
 	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
-- 
cgit v1.2.3


From fe5886852601fb2593cbc5a7549ef9fd2ef481ba Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Fri, 29 Jun 2018 09:48:18 +0200
Subject: xsk: frame could be completed more than once in SKB path

Fixed a bug in which a frame could be completed more than once
when an error was returned from dev_direct_xmit(). The code
erroneously retried sending the message leading to multiple
calls to the SKB destructor and therefore multiple completions
of the same buffer to user space.

The error code in this case has been changed from EAGAIN to EBUSY
in order to tell user space that the sending of the packet failed
and the buffer has been return to user space through the completion
queue.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Reported-by: Pavel Odintsov <pavel@fastnetmon.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/xdp/xsk.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 59fb7d3c36a3..15aca73805fc 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -268,15 +268,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
+		xskq_discard_desc(xs->tx);
 		/* Ignore NET_XMIT_CN as packet might have been sent */
 		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
-			err = -EAGAIN;
-			/* SKB consumed by dev_direct_xmit() */
+			/* SKB completed but not sent */
+			err = -EBUSY;
 			goto out;
 		}
 
 		sent_frame = true;
-		xskq_discard_desc(xs->tx);
 	}
 
 out:
-- 
cgit v1.2.3


From a9744f7ca200c756e6f8c65b633770a2da711651 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Fri, 29 Jun 2018 09:48:20 +0200
Subject: xsk: fix potential race in SKB TX completion code

There is a potential race in the TX completion code for the SKB
case. One process enters the sendmsg code of an AF_XDP socket in order
to send a frame. The execution eventually trickles down to the driver
that is told to send the packet. However, it decides to drop the
packet due to some error condition (e.g., rings full) and frees the
SKB. This will trigger the SKB destructor and a completion will be
sent to the AF_XDP user space through its
single-producer/single-consumer queues.

At the same time a TX interrupt has fired on another core and it
dispatches the TX completion code in the driver. It does its HW
specific things and ends up freeing the SKB associated with the
transmitted packet. This will trigger the SKB destructor and a
completion will be sent to the AF_XDP user space through its
single-producer/single-consumer queues. With a pseudo call stack, it
would look like this:

Core 1:
sendmsg() being called in the application
  netdev_start_xmit()
    Driver entered through ndo_start_xmit
      Driver decides to free the SKB for some reason (e.g., rings full)
        Destructor of SKB called
          xskq_produce_addr() is called to signal completion to user space

Core 2:
TX completion irq
  NAPI loop
    Driver irq handler for TX completions
      Frees the SKB
        Destructor of SKB called
          xskq_produce_addr() is called to signal completion to user space

We now have a violation of the single-producer/single-consumer
principle for our queues as there are two threads trying to produce at
the same time on the same queue.

Fixed by introducing a spin_lock in the destructor. In regards to the
performance, I get around 1.74 Mpps for txonly before and after the
introduction of the spinlock. There is of course some impact due to
the spin lock but it is in the less significant digits that are too
noisy for me to measure. But let us say that the version without the
spin lock got 1.745 Mpps in the best case and the version with 1.735
Mpps in the worst case, then that would mean a maximum drop in
performance of 0.5%.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h | 4 ++++
 net/xdp/xsk.c          | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'net')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 9fe472f2ac95..7161856bcf9c 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -60,6 +60,10 @@ struct xdp_sock {
 	bool zc;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
+	/* Mutual exclusion of NAPI TX thread and sendmsg error paths
+	 * in the SKB destructor callback.
+	 */
+	spinlock_t tx_completion_lock;
 	u64 rx_dropped;
 };
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 15aca73805fc..7d220cbd09b6 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -199,8 +199,11 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 {
 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 	struct xdp_sock *xs = xdp_sk(skb->sk);
+	unsigned long flags;
 
+	spin_lock_irqsave(&xs->tx_completion_lock, flags);
 	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
+	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
 
 	sock_wfree(skb);
 }
@@ -755,6 +758,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 
 	xs = xdp_sk(sk);
 	mutex_init(&xs->mutex);
+	spin_lock_init(&xs->tx_completion_lock);
 
 	local_bh_disable();
 	sock_prot_inuse_add(net, &xsk_proto, 1);
-- 
cgit v1.2.3


From 19103a4bfb42f320395daa5616ece3e89e759d63 Mon Sep 17 00:00:00 2001
From: "mpubbise@codeaurora.org" <mpubbise@codeaurora.org>
Date: Mon, 2 Jul 2018 15:40:14 +0530
Subject: mac80211: add stations tied to AP_VLANs during hw reconfig

As part of hw reconfig, only stations linked to AP interfaces are added
back to the driver ignoring those which are tied to AP_VLAN interfaces.

It is true that there could be stations tied to the AP_VLAN interface while
serving 4addr clients or when using AP_VLAN for VLAN operations; we should
be adding these stations back to the driver as part of hw reconfig, failing
to do so can cause functional issues.

In the case of ath10k driver, the following errors were observed.

ath10k_pci : failed to install key for non-existent peer XX:XX:XX:XX:XX:XX
Workqueue: events_freezable ieee80211_restart_work [mac80211]
(unwind_backtrace) from (show_stack+0x10/0x14)
(show_stack) (dump_stack+0x80/0xa0)
(dump_stack) (warn_slowpath_common+0x68/0x8c)
(warn_slowpath_common) (warn_slowpath_null+0x18/0x20)
(warn_slowpath_null) (ieee80211_enable_keys+0x88/0x154 [mac80211])
(ieee80211_enable_keys) (ieee80211_reconfig+0xc90/0x19c8 [mac80211])
(ieee80211_reconfig]) (ieee80211_restart_work+0x8c/0xa0 [mac80211])
(ieee80211_restart_work) (process_one_work+0x284/0x488)
(process_one_work) (worker_thread+0x228/0x360)
(worker_thread) (kthread+0xd8/0xec)
(kthread) (ret_from_fork+0x14/0x24)

Also while bringing down the AP VAP, WARN_ONs and errors related to peer
removal were observed.

ath10k_pci : failed to clear all peer wep keys for vdev 0: -2
ath10k_pci : failed to disassociate station: 8c:fd:f0:0a:8c:f5 vdev 0: -2
(unwind_backtrace) (show_stack+0x10/0x14)
(show_stack) (dump_stack+0x80/0xa0)
(dump_stack) (warn_slowpath_common+0x68/0x8c)
(warn_slowpath_common) (warn_slowpath_null+0x18/0x20)
(warn_slowpath_null) (sta_set_sinfo+0xb98/0xc9c [mac80211])
(sta_set_sinfo [mac80211]) (__sta_info_flush+0xf0/0x134 [mac80211])
(__sta_info_flush [mac80211]) (ieee80211_stop_ap+0xe8/0x390 [mac80211])
(ieee80211_stop_ap [mac80211]) (__cfg80211_stop_ap+0xe0/0x3dc [cfg80211])
(__cfg80211_stop_ap [cfg80211]) (cfg80211_stop_ap+0x30/0x44 [cfg80211])
(cfg80211_stop_ap [cfg80211]) (genl_rcv_msg+0x274/0x30c)
(genl_rcv_msg) (netlink_rcv_skb+0x58/0xac)
(netlink_rcv_skb) (genl_rcv+0x20/0x34)
(genl_rcv) (netlink_unicast+0x11c/0x204)
(netlink_unicast) (netlink_sendmsg+0x30c/0x370)
(netlink_sendmsg) (sock_sendmsg+0x70/0x84)
(sock_sendmsg) (___sys_sendmsg.part.3+0x188/0x228)
(___sys_sendmsg.part.3) (__sys_sendmsg+0x4c/0x70)
(__sys_sendmsg) (ret_fast_syscall+0x0/0x44)

These issues got fixed by adding the stations which are
tied to AP_VLANs back to the driver.

Signed-off-by: Manikanta Pubbisetty <mpubbise@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/util.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 5e2e511c4a6f..d02fbfec3783 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2111,7 +2111,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		if (!sta->uploaded)
 			continue;
 
-		if (sta->sdata->vif.type != NL80211_IFTYPE_AP)
+		if (sta->sdata->vif.type != NL80211_IFTYPE_AP &&
+		    sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
 			continue;
 
 		for (state = IEEE80211_STA_NOTEXIST;
-- 
cgit v1.2.3


From 52ee6ef36ee10dd493cf2067311e56ca8015eb8d Mon Sep 17 00:00:00 2001
From: Doron Roberts-Kedes <doronrk@fb.com>
Date: Mon, 2 Jul 2018 10:25:05 -0700
Subject: tls: fix skb_to_sgvec returning unhandled error.

The current code does not inspect the return value of skb_to_sgvec. This
can cause a nullptr kernel panic when the malformed sgvec is passed into
the crypto request.

Checking the return value of skb_to_sgvec and skipping decryption if it
is negative fixes this problem.

Fixes: c46234ebb4d1 ("tls: RX path for ktls")
Acked-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index d2380548f8f6..7818011fd250 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -701,6 +701,10 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 	nsg = skb_to_sgvec(skb, &sgin[1],
 			   rxm->offset + tls_ctx->rx.prepend_size,
 			   rxm->full_len - tls_ctx->rx.prepend_size);
+	if (nsg < 0) {
+		ret = nsg;
+		goto out;
+	}
 
 	tls_make_aad(ctx->rx_aad_ciphertext,
 		     rxm->full_len - tls_ctx->rx.overhead_size,
@@ -712,6 +716,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 				rxm->full_len - tls_ctx->rx.overhead_size,
 				skb, sk->sk_allocation);
 
+out:
 	if (sgin != &sgin_arr[0])
 		kfree(sgin);
 
-- 
cgit v1.2.3


From 410da1e12ffed61129d61df5b7adce4d08c7f17c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 3 Jul 2018 09:53:43 -0700
Subject: net/smc: fix up merge error with poll changes

My networking merge (commit 4e33d7d47943: "Pull networking fixes from
David Miller") got the poll() handling conflict wrong for af_smc.

The conflict between my a11e1d432b51 ("Revert changes to convert to
->poll_mask() and aio IOCB_CMD_POLL") and Ursula Braun's 24ac3a08e658
("net/smc: rebuild nonblocking connect") should have left the call to
sock_poll_wait() in place, just without the socket lock release/retake.

And I really should have realized that.  But happily, I at least asked
Ursula to double-check the merge, and she set me right.

This also fixes an incidental whitespace issue nearby that annoyed me
while looking at this.

Pointed-out-by: Ursula Braun <ubraun@linux.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/smc/af_smc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e017b6a4452b..3c1405df936c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1345,6 +1345,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 		if (sk->sk_err)
 			mask |= EPOLLERR;
 	} else {
+		if (sk->sk_state != SMC_CLOSED)
+			sock_poll_wait(file, sk_sleep(sk), wait);
 		if (sk->sk_err)
 			mask |= EPOLLERR;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1370,7 +1372,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 		}
 		if (smc->conn.urg_state == SMC_URG_VALID)
 			mask |= EPOLLPRI;
-
 	}
 
 	return mask;
-- 
cgit v1.2.3


From d5a672ac9f48f81b20b1cad1d9ed7bbf4e418d4c Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Mon, 2 Jul 2018 22:52:20 +0200
Subject: gen_stats: Fix netlink stats dumping in the presence of padding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gen_stats facility will add a header for the toplevel nlattr of type
TCA_STATS2 that contains all stats added by qdisc callbacks. A reference
to this header is stored in the gnet_dump struct, and when all the
per-qdisc callbacks have finished adding their stats, the length of the
containing header will be adjusted to the right value.

However, on architectures that need padding (i.e., that don't set
CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS), the padding nlattr is added
before the stats, which means that the stored pointer will point to the
padding, and so when the header is fixed up, the result is just a very
big padding nlattr. Because most qdiscs also supply the legacy TCA_STATS
struct, this problem has been mostly invisible, but we exposed it with
the netlink attribute-based statistics in CAKE.

Fix the issue by fixing up the stored pointer if it points to a padding
nlattr.

Tested-by: Pete Heist <pete@heistp.net>
Tested-by: Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_stats.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index b2b2323bdc84..188d693cb251 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -77,8 +77,20 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
 		d->lock = lock;
 		spin_lock_bh(lock);
 	}
-	if (d->tail)
-		return gnet_stats_copy(d, type, NULL, 0, padattr);
+	if (d->tail) {
+		int ret = gnet_stats_copy(d, type, NULL, 0, padattr);
+
+		/* The initial attribute added in gnet_stats_copy() may be
+		 * preceded by a padding attribute, in which case d->tail will
+		 * end up pointing at the padding instead of the real attribute.
+		 * Fix this so gnet_stats_finish_copy() adjusts the length of
+		 * the right attribute.
+		 */
+		if (ret == 0 && d->tail->nla_type == padattr)
+			d->tail = (struct nlattr *)((char *)d->tail +
+						    NLA_ALIGN(d->tail->nla_len));
+		return ret;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 33bd5ac54dc47e002da4a395aaf9bf158dd17709 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 3 Jul 2018 14:36:21 -0700
Subject: net/ipv6: Revert attempt to simplify route replace and append

NetworkManager likes to manage linklocal prefix routes and does so with
the NLM_F_APPEND flag, breaking attempts to simplify the IPv6 route
code and by extension enable multipath routes with device only nexthops.

Revert f34436a43092 and these followup patches:
6eba08c3626b ("ipv6: Only emit append events for appended routes").
ce45bded6435 ("mlxsw: spectrum_router: Align with new route replace logic")
53b562df8c20 ("mlxsw: spectrum_router: Allow appending to dev-only routes")

Update the fib_tests cases to reflect the old behavior.

Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route")
Signed-off-by: David Ahern <dsahern@gmail.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  48 +++----
 include/net/ip6_route.h                            |   6 +
 net/ipv6/ip6_fib.c                                 | 156 ++++++++++++---------
 net/ipv6/route.c                                   |   3 +-
 tools/testing/selftests/net/fib_tests.sh           |  41 ------
 5 files changed, 117 insertions(+), 137 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 6aaaf3d9ba31..77b2adb29341 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4756,6 +4756,12 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 	kfree(mlxsw_sp_rt6);
 }
 
+static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
+{
+	/* RTF_CACHE routes are ignored */
+	return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
+}
+
 static struct fib6_info *
 mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 {
@@ -4765,11 +4771,11 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-				 const struct fib6_info *nrt, bool append)
+				 const struct fib6_info *nrt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 
-	if (!append)
+	if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace)
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
@@ -4784,7 +4790,8 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			break;
 		if (rt->fib6_metric < nrt->fib6_metric)
 			continue;
-		if (rt->fib6_metric == nrt->fib6_metric)
+		if (rt->fib6_metric == nrt->fib6_metric &&
+		    mlxsw_sp_fib6_rt_can_mp(rt))
 			return fib6_entry;
 		if (rt->fib6_metric > nrt->fib6_metric)
 			break;
@@ -5163,7 +5170,7 @@ static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			      const struct fib6_info *nrt, bool replace)
 {
-	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
 		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
@@ -5172,13 +5179,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 			continue;
 		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
 			break;
-		if (replace && rt->fib6_metric == nrt->fib6_metric)
-			return fib6_entry;
+		if (replace && rt->fib6_metric == nrt->fib6_metric) {
+			if (mlxsw_sp_fib6_rt_can_mp(rt) ==
+			    mlxsw_sp_fib6_rt_can_mp(nrt))
+				return fib6_entry;
+			if (mlxsw_sp_fib6_rt_can_mp(nrt))
+				fallback = fallback ?: fib6_entry;
+		}
 		if (rt->fib6_metric > nrt->fib6_metric)
-			return fib6_entry;
+			return fallback ?: fib6_entry;
 	}
 
-	return NULL;
+	return fallback;
 }
 
 static int
@@ -5304,8 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
-				    struct fib6_info *rt, bool replace,
-				    bool append)
+				    struct fib6_info *rt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5331,7 +5342,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 	/* Before creating a new entry, try to append route to an existing
 	 * multipath entry.
 	 */
-	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, append);
+	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace);
 	if (fib6_entry) {
 		err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt);
 		if (err)
@@ -5339,14 +5350,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 		return 0;
 	}
 
-	/* We received an append event, yet did not find any route to
-	 * append to.
-	 */
-	if (WARN_ON(append)) {
-		err = -EINVAL;
-		goto err_fib6_entry_append;
-	}
-
 	fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt);
 	if (IS_ERR(fib6_entry)) {
 		err = PTR_ERR(fib6_entry);
@@ -5364,7 +5367,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 err_fib6_node_entry_link:
 	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
 err_fib6_entry_create:
-err_fib6_entry_append:
 err_fib6_entry_nexthop_add:
 	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
 	return err;
@@ -5715,7 +5717,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	struct mlxsw_sp_fib_event_work *fib_work =
 		container_of(work, struct mlxsw_sp_fib_event_work, work);
 	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
-	bool replace, append;
+	bool replace;
 	int err;
 
 	rtnl_lock();
@@ -5726,10 +5728,8 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	case FIB_EVENT_ENTRY_APPEND: /* fall through */
 	case FIB_EVENT_ENTRY_ADD:
 		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
-		append = fib_work->event == FIB_EVENT_ENTRY_APPEND;
 		err = mlxsw_sp_router_fib6_add(mlxsw_sp,
-					       fib_work->fen6_info.rt, replace,
-					       append);
+					       fib_work->fen6_info.rt, replace);
 		if (err)
 			mlxsw_sp_router_fib_abort(mlxsw_sp);
 		mlxsw_sp_rt6_release(fib_work->fen6_info.rt);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 59656fc580df..7b9c82de11cc 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
+{
+	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+	       RTF_GATEWAY;
+}
+
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct net_device *dev,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1fb2f3118d60..d212738e9d10 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -935,20 +935,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 {
 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-	enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
-	struct fib6_info *iter = NULL, *match = NULL;
+	struct fib6_info *iter = NULL;
 	struct fib6_info __rcu **ins;
+	struct fib6_info __rcu **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
-	int append = (info->nlh &&
-		       (info->nlh->nlmsg_flags & NLM_F_APPEND));
 	int add = (!info->nlh ||
 		   (info->nlh->nlmsg_flags & NLM_F_CREATE));
 	int found = 0;
+	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
 	u16 nlflags = NLM_F_EXCL;
 	int err;
 
-	if (append)
+	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
 		nlflags |= NLM_F_APPEND;
 
 	ins = &fn->leaf;
@@ -970,8 +969,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 
 			nlflags &= ~NLM_F_EXCL;
 			if (replace) {
-				found++;
-				break;
+				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
+					found++;
+					break;
+				}
+				if (rt_can_ecmp)
+					fallback_ins = fallback_ins ?: ins;
+				goto next_iter;
 			}
 
 			if (rt6_duplicate_nexthop(iter, rt)) {
@@ -986,51 +990,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
-
-			/* first route that matches */
-			if (!match)
-				match = iter;
+			/* If we have the same destination and the same metric,
+			 * but not the same gateway, then the route we try to
+			 * add is sibling to this route, increment our counter
+			 * of siblings, and later we will add our route to the
+			 * list.
+			 * Only static routes (which don't have flag
+			 * RTF_EXPIRES) are used for ECMPv6.
+			 *
+			 * To avoid long list, we only had siblings if the
+			 * route have a gateway.
+			 */
+			if (rt_can_ecmp &&
+			    rt6_qualify_for_ecmp(iter))
+				rt->fib6_nsiblings++;
 		}
 
 		if (iter->fib6_metric > rt->fib6_metric)
 			break;
 
+next_iter:
 		ins = &iter->fib6_next;
 	}
 
+	if (fallback_ins && !found) {
+		/* No ECMP-able route found, replace first non-ECMP one */
+		ins = fallback_ins;
+		iter = rcu_dereference_protected(*ins,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+		found++;
+	}
+
 	/* Reset round-robin state, if necessary */
 	if (ins == &fn->leaf)
 		fn->rr_ptr = NULL;
 
 	/* Link this route to others same route. */
-	if (append && match) {
+	if (rt->fib6_nsiblings) {
+		unsigned int fib6_nsiblings;
 		struct fib6_info *sibling, *temp_sibling;
 
-		if (rt->fib6_flags & RTF_REJECT) {
-			NL_SET_ERR_MSG(extack,
-				       "Can not append a REJECT route");
-			return -EINVAL;
-		} else if (match->fib6_flags & RTF_REJECT) {
-			NL_SET_ERR_MSG(extack,
-				       "Can not append to a REJECT route");
-			return -EINVAL;
+		/* Find the first route that have the same metric */
+		sibling = leaf;
+		while (sibling) {
+			if (sibling->fib6_metric == rt->fib6_metric &&
+			    rt6_qualify_for_ecmp(sibling)) {
+				list_add_tail(&rt->fib6_siblings,
+					      &sibling->fib6_siblings);
+				break;
+			}
+			sibling = rcu_dereference_protected(sibling->fib6_next,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		}
-		event = FIB_EVENT_ENTRY_APPEND;
-		rt->fib6_nsiblings = match->fib6_nsiblings;
-		list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
-		match->fib6_nsiblings++;
-
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
 		 */
+		fib6_nsiblings = 0;
 		list_for_each_entry_safe(sibling, temp_sibling,
-					 &match->fib6_siblings, fib6_siblings) {
+					 &rt->fib6_siblings, fib6_siblings) {
 			sibling->fib6_nsiblings++;
-			BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
+			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
+			fib6_nsiblings++;
 		}
-
-		rt6_multipath_rebalance(match);
+		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
+		rt6_multipath_rebalance(temp_sibling);
 	}
 
 	/*
@@ -1043,8 +1067,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 add:
 		nlflags |= NLM_F_CREATE;
 
-		err = call_fib6_entry_notifiers(info->nl_net, event, rt,
-						extack);
+		err = call_fib6_entry_notifiers(info->nl_net,
+						FIB_EVENT_ENTRY_ADD,
+						rt, extack);
 		if (err)
 			return err;
 
@@ -1062,7 +1087,7 @@ add:
 		}
 
 	} else {
-		struct fib6_info *tmp;
+		int nsiblings;
 
 		if (!found) {
 			if (add)
@@ -1077,57 +1102,48 @@ add:
 		if (err)
 			return err;
 
-		/* if route being replaced has siblings, set tmp to
-		 * last one, otherwise tmp is current route. this is
-		 * used to set fib6_next for new route
-		 */
-		if (iter->fib6_nsiblings)
-			tmp = list_last_entry(&iter->fib6_siblings,
-					      struct fib6_info,
-					      fib6_siblings);
-		else
-			tmp = iter;
-
-		/* insert new route */
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
-		rt->fib6_next = tmp->fib6_next;
+		rt->fib6_next = iter->fib6_next;
 		rcu_assign_pointer(*ins, rt);
-
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
+		nsiblings = iter->fib6_nsiblings;
+		iter->fib6_node = NULL;
+		fib6_purge_rt(iter, fn, info->nl_net);
+		if (rcu_access_pointer(fn->rr_ptr) == iter)
+			fn->rr_ptr = NULL;
+		fib6_info_release(iter);
 
-		/* delete old route */
-		rt = iter;
-
-		if (rt->fib6_nsiblings) {
-			struct fib6_info *tmp;
-
+		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
-			list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
-						 fib6_siblings) {
-				iter->fib6_node = NULL;
-				fib6_purge_rt(iter, fn, info->nl_net);
-				if (rcu_access_pointer(fn->rr_ptr) == iter)
-					fn->rr_ptr = NULL;
-				fib6_info_release(iter);
-
-				rt->fib6_nsiblings--;
-				info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
+			ins = &rt->fib6_next;
+			iter = rcu_dereference_protected(*ins,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+			while (iter) {
+				if (iter->fib6_metric > rt->fib6_metric)
+					break;
+				if (rt6_qualify_for_ecmp(iter)) {
+					*ins = iter->fib6_next;
+					iter->fib6_node = NULL;
+					fib6_purge_rt(iter, fn, info->nl_net);
+					if (rcu_access_pointer(fn->rr_ptr) == iter)
+						fn->rr_ptr = NULL;
+					fib6_info_release(iter);
+					nsiblings--;
+					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
+				} else {
+					ins = &iter->fib6_next;
+				}
+				iter = rcu_dereference_protected(*ins,
+					lockdep_is_held(&rt->fib6_table->tb6_lock));
 			}
+			WARN_ON(nsiblings != 0);
 		}
-
-		WARN_ON(rt->fib6_nsiblings != 0);
-
-		rt->fib6_node = NULL;
-		fib6_purge_rt(rt, fn, info->nl_net);
-		if (rcu_access_pointer(fn->rr_ptr) == rt)
-			fn->rr_ptr = NULL;
-		fib6_info_release(rt);
 	}
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 86a0e4333d42..63f99411f0de 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3842,7 +3842,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	while (iter) {
 		if (iter->fib6_metric == rt->fib6_metric &&
-		    iter->fib6_nsiblings)
+		    rt6_qualify_for_ecmp(iter))
 			return iter;
 		iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4439,7 +4439,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		 */
 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
 						     NLM_F_REPLACE);
-		cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
 		nhn++;
 	}
 
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 78245d60d8bc..0f45633bd634 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -740,13 +740,6 @@ ipv6_rt_add()
 	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
 	log_test $? 2 "Attempt to add duplicate route - reject route"
 
-	# iproute2 prepend only sets NLM_F_CREATE
-	# - adds a new route; does NOT convert existing route to ECMP
-	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
-	run_cmd "$IP -6 ro prepend 2001:db8:104::/64 via 2001:db8:103::2"
-	check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024 2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
-	log_test $? 0 "Add new route for existing prefix (w/o NLM_F_EXCL)"
-
 	# route append with same prefix adds a new route
 	# - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
 	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
@@ -754,27 +747,6 @@ ipv6_rt_add()
 	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
 	log_test $? 0 "Append nexthop to existing route - gw"
 
-	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
-	run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3"
-	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop dev veth3 weight 1"
-	log_test $? 0 "Append nexthop to existing route - dev only"
-
-	# multipath route can not have a nexthop that is a reject route
-	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
-	run_cmd "$IP -6 ro append unreachable 2001:db8:104::/64"
-	log_test $? 2 "Append nexthop to existing route - reject route"
-
-	# reject route can not be converted to multipath route
-	run_cmd "$IP -6 ro flush 2001:db8:104::/64"
-	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
-	run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2"
-	log_test $? 2 "Append nexthop to existing reject route - gw"
-
-	run_cmd "$IP -6 ro flush 2001:db8:104::/64"
-	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
-	run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3"
-	log_test $? 2 "Append nexthop to existing reject route - dev only"
-
 	# insert mpath directly
 	add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
 	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
@@ -819,13 +791,6 @@ ipv6_rt_replace_single()
 	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
 	log_test $? 0 "Single path with multipath"
 
-	# single path with reject
-	#
-	add_initial_route6 "nexthop via 2001:db8:101::2"
-	run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64"
-	check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024"
-	log_test $? 0 "Single path with reject route"
-
 	# single path with single path using MULTIPATH attribute
 	#
 	add_initial_route6 "via 2001:db8:101::2"
@@ -873,12 +838,6 @@ ipv6_rt_replace_mpath()
 	check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
 	log_test $? 0 "Multipath with single path via multipath attribute"
 
-	# multipath with reject
-	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
-	run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64"
-	check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024"
-	log_test $? 0 "Multipath with reject route"
-
 	# route replace fails - invalid nexthop 1
 	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
 	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3"
-- 
cgit v1.2.3


From a65925475571953da12a9bc2082aec29d4e2c0e7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 3 Jul 2018 16:30:47 +0800
Subject: sctp: fix the issue that pathmtu may be set lower than MINSEGMENT

After commit b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed
for too small MTUs"), sctp_transport_update_pmtu would refetch pathmtu
from the dst and set it to transport's pathmtu without any check.

The new pathmtu may be lower than MINSEGMENT if the dst is obsolete and
updated by .get_dst() in sctp_transport_update_pmtu. In this case, it
could have a smaller MTU as well, and thus we should validate it
against MINSEGMENT instead.

Syzbot reported a warning in sctp_mtu_payload caused by this.

This patch refetches the pathmtu by calling sctp_dst_mtu where it does
the check against MINSEGMENT.

v1->v2:
  - refetch the pathmtu by calling sctp_dst_mtu instead as Marcelo's
    suggestion.

Fixes: b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed for too small MTUs")
Reported-by: syzbot+f0d9d7cba052f9344b03@syzkaller.appspotmail.com
Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 445b7ef61677..12cac85da994 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -282,7 +282,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
 
 	if (dst) {
 		/* Re-fetch, as under layers may have a higher minimum size */
-		pmtu = SCTP_TRUNC4(dst_mtu(dst));
+		pmtu = sctp_dst_mtu(dst);
 		change = t->pathmtu != pmtu;
 	}
 	t->pathmtu = pmtu;
-- 
cgit v1.2.3


From b30c122c0bbb0a1dc413085e177ea09467e65fdb Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 2 Jul 2018 11:21:47 +0200
Subject: ieee802154: 6lowpan: set IFLA_LINK

Otherwise NetworkManager (and iproute alike) is not able to identify the
parent IEEE 802.15.4 interface of a 6LoWPAN link.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 net/ieee802154/6lowpan/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 275449b0d633..3297e7fa9945 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -90,12 +90,18 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n)
 	return 0;
 }
 
+static int lowpan_get_iflink(const struct net_device *dev)
+{
+	return lowpan_802154_dev(dev)->wdev->ifindex;
+}
+
 static const struct net_device_ops lowpan_netdev_ops = {
 	.ndo_init		= lowpan_dev_init,
 	.ndo_start_xmit		= lowpan_xmit,
 	.ndo_open		= lowpan_open,
 	.ndo_stop		= lowpan_stop,
 	.ndo_neigh_construct    = lowpan_neigh_construct,
+	.ndo_get_iflink         = lowpan_get_iflink,
 };
 
 static void lowpan_setup(struct net_device *ldev)
-- 
cgit v1.2.3


From d376bef9c29b3c65aeee4e785fffcd97ef0a9a81 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 4 Jul 2018 20:25:32 +0200
Subject: netfilter: x_tables: set module owner for icmp(6) matches

nft_compat relies on xt_request_find_match to increment
refcount of the module that provides the match/target.

The (builtin) icmp matches did't set the module owner so it
was possible to rmmod ip(6)tables while icmp extensions were still in use.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c  | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ca0dad90803a..e77872c93c20 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
 		.checkentry = icmp_checkentry,
 		.proto      = IPPROTO_ICMP,
 		.family     = NFPROTO_IPV4,
+		.me	    = THIS_MODULE,
 	},
 };
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7eab959734bc..daf2e9e9193d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1909,6 +1909,7 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = {
 		.checkentry = icmp6_checkentry,
 		.proto      = IPPROTO_ICMPV6,
 		.family     = NFPROTO_IPV6,
+		.me	    = THIS_MODULE,
 	},
 };
 
-- 
cgit v1.2.3


From a9ba23d48dbc6ffd08426bb10f05720e0b9f5c14 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Wed, 4 Jul 2018 09:58:05 -0400
Subject: ipv6: make ipv6_renew_options() interrupt/kernel safe

At present the ipv6_renew_options_kern() function ends up calling into
access_ok() which is problematic if done from inside an interrupt as
access_ok() calls WARN_ON_IN_IRQ() on some (all?) architectures
(x86-64 is affected).  Example warning/backtrace is shown below:

 WARNING: CPU: 1 PID: 3144 at lib/usercopy.c:11 _copy_from_user+0x85/0x90
 ...
 Call Trace:
  <IRQ>
  ipv6_renew_option+0xb2/0xf0
  ipv6_renew_options+0x26a/0x340
  ipv6_renew_options_kern+0x2c/0x40
  calipso_req_setattr+0x72/0xe0
  netlbl_req_setattr+0x126/0x1b0
  selinux_netlbl_inet_conn_request+0x80/0x100
  selinux_inet_conn_request+0x6d/0xb0
  security_inet_conn_request+0x32/0x50
  tcp_conn_request+0x35f/0xe00
  ? __lock_acquire+0x250/0x16c0
  ? selinux_socket_sock_rcv_skb+0x1ae/0x210
  ? tcp_rcv_state_process+0x289/0x106b
  tcp_rcv_state_process+0x289/0x106b
  ? tcp_v6_do_rcv+0x1a7/0x3c0
  tcp_v6_do_rcv+0x1a7/0x3c0
  tcp_v6_rcv+0xc82/0xcf0
  ip6_input_finish+0x10d/0x690
  ip6_input+0x45/0x1e0
  ? ip6_rcv_finish+0x1d0/0x1d0
  ipv6_rcv+0x32b/0x880
  ? ip6_make_skb+0x1e0/0x1e0
  __netif_receive_skb_core+0x6f2/0xdf0
  ? process_backlog+0x85/0x250
  ? process_backlog+0x85/0x250
  ? process_backlog+0xec/0x250
  process_backlog+0xec/0x250
  net_rx_action+0x153/0x480
  __do_softirq+0xd9/0x4f7
  do_softirq_own_stack+0x2a/0x40
  </IRQ>
  ...

While not present in the backtrace, ipv6_renew_option() ends up calling
access_ok() via the following chain:

  access_ok()
  _copy_from_user()
  copy_from_user()
  ipv6_renew_option()

The fix presented in this patch is to perform the userspace copy
earlier in the call chain such that it is only called when the option
data is actually coming from userspace; that place is
do_ipv6_setsockopt().  Not only does this solve the problem seen in
the backtrace above, it also allows us to simplify the code quite a
bit by removing ipv6_renew_options_kern() completely.  We also take
this opportunity to cleanup ipv6_renew_options()/ipv6_renew_option()
a small amount as well.

This patch is heavily based on a rough patch by Al Viro.  I've taken
his original patch, converted a kmemdup() call in do_ipv6_setsockopt()
to a memdup_user() call, made better use of the e_inval jump target in
the same function, and cleaned up the use ipv6_renew_option() by
ipv6_renew_options().

CC: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       |   9 +---
 net/ipv6/calipso.c       |   9 ++--
 net/ipv6/exthdrs.c       | 111 +++++++++++++----------------------------------
 net/ipv6/ipv6_sockglue.c |  27 ++++++++----
 4 files changed, 53 insertions(+), 103 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 16475c269749..d02881e4ad1f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -355,14 +355,7 @@ struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
 struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
 					  struct ipv6_txoptions *opt,
 					  int newtype,
-					  struct ipv6_opt_hdr __user *newopt,
-					  int newoptlen);
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk,
-			struct ipv6_txoptions *opt,
-			int newtype,
-			struct ipv6_opt_hdr *newopt,
-			int newoptlen);
+					  struct ipv6_opt_hdr *newopt);
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 					  struct ipv6_txoptions *opt);
 
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 1323b9679cf7..1c0bb9fb76e6 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
 {
 	struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
 
-	txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
-					 hop, hop ? ipv6_optlen(hop) : 0);
+	txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop);
 	txopt_put(old);
 	if (IS_ERR(txopts))
 		return PTR_ERR(txopts);
@@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req,
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
-	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
-					 new, new ? ipv6_optlen(new) : 0);
+	txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
 
 	kfree(new);
 
@@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req)
 	if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
 		return; /* Nothing to do */
 
-	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
-					 new, new ? ipv6_optlen(new) : 0);
+	txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
 
 	if (!IS_ERR(txopts)) {
 		txopts = xchg(&req_inet->ipv6_opt, txopts);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 5bc2bf3733ab..20291c2036fc 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
 }
 EXPORT_SYMBOL_GPL(ipv6_dup_options);
 
-static int ipv6_renew_option(void *ohdr,
-			     struct ipv6_opt_hdr __user *newopt, int newoptlen,
-			     int inherit,
-			     struct ipv6_opt_hdr **hdr,
-			     char **p)
+static void ipv6_renew_option(int renewtype,
+			      struct ipv6_opt_hdr **dest,
+			      struct ipv6_opt_hdr *old,
+			      struct ipv6_opt_hdr *new,
+			      int newtype, char **p)
 {
-	if (inherit) {
-		if (ohdr) {
-			memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
-			*hdr = (struct ipv6_opt_hdr *)*p;
-			*p += CMSG_ALIGN(ipv6_optlen(*hdr));
-		}
-	} else {
-		if (newopt) {
-			if (copy_from_user(*p, newopt, newoptlen))
-				return -EFAULT;
-			*hdr = (struct ipv6_opt_hdr *)*p;
-			if (ipv6_optlen(*hdr) > newoptlen)
-				return -EINVAL;
-			*p += CMSG_ALIGN(newoptlen);
-		}
-	}
-	return 0;
+	struct ipv6_opt_hdr *src;
+
+	src = (renewtype == newtype ? new : old);
+	if (!src)
+		return;
+
+	memcpy(*p, src, ipv6_optlen(src));
+	*dest = (struct ipv6_opt_hdr *)*p;
+	*p += CMSG_ALIGN(ipv6_optlen(*dest));
 }
 
 /**
@@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr,
  */
 struct ipv6_txoptions *
 ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
-		   int newtype,
-		   struct ipv6_opt_hdr __user *newopt, int newoptlen)
+		   int newtype, struct ipv6_opt_hdr *newopt)
 {
 	int tot_len = 0;
 	char *p;
 	struct ipv6_txoptions *opt2;
-	int err;
 
 	if (opt) {
 		if (newtype != IPV6_HOPOPTS && opt->hopopt)
@@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 			tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
 	}
 
-	if (newopt && newoptlen)
-		tot_len += CMSG_ALIGN(newoptlen);
+	if (newopt)
+		tot_len += CMSG_ALIGN(ipv6_optlen(newopt));
 
 	if (!tot_len)
 		return NULL;
@@ -1098,29 +1088,19 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 	opt2->tot_len = tot_len;
 	p = (char *)(opt2 + 1);
 
-	err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
-				newtype != IPV6_HOPOPTS,
-				&opt2->hopopt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
-				newtype != IPV6_RTHDRDSTOPTS,
-				&opt2->dst0opt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
-				newtype != IPV6_RTHDR,
-				(struct ipv6_opt_hdr **)&opt2->srcrt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
-				newtype != IPV6_DSTOPTS,
-				&opt2->dst1opt, &p);
-	if (err)
-		goto out;
+	ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt,
+			  (opt ? opt->hopopt : NULL),
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt,
+			  (opt ? opt->dst0opt : NULL),
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_RTHDR,
+			  (struct ipv6_opt_hdr **)&opt2->srcrt,
+			  (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL),
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt,
+			  (opt ? opt->dst1opt : NULL),
+			  newopt, newtype, &p);
 
 	opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
 			  (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
@@ -1128,37 +1108,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 	opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
 
 	return opt2;
-out:
-	sock_kfree_s(sk, opt2, opt2->tot_len);
-	return ERR_PTR(err);
-}
-
-/**
- * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
- *
- * @sk: sock from which to allocate memory
- * @opt: original options
- * @newtype: option type to replace in @opt
- * @newopt: new option of type @newtype to replace (kernel-mem)
- * @newoptlen: length of @newopt
- *
- * See ipv6_renew_options().  The difference is that @newopt is
- * kernel memory, rather than user memory.
- */
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
-			int newtype, struct ipv6_opt_hdr *newopt,
-			int newoptlen)
-{
-	struct ipv6_txoptions *ret_val;
-	const mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret_val = ipv6_renew_options(sk, opt, newtype,
-				     (struct ipv6_opt_hdr __user *)newopt,
-				     newoptlen);
-	set_fs(old_fs);
-	return ret_val;
 }
 
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4d780c7f0130..c95c3486d904 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case IPV6_DSTOPTS:
 	{
 		struct ipv6_txoptions *opt;
+		struct ipv6_opt_hdr *new = NULL;
+
+		/* hop-by-hop / destination options are privileged option */
+		retv = -EPERM;
+		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
+			break;
 
 		/* remove any sticky options header with a zero option
 		 * length, per RFC3542.
@@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		else if (optlen < sizeof(struct ipv6_opt_hdr) ||
 			 optlen & 0x7 || optlen > 8 * 255)
 			goto e_inval;
-
-		/* hop-by-hop / destination options are privileged option */
-		retv = -EPERM;
-		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
-			break;
+		else {
+			new = memdup_user(optval, optlen);
+			if (IS_ERR(new)) {
+				retv = PTR_ERR(new);
+				break;
+			}
+			if (unlikely(ipv6_optlen(new) > optlen)) {
+				kfree(new);
+				goto e_inval;
+			}
+		}
 
 		opt = rcu_dereference_protected(np->opt,
 						lockdep_sock_is_held(sk));
-		opt = ipv6_renew_options(sk, opt, optname,
-					 (struct ipv6_opt_hdr __user *)optval,
-					 optlen);
+		opt = ipv6_renew_options(sk, opt, optname, new);
+		kfree(new);
 		if (IS_ERR(opt)) {
 			retv = PTR_ERR(opt);
 			break;
-- 
cgit v1.2.3


From fdf5fd3975666804118e62c69de25dc85cc0909c Mon Sep 17 00:00:00 2001
From: Arun Kumar Neelakantam <aneela@codeaurora.org>
Date: Wed, 4 Jul 2018 19:49:32 +0530
Subject: net: qrtr: Broadcast messages only from control port

The broadcast node id should only be sent with the control port id.

Signed-off-by: Arun Kumar Neelakantam <aneela@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/qrtr/qrtr.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 2aa07b547b16..7ffc9a3a7dd4 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -764,6 +764,10 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	node = NULL;
 	if (addr->sq_node == QRTR_NODE_BCAST) {
 		enqueue_fn = qrtr_bcast_enqueue;
+		if (addr->sq_port != QRTR_PORT_CTRL) {
+			release_sock(sk);
+			return -ENOTCONN;
+		}
 	} else if (addr->sq_node == ipc->us.sq_node) {
 		enqueue_fn = qrtr_local_enqueue;
 	} else {
-- 
cgit v1.2.3


From d27e77a3de2866b0a772803fd03cd667b5ff8a9a Mon Sep 17 00:00:00 2001
From: Arun Kumar Neelakantam <aneela@codeaurora.org>
Date: Wed, 4 Jul 2018 19:49:33 +0530
Subject: net: qrtr: Reset the node and port ID of broadcast messages

All the control messages broadcast to remote routers are using
QRTR_NODE_BCAST instead of using local router NODE ID which cause
the packets to be dropped on remote router due to invalid NODE ID.

Signed-off-by: Arun Kumar Neelakantam <aneela@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/qrtr/qrtr.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 7ffc9a3a7dd4..86e1e37eb4e8 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -191,8 +191,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
 	hdr->type = cpu_to_le32(type);
 	hdr->src_node_id = cpu_to_le32(from->sq_node);
 	hdr->src_port_id = cpu_to_le32(from->sq_port);
-	hdr->dst_node_id = cpu_to_le32(to->sq_node);
-	hdr->dst_port_id = cpu_to_le32(to->sq_port);
+	if (to->sq_port == QRTR_PORT_CTRL) {
+		hdr->dst_node_id = cpu_to_le32(node->nid);
+		hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST);
+	} else {
+		hdr->dst_node_id = cpu_to_le32(to->sq_node);
+		hdr->dst_port_id = cpu_to_le32(to->sq_port);
+	}
 
 	hdr->size = cpu_to_le32(len);
 	hdr->confirm_rx = 0;
-- 
cgit v1.2.3


From 70ba5b6db96ff7324b8cfc87e0d0383cf59c9677 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 5 Jul 2018 18:49:23 +0000
Subject: ipv4: Return EINVAL when ping_group_range sysctl doesn't map to user
 ns

The low and high values of the net.ipv4.ping_group_range sysctl were
being silently forced to the default disabled state when a write to the
sysctl contained GIDs that didn't map to the associated user namespace.
Confusingly, the sysctl's write operation would return success and then
a subsequent read of the sysctl would indicate that the low and high
values are the overflowgid.

This patch changes the behavior by clearly returning an error when the
sysctl write operation receives a GID range that doesn't map to the
associated user namespace. In such a situation, the previous value of
the sysctl is preserved and that range will be returned in a subsequent
read of the sysctl.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index af0a857d8352..5fa335fd3852 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
 	if (write && ret == 0) {
 		low = make_kgid(user_ns, urange[0]);
 		high = make_kgid(user_ns, urange[1]);
-		if (!gid_valid(low) || !gid_valid(high) ||
-		    (urange[1] < urange[0]) || gid_lt(high, low)) {
+		if (!gid_valid(low) || !gid_valid(high))
+			return -EINVAL;
+		if (urange[1] < urange[0] || gid_lt(high, low)) {
 			low = make_kgid(&init_user_ns, 1);
 			high = make_kgid(&init_user_ns, 0);
 		}
-- 
cgit v1.2.3


From 5711b4e89319c2912f20b2a4f371c1525fc9551d Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Thu, 5 Jul 2018 12:01:53 +0200
Subject: netfilter: nf_tproxy: fix possible non-linear access to transport
 header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes a silent out-of-bound read possibility that was present
because of the misuse of this function.

Mostly it was called with a struct udphdr *hp which had only the udphdr
part linearized by the skb_header_pointer, however
nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for
tcp specific attributes may be invalid.

Fixes: a583636a83ea ("inet: refactor inet[6]_lookup functions to take skb")
Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tproxy.h   |  4 ++--
 net/ipv4/netfilter/nf_tproxy_ipv4.c | 18 ++++++++++++------
 net/ipv6/netfilter/nf_tproxy_ipv6.c | 18 ++++++++++++------
 net/netfilter/xt_TPROXY.c           |  8 ++++----
 4 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h
index 9754a50ecde9..4cc64c8446eb 100644
--- a/include/net/netfilter/nf_tproxy.h
+++ b/include/net/netfilter/nf_tproxy.h
@@ -64,7 +64,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
  * belonging to established connections going through that one.
  */
 struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
 		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
@@ -103,7 +103,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 			    struct sock *sk);
 
 struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
 		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 805e83ec3ad9..164714104965 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+		sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 					    iph->saddr, laddr ? laddr : iph->daddr,
 					    hp->source, lport ? lport : hp->dest,
 					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
 
 struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
 		      const u8 protocol,
 		      const __be32 saddr, const __be32 daddr,
 		      const __be16 sport, const __be16 dport,
@@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
-	struct tcphdr *tcph;
 
 	switch (protocol) {
-	case IPPROTO_TCP:
+	case IPPROTO_TCP: {
+		struct tcphdr _hdr, *hp;
+
+		hp = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(struct tcphdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
 		switch (lookup_type) {
 		case NF_TPROXY_LOOKUP_LISTENER:
-			tcph = hp;
 			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
 						    ip_hdrlen(skb) +
-						      __tcp_hdrlen(tcph),
+						      __tcp_hdrlen(hp),
 						    saddr, sport,
 						    daddr, dport,
 						    in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
 			BUG();
 		}
 		break;
+		}
 	case IPPROTO_UDP:
 		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
 				     in->ifindex);
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index bf1d6c421e3b..5dfd33af6451 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
+		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto,
 					    &iph->saddr,
 					    nf_tproxy_laddr6(skb, laddr, &iph->daddr),
 					    hp->source,
@@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
 
 struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
 		      const u8 protocol,
 		      const struct in6_addr *saddr, const struct in6_addr *daddr,
 		      const __be16 sport, const __be16 dport,
@@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 		      const enum nf_tproxy_lookup_t lookup_type)
 {
 	struct sock *sk;
-	struct tcphdr *tcph;
 
 	switch (protocol) {
-	case IPPROTO_TCP:
+	case IPPROTO_TCP: {
+		struct tcphdr _hdr, *hp;
+
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(struct tcphdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
 		switch (lookup_type) {
 		case NF_TPROXY_LOOKUP_LISTENER:
-			tcph = hp;
 			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
-						   thoff + __tcp_hdrlen(tcph),
+						   thoff + __tcp_hdrlen(hp),
 						   saddr, sport,
 						   daddr, ntohs(dport),
 						   in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 			BUG();
 		}
 		break;
+		}
 	case IPPROTO_UDP:
 		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
 				     in->ifindex);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 58fce4e749a9..d76550a8b642 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -61,7 +61,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+	sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
 				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -77,7 +77,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	else if (!sk)
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+		sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
 					   iph->saddr, laddr,
 					   hp->source, lport,
 					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -150,7 +150,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
+	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
 				   xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -171,7 +171,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
+		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff,
 					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
 					   xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
-- 
cgit v1.2.3


From a948f713842ad5c23f125efc61dee6951893219c Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Tue, 3 Jul 2018 15:05:48 -0500
Subject: nl80211/mac80211: allow non-linear skb in rx_control_port

The current implementation of cfg80211_rx_control_port assumed that the
caller could provide a contiguous region of memory for the control port
frame to be sent up to userspace.  Unfortunately, many drivers produce
non-linear skbs, especially for data frames.  This resulted in userspace
getting notified of control port frames with correct metadata (from
address, port, etc) yet garbage / nonsense contents, resulting in bad
handshakes, disconnections, etc.

mac80211 linearizes skbs containing management frames.  But it didn't
seem worthwhile to do this for control port frames.  Thus the signature
of cfg80211_rx_control_port was changed to take the skb directly.
nl80211 then takes care of obtaining control port frame data directly
from the (linear | non-linear) skb.

The caller is still responsible for freeing the skb,
cfg80211_rx_control_port does not take ownership of it.

Fixes: 6a671a50f819 ("nl80211: Add CMD_CONTROL_PORT_FRAME API")
Signed-off-by: Denis Kenzior <denkenz@gmail.com>
[fix some kernel-doc formatting, add fixes tag]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 12 ++++++------
 net/mac80211/rx.c      |  5 +----
 net/wireless/nl80211.c | 24 +++++++++++++++---------
 net/wireless/trace.h   | 18 ++++++++++--------
 4 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5fbfe61f41c6..1beb3ead0385 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5835,10 +5835,11 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 /**
  * cfg80211_rx_control_port - notification about a received control port frame
  * @dev: The device the frame matched to
- * @buf: control port frame
- * @len: length of the frame data
- * @addr: The peer from which the frame was received
- * @proto: frame protocol, typically PAE or Pre-authentication
+ * @skb: The skbuf with the control port frame.  It is assumed that the skbuf
+ *	is 802.3 formatted (with 802.3 header).  The skb can be non-linear.
+ *	This function does not take ownership of the skb, so the caller is
+ *	responsible for any cleanup.  The caller must also ensure that
+ *	skb->protocol is set appropriately.
  * @unencrypted: Whether the frame was received unencrypted
  *
  * This function is used to inform userspace about a received control port
@@ -5851,8 +5852,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
  * Return: %true if the frame was passed to userspace
  */
 bool cfg80211_rx_control_port(struct net_device *dev,
-			      const u8 *buf, size_t len,
-			      const u8 *addr, u16 proto, bool unencrypted);
+			      struct sk_buff *skb, bool unencrypted);
 
 /**
  * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0a38cc1cbebc..932985ca4e66 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2254,11 +2254,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
 		     sdata->control_port_over_nl80211)) {
 		struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 		bool noencrypt = status->flag & RX_FLAG_DECRYPTED;
-		struct ethhdr *ehdr = eth_hdr(skb);
 
-		cfg80211_rx_control_port(dev, skb->data, skb->len,
-					 ehdr->h_source,
-					 be16_to_cpu(skb->protocol), noencrypt);
+		cfg80211_rx_control_port(dev, skb, noencrypt);
 		dev_kfree_skb(skb);
 	} else {
 		/* deliver to local stack */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4eece06be1e7..b6c700572755 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14923,20 +14923,24 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
 
 static int __nl80211_rx_control_port(struct net_device *dev,
-				     const u8 *buf, size_t len,
-				     const u8 *addr, u16 proto,
+				     struct sk_buff *skb,
 				     bool unencrypted, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct ethhdr *ehdr = eth_hdr(skb);
+	const u8 *addr = ehdr->h_source;
+	u16 proto = be16_to_cpu(skb->protocol);
 	struct sk_buff *msg;
 	void *hdr;
+	struct nlattr *frame;
+
 	u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid);
 
 	if (!nlportid)
 		return -ENOENT;
 
-	msg = nlmsg_new(100 + len, gfp);
+	msg = nlmsg_new(100 + skb->len, gfp);
 	if (!msg)
 		return -ENOMEM;
 
@@ -14950,13 +14954,17 @@ static int __nl80211_rx_control_port(struct net_device *dev,
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
 	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
 			      NL80211_ATTR_PAD) ||
-	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
 	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
 	    nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) ||
 	    (unencrypted && nla_put_flag(msg,
 					 NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT)))
 		goto nla_put_failure;
 
+	frame = nla_reserve(msg, NL80211_ATTR_FRAME, skb->len);
+	if (!frame)
+		goto nla_put_failure;
+
+	skb_copy_bits(skb, 0, nla_data(frame), skb->len);
 	genlmsg_end(msg, hdr);
 
 	return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
@@ -14967,14 +14975,12 @@ static int __nl80211_rx_control_port(struct net_device *dev,
 }
 
 bool cfg80211_rx_control_port(struct net_device *dev,
-			      const u8 *buf, size_t len,
-			      const u8 *addr, u16 proto, bool unencrypted)
+			      struct sk_buff *skb, bool unencrypted)
 {
 	int ret;
 
-	trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted);
-	ret = __nl80211_rx_control_port(dev, buf, len, addr, proto,
-					unencrypted, GFP_ATOMIC);
+	trace_cfg80211_rx_control_port(dev, skb, unencrypted);
+	ret = __nl80211_rx_control_port(dev, skb, unencrypted, GFP_ATOMIC);
 	trace_cfg80211_return_bool(ret == 0);
 	return ret == 0;
 }
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2b417a2fe63f..7c73510b161f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2627,23 +2627,25 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
 );
 
 TRACE_EVENT(cfg80211_rx_control_port,
-	TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len,
-		 const u8 *addr, u16 proto, bool unencrypted),
-	TP_ARGS(netdev, buf, len, addr, proto, unencrypted),
+	TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
+		 bool unencrypted),
+	TP_ARGS(netdev, skb, unencrypted),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
-		MAC_ENTRY(addr)
+		__field(int, len)
+		MAC_ENTRY(from)
 		__field(u16, proto)
 		__field(bool, unencrypted)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
-		MAC_ASSIGN(addr, addr);
-		__entry->proto = proto;
+		__entry->len = skb->len;
+		MAC_ASSIGN(from, eth_hdr(skb)->h_source);
+		__entry->proto = be16_to_cpu(skb->protocol);
 		__entry->unencrypted = unencrypted;
 	),
-	TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s",
-		  NETDEV_PR_ARG, MAC_PR_ARG(addr),
+	TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s",
+		  NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from),
 		  __entry->proto, BOOL_TO_STR(__entry->unencrypted))
 );
 
-- 
cgit v1.2.3


From e240cd0df48185a28c153f83a39ba3940e3e9b86 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 6 Jul 2018 19:06:43 +0200
Subject: netfilter: nf_tables: place all set backends in one single module

This patch disallows rbtree with single elements, which is causing
problems with the recent timeout support. Before this patch, you
could opt out individual set representations per module, which is
just adding extra complexity.

Fixes: 8d8540c4f5e0("netfilter: nft_set_rbtree: add timeout support")
Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  6 ++++++
 net/netfilter/Kconfig                  | 25 +++++++------------------
 net/netfilter/Makefile                 |  7 ++++---
 net/netfilter/nf_tables_set_core.c     | 28 ++++++++++++++++++++++++++++
 net/netfilter/nft_set_bitmap.c         | 19 +------------------
 net/netfilter/nft_set_hash.c           | 29 +++--------------------------
 net/netfilter/nft_set_rbtree.c         | 19 +------------------
 7 files changed, 50 insertions(+), 83 deletions(-)
 create mode 100644 net/netfilter/nf_tables_set_core.c

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index e0c0c2558ec4..a05134507e7b 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -65,4 +65,10 @@ extern const struct nft_expr_ops nft_payload_fast_ops;
 extern struct static_key_false nft_counters_enabled;
 extern struct static_key_false nft_trace_enabled;
 
+extern struct nft_set_type nft_set_rhash_type;
+extern struct nft_set_type nft_set_hash_type;
+extern struct nft_set_type nft_set_hash_fast_type;
+extern struct nft_set_type nft_set_rbtree_type;
+extern struct nft_set_type nft_set_bitmap_type;
+
 #endif /* _NET_NF_TABLES_CORE_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index dbd7d1fad277..f0a1c536ef15 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -460,6 +460,13 @@ config NF_TABLES
 
 if NF_TABLES
 
+config NF_TABLES_SET
+	tristate "Netfilter nf_tables set infrastructure"
+	help
+	  This option enables the nf_tables set infrastructure that allows to
+	  look up for elements in a set and to build one-way mappings between
+	  matchings and actions.
+
 config NF_TABLES_INET
 	depends on IPV6
 	select NF_TABLES_IPV4
@@ -493,24 +500,6 @@ config NFT_FLOW_OFFLOAD
 	  This option adds the "flow_offload" expression that you can use to
 	  choose what flows are placed into the hardware.
 
-config NFT_SET_RBTREE
-	tristate "Netfilter nf_tables rbtree set module"
-	help
-	  This option adds the "rbtree" set type (Red Black tree) that is used
-	  to build interval-based sets.
-
-config NFT_SET_HASH
-	tristate "Netfilter nf_tables hash set module"
-	help
-	  This option adds the "hash" set type that is used to build one-way
-	  mappings between matchings and actions.
-
-config NFT_SET_BITMAP
-	tristate "Netfilter nf_tables bitmap set module"
-	help
-	  This option adds the "bitmap" set type that is used to build sets
-	  whose keys are smaller or equal to 16 bits.
-
 config NFT_COUNTER
 	tristate "Netfilter nf_tables counter module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 44449389e527..8a76dced974d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -78,7 +78,11 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
 		  nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
 
+nf_tables_set-objs := nf_tables_set_core.o \
+		      nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
+
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
+obj-$(CONFIG_NF_TABLES_SET)	+= nf_tables_set.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_CONNLIMIT)	+= nft_connlimit.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
@@ -91,9 +95,6 @@ obj-$(CONFIG_NFT_QUEUE)		+= nft_queue.o
 obj-$(CONFIG_NFT_QUOTA)		+= nft_quota.o
 obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o
 obj-$(CONFIG_NFT_REJECT_INET)	+= nft_reject_inet.o
-obj-$(CONFIG_NFT_SET_RBTREE)	+= nft_set_rbtree.o
-obj-$(CONFIG_NFT_SET_HASH)	+= nft_set_hash.o
-obj-$(CONFIG_NFT_SET_BITMAP)	+= nft_set_bitmap.o
 obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
 obj-$(CONFIG_NFT_LOG)		+= nft_log.o
 obj-$(CONFIG_NFT_MASQ)		+= nft_masq.o
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
new file mode 100644
index 000000000000..814789644bd3
--- /dev/null
+++ b/net/netfilter/nf_tables_set_core.c
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <net/netfilter/nf_tables_core.h>
+
+static int __init nf_tables_set_module_init(void)
+{
+	nft_register_set(&nft_set_hash_fast_type);
+	nft_register_set(&nft_set_hash_type);
+	nft_register_set(&nft_set_rhash_type);
+	nft_register_set(&nft_set_bitmap_type);
+	nft_register_set(&nft_set_rbtree_type);
+
+	return 0;
+}
+
+static void __exit nf_tables_set_module_exit(void)
+{
+	nft_unregister_set(&nft_set_rbtree_type);
+	nft_unregister_set(&nft_set_bitmap_type);
+	nft_unregister_set(&nft_set_rhash_type);
+	nft_unregister_set(&nft_set_hash_type);
+	nft_unregister_set(&nft_set_hash_fast_type);
+}
+
+module_init(nf_tables_set_module_init);
+module_exit(nf_tables_set_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index d6626e01c7ee..128bc16f52dd 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -296,7 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_bitmap_type __read_mostly = {
+struct nft_set_type nft_set_bitmap_type __read_mostly = {
 	.owner		= THIS_MODULE,
 	.ops		= {
 		.privsize	= nft_bitmap_privsize,
@@ -314,20 +314,3 @@ static struct nft_set_type nft_bitmap_type __read_mostly = {
 		.get		= nft_bitmap_get,
 	},
 };
-
-static int __init nft_bitmap_module_init(void)
-{
-	return nft_register_set(&nft_bitmap_type);
-}
-
-static void __exit nft_bitmap_module_exit(void)
-{
-	nft_unregister_set(&nft_bitmap_type);
-}
-
-module_init(nft_bitmap_module_init);
-module_exit(nft_bitmap_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 6f9a1365a09f..72ef35b51cac 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -654,7 +654,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
 	return true;
 }
 
-static struct nft_set_type nft_rhash_type __read_mostly = {
+struct nft_set_type nft_set_rhash_type __read_mostly = {
 	.owner		= THIS_MODULE,
 	.features	= NFT_SET_MAP | NFT_SET_OBJECT |
 			  NFT_SET_TIMEOUT | NFT_SET_EVAL,
@@ -677,7 +677,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = {
 	},
 };
 
-static struct nft_set_type nft_hash_type __read_mostly = {
+struct nft_set_type nft_set_hash_type __read_mostly = {
 	.owner		= THIS_MODULE,
 	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
 	.ops		= {
@@ -697,7 +697,7 @@ static struct nft_set_type nft_hash_type __read_mostly = {
 	},
 };
 
-static struct nft_set_type nft_hash_fast_type __read_mostly = {
+struct nft_set_type nft_set_hash_fast_type __read_mostly = {
 	.owner		= THIS_MODULE,
 	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
 	.ops		= {
@@ -716,26 +716,3 @@ static struct nft_set_type nft_hash_fast_type __read_mostly = {
 		.get		= nft_hash_get,
 	},
 };
-
-static int __init nft_hash_module_init(void)
-{
-	if (nft_register_set(&nft_hash_fast_type) ||
-	    nft_register_set(&nft_hash_type) ||
-	    nft_register_set(&nft_rhash_type))
-		return 1;
-	return 0;
-}
-
-static void __exit nft_hash_module_exit(void)
-{
-	nft_unregister_set(&nft_rhash_type);
-	nft_unregister_set(&nft_hash_type);
-	nft_unregister_set(&nft_hash_fast_type);
-}
-
-module_init(nft_hash_module_init);
-module_exit(nft_hash_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 7f3a9a211034..1f8f257cb518 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -462,7 +462,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_rbtree_type __read_mostly = {
+struct nft_set_type nft_set_rbtree_type __read_mostly = {
 	.owner		= THIS_MODULE,
 	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 	.ops		= {
@@ -481,20 +481,3 @@ static struct nft_set_type nft_rbtree_type __read_mostly = {
 		.get		= nft_rbtree_get,
 	},
 };
-
-static int __init nft_rbtree_module_init(void)
-{
-	return nft_register_set(&nft_rbtree_type);
-}
-
-static void __exit nft_rbtree_module_exit(void)
-{
-	nft_unregister_set(&nft_rbtree_type);
-}
-
-module_init(nft_rbtree_module_init);
-module_exit(nft_rbtree_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_SET();
-- 
cgit v1.2.3


From 2a57f182420174c7fd4b19db979a2d135231a963 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 6 Jul 2018 20:10:03 +0200
Subject: tipc: fix wrong return value from function tipc_node_try_addr()

The function for checking if there is an node address conflict is
supposed to return a suggestion for a new address if it finds a
conflict, and zero otherwise. But in case the peer being checked
is previously unknown it does instead return a "suggestion" for
the checked address itself. This results in a DSC_TRIAL_FAIL_MSG
being sent unecessarily to the peer, and sometimes makes the trial
period starting over again.

Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index 6a44eb812baf..0453bd451ce8 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -797,6 +797,7 @@ static u32 tipc_node_suggest_addr(struct net *net, u32 addr)
 }
 
 /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not
+ * Returns suggested address if any, otherwise 0
  */
 u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
 {
@@ -819,12 +820,14 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
 	if (n) {
 		addr = n->addr;
 		tipc_node_put(n);
+		return addr;
 	}
-	/* Even this node may be in trial phase */
+
+	/* Even this node may be in conflict */
 	if (tn->trial_addr == addr)
 		return tipc_node_suggest_addr(net, addr);
 
-	return addr;
+	return 0;
 }
 
 void tipc_node_check_dest(struct net *net, u32 addr,
-- 
cgit v1.2.3


From e415577f57f4452150642500364cbe5fa6112813 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 6 Jul 2018 20:10:04 +0200
Subject: tipc: correct discovery message handling during address trial period

With the duplicate address discovery protocol for tipc nodes addresses
we introduced a one second trial period before a node is allocated a
hash number to use as address.

Unfortunately, we miss to handle the case when a regular LINK REQUEST/
RESPONSE arrives from a cluster node during the trial period. Such
messages are not ignored as they should be, leading to links setup
attempts while the node still has no address.

Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/discover.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 9f666e0650e2..dcadc10dffd1 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -133,6 +133,8 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
 }
 
 /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer
+ * Returns true if message should be dropped by caller, i.e., if it is a
+ * trial message or we are inside trial period. Otherwise false.
  */
 static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
 				     struct tipc_media_addr *maddr,
@@ -168,8 +170,9 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
 		msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
 	}
 
+	/* Accept regular link requests/responses only after trial period */
 	if (mtyp != DSC_TRIAL_MSG)
-		return false;
+		return trial;
 
 	sugg_addr = tipc_node_try_addr(net, peer_id, src);
 	if (sugg_addr)
-- 
cgit v1.2.3


From 92018c7ca959ccd346d6235dac03cf7fc1ba51f7 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 6 Jul 2018 20:10:05 +0200
Subject: tipc: fix correct setting of message type in second discoverer

The duplicate address discovery protocol is not safe against two
discoverers running in parallel. The one executing first after the
trial period is over will set the node address and change its own
message type to DSC_REQ_MSG. The one executing last may find that the
node address is already set, and never change message type, with the
result that its links may never be established.

In this commmit we ensure that the message type always is set correctly
after the trial period is over.

Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/discover.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index dcadc10dffd1..2830709957bd 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -287,7 +287,6 @@ static void tipc_disc_timeout(struct timer_list *t)
 {
 	struct tipc_discoverer *d = from_timer(d, t, timer);
 	struct tipc_net *tn = tipc_net(d->net);
-	u32 self = tipc_own_addr(d->net);
 	struct tipc_media_addr maddr;
 	struct sk_buff *skb = NULL;
 	struct net *net = d->net;
@@ -301,12 +300,14 @@ static void tipc_disc_timeout(struct timer_list *t)
 		goto exit;
 	}
 
-	/* Did we just leave the address trial period ? */
-	if (!self && !time_before(jiffies, tn->addr_trial_end)) {
-		self = tn->trial_addr;
-		tipc_net_finalize(net, self);
-		msg_set_prevnode(buf_msg(d->skb), self);
+	/* Trial period over ? */
+	if (!time_before(jiffies, tn->addr_trial_end)) {
+		/* Did we just leave it ? */
+		if (!tipc_own_addr(net))
+			tipc_net_finalize(net, tn->trial_addr);
+
 		msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
+		msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net));
 	}
 
 	/* Adjust timeout interval according to discovery phase */
-- 
cgit v1.2.3


From 9faa89d4ed9d7d326f4763d262842270450f9b1f Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Fri, 6 Jul 2018 20:10:06 +0200
Subject: tipc: make function tipc_net_finalize() thread safe

The setting of the node address is not thread safe, meaning that
two discoverers may decide to set it simultanously, with a duplicate
entry in the name table as result. We fix that with this commit.

Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/net.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/tipc/net.c b/net/tipc/net.c
index 4fbaa0464405..a7f6964c3a4b 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -121,12 +121,17 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
 
 void tipc_net_finalize(struct net *net, u32 addr)
 {
-	tipc_set_node_addr(net, addr);
-	smp_mb();
-	tipc_named_reinit(net);
-	tipc_sk_reinit(net);
-	tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
-			     TIPC_CLUSTER_SCOPE, 0, addr);
+	struct tipc_net *tn = tipc_net(net);
+
+	spin_lock_bh(&tn->node_list_lock);
+	if (!tipc_own_addr(net)) {
+		tipc_set_node_addr(net, addr);
+		tipc_named_reinit(net);
+		tipc_sk_reinit(net);
+		tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
+				     TIPC_CLUSTER_SCOPE, 0, addr);
+	}
+	spin_unlock_bh(&tn->node_list_lock);
 }
 
 void tipc_net_stop(struct net *net)
-- 
cgit v1.2.3


From e1bbdd57047454dad068dc36612dd60a57f4c58f Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 5 Jul 2018 16:15:30 +0200
Subject: net/smc: reduce sock_put() for fallback sockets

smc_release() calls a sock_put() for smc fallback sockets to cover
the passive closing sock_hold() in __smc_connect() and
smc_tcp_listen_work(). This does not make sense for sockets in state
SMC_LISTEN and SMC_INIT.
An SMC socket stays in state SMC_INIT if connect fails. The sock_put
in smc_connect_abort() does not cover all failures. Move it into
smc_connect_decline_fallback().

Fixes: ee9dfbef02d18 ("net/smc: handle sockopts forcing fallback")
Reported-by: syzbot+3a0748c8f2f210c0ef9b@syzkaller.appspotmail.com
Reported-by: syzbot+9e60d2428a42049a592a@syzkaller.appspotmail.com
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c    | 15 ++++++++++-----
 net/smc/smc_close.c |  2 ++
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e017b6a4452b..5334157f5065 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -147,7 +147,8 @@ static int smc_release(struct socket *sock)
 		smc->clcsock = NULL;
 	}
 	if (smc->use_fallback) {
-		sock_put(sk); /* passive closing */
+		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
+			sock_put(sk); /* passive closing */
 		sk->sk_state = SMC_CLOSED;
 		sk->sk_state_change(sk);
 	}
@@ -417,12 +418,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
 {
 	int rc;
 
-	if (reason_code < 0) /* error, fallback is not possible */
+	if (reason_code < 0) { /* error, fallback is not possible */
+		if (smc->sk.sk_state == SMC_INIT)
+			sock_put(&smc->sk); /* passive closing */
 		return reason_code;
+	}
 	if (reason_code != SMC_CLC_DECL_REPLY) {
 		rc = smc_clc_send_decline(smc, reason_code);
-		if (rc < 0)
+		if (rc < 0) {
+			if (smc->sk.sk_state == SMC_INIT)
+				sock_put(&smc->sk); /* passive closing */
 			return rc;
+		}
 	}
 	return smc_connect_fallback(smc);
 }
@@ -435,8 +442,6 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code,
 		smc_lgr_forget(smc->conn.lgr);
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_conn_free(&smc->conn);
-	if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
-		sock_put(&smc->sk); /* passive closing */
 	return reason_code;
 }
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index fa41d9881741..ac961dfb1ea1 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -107,6 +107,8 @@ static void smc_close_active_abort(struct smc_sock *smc)
 	}
 	switch (sk->sk_state) {
 	case SMC_INIT:
+		sk->sk_state = SMC_PEERABORTWAIT;
+		break;
 	case SMC_ACTIVE:
 		sk->sk_state = SMC_PEERABORTWAIT;
 		release_sock(sk);
-- 
cgit v1.2.3


From 11a245e2f7bf25fc21f47e4c9c8491841b128890 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 6 Jul 2018 21:01:05 +0200
Subject: net/sched: act_csum: fix NULL dereference when 'goto chain' is used

the control action in the common member of struct tcf_csum must be a valid
value, as it can contain the chain index when 'goto chain' is used. Ensure
that the control action can be read as x->tcfa_action, when x is a pointer
to struct tc_action and x->ops->type is TCA_ACT_CSUM, to prevent the
following command:

  # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
  > $tcflags dst_mac $h2mac action csum ip or tcp or udp or sctp goto chain 1

from triggering a NULL pointer dereference when a matching packet is
received.

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
 PGD 800000010416b067 P4D 800000010416b067 PUD 1041be067 PMD 0
 Oops: 0000 [#1] SMP PTI
 CPU: 0 PID: 3072 Comm: mausezahn Tainted: G            E     4.18.0-rc2.auguri+ #421
 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013
 RIP: 0010:tcf_action_exec+0xb8/0x100
 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3
 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246
 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054
 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00
 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c
 R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00
 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300
 FS:  00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0
 Call Trace:
  <IRQ>
  fl_classify+0x1ad/0x1c0 [cls_flower]
  ? arp_rcv+0x121/0x1b0
  ? __x2apic_send_IPI_dest+0x40/0x40
  ? smp_reschedule_interrupt+0x1c/0xd0
  ? reschedule_interrupt+0xf/0x20
  ? reschedule_interrupt+0xa/0x20
  ? device_is_rmrr_locked+0xe/0x50
  ? iommu_should_identity_map+0x49/0xd0
  ? __intel_map_single+0x30/0x140
  ? e1000e_update_rdt_wa.isra.52+0x22/0xb0 [e1000e]
  ? e1000_alloc_rx_buffers+0x233/0x250 [e1000e]
  ? kmem_cache_alloc+0x38/0x1c0
  tcf_classify+0x89/0x140
  __netif_receive_skb_core+0x5ea/0xb70
  ? enqueue_task_fair+0xb6/0x7d0
  ? process_backlog+0x97/0x150
  process_backlog+0x97/0x150
  net_rx_action+0x14b/0x3e0
  __do_softirq+0xde/0x2b4
  do_softirq_own_stack+0x2a/0x40
  </IRQ>
  do_softirq.part.18+0x49/0x50
  __local_bh_enable_ip+0x49/0x50
  __dev_queue_xmit+0x4ab/0x8a0
  ? wait_woken+0x80/0x80
  ? packet_sendmsg+0x38f/0x810
  ? __dev_queue_xmit+0x8a0/0x8a0
  packet_sendmsg+0x38f/0x810
  sock_sendmsg+0x36/0x40
  __sys_sendto+0x10e/0x140
  ? do_vfs_ioctl+0xa4/0x630
  ? syscall_trace_enter+0x1df/0x2e0
  ? __audit_syscall_exit+0x22a/0x290
  __x64_sys_sendto+0x24/0x30
  do_syscall_64+0x5b/0x180
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x7f5a45cbec93
 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24
 RSP: 002b:00007ffd0ee6d748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000001161010 RCX: 00007f5a45cbec93
 RDX: 0000000000000062 RSI: 0000000001161322 RDI: 0000000000000003
 RBP: 00007ffd0ee6d780 R08: 00007ffd0ee6d760 R09: 0000000000000014
 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062
 R13: 0000000001161322 R14: 00007ffd0ee6d760 R15: 0000000000000003
 Modules linked in: act_csum act_gact cls_flower sch_ingress vrf veth act_tunnel_key(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel snd_hda_codec_hdmi snd_hda_codec_realtek kvm snd_hda_codec_generic hp_wmi iTCO_wdt sparse_keymap rfkill mei_wdt iTCO_vendor_support wmi_bmof gpio_ich irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel snd_hda_intel crypto_simd cryptd snd_hda_codec glue_helper snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm pcspkr i2c_i801 snd_timer snd sg lpc_ich soundcore wmi mei_me
  mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom sd_mod ahci libahci crc32c_intel i915 ixgbe serio_raw libata video dca i2c_algo_bit sfc drm_kms_helper syscopyarea mtd sysfillrect mdio sysimgblt fb_sys_fops drm e1000e i2c_core
 CR2: 0000000000000000
 ---[ end trace 3c9e9d1a77df4026 ]---
 RIP: 0010:tcf_action_exec+0xb8/0x100
 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3
 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246
 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054
 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00
 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c
 R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00
 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300
 FS:  00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0
 Kernel panic - not syncing: Fatal exception in interrupt
 Kernel Offset: 0x26400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
 ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Fixes: 9c5f69bbd75a ("net/sched: act_csum: don't use spinlock in the fast path")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_csum.h | 1 -
 net/sched/act_csum.c         | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 9470fd7e4350..32d2454c0479 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -7,7 +7,6 @@
 #include <linux/tc_act/tc_csum.h>
 
 struct tcf_csum_params {
-	int action;
 	u32 update_flags;
 	struct rcu_head rcu;
 };
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 526a8e491626..6e7124e57918 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -91,7 +91,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	}
 	params_old = rtnl_dereference(p->params);
 
-	params_new->action = parm->action;
+	p->tcf_action = parm->action;
 	params_new->update_flags = parm->update_flags;
 	rcu_assign_pointer(p->params, params_new);
 	if (params_old)
@@ -561,7 +561,7 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
 	tcf_lastuse_update(&p->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
 
-	action = params->action;
+	action = READ_ONCE(p->tcf_action);
 	if (unlikely(action == TC_ACT_SHOT))
 		goto drop_stats;
 
@@ -599,11 +599,11 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 		.index   = p->tcf_index,
 		.refcnt  = p->tcf_refcnt - ref,
 		.bindcnt = p->tcf_bindcnt - bind,
+		.action  = p->tcf_action,
 	};
 	struct tcf_t t;
 
 	params = rtnl_dereference(p->params);
-	opt.action = params->action;
 	opt.update_flags = params->update_flags;
 
 	if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
-- 
cgit v1.2.3


From 38230a3e0e0933bbcf5df6fa469ba0667f667568 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 6 Jul 2018 21:01:06 +0200
Subject: net/sched: act_tunnel_key: fix NULL dereference when 'goto chain' is
 used

the control action in the common member of struct tcf_tunnel_key must be a
valid value, as it can contain the chain index when 'goto chain' is used.
Ensure that the control action can be read as x->tcfa_action, when x is a
pointer to struct tc_action and x->ops->type is TCA_ACT_TUNNEL_KEY, to
prevent the following command:

 # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
 > $tcflags dst_mac $h2mac action tunnel_key unset goto chain 1

from causing a NULL dereference when a matching packet is received:

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
 PGD 80000001097ac067 P4D 80000001097ac067 PUD 103b0a067 PMD 0
 Oops: 0000 [#1] SMP PTI
 CPU: 0 PID: 3491 Comm: mausezahn Tainted: G            E     4.18.0-rc2.auguri+ #421
 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013
 RIP: 0010:tcf_action_exec+0xb8/0x100
 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3
 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246
 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001
 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000
 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c
 R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800
 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40
 FS:  00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0
 Call Trace:
  <IRQ>
  fl_classify+0x1ad/0x1c0 [cls_flower]
  ? __update_load_avg_se.isra.47+0x1ca/0x1d0
  ? __update_load_avg_se.isra.47+0x1ca/0x1d0
  ? update_load_avg+0x665/0x690
  ? update_load_avg+0x665/0x690
  ? kmem_cache_alloc+0x38/0x1c0
  tcf_classify+0x89/0x140
  __netif_receive_skb_core+0x5ea/0xb70
  ? enqueue_entity+0xd0/0x270
  ? process_backlog+0x97/0x150
  process_backlog+0x97/0x150
  net_rx_action+0x14b/0x3e0
  __do_softirq+0xde/0x2b4
  do_softirq_own_stack+0x2a/0x40
  </IRQ>
  do_softirq.part.18+0x49/0x50
  __local_bh_enable_ip+0x49/0x50
  __dev_queue_xmit+0x4ab/0x8a0
  ? wait_woken+0x80/0x80
  ? packet_sendmsg+0x38f/0x810
  ? __dev_queue_xmit+0x8a0/0x8a0
  packet_sendmsg+0x38f/0x810
  sock_sendmsg+0x36/0x40
  __sys_sendto+0x10e/0x140
  ? do_vfs_ioctl+0xa4/0x630
  ? syscall_trace_enter+0x1df/0x2e0
  ? __audit_syscall_exit+0x22a/0x290
  __x64_sys_sendto+0x24/0x30
  do_syscall_64+0x5b/0x180
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x7fd67e18dc93
 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24
 RSP: 002b:00007ffe0189b748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 00000000020ca010 RCX: 00007fd67e18dc93
 RDX: 0000000000000062 RSI: 00000000020ca322 RDI: 0000000000000003
 RBP: 00007ffe0189b780 R08: 00007ffe0189b760 R09: 0000000000000014
 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062
 R13: 00000000020ca322 R14: 00007ffe0189b760 R15: 0000000000000003
 Modules linked in: act_tunnel_key act_gact cls_flower sch_ingress vrf veth act_csum(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl snd_hda_codec_hdmi x86_pkg_temp_thermal intel_powerclamp snd_hda_codec_realtek coretemp snd_hda_codec_generic kvm_intel kvm irqbypass snd_hda_intel crct10dif_pclmul crc32_pclmul hp_wmi ghash_clmulni_intel pcbc snd_hda_codec aesni_intel sparse_keymap rfkill snd_hda_core snd_hwdep snd_seq crypto_simd iTCO_wdt gpio_ich iTCO_vendor_support wmi_bmof cryptd mei_wdt glue_helper snd_seq_device snd_pcm pcspkr snd_timer snd i2c_i801 lpc_ich sg soundcore wmi mei_me
  mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod sr_mod cdrom i915 video i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ahci crc32c_intel libahci serio_raw sfc libata mtd drm ixgbe mdio i2c_core e1000e dca
 CR2: 0000000000000000
 ---[ end trace 1ab8b5b5d4639dfc ]---
 RIP: 0010:tcf_action_exec+0xb8/0x100
 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3
 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246
 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001
 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000
 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c
 R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800
 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40
 FS:  00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0
 Kernel panic - not syncing: Fatal exception in interrupt
 Kernel Offset: 0x11400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
 ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Fixes: d0f6dd8a914f ("net/sched: Introduce act_tunnel_key")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_tunnel_key.h | 1 -
 net/sched/act_tunnel_key.c         | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
index efef0b4b1b2b..46b8c7f1c8d5 100644
--- a/include/net/tc_act/tc_tunnel_key.h
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -18,7 +18,6 @@
 struct tcf_tunnel_key_params {
 	struct rcu_head		rcu;
 	int			tcft_action;
-	int			action;
 	struct metadata_dst     *tcft_enc_metadata;
 };
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 626dac81a48a..9bc6c2ae98a5 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -36,7 +36,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
 
 	tcf_lastuse_update(&t->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
-	action = params->action;
+	action = READ_ONCE(t->tcf_action);
 
 	switch (params->tcft_action) {
 	case TCA_TUNNEL_KEY_ACT_RELEASE:
@@ -182,7 +182,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 
 	params_old = rtnl_dereference(t->params);
 
-	params_new->action = parm->action;
+	t->tcf_action = parm->action;
 	params_new->tcft_action = parm->t_action;
 	params_new->tcft_enc_metadata = metadata;
 
@@ -254,13 +254,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		.index    = t->tcf_index,
 		.refcnt   = t->tcf_refcnt - ref,
 		.bindcnt  = t->tcf_bindcnt - bind,
+		.action   = t->tcf_action,
 	};
 	struct tcf_t tm;
 
 	params = rtnl_dereference(t->params);
 
 	opt.t_action = params->tcft_action;
-	opt.action = params->action;
 
 	if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 0c6bc6e531a6db36f49622f1f115770160f7afb0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 5 Jul 2018 08:49:59 -0700
Subject: bpf: fix sk_skb programs without skb->dev assigned

Multiple BPF helpers in use by sk_skb programs calculate the max
skb length using the __bpf_skb_max_len function. However, this
calculates the max length using the skb->dev pointer which can be
NULL when an sk_skb program is paired with an sk_msg program.

To force this a sk_msg program needs to redirect into the ingress
path of a sock with an attach sk_skb program. Then the the sk_skb
program would need to call one of the helpers that adjust the skb
size.

To fix the null ptr dereference use SKB_MAX_ALLOC size if no dev
is available.

Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 0ca6907d7efe..3095f1ba7015 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2779,7 +2779,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
 
 static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 {
-	return skb->dev->mtu + skb->dev->hard_header_len;
+	return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
+			  SKB_MAX_ALLOC;
 }
 
 static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
-- 
cgit v1.2.3


From 0ea488ff8d23c93da383fcf424825c298b13b1fb Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 5 Jul 2018 08:50:15 -0700
Subject: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb

In commit

  'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512)

we added the routine bpf_compute_data_end_sk_skb() to compute the
correct data_end values, but this has since been lost. In kernel
v4.14 this was correct and the above patch was applied in it
entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree
we lost the piece that renamed bpf_compute_data_pointers to the
new function bpf_compute_data_end_sk_skb. This was done here,

e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")

When it conflicted with the following rename patch,

6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers")

Finally, after a refactor I thought even the function
bpf_compute_data_end_sk_skb() was no longer needed and it was
erroneously removed.

However, we never reverted the sk_skb_convert_ctx_access() usage of
tcp_skb_cb which had been committed and survived the merge conflict.
Here we fix this by adding back the helper and *_data_end_sk_skb()
usage. Using the bpf_skc_data_end mapping is not correct because it
expects a qdisc_skb_cb object but at the sock layer this is not the
case. Even though it happens to work here because we don't overwrite
any data in-use at the socket layer and the cb structure is cleared
later this has potential to create some subtle issues. But, even
more concretely the filter.c access check uses tcp_skb_cb.

And by some act of chance though,

struct bpf_skb_data_end {
        struct qdisc_skb_cb        qdisc_cb;             /*     0    28 */

        /* XXX 4 bytes hole, try to pack */

        void *                     data_meta;            /*    32     8 */
        void *                     data_end;             /*    40     8 */

        /* size: 48, cachelines: 1, members: 3 */
        /* sum members: 44, holes: 1, sum holes: 4 */
        /* last cacheline: 48 bytes */
};

and then tcp_skb_cb,

struct tcp_skb_cb {
	[...]
                struct {
                        __u32      flags;                /*    24     4 */
                        struct sock * sk_redir;          /*    32     8 */
                        void *     data_end;             /*    40     8 */
                } bpf;                                   /*          24 */
        };

So when we use offset_of() to track down the byte offset we get 40 in
either case and everything continues to work. Fix this mess and use
correct structures its unclear how long this might actually work for
until someone moves the structs around.

Reported-by: Martin KaFai Lau <kafai@fb.com>
Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")
Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h    |  4 +++
 kernel/bpf/sockmap.c |  4 +--
 net/core/filter.c    | 98 ++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 97 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 800582b5dd54..af3ec72d5d41 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -828,6 +828,10 @@ struct tcp_skb_cb {
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
+static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
+}
 
 #if IS_ENABLED(CONFIG_IPV6)
 /* This is the variant of inet6_iif() that must be used by TCP,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index dfc8a8a07c1f..98fb7938beea 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 */
 	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
 	skb->sk = psock->sock;
-	bpf_compute_data_pointers(skb);
+	bpf_compute_data_end_sk_skb(skb);
 	preempt_disable();
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	preempt_enable();
@@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_pointers(skb);
+	bpf_compute_data_end_sk_skb(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/net/core/filter.c b/net/core/filter.c
index 3095f1ba7015..470268024a40 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1762,6 +1762,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static inline int sk_skb_try_make_writable(struct sk_buff *skb,
+					   unsigned int write_len)
+{
+	int err = __bpf_try_make_writable(skb, write_len);
+
+	bpf_compute_data_end_sk_skb(skb);
+	return err;
+}
+
+BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+	/* Idea is the following: should the needed direct read/write
+	 * test fail during runtime, we can pull in more data and redo
+	 * again, since implicitly, we invalidate previous checks here.
+	 *
+	 * Or, since we know how much we need to make read/writeable,
+	 * this can be done once at the program beginning for direct
+	 * access case. By this we overcome limitations of only current
+	 * headroom being accessible.
+	 */
+	return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto sk_skb_pull_data_proto = {
+	.func		= sk_skb_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
 	   u64, from, u64, to, u64, flags)
 {
@@ -2864,8 +2895,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
 	return __skb_trim_rcsum(skb, new_len);
 }
 
-BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
-	   u64, flags)
+static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
+					u64 flags)
 {
 	u32 max_len = __bpf_skb_max_len(skb);
 	u32 min_len = __bpf_skb_min_len(skb);
@@ -2901,6 +2932,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
 		if (!ret && skb_is_gso(skb))
 			skb_gso_reset(skb);
 	}
+	return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+	   u64, flags)
+{
+	int ret = __bpf_skb_change_tail(skb, new_len, flags);
 
 	bpf_compute_data_pointers(skb);
 	return ret;
@@ -2915,8 +2953,26 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
 	   u64, flags)
+{
+	int ret = __bpf_skb_change_tail(skb, new_len, flags);
+
+	bpf_compute_data_end_sk_skb(skb);
+	return ret;
+}
+
+static const struct bpf_func_proto sk_skb_change_tail_proto = {
+	.func		= sk_skb_change_tail,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
+					u64 flags)
 {
 	u32 max_len = __bpf_skb_max_len(skb);
 	u32 new_len = skb->len + head_room;
@@ -2942,8 +2998,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
 		skb_reset_mac_header(skb);
 	}
 
+	return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+	   u64, flags)
+{
+	int ret = __bpf_skb_change_head(skb, head_room, flags);
+
 	bpf_compute_data_pointers(skb);
-	return 0;
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_skb_change_head_proto = {
@@ -2955,6 +3019,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
+	   u64, flags)
+{
+	int ret = __bpf_skb_change_head(skb, head_room, flags);
+
+	bpf_compute_data_end_sk_skb(skb);
+	return ret;
+}
+
+static const struct bpf_func_proto sk_skb_change_head_proto = {
+	.func		= sk_skb_change_head,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
 static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
 {
 	return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -4618,9 +4699,12 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_skb_store_bytes ||
 	    func == bpf_skb_change_proto ||
 	    func == bpf_skb_change_head ||
+	    func == sk_skb_change_head ||
 	    func == bpf_skb_change_tail ||
+	    func == sk_skb_change_tail ||
 	    func == bpf_skb_adjust_room ||
 	    func == bpf_skb_pull_data ||
+	    func == sk_skb_pull_data ||
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
@@ -4872,11 +4956,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
 	case BPF_FUNC_skb_pull_data:
-		return &bpf_skb_pull_data_proto;
+		return &sk_skb_pull_data_proto;
 	case BPF_FUNC_skb_change_tail:
-		return &bpf_skb_change_tail_proto;
+		return &sk_skb_change_tail_proto;
 	case BPF_FUNC_skb_change_head:
-		return &bpf_skb_change_head_proto;
+		return &sk_skb_change_head_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
-- 
cgit v1.2.3


From d8d7218ad842e18fc6976b87c08ed749e8d56313 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Fri, 6 Jul 2018 11:49:00 +0900
Subject: xdp: XDP_REDIRECT should check IFF_UP and MTU

Otherwise we end up with attempting to send packets from down devices
or to send oversized packets, which may cause unexpected driver/device
behaviour. Generic XDP has already done this check, so reuse the logic
in native XDP.

Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function")
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 6 +++---
 kernel/bpf/devmap.c    | 7 ++++++-
 net/core/filter.c      | 9 +++++++--
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 300baad62c88..c73dd7396886 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -765,8 +765,8 @@ static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
-static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb,
-					   struct net_device *fwd)
+static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
+				 unsigned int pktlen)
 {
 	unsigned int len;
 
@@ -774,7 +774,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb,
 		return -ENETDOWN;
 
 	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len)
+	if (pktlen > len)
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 642c97f6d1b8..d361fc1e3bf3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
+	int err;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit)
 		return -EOPNOTSUPP;
 
+	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+	if (unlikely(err))
+		return err;
+
 	xdpf = convert_to_xdp_frame(xdp);
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
@@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 {
 	int err;
 
-	err = __xdp_generic_ok_fwd_dev(skb, dst->dev);
+	err = xdp_ok_fwd_dev(dst->dev, skb->len);
 	if (unlikely(err))
 		return err;
 	skb->dev = dst->dev;
diff --git a/net/core/filter.c b/net/core/filter.c
index 470268024a40..5fa66a33927f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3128,12 +3128,16 @@ static int __bpf_tx_xdp(struct net_device *dev,
 			u32 index)
 {
 	struct xdp_frame *xdpf;
-	int sent;
+	int err, sent;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit) {
 		return -EOPNOTSUPP;
 	}
 
+	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+	if (unlikely(err))
+		return err;
+
 	xdpf = convert_to_xdp_frame(xdp);
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
@@ -3367,7 +3371,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 		goto err;
 	}
 
-	if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+	err = xdp_ok_fwd_dev(fwd, skb->len);
+	if (unlikely(err))
 		goto err;
 
 	skb->dev = fwd;
-- 
cgit v1.2.3


From e7372197e15856ec4ee66b668020a662994db103 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 7 Jul 2018 16:15:26 -0700
Subject: net/ipv4: Set oif in fib_compute_spec_dst

Xin reported that icmp replies may not use the address on the device the
echo request is received if the destination address is broadcast. Instead
a route lookup is done without considering VRF context. Fix by setting
oif in flow struct to the master device if it is enslaved. That directs
the lookup to the VRF table. If the device is not enslaved, oif is still
0 so no affect.

Fixes: cd2fbe1b6b51 ("net: Use VRF device index for lookups on RX")
Reported-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b21833651394..e46cdd310e5f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -300,6 +300,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 		struct flowi4 fl4 = {
 			.flowi4_iif = LOOPBACK_IFINDEX,
+			.flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 			.daddr = ip_hdr(skb)->saddr,
 			.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
 			.flowi4_scope = scope,
-- 
cgit v1.2.3


From acc2cf4e37174646a24cba42fa53c668b2338d4e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Sat, 7 Jul 2018 16:31:40 +0900
Subject: net: diag: Don't double-free TCP_NEW_SYN_RECV sockets in tcp_abort

When tcp_diag_destroy closes a TCP_NEW_SYN_RECV socket, it first
frees it by calling inet_csk_reqsk_queue_drop_and_and_put in
tcp_abort, and then frees it again by calling sock_gen_put.

Since tcp_abort only has one caller, and all the other codepaths
in tcp_abort don't free the socket, just remove the free in that
function.

Cc: David Ahern <dsa@cumulusnetworks.com>
Tested: passes Android sock_diag_test.py, which exercises this codepath
Fixes: d7226c7a4dd1 ("net: diag: Fix refcnt leak in error path destroying socket")
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e7b53d2a971f..c959bb6ea4ed 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3720,8 +3720,7 @@ int tcp_abort(struct sock *sk, int err)
 			struct request_sock *req = inet_reqsk(sk);
 
 			local_bh_disable();
-			inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
-							  req);
+			inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 			local_bh_enable();
 			return 0;
 		}
-- 
cgit v1.2.3


From f6f2a4a2eb92bc73671204198bb2f8ab53ff59fb Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 6 Jul 2018 12:30:20 +0200
Subject: ipfrag: really prevent allocation on netns exit

Setting the low threshold to 0 has no effect on frags allocation,
we need to clear high_thresh instead.

The code was pre-existent to commit 648700f76b03 ("inet: frags:
use rhashtables for reassembly units"), but before the above,
such assignment had a different role: prevent concurrent eviction
from the worker and the netns cleanup helper.

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..1e4cf3ab560f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -90,7 +90,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 
 void inet_frags_exit_net(struct netns_frags *nf)
 {
-	nf->low_thresh = 0; /* prevent creation of new frags */
+	nf->high_thresh = 0; /* prevent creation of new frags */
 
 	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
-- 
cgit v1.2.3


From 6508b6781be076f889e3077a1a5fadf1930a569d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 7 Jul 2018 23:00:01 -0700
Subject: tcp: cleanup copied_seq and urg_data in tcp_disconnect

tcp_zerocopy_receive() relies on tcp_inq() to limit number of bytes
requested by user.

syzbot found that after tcp_disconnect(), tcp_inq() was returning
a stale value (number of bytes in queue before the disconnect).

Note that after this patch, ioctl(fd, SIOCINQ, &val) is also fixed
and returns 0, so this might be a candidate for all known linux kernels.

While we are at this, we probably also should clear urg_data to
avoid other syzkaller reports after it discovers how to deal with
urgent data.

syzkaller repro :

socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
bind(3, {sa_family=AF_INET, sin_port=htons(20000), sin_addr=inet_addr("224.0.0.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(20000), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
send(3, ..., 4096, 0) = 4096
connect(3, {sa_family=AF_UNSPEC, sa_data="\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 128) = 0
getsockopt(3, SOL_TCP, TCP_ZEROCOPY_RECEIVE, ..., [16]) = 0 // CRASH

Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c959bb6ea4ed..0d43705dd001 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2562,6 +2562,8 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
+	tp->copied_seq = tp->rcv_nxt;
+	tp->urg_data = 0;
 	tcp_write_queue_purge(sk);
 	tcp_fastopen_active_disable_ofo_check(sk);
 	skb_rbtree_purge(&tp->out_of_order_queue);
-- 
cgit v1.2.3


From 5cf3006cc81d9aa09a10aa781fc065546b12919d Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Sun, 8 Jul 2018 09:57:22 +0000
Subject: nl80211: Add a missing break in parse_station_flags

I was looking at usually suppressed gcc warnings,
[-Wimplicit-fallthrough=] in this case:

The code definitely looks like a break is missing here.
However I am not able to test the NL80211_IFTYPE_MESH_POINT,
nor do I actually know what might be :)
So please use this patch with caution and only if you are
able to do some testing.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
[johannes: looks obvious enough to apply as is, interesting
 though that it never seems to have been a problem]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b6c700572755..80bc986c79e5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4409,6 +4409,7 @@ static int parse_station_flags(struct genl_info *info,
 		params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
 					 BIT(NL80211_STA_FLAG_MFP) |
 					 BIT(NL80211_STA_FLAG_AUTHORIZED);
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 21d5e078192d244df3d6049f9464fff2f72cfd68 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 6 Jul 2018 20:06:05 +0200
Subject: netfilter: nft_compat: explicitly reject ERROR and standard target

iptables-nft never requests these, but make this explicitly illegal.
If it were quested, kernel could oops as ->eval is NULL, furthermore,
the builtin targets have no owning module so its possible to rmmod
eb/ip/ip6_tables module even if they would be loaded.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 8d1ff654e5af..32535eea51b2 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -832,10 +832,18 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
 	family = ctx->family;
 
+	if (strcmp(tg_name, XT_ERROR_TARGET) == 0 ||
+	    strcmp(tg_name, XT_STANDARD_TARGET) == 0 ||
+	    strcmp(tg_name, "standard") == 0)
+		return ERR_PTR(-EINVAL);
+
 	/* Re-use the existing target if it's already loaded. */
 	list_for_each_entry(nft_target, &nft_target_list, head) {
 		struct xt_target *target = nft_target->ops.data;
 
+		if (!target->target)
+			continue;
+
 		if (nft_target_cmp(target, tg_name, rev, family))
 			return &nft_target->ops;
 	}
@@ -844,6 +852,11 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	if (IS_ERR(target))
 		return ERR_PTR(-ENOENT);
 
+	if (!target->target) {
+		err = -EINVAL;
+		goto err;
+	}
+
 	if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) {
 		err = -EINVAL;
 		goto err;
-- 
cgit v1.2.3


From 2045cdfa1b40d66f126f3fd05604fc7c754f0022 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 6 Jul 2018 16:38:53 +0300
Subject: netfilter: nf_conntrack: Fix possible possible crash on module
 loading.

Loading the nf_conntrack module with doubled hashsize parameter, i.e.
	  modprobe nf_conntrack hashsize=12345 hashsize=12345
causes NULL-ptr deref.

If 'hashsize' specified twice, the nf_conntrack_set_hashsize() function
will be called also twice.
The first nf_conntrack_set_hashsize() call will set the
'nf_conntrack_htable_size' variable:

	nf_conntrack_set_hashsize()
		...
		/* On boot, we can set this without any fancy locking. */
		if (!nf_conntrack_htable_size)
			return param_set_uint(val, kp);

But on the second invocation, the nf_conntrack_htable_size is already set,
so the nf_conntrack_set_hashsize() will take a different path and call
the nf_conntrack_hash_resize() function. Which will crash on the attempt
to dereference 'nf_conntrack_hash' pointer:

	BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
	RIP: 0010:nf_conntrack_hash_resize+0x255/0x490 [nf_conntrack]
	Call Trace:
	 nf_conntrack_set_hashsize+0xcd/0x100 [nf_conntrack]
	 parse_args+0x1f9/0x5a0
	 load_module+0x1281/0x1a50
	 __se_sys_finit_module+0xbe/0xf0
	 do_syscall_64+0x7c/0x390
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix this, by checking !nf_conntrack_hash instead of
!nf_conntrack_htable_size. nf_conntrack_hash will be initialized only
after the module loaded, so the second invocation of the
nf_conntrack_set_hashsize() won't crash, it will just reinitialize
nf_conntrack_htable_size again.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3465da2a98bd..3d5280425027 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2043,7 +2043,7 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
 		return -EOPNOTSUPP;
 
 	/* On boot, we can set this without any fancy locking. */
-	if (!nf_conntrack_htable_size)
+	if (!nf_conntrack_hash)
 		return param_set_uint(val, kp);
 
 	rc = kstrtouint(val, 0, &hashsize);
-- 
cgit v1.2.3


From 84379c9afe011020e797e3f50a662b08a6355dcf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 9 Jul 2018 13:43:38 +0200
Subject: netfilter: ipv6: nf_defrag: drop skb dst before queueing

Eric Dumazet reports:
 Here is a reproducer of an annoying bug detected by syzkaller on our production kernel
 [..]
 ./b78305423 enable_conntrack
 Then :
 sleep 60
 dmesg | tail -10
 [  171.599093] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  181.631024] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  191.687076] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  201.703037] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  211.711072] unregister_netdevice: waiting for lo to become free. Usage count = 2
 [  221.959070] unregister_netdevice: waiting for lo to become free. Usage count = 2

Reproducer sends ipv6 fragment that hits nfct defrag via LOCAL_OUT hook.
skb gets queued until frag timer expiry -- 1 minute.

Normally nf_conntrack_reasm gets called during prerouting, so skb has
no dst yet which might explain why this wasn't spotted earlier.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: John Sperbeck <jsperbeck@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Tested-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index a452d99c9f52..e4d9e6976d3c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -585,6 +585,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	    fq->q.meat == fq->q.len &&
 	    nf_ct_frag6_reasm(fq, skb, dev))
 		ret = 0;
+	else
+		skb_dst_drop(skb);
 
 out_unlock:
 	spin_unlock_bh(&fq->q.lock);
-- 
cgit v1.2.3


From 59ee4129a279070d8e2f9dc1660330f6593c7808 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 10 Jul 2018 00:43:22 +0200
Subject: bpf: fix ldx in ld_abs rewrite for large offsets

Mark reported that syzkaller triggered a KASAN detected slab-out-of-bounds
bug in ___bpf_prog_run() with a BPF_LD | BPF_ABS word load at offset 0x8001.
After further investigation it became clear that the issue was the
BPF_LDX_MEM() which takes offset as an argument whereas it cannot encode
larger than S16_MAX offsets into it. For this synthetical case we need to
move the full address into tmp register instead and do the LDX without
immediate value.

Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf")
Reported-by: syzbot <syzkaller@googlegroups.com>
Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 5fa66a33927f..a13f5b1f1636 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -459,11 +459,21 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
 	     (!unaligned_ok && offset >= 0 &&
 	      offset + ip_align >= 0 &&
 	      offset + ip_align % size == 0))) {
+		bool ldx_off_ok = offset <= S16_MAX;
+
 		*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
 		*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
-		*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
-		*insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
-				      offset);
+		*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
+				      size, 2 + endian + (!ldx_off_ok * 2));
+		if (ldx_off_ok) {
+			*insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+					      BPF_REG_D, offset);
+		} else {
+			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
+			*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
+			*insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+					      BPF_REG_TMP, 0);
+		}
 		if (endian)
 			*insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
 		*insn++ = BPF_JMP_A(8);
-- 
cgit v1.2.3


From 61d769807f273fda962866f3d4c677cda9974d3c Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Tue, 10 Jul 2018 16:54:02 +0000
Subject: bpf: fix availability probing for seg6 helpers

bpf_lwt_seg6_* helpers require CONFIG_IPV6_SEG6_BPF, and currently
return -EOPNOTSUPP to indicate unavailability. This patch forces the
BPF verifier to reject programs using these helpers when
!CONFIG_IPV6_SEG6_BPF, allowing users to more easily probe if they are
available or not.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index a13f5b1f1636..06da770f543f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4536,10 +4536,10 @@ static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
 	.arg4_type	= ARG_CONST_SIZE
 };
 
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
 	   const void *, from, u32, len)
 {
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	struct seg6_bpf_srh_state *srh_state =
 		this_cpu_ptr(&seg6_bpf_srh_states);
 	void *srh_tlvs, *srh_end, *ptr;
@@ -4565,9 +4565,6 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
 
 	memcpy(skb->data + offset, from, len);
 	return 0;
-#else /* CONFIG_IPV6_SEG6_BPF */
-	return -EOPNOTSUPP;
-#endif
 }
 
 static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
@@ -4583,7 +4580,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
 BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
 	   u32, action, void *, param, u32, param_len)
 {
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	struct seg6_bpf_srh_state *srh_state =
 		this_cpu_ptr(&seg6_bpf_srh_states);
 	struct ipv6_sr_hdr *srh;
@@ -4631,9 +4627,6 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
 	default:
 		return -EINVAL;
 	}
-#else /* CONFIG_IPV6_SEG6_BPF */
-	return -EOPNOTSUPP;
-#endif
 }
 
 static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
@@ -4649,7 +4642,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
 BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
 	   s32, len)
 {
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	struct seg6_bpf_srh_state *srh_state =
 		this_cpu_ptr(&seg6_bpf_srh_states);
 	void *srh_end, *srh_tlvs, *ptr;
@@ -4693,9 +4685,6 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
 	srh_state->hdrlen += len;
 	srh_state->valid = 0;
 	return 0;
-#else /* CONFIG_IPV6_SEG6_BPF */
-	return -EOPNOTSUPP;
-#endif
 }
 
 static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
@@ -4706,6 +4695,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_ANYTHING,
 };
+#endif /* CONFIG_IPV6_SEG6_BPF */
 
 bool bpf_helper_changes_pkt_data(void *func)
 {
@@ -4727,11 +4717,12 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_xdp_adjust_meta ||
 	    func == bpf_msg_pull_data ||
 	    func == bpf_xdp_adjust_tail ||
-	    func == bpf_lwt_push_encap ||
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	    func == bpf_lwt_seg6_store_bytes ||
 	    func == bpf_lwt_seg6_adjust_srh ||
-	    func == bpf_lwt_seg6_action
-	    )
+	    func == bpf_lwt_seg6_action ||
+#endif
+	    func == bpf_lwt_push_encap)
 		return true;
 
 	return false;
@@ -5066,12 +5057,14 @@ static const struct bpf_func_proto *
 lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	case BPF_FUNC_lwt_seg6_store_bytes:
 		return &bpf_lwt_seg6_store_bytes_proto;
 	case BPF_FUNC_lwt_seg6_action:
 		return &bpf_lwt_seg6_action_proto;
 	case BPF_FUNC_lwt_seg6_adjust_srh:
 		return &bpf_lwt_seg6_adjust_srh_proto;
+#endif
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3


From 6e6fddc78323533be570873abb728b7e0ba7e024 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 11 Jul 2018 15:30:14 +0200
Subject: bpf: fix panic due to oob in bpf_prog_test_run_skb

sykzaller triggered several panics similar to the below:

  [...]
  [  248.851531] BUG: KASAN: use-after-free in _copy_to_user+0x5c/0x90
  [  248.857656] Read of size 985 at addr ffff8808017ffff2 by task a.out/1425
  [...]
  [  248.865902] CPU: 1 PID: 1425 Comm: a.out Not tainted 4.18.0-rc4+ #13
  [  248.865903] Hardware name: Supermicro SYS-5039MS-H12TRF/X11SSE-F, BIOS 2.1a 03/08/2018
  [  248.865905] Call Trace:
  [  248.865910]  dump_stack+0xd6/0x185
  [  248.865911]  ? show_regs_print_info+0xb/0xb
  [  248.865913]  ? printk+0x9c/0xc3
  [  248.865915]  ? kmsg_dump_rewind_nolock+0xe4/0xe4
  [  248.865919]  print_address_description+0x6f/0x270
  [  248.865920]  kasan_report+0x25b/0x380
  [  248.865922]  ? _copy_to_user+0x5c/0x90
  [  248.865924]  check_memory_region+0x137/0x190
  [  248.865925]  kasan_check_read+0x11/0x20
  [  248.865927]  _copy_to_user+0x5c/0x90
  [  248.865930]  bpf_test_finish.isra.8+0x4f/0xc0
  [  248.865932]  bpf_prog_test_run_skb+0x6a0/0xba0
  [...]

After scrubbing the BPF prog a bit from the noise, turns out it called
bpf_skb_change_head() for the lwt_xmit prog with headroom of 2. Nothing
wrong in that, however, this was run with repeat >> 0 in bpf_prog_test_run_skb()
and the same skb thus keeps changing until the pskb_expand_head() called
from skb_cow() keeps bailing out in atomic alloc context with -ENOMEM.
So upon return we'll basically have 0 headroom left yet blindly do the
__skb_push() of 14 bytes and keep copying data from there in bpf_test_finish()
out of bounds. Fix to check if we have enough headroom and if pskb_expand_head()
fails, bail out with error.

Another bug independent of this fix (but related in triggering above) is
that BPF_PROG_TEST_RUN should be reworked to reset the skb/xdp buffer to
it's original state from input as otherwise repeating the same test in a
loop won't work for benchmarking when underlying input buffer is getting
changed by the prog each time and reused for the next run leading to
unexpected results.

Fixes: 1cf1cae963c2 ("bpf: introduce BPF_PROG_TEST_RUN command")
Reported-by: syzbot+709412e651e55ed96498@syzkaller.appspotmail.com
Reported-by: syzbot+54f39d6ab58f39720a55@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/bpf/test_run.c                          | 17 ++++++++++++++---
 tools/testing/selftests/bpf/test_verifier.c | 23 ++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 68c3578343b4..22a78eedf4b1 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -96,6 +96,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	u32 size = kattr->test.data_size_in;
 	u32 repeat = kattr->test.repeat;
 	u32 retval, duration;
+	int hh_len = ETH_HLEN;
 	struct sk_buff *skb;
 	void *data;
 	int ret;
@@ -131,12 +132,22 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	skb_reset_network_header(skb);
 
 	if (is_l2)
-		__skb_push(skb, ETH_HLEN);
+		__skb_push(skb, hh_len);
 	if (is_direct_pkt_access)
 		bpf_compute_data_pointers(skb);
 	retval = bpf_test_run(prog, skb, repeat, &duration);
-	if (!is_l2)
-		__skb_push(skb, ETH_HLEN);
+	if (!is_l2) {
+		if (skb_headroom(skb) < hh_len) {
+			int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+			if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
+				kfree_skb(skb);
+				return -ENOMEM;
+			}
+		}
+		memset(__skb_push(skb, hh_len), 0, hh_len);
+	}
+
 	size = skb->len;
 	/* bpf program can never convert linear skb to non-linear */
 	if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 2ecd27b670d7..f5f7bcc96046 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -4974,6 +4974,24 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_LWT_XMIT,
 	},
+	{
+		"make headroom for LWT_XMIT",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_skb_change_head),
+			/* split for s390 to succeed */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_2, 42),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_skb_change_head),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_LWT_XMIT,
+	},
 	{
 		"invalid access of tc_classid for LWT_IN",
 		.insns = {
@@ -12554,8 +12572,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	}
 
 	if (fd_prog >= 0) {
+		__u8 tmp[TEST_DATA_LEN << 2];
+		__u32 size_tmp = sizeof(tmp);
+
 		err = bpf_prog_test_run(fd_prog, 1, test->data,
-					sizeof(test->data), NULL, NULL,
+					sizeof(test->data), tmp, &size_tmp,
 					&retval, NULL);
 		if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) {
 			printf("Unexpected bpf_prog_test_run error\n");
-- 
cgit v1.2.3


From 83fe6b8709f65bc505b10235bd82ece12c4c5099 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Tue, 10 Jul 2018 14:22:27 -0700
Subject: sch_fq_codel: zero q->flows_cnt when fq_codel_init fails

When fq_codel_init fails, qdisc_create_dflt will cleanup by using
qdisc_destroy. This function calls the ->reset() op prior to calling the
->destroy() op.

Unfortunately, during the failure flow for sch_fq_codel, the ->flows
parameter is not initialized, so the fq_codel_reset function will null
pointer dereference.

   kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
   kernel: IP: fq_codel_reset+0x58/0xd0 [sch_fq_codel]
   kernel: PGD 0 P4D 0
   kernel: Oops: 0000 [#1] SMP PTI
   kernel: Modules linked in: i40iw i40e(OE) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack tun bridge stp llc devlink ebtable_filter ebtables ip6table_filter ip6_tables rpcrdma ib_isert iscsi_target_mod sunrpc ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate iTCO_wdt iTCO_vendor_support intel_uncore ib_core intel_rapl_perf mei_me mei joydev i2c_i801 lpc_ich ioatdma shpchp wmi sch_fq_codel xfs libcrc32c mgag200 ixgbe drm_kms_helper isci ttm firewire_ohci
   kernel:  mdio drm igb libsas crc32c_intel firewire_core ptp pps_core scsi_transport_sas crc_itu_t dca i2c_algo_bit ipmi_si ipmi_devintf ipmi_msghandler [last unloaded: i40e]
   kernel: CPU: 10 PID: 4219 Comm: ip Tainted: G           OE    4.16.13custom-fq-codel-test+ #3
   kernel: Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.05.0004.051120151007 05/11/2015
   kernel: RIP: 0010:fq_codel_reset+0x58/0xd0 [sch_fq_codel]
   kernel: RSP: 0018:ffffbfbf4c1fb620 EFLAGS: 00010246
   kernel: RAX: 0000000000000400 RBX: 0000000000000000 RCX: 00000000000005b9
   kernel: RDX: 0000000000000000 RSI: ffff9d03264a60c0 RDI: ffff9cfd17b31c00
   kernel: RBP: 0000000000000001 R08: 00000000000260c0 R09: ffffffffb679c3e9
   kernel: R10: fffff1dab06a0e80 R11: ffff9cfd163af800 R12: ffff9cfd17b31c00
   kernel: R13: 0000000000000001 R14: ffff9cfd153de600 R15: 0000000000000001
   kernel: FS:  00007fdec2f92800(0000) GS:ffff9d0326480000(0000) knlGS:0000000000000000
   kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
   kernel: CR2: 0000000000000008 CR3: 0000000c1956a006 CR4: 00000000000606e0
   kernel: Call Trace:
   kernel:  qdisc_destroy+0x56/0x140
   kernel:  qdisc_create_dflt+0x8b/0xb0
   kernel:  mq_init+0xc1/0xf0
   kernel:  qdisc_create_dflt+0x5a/0xb0
   kernel:  dev_activate+0x205/0x230
   kernel:  __dev_open+0xf5/0x160
   kernel:  __dev_change_flags+0x1a3/0x210
   kernel:  dev_change_flags+0x21/0x60
   kernel:  do_setlink+0x660/0xdf0
   kernel:  ? down_trylock+0x25/0x30
   kernel:  ? xfs_buf_trylock+0x1a/0xd0 [xfs]
   kernel:  ? rtnl_newlink+0x816/0x990
   kernel:  ? _xfs_buf_find+0x327/0x580 [xfs]
   kernel:  ? _cond_resched+0x15/0x30
   kernel:  ? kmem_cache_alloc+0x20/0x1b0
   kernel:  ? rtnetlink_rcv_msg+0x200/0x2f0
   kernel:  ? rtnl_calcit.isra.30+0x100/0x100
   kernel:  ? netlink_rcv_skb+0x4c/0x120
   kernel:  ? netlink_unicast+0x19e/0x260
   kernel:  ? netlink_sendmsg+0x1ff/0x3c0
   kernel:  ? sock_sendmsg+0x36/0x40
   kernel:  ? ___sys_sendmsg+0x295/0x2f0
   kernel:  ? ebitmap_cmp+0x6d/0x90
   kernel:  ? dev_get_by_name_rcu+0x73/0x90
   kernel:  ? skb_dequeue+0x52/0x60
   kernel:  ? __inode_wait_for_writeback+0x7f/0xf0
   kernel:  ? bit_waitqueue+0x30/0x30
   kernel:  ? fsnotify_grab_connector+0x3c/0x60
   kernel:  ? __sys_sendmsg+0x51/0x90
   kernel:  ? do_syscall_64+0x74/0x180
   kernel:  ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2
   kernel: Code: 00 00 48 89 87 00 02 00 00 8b 87 a0 01 00 00 85 c0 0f 84 84 00 00 00 31 ed 48 63 dd 83 c5 01 48 c1 e3 06 49 03 9c 24 90 01 00 00 <48> 8b 73 08 48 8b 3b e8 6c 9a 4f f6 48 8d 43 10 48 c7 03 00 00
   kernel: RIP: fq_codel_reset+0x58/0xd0 [sch_fq_codel] RSP: ffffbfbf4c1fb620
   kernel: CR2: 0000000000000008
   kernel: ---[ end trace e81a62bede66274e ]---

This is caused because flows_cnt is non-zero, but flows hasn't been
initialized. fq_codel_init has left the private data in a partially
initialized state.

To fix this, reset flows_cnt to 0 when we fail to initialize.
Additionally, to make the state more consistent, also cleanup the flows
pointer when the allocation of backlogs fails.

This fixes the NULL pointer dereference, since both the for-loop and
memset in fq_codel_reset will be no-ops when flow_cnt is zero.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq_codel.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index cd2e0e342fb6..6c0a9d5dbf94 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -479,24 +479,28 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
 	q->cparams.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
-		int err = fq_codel_change(sch, opt, extack);
+		err = fq_codel_change(sch, opt, extack);
 		if (err)
-			return err;
+			goto init_failure;
 	}
 
 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
 	if (err)
-		return err;
+		goto init_failure;
 
 	if (!q->flows) {
 		q->flows = kvcalloc(q->flows_cnt,
 				    sizeof(struct fq_codel_flow),
 				    GFP_KERNEL);
-		if (!q->flows)
-			return -ENOMEM;
+		if (!q->flows) {
+			err = -ENOMEM;
+			goto init_failure;
+		}
 		q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL);
-		if (!q->backlogs)
-			return -ENOMEM;
+		if (!q->backlogs) {
+			err = -ENOMEM;
+			goto alloc_failure;
+		}
 		for (i = 0; i < q->flows_cnt; i++) {
 			struct fq_codel_flow *flow = q->flows + i;
 
@@ -509,6 +513,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
 	else
 		sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
+
+alloc_failure:
+	kvfree(q->flows);
+	q->flows = NULL;
+init_failure:
+	q->flows_cnt = 0;
+	return err;
 }
 
 static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
cgit v1.2.3


From 21684dc46c598e477707487c009f9773f7c0382d Mon Sep 17 00:00:00 2001
From: Stefan Baranoff <sbaranoff@gmail.com>
Date: Tue, 10 Jul 2018 17:25:20 -0400
Subject: tcp: fix sequence numbers for repaired sockets re-using TIME-WAIT
 sockets

This patch fixes a bug where the sequence numbers of a socket created using
TCP repair functionality are lower than set after connect is called.
This occurs when the repair socket overlaps with a TIME-WAIT socket and
triggers the re-use code. The amount lower is equal to the number of times
that a particular IP/port set is re-used and then put back into TIME-WAIT.
Re-using the first time the sequence number is 1 lower, closing that socket
and then re-opening (with repair) a new socket with the same addresses/ports
puts the sequence number 2 lower than set via setsockopt. The third time is
3 lower, etc. I have not tested what the limit of this acrewal is, if any.

The fix is, if a socket is in repair mode, to respect the already set
sequence number and timestamp when it would have already re-used the
TIME-WAIT socket.

Signed-off-by: Stefan Baranoff <sbaranoff@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bea17f1e8302..3b2711e33e4c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -156,11 +156,24 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 	 */
 	if (tcptw->tw_ts_recent_stamp &&
 	    (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
-		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
-		if (tp->write_seq == 0)
-			tp->write_seq = 1;
-		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
-		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		/* In case of repair and re-using TIME-WAIT sockets we still
+		 * want to be sure that it is safe as above but honor the
+		 * sequence numbers and time stamps set as part of the repair
+		 * process.
+		 *
+		 * Without this check re-using a TIME-WAIT socket with TCP
+		 * repair would accumulate a -1 on the repair assigned
+		 * sequence number. The first time it is reused the sequence
+		 * is -1, the second time -2, etc. This fixes that issue
+		 * without appearing to create any others.
+		 */
+		if (likely(!tp->repair)) {
+			tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+			if (tp->write_seq == 0)
+				tp->write_seq = 1;
+			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		}
 		sock_hold(sktw);
 		return 1;
 	}
-- 
cgit v1.2.3


From 70b7ff130224d2d22a158c7f4aa5e7fb1c95949d Mon Sep 17 00:00:00 2001
From: Stefan Baranoff <sbaranoff@gmail.com>
Date: Tue, 10 Jul 2018 17:31:10 -0400
Subject: tcp: allow user to create repair socket without window probes

Under rare conditions where repair code may be used it is possible that
window probes are either unnecessary or undesired. If the user knows that
window probes are not wanted or needed this change allows them to skip
sending them when a socket comes out of repair.

Signed-off-by: Stefan Baranoff <sbaranoff@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0d43705dd001..8e5e2ca9ab1b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2823,14 +2823,16 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_REPAIR:
 		if (!tcp_can_repair_sock(sk))
 			err = -EPERM;
-		else if (val == 1) {
-			tp->repair = 1;
+		/* 1 for normal repair, 2 for no window probes */
+		else if (val == 1 || val == 2) {
+			tp->repair = val;
 			sk->sk_reuse = SK_FORCE_REUSE;
 			tp->repair_queue = TCP_NO_QUEUE;
 		} else if (val == 0) {
 			tp->repair = 0;
 			sk->sk_reuse = SK_NO_REUSE;
-			tcp_send_window_probe(sk);
+			if (tp->repair == 1)
+				tcp_send_window_probe(sk);
 		} else
 			err = -EINVAL;
 
-- 
cgit v1.2.3


From 8b7008620b8452728cadead460a36f64ed78c460 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 11 Jul 2018 14:39:42 +0200
Subject: net: Don't copy pfmemalloc flag in __copy_skb_header()

The pfmemalloc flag indicates that the skb was allocated from
the PFMEMALLOC reserves, and the flag is currently copied on skb
copy and clone.

However, an skb copied from an skb flagged with pfmemalloc
wasn't necessarily allocated from PFMEMALLOC reserves, and on
the other hand an skb allocated that way might be copied from an
skb that wasn't.

So we should not copy the flag on skb copy, and rather decide
whether to allow an skb to be associated with sockets unrelated
to page reclaim depending only on how it was allocated.

Move the pfmemalloc flag before headers_start[0] using an
existing 1-bit hole, so that __copy_skb_header() doesn't copy
it.

When cloning, we'll now take care of this flag explicitly,
contravening to the warning comment of __skb_clone().

While at it, restore the newline usage introduced by commit
b19372273164 ("net: reorganize sk_buff for faster
__copy_skb_header()") to visually separate bytes used in
bitfields after headers_start[0], that was gone after commit
a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage
area"), and describe the pfmemalloc flag in the kernel-doc
structure comment.

This doesn't change the size of sk_buff or cacheline boundaries,
but consolidates the 15 bits hole before tc_index into a 2 bytes
hole before csum, that could now be filled more easily.

Reported-by: Patrick Talbert <ptalbert@redhat.com>
Fixes: c93bdd0e03e8 ("netvm: allow skb allocation to use PFMEMALLOC reserves")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 10 +++++-----
 net/core/skbuff.c      |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 164cdedf6012..610a201126ee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
+ *	@pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
  *	@ndisc_nodetype: router type (from link layer)
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -735,7 +736,7 @@ struct sk_buff {
 				peeked:1,
 				head_frag:1,
 				xmit_more:1,
-				__unused:1; /* one bit hole */
+				pfmemalloc:1;
 
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
@@ -754,31 +755,30 @@ struct sk_buff {
 
 	__u8			__pkt_type_offset[0];
 	__u8			pkt_type:3;
-	__u8			pfmemalloc:1;
 	__u8			ignore_df:1;
-
 	__u8			nf_trace:1;
 	__u8			ip_summed:2;
 	__u8			ooo_okay:1;
+
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
 	__u8			wifi_acked_valid:1;
 	__u8			wifi_acked:1;
-
 	__u8			no_fcs:1;
 	/* Indicates the inner headers are valid in the skbuff. */
 	__u8			encapsulation:1;
 	__u8			encap_hdr_csum:1;
 	__u8			csum_valid:1;
+
 	__u8			csum_complete_sw:1;
 	__u8			csum_level:2;
 	__u8			csum_not_inet:1;
-
 	__u8			dst_pending_confirm:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
 	__u8			ipvs_property:1;
+
 	__u8			inner_protocol_type:1;
 	__u8			remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index eba8dae22c25..4df3164bb5fc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -858,6 +858,8 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	n->cloned = 1;
 	n->nohdr = 0;
 	n->peeked = 0;
+	if (skb->pfmemalloc)
+		n->pfmemalloc = 1;
 	n->destructor = NULL;
 	C(tail);
 	C(end);
-- 
cgit v1.2.3


From bab2c80e5a6c855657482eac9e97f5f3eedb509a Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 11 Jul 2018 12:00:44 -0400
Subject: nsh: set mac len based on inner packet

When pulling the NSH header in nsh_gso_segment, set the mac length
based on the encapsulated packet type.

skb_reset_mac_len computes an offset to the network header, which
here still points to the outer packet:

  >     skb_reset_network_header(skb);
  >     [...]
  >     __skb_pull(skb, nsh_len);
  >     skb_reset_mac_header(skb);    // now mac hdr starts nsh_len == 8B after net hdr
  >     skb_reset_mac_len(skb);       // mac len = net hdr - mac hdr == (u16) -8 == 65528
  >     [..]
  >     skb_mac_gso_segment(skb, ..)

Link: http://lkml.kernel.org/r/CAF=yD-KeAcTSOn4AxirAxL8m7QAS8GBBe1w09eziYwvPbbUeYA@mail.gmail.com
Reported-by: syzbot+7b9ed9872dab8c32305d@syzkaller.appspotmail.com
Fixes: c411ed854584 ("nsh: add GSO support")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/nsh/nsh.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 9696ef96b719..1a30e165eeb4 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -104,7 +104,7 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
 	__skb_pull(skb, nsh_len);
 
 	skb_reset_mac_header(skb);
-	skb_reset_mac_len(skb);
+	skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0;
 	skb->protocol = proto;
 
 	features &= NETIF_F_SG;
-- 
cgit v1.2.3


From 993675a3100b16a4c80dfd70cbcde8ea7127b31d Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 11 Jul 2018 12:00:45 -0400
Subject: packet: reset network header if packet shorter than ll reserved space

If variable length link layer headers result in a packet shorter
than dev->hard_header_len, reset the network header offset. Else
skb->mac_len may exceed skb->len after skb_mac_reset_len.

packet_sendmsg_spkt already has similar logic.

Fixes: b84bbaf7a6c8 ("packet: in packet_snd start writing at link layer allocation")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 57634bc3da74..9b27d0cd766d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2878,6 +2878,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 			goto out_free;
 	} else if (reserve) {
 		skb_reserve(skb, -reserve);
+		if (len < reserve)
+			skb_reset_network_header(skb);
 	}
 
 	/* Returns -EFAULT on error */
-- 
cgit v1.2.3


From 509d7648135f914a3dd64c17484b33df5dd0a19c Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jul 2018 10:12:49 +0200
Subject: xsk: do not return ENXIO from TX copy mode

This patch removes the ENXIO return code from TX copy-mode when
someone has forcefully changed the number of queues on the device so
that the queue bound to the socket is no longer available. Just
silently stop sending anything as in zero-copy mode so the error
reporting gets consistent between the two modes.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7d220cbd09b6..08d09115093e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -244,10 +244,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			goto out;
 		}
 
-		if (xs->queue_id >= xs->dev->real_num_tx_queues) {
-			err = -ENXIO;
+		if (xs->queue_id >= xs->dev->real_num_tx_queues)
 			goto out;
-		}
 
 		skb = sock_alloc_send_skb(sk, len, 1, &err);
 		if (unlikely(!skb)) {
-- 
cgit v1.2.3


From 9684f5e7c8cdf076aeec81344d4893a30f7aa6a1 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jul 2018 10:12:50 +0200
Subject: xsk: do not return EAGAIN from sendmsg when completion queue is full

This patch stops returning EAGAIN in TX copy mode when the completion
queue is full as zero-copy does not do this. Instead this situation
can be detected by comparing the head and tail pointers of the
completion queue in both modes. In any case, EAGAIN was not the
correct error code here since no amount of calling sendmsg will solve
the problem. Only consuming one or more messages on the completion
queue will fix this.

With this patch, the error reporting becomes consistent between copy
mode and zero-copy mode.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 08d09115093e..87567232d0f8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -233,10 +233,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			goto out;
 		}
 
-		if (xskq_reserve_addr(xs->umem->cq)) {
-			err = -EAGAIN;
+		if (xskq_reserve_addr(xs->umem->cq))
 			goto out;
-		}
 
 		len = desc.len;
 		if (unlikely(len > xs->dev->mtu)) {
-- 
cgit v1.2.3


From 6efb4436f7fcc50cc3fb9a113d0f16e3968172b1 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jul 2018 10:12:51 +0200
Subject: xsk: always return ENOBUFS from sendmsg if there is no TX queue

This patch makes sure ENOBUFS is always returned from sendmsg if there
is no TX queue configured. This was not the case for zero-copy
mode. With this patch this error reporting is consistent between copy
mode and zero-copy mode.

Fixes: ac98d8aab61b ("xsk: wire upp Tx zero-copy functions")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 87567232d0f8..9c784307f7b0 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -218,9 +218,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 	struct sk_buff *skb;
 	int err = 0;
 
-	if (unlikely(!xs->tx))
-		return -ENOBUFS;
-
 	mutex_lock(&xs->mutex);
 
 	while (xskq_peek_desc(xs->tx, &desc)) {
@@ -296,6 +293,8 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 		return -ENXIO;
 	if (unlikely(!(xs->dev->flags & IFF_UP)))
 		return -ENETDOWN;
+	if (unlikely(!xs->tx))
+		return -ENOBUFS;
 	if (need_wait)
 		return -EOPNOTSUPP;
 
-- 
cgit v1.2.3


From 09210c4bcc065d9d91ef3c051902ad18252cd3c0 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 11 Jul 2018 10:12:52 +0200
Subject: xsk: do not return EMSGSIZE in copy mode for packets larger than MTU

This patch stops returning EMSGSIZE from sendmsg in copy mode when the
size of the packet is larger than the MTU. Just send it to the device
so that it will drop it as in zero-copy mode. This makes the error
reporting consistent between copy mode and zero-copy mode.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9c784307f7b0..72335c2e8108 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -233,15 +233,10 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		if (xskq_reserve_addr(xs->umem->cq))
 			goto out;
 
-		len = desc.len;
-		if (unlikely(len > xs->dev->mtu)) {
-			err = -EMSGSIZE;
-			goto out;
-		}
-
 		if (xs->queue_id >= xs->dev->real_num_tx_queues)
 			goto out;
 
+		len = desc.len;
 		skb = sock_alloc_send_skb(sk, len, 1, &err);
 		if (unlikely(!skb)) {
 			err = -EAGAIN;
-- 
cgit v1.2.3


From e78bfb0751d4e312699106ba7efbed2bab1a53ca Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 13 Jul 2018 13:21:07 +0200
Subject: skbuff: Unconditionally copy pfmemalloc in __skb_clone()

Commit 8b7008620b84 ("net: Don't copy pfmemalloc flag in
__copy_skb_header()") introduced a different handling for the
pfmemalloc flag in copy and clone paths.

In __skb_clone(), now, the flag is set only if it was set in the
original skb, but not cleared if it wasn't. This is wrong and
might lead to socket buffers being flagged with pfmemalloc even
if the skb data wasn't allocated from pfmemalloc reserves. Copy
the flag instead of ORing it.

Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Fixes: 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_header()")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Tested-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4df3164bb5fc..8e51f8555e11 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -858,8 +858,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	n->cloned = 1;
 	n->nohdr = 0;
 	n->peeked = 0;
-	if (skb->pfmemalloc)
-		n->pfmemalloc = 1;
+	C(pfmemalloc);
 	n->destructor = NULL;
 	C(tail);
 	C(end);
-- 
cgit v1.2.3


From b0c05d0e99d98d7f0cd41efc1eeec94efdc3325d Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jul 2018 06:04:52 -0700
Subject: tcp: fix dctcp delayed ACK schedule

Previously, when a data segment was sent an ACK was piggybacked
on the data segment without generating a CA_EVENT_NON_DELAYED_ACK
event to notify congestion control modules. So the DCTCP
ca->delayed_ack_reserved flag could incorrectly stay set when
in fact there were no delayed ACKs being reserved. This could result
in sending a special ECN notification ACK that carries an older
ACK sequence, when in fact there was no need for such an ACK.
DCTCP keeps track of the delayed ACK status with its own separate
state ca->delayed_ack_reserved. Previously it may accidentally cancel
the delayed ACK without updating this field upon sending a special
ACK that carries a older ACK sequence. This inconsistency would
lead to DCTCP receiver never acknowledging the latest data until the
sender times out and retry in some cases.

Packetdrill script (provided by Larry Brakmo)

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
0.110 < [ect0] . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4

0.200 < [ect0] . 1:1001(1000) ack 1 win 257
0.200 > [ect01] . 1:1(0) ack 1001

0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 1:2(1) ack 1001

0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 2:3(1) ack 2001

0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
0.200 > [ect01] . 3:3(0) ack 4001

0.210 < [ce] P. 4001:4501(500) ack 3 win 257

+0.001 read(4, ..., 4500) = 4500
+0 write(4, ..., 1) = 1
+0 > [ect01] PE. 3:4(1) ack 4501

+0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
// Previously the ACK sequence below would be 4501, causing a long RTO
+0.040~+0.045 > [ect01] . 4:4(0) ack 5501   // delayed ack

+0.311 < [ect0] . 5501:6501(1000) ack 4 win 257  // More data
+0 > [ect01] . 4:4(0) ack 6501     // now acks everything

+0.500 < F. 9501:9501(0) ack 4 win 257

Reported-by: Larry Brakmo <brakmo@fb.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_dctcp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5f5e5936760e..89f88b0d8167 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
 	/* State has changed from CE=0 to CE=1 and delayed
 	 * ACK has not sent yet.
 	 */
-	if (!ca->ce_state && ca->delayed_ack_reserved) {
+	if (!ca->ce_state &&
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
 		u32 tmp_rcv_nxt;
 
 		/* Save current rcv_nxt. */
@@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 	/* State has changed from CE=1 to CE=0 and delayed
 	 * ACK has not sent yet.
 	 */
-	if (ca->ce_state && ca->delayed_ack_reserved) {
+	if (ca->ce_state &&
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
 		u32 tmp_rcv_nxt;
 
 		/* Save current rcv_nxt. */
-- 
cgit v1.2.3


From a69258f7aa2623e0930212f09c586fd06674ad79 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 12 Jul 2018 06:04:53 -0700
Subject: tcp: remove DELAYED ACK events in DCTCP

After fixing the way DCTCP tracking delayed ACKs, the delayed-ACK
related callbacks are no longer needed

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 --
 net/ipv4/tcp_dctcp.c  | 25 -------------------------
 net/ipv4/tcp_output.c |  4 ----
 3 files changed, 31 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index af3ec72d5d41..3482d13d655b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -912,8 +912,6 @@ enum tcp_ca_event {
 	CA_EVENT_LOSS,		/* loss timeout */
 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
-	CA_EVENT_DELAYED_ACK,	/* Delayed ack is sent */
-	CA_EVENT_NON_DELAYED_ACK,
 };
 
 /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 89f88b0d8167..5869f89ca656 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -55,7 +55,6 @@ struct dctcp {
 	u32 dctcp_alpha;
 	u32 next_seq;
 	u32 ce_state;
-	u32 delayed_ack_reserved;
 	u32 loss_cwnd;
 };
 
@@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk)
 
 		ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
 
-		ca->delayed_ack_reserved = 0;
 		ca->loss_cwnd = 0;
 		ca->ce_state = 0;
 
@@ -250,25 +248,6 @@ static void dctcp_state(struct sock *sk, u8 new_state)
 	}
 }
 
-static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
-{
-	struct dctcp *ca = inet_csk_ca(sk);
-
-	switch (ev) {
-	case CA_EVENT_DELAYED_ACK:
-		if (!ca->delayed_ack_reserved)
-			ca->delayed_ack_reserved = 1;
-		break;
-	case CA_EVENT_NON_DELAYED_ACK:
-		if (ca->delayed_ack_reserved)
-			ca->delayed_ack_reserved = 0;
-		break;
-	default:
-		/* Don't care for the rest. */
-		break;
-	}
-}
-
 static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 {
 	switch (ev) {
@@ -278,10 +257,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 	case CA_EVENT_ECN_NO_CE:
 		dctcp_ce_state_1_to_0(sk);
 		break;
-	case CA_EVENT_DELAYED_ACK:
-	case CA_EVENT_NON_DELAYED_ACK:
-		dctcp_update_ack_reserved(sk, ev);
-		break;
 	default:
 		/* Don't care for the rest. */
 		break;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8e08b409c71e..00e5a300ddb9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3523,8 +3523,6 @@ void tcp_send_delayed_ack(struct sock *sk)
 	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
-	tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
-
 	if (ato > TCP_DELACK_MIN) {
 		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ / 2;
@@ -3581,8 +3579,6 @@ void tcp_send_ack(struct sock *sk)
 	if (sk->sk_state == TCP_CLOSE)
 		return;
 
-	tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
-
 	/* We are not putting this on the write queue, so
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.
-- 
cgit v1.2.3


From c290fba8c4ce6530cd941ea14db5a4ac2f77183f Mon Sep 17 00:00:00 2001
From: piaojun <piaojun@huawei.com>
Date: Fri, 13 Jul 2018 16:59:06 -0700
Subject: net/9p/client.c: put refcount of trans_mod in error case in
 parse_opts()

In my testing, the second mount will fail after umounting successfully.
The reason is that we put refcount of trans_mod in the correct case
rather than the error case in parse_opts() at last.  That will cause the
refcount decrease to -1, and when we try to get trans_mod again in
try_module_get(), we could only increase refcount to 0 which will cause
failure as follows:

parse_opts
  v9fs_get_trans_by_name
    try_module_get : return NULL to caller which cause error

So we should put refcount of trans_mod in error case.

Link: http://lkml.kernel.org/r/5B3F39A0.2030509@huawei.com
Fixes: 9421c3e64137ec ("net/9p/client.c: fix potential refcnt problem of trans module")
Signed-off-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Dominique Martinet <dominique.martinet@cea.fr>
Tested-by: Dominique Martinet <dominique.martinet@cea.fr>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/9p/client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/9p/client.c b/net/9p/client.c
index 18c5271910dc..5c1343195292 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -225,7 +225,8 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 	}
 
 free_and_return:
-	v9fs_put_trans(clnt->trans_mod);
+	if (ret)
+		v9fs_put_trans(clnt->trans_mod);
 	kfree(tmp_options);
 	return ret;
 }
-- 
cgit v1.2.3


From 6e2059b53f9885f202b086d7b4ca10a98926e974 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 10 Jul 2018 22:41:26 +0800
Subject: ipv4/igmp: init group mode as INCLUDE when join source group

Based on RFC3376 5.1
   If no interface
   state existed for that multicast address before the change (i.e., the
   change consisted of creating a new per-interface record), or if no
   state exists after the change (i.e., the change consisted of deleting
   a per-interface record), then the "non-existent" state is considered
   to have a filter mode of INCLUDE and an empty source list.

Which means a new multicast group should start with state IN().

Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast)
mode. It adds a group with state EX() and inits crcount to mc_qrv,
so the kernel will send a TO_EX() report message after adding group.

But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we
split the group joining into two steps. First we join the group like ASM,
i.e. via ip_mc_join_group(). So the state changes from IN() to EX().

Then we add the source-specific address with INCLUDE mode. So the state
changes from EX() to IN(A).

Before the first step sends a group change record, we finished the second
step. So we will only send the second change record. i.e. TO_IN(A).

Regarding the RFC stands, we should actually send an ALLOW(A) message for
SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)'
transition.

The issue was exposed by commit a052517a8ff65 ("net/multicast: should not
send source list records when have filter mode change"). Before this change,
we used to send both ALLOW(A) and TO_IN(A). After this change we only send
TO_IN(A).

Fix it by adding a new parameter to init group mode. Also add new wrapper
functions so we don't need to change too much code.

v1 -> v2:
In my first version I only cleared the group change record. But this is not
enough. Because when a new group join, it will init as EXCLUDE and trigger
an filter mode change in ip/ip6_mc_add_src(), which will clear all source
addresses' sf_crcount. This will prevent early joined address sending state
change records if multi source addressed joined at the same time.

In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM
JOIN_SOURCE_GROUP. I also split the original patch into two separated patches
for IPv4 and IPv6.

Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h   |  2 ++
 net/ipv4/igmp.c        | 58 ++++++++++++++++++++++++++++++++++++--------------
 net/ipv4/ip_sockglue.c |  4 ++--
 3 files changed, 46 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index f8231854b5d6..119f53941c12 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -109,6 +109,8 @@ struct ip_mc_list {
 extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto);
 extern int igmp_rcv(struct sk_buff *);
 extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
+extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+				unsigned int mode);
 extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
 extern void ip_mc_drop_socket(struct sock *sk);
 extern int ip_mc_source(int add, int omode, struct sock *sk,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 85b617b655bc..b3c899a630a0 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,13 +1200,14 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 	spin_lock_bh(&im->lock);
 	if (pmc) {
 		im->interface = pmc->interface;
-		im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		im->sfmode = pmc->sfmode;
 		if (pmc->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
 			im->sources = pmc->sources;
 			for (psf = im->sources; psf; psf = psf->sf_next)
-				psf->sf_crcount = im->crcount;
+				psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+		} else {
+			im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
 		}
 		in_dev_put(pmc->interface);
 		kfree(pmc);
@@ -1288,7 +1289,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 #endif
 }
 
-static void igmp_group_added(struct ip_mc_list *im)
+static void igmp_group_added(struct ip_mc_list *im, unsigned int mode)
 {
 	struct in_device *in_dev = im->interface;
 #ifdef CONFIG_IP_MULTICAST
@@ -1316,7 +1317,13 @@ static void igmp_group_added(struct ip_mc_list *im)
 	}
 	/* else, v3 */
 
-	im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+	/* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
+	 * not send filter-mode change record as the mode should be from
+	 * IN() to IN(A).
+	 */
+	if (mode == MCAST_EXCLUDE)
+		im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+
 	igmp_ifc_event(in_dev);
 #endif
 }
@@ -1381,8 +1388,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
 /*
  *	A socket has joined a multicast group on device dev.
  */
-
-void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode)
 {
 	struct ip_mc_list *im;
 #ifdef CONFIG_IP_MULTICAST
@@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	for_each_pmc_rtnl(in_dev, im) {
 		if (im->multiaddr == addr) {
 			im->users++;
-			ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+			ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
 			goto out;
 		}
 	}
@@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	in_dev_hold(in_dev);
 	im->multiaddr = addr;
 	/* initial mode is (EX, empty) */
-	im->sfmode = MCAST_EXCLUDE;
-	im->sfcount[MCAST_EXCLUDE] = 1;
+	im->sfmode = mode;
+	im->sfcount[mode] = 1;
 	refcount_set(&im->refcnt, 1);
 	spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
@@ -1426,12 +1432,17 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 #ifdef CONFIG_IP_MULTICAST
 	igmpv3_del_delrec(in_dev, im);
 #endif
-	igmp_group_added(im);
+	igmp_group_added(im, mode);
 	if (!in_dev->dead)
 		ip_rt_multicast_event(in_dev);
 out:
 	return;
 }
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+	__ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+}
 EXPORT_SYMBOL(ip_mc_inc_group);
 
 static int ip_mc_check_iphdr(struct sk_buff *skb)
@@ -1688,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev)
 #ifdef CONFIG_IP_MULTICAST
 		igmpv3_del_delrec(in_dev, pmc);
 #endif
-		igmp_group_added(pmc);
+		igmp_group_added(pmc, pmc->sfmode);
 	}
 }
 
@@ -1751,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev)
 #ifdef CONFIG_IP_MULTICAST
 		igmpv3_del_delrec(in_dev, pmc);
 #endif
-		igmp_group_added(pmc);
+		igmp_group_added(pmc, pmc->sfmode);
 	}
 }
 
@@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
 
 /* Join a multicast group
  */
-
-int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
+			      unsigned int mode)
 {
 	__be32 addr = imr->imr_multiaddr.s_addr;
 	struct ip_mc_socklist *iml, *i;
@@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
 	memcpy(&iml->multi, imr, sizeof(*imr));
 	iml->next_rcu = inet->mc_list;
 	iml->sflist = NULL;
-	iml->sfmode = MCAST_EXCLUDE;
+	iml->sfmode = mode;
 	rcu_assign_pointer(inet->mc_list, iml);
-	ip_mc_inc_group(in_dev, addr);
+	__ip_mc_inc_group(in_dev, addr, mode);
 	err = 0;
 done:
 	return err;
 }
+
+/* Join ASM (Any-Source Multicast) group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
+}
 EXPORT_SYMBOL(ip_mc_join_group);
 
+/* Join SSM (Source-Specific Multicast) group
+ */
+int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+			 unsigned int mode)
+{
+	return __ip_mc_join_group(sk, imr, mode);
+}
+
 static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
 			   struct in_device *in_dev)
 {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc32fdbeefa6..64c76dcf7386 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -984,7 +984,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
 			mreq.imr_address.s_addr = mreqs.imr_interface;
 			mreq.imr_ifindex = 0;
-			err = ip_mc_join_group(sk, &mreq);
+			err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
 			if (err && err != -EADDRINUSE)
 				break;
 			omode = MCAST_INCLUDE;
@@ -1061,7 +1061,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			mreq.imr_multiaddr = psin->sin_addr;
 			mreq.imr_address.s_addr = 0;
 			mreq.imr_ifindex = greqs.gsr_interface;
-			err = ip_mc_join_group(sk, &mreq);
+			err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
 			if (err && err != -EADDRINUSE)
 				break;
 			greqs.gsr_interface = mreq.imr_ifindex;
-- 
cgit v1.2.3


From c7ea20c9da5b94e400c8dcc0adb99411f2e430a6 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 10 Jul 2018 22:41:27 +0800
Subject: ipv6/mcast: init as INCLUDE when join SSM INCLUDE group

This an IPv6 version patch of "ipv4/igmp: init group mode as INCLUDE when
join source group". From RFC3810, part 6.1:

   If no per-interface state existed for that
   multicast address before the change (i.e., the change consisted of
   creating a new per-interface record), or if no state exists after the
   change (i.e., the change consisted of deleting a per-interface
   record), then the "non-existent" state is considered to have an
   INCLUDE filter mode and an empty source list.

Which means a new multicast group should start with state IN(). Currently,
for MLDv2 SSM JOIN_SOURCE_GROUP mode, we first call ipv6_sock_mc_join(),
then ip6_mc_source(), which will trigger a TO_IN() message instead of
ALLOW().

The issue was exposed by commit a052517a8ff65 ("net/multicast: should not
send source list records when have filter mode change"). Before this change,
we sent both ALLOW(A) and TO_IN(A). Now, we only send TO_IN(A).

Fix it by adding a new parameter to init group mode. Also add some wrapper
functions to avoid changing too much code.

v1 -> v2:
In the first version I only cleared the group change record. But this is not
enough. Because when a new group join, it will init as EXCLUDE and trigger
a filter mode change in ip/ip6_mc_add_src(), which will clear all source
addresses sf_crcount. This will prevent early joined address sending state
change records if multi source addressed joined at the same time.

In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM
JOIN_SOURCE_GROUP. I also split the original patch into two separated patches
for IPv4 and IPv6.

There is also a difference between v4 and v6 version. For IPv6, when the
interface goes down and up, we will send correct state change record with
unspecified IPv6 address (::) with function ipv6_mc_up(). But after DAD is
completed, we resend the change record TO_IN() in mld_send_initial_cr().
Fix it by sending ALLOW() for INCLUDE mode in mld_send_initial_cr().

Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       |  2 ++
 net/ipv6/ipv6_sockglue.c |  5 ++--
 net/ipv6/mcast.c         | 64 ++++++++++++++++++++++++++++++++++--------------
 3 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index d02881e4ad1f..7528632bcf2a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1100,6 +1100,8 @@ void ipv6_sysctl_unregister(void);
 
 int ipv6_sock_mc_join(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
+int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
+			  const struct in6_addr *addr, unsigned int mode);
 int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
 #endif /* _NET_IPV6_H */
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c95c3486d904..568ca4187cd1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -729,8 +729,9 @@ done:
 			struct sockaddr_in6 *psin6;
 
 			psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
-			retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
-						 &psin6->sin6_addr);
+			retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
+						     &psin6->sin6_addr,
+						     MCAST_INCLUDE);
 			/* prior join w/ different source is ok */
 			if (retv && retv != -EADDRINUSE)
 				break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index c0c74088f2af..2699be7202be 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -95,6 +95,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
 			  int delta);
 static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 			    struct inet6_dev *idev);
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+			     const struct in6_addr *addr, unsigned int mode);
 
 #define MLD_QRV_DEFAULT		2
 /* RFC3810, 9.2. Query Interval */
@@ -132,7 +134,8 @@ static int unsolicited_report_interval(struct inet6_dev *idev)
 	return iv > 0 ? iv : 1;
 }
 
-int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
+			       const struct in6_addr *addr, unsigned int mode)
 {
 	struct net_device *dev = NULL;
 	struct ipv6_mc_socklist *mc_lst;
@@ -179,7 +182,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	}
 
 	mc_lst->ifindex = dev->ifindex;
-	mc_lst->sfmode = MCAST_EXCLUDE;
+	mc_lst->sfmode = mode;
 	rwlock_init(&mc_lst->sflock);
 	mc_lst->sflist = NULL;
 
@@ -187,7 +190,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	 *	now add/increase the group membership on the device
 	 */
 
-	err = ipv6_dev_mc_inc(dev, addr);
+	err = __ipv6_dev_mc_inc(dev, addr, mode);
 
 	if (err) {
 		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
@@ -199,8 +202,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 
 	return 0;
 }
+
+int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+	return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE);
+}
 EXPORT_SYMBOL(ipv6_sock_mc_join);
 
+int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
+			  const struct in6_addr *addr, unsigned int mode)
+{
+	return __ipv6_sock_mc_join(sk, ifindex, addr, mode);
+}
+
 /*
  *	socket leave on multicast group
  */
@@ -646,7 +660,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
 	return rv;
 }
 
-static void igmp6_group_added(struct ifmcaddr6 *mc)
+static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode)
 {
 	struct net_device *dev = mc->idev->dev;
 	char buf[MAX_ADDR_LEN];
@@ -672,7 +686,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
 	}
 	/* else v2 */
 
-	mc->mca_crcount = mc->idev->mc_qrv;
+	/* Based on RFC3810 6.1, for newly added INCLUDE SSM, we
+	 * should not send filter-mode change record as the mode
+	 * should be from IN() to IN(A).
+	 */
+	if (mode == MCAST_EXCLUDE)
+		mc->mca_crcount = mc->idev->mc_qrv;
+
 	mld_ifc_event(mc->idev);
 }
 
@@ -770,13 +790,14 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
 	spin_lock_bh(&im->mca_lock);
 	if (pmc) {
 		im->idev = pmc->idev;
-		im->mca_crcount = idev->mc_qrv;
 		im->mca_sfmode = pmc->mca_sfmode;
 		if (pmc->mca_sfmode == MCAST_INCLUDE) {
 			im->mca_tomb = pmc->mca_tomb;
 			im->mca_sources = pmc->mca_sources;
 			for (psf = im->mca_sources; psf; psf = psf->sf_next)
-				psf->sf_crcount = im->mca_crcount;
+				psf->sf_crcount = idev->mc_qrv;
+		} else {
+			im->mca_crcount = idev->mc_qrv;
 		}
 		in6_dev_put(pmc->idev);
 		kfree(pmc);
@@ -831,7 +852,8 @@ static void ma_put(struct ifmcaddr6 *mc)
 }
 
 static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
-				   const struct in6_addr *addr)
+				   const struct in6_addr *addr,
+				   unsigned int mode)
 {
 	struct ifmcaddr6 *mc;
 
@@ -849,9 +871,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
 	refcount_set(&mc->mca_refcnt, 1);
 	spin_lock_init(&mc->mca_lock);
 
-	/* initial mode is (EX, empty) */
-	mc->mca_sfmode = MCAST_EXCLUDE;
-	mc->mca_sfcount[MCAST_EXCLUDE] = 1;
+	mc->mca_sfmode = mode;
+	mc->mca_sfcount[mode] = 1;
 
 	if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
 	    IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
@@ -863,7 +884,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
 /*
  *	device multicast group inc (add if not found)
  */
-int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+			     const struct in6_addr *addr, unsigned int mode)
 {
 	struct ifmcaddr6 *mc;
 	struct inet6_dev *idev;
@@ -887,14 +909,13 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
 		if (ipv6_addr_equal(&mc->mca_addr, addr)) {
 			mc->mca_users++;
 			write_unlock_bh(&idev->lock);
-			ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0,
-				NULL, 0);
+			ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
 			in6_dev_put(idev);
 			return 0;
 		}
 	}
 
-	mc = mca_alloc(idev, addr);
+	mc = mca_alloc(idev, addr, mode);
 	if (!mc) {
 		write_unlock_bh(&idev->lock);
 		in6_dev_put(idev);
@@ -911,11 +932,16 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 
 	mld_del_delrec(idev, mc);
-	igmp6_group_added(mc);
+	igmp6_group_added(mc, mode);
 	ma_put(mc);
 	return 0;
 }
 
+int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+{
+	return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
+}
+
 /*
  *	device multicast group del
  */
@@ -1751,7 +1777,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 
 		psf_next = psf->sf_next;
 
-		if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
 			psf_prev = psf;
 			continue;
 		}
@@ -2066,7 +2092,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev)
 		if (pmc->mca_sfcount[MCAST_EXCLUDE])
 			type = MLD2_CHANGE_TO_EXCLUDE;
 		else
-			type = MLD2_CHANGE_TO_INCLUDE;
+			type = MLD2_ALLOW_NEW_SOURCES;
 		skb = add_grec(skb, pmc, type, 0, 0, 1);
 		spin_unlock_bh(&pmc->mca_lock);
 	}
@@ -2546,7 +2572,7 @@ void ipv6_mc_up(struct inet6_dev *idev)
 	ipv6_mc_reset(idev);
 	for (i = idev->mc_list; i; i = i->next) {
 		mld_del_delrec(idev, i);
-		igmp6_group_added(i);
+		igmp6_group_added(i, i->mca_sfmode);
 	}
 	read_unlock_bh(&idev->lock);
 }
-- 
cgit v1.2.3


From c604cb767049b78b3075497b80ebb8fd530ea2cc Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 11 Jul 2018 10:46:29 -0700
Subject: KEYS: DNS: fix parsing multiple options

My recent fix for dns_resolver_preparse() printing very long strings was
incomplete, as shown by syzbot which still managed to hit the
WARN_ONCE() in set_precision() by adding a crafted "dns_resolver" key:

    precision 50001 too large
    WARNING: CPU: 7 PID: 864 at lib/vsprintf.c:2164 vsnprintf+0x48a/0x5a0

The bug this time isn't just a printing bug, but also a logical error
when multiple options ("#"-separated strings) are given in the key
payload.  Specifically, when separating an option string into name and
value, if there is no value then the name is incorrectly considered to
end at the end of the key payload, rather than the end of the current
option.  This bypasses validation of the option length, and also means
that specifying multiple options is broken -- which presumably has gone
unnoticed as there is currently only one valid option anyway.

A similar problem also applied to option values, as the kstrtoul() when
parsing the "dnserror" option will read past the end of the current
option and into the next option.

Fix these bugs by correctly computing the length of the option name and
by copying the option value, null-terminated, into a temporary buffer.

Reproducer for the WARN_ONCE() that syzbot hit:

    perl -e 'print "#A#", "\0" x 50000' | keyctl padd dns_resolver desc @s

Reproducer for "dnserror" option being parsed incorrectly (expected
behavior is to fail when seeing the unknown option "foo", actual
behavior was to read the dnserror value as "1#foo" and fail there):

    perl -e 'print "#dnserror=1#foo\0"' | keyctl padd dns_resolver desc @s

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 4a2d789267e0 ("DNS: If the DNS server returns an error, allow that to be cached [ver #2]")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dns_resolver/dns_key.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 40c851693f77..0c9478b91fa5 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -86,35 +86,39 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 		opt++;
 		kdebug("options: '%s'", opt);
 		do {
+			int opt_len, opt_nlen;
 			const char *eq;
-			int opt_len, opt_nlen, opt_vlen, tmp;
+			char optval[128];
 
 			next_opt = memchr(opt, '#', end - opt) ?: end;
 			opt_len = next_opt - opt;
-			if (opt_len <= 0 || opt_len > 128) {
+			if (opt_len <= 0 || opt_len > sizeof(optval)) {
 				pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
 						    opt_len);
 				return -EINVAL;
 			}
 
-			eq = memchr(opt, '=', opt_len) ?: end;
-			opt_nlen = eq - opt;
-			eq++;
-			opt_vlen = next_opt - eq; /* will be -1 if no value */
+			eq = memchr(opt, '=', opt_len);
+			if (eq) {
+				opt_nlen = eq - opt;
+				eq++;
+				memcpy(optval, eq, next_opt - eq);
+				optval[next_opt - eq] = '\0';
+			} else {
+				opt_nlen = opt_len;
+				optval[0] = '\0';
+			}
 
-			tmp = opt_vlen >= 0 ? opt_vlen : 0;
-			kdebug("option '%*.*s' val '%*.*s'",
-			       opt_nlen, opt_nlen, opt, tmp, tmp, eq);
+			kdebug("option '%*.*s' val '%s'",
+			       opt_nlen, opt_nlen, opt, optval);
 
 			/* see if it's an error number representing a DNS error
 			 * that's to be recorded as the result in this key */
 			if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
 			    memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
 				kdebug("dns error number option");
-				if (opt_vlen <= 0)
-					goto bad_option_value;
 
-				ret = kstrtoul(eq, 10, &derrno);
+				ret = kstrtoul(optval, 10, &derrno);
 				if (ret < 0)
 					goto bad_option_value;
 
-- 
cgit v1.2.3


From 32da12216e467dea70a09cd7094c30779ce0f9db Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 12 Jul 2018 08:03:43 -0700
Subject: tls: Stricter error checking in zerocopy sendmsg path

In the zerocopy sendmsg() path, there are error checks to revert
the zerocopy if we get any error code.  syzkaller has discovered
that tls_push_record can return -ECONNRESET, which is fatal, and
happens after the point at which it is safe to revert the iter,
as we've already passed the memory to do_tcp_sendpages.

Previously this code could return -ENOMEM and we would want to
revert the iter, but AFAIK this no longer returns ENOMEM after
a447da7d004 ("tls: fix waitall behavior in tls_sw_recvmsg"),
so we fail for all error codes.

Reported-by: syzbot+c226690f7b3126c5ee04@syzkaller.appspotmail.com
Reported-by: syzbot+709f2810a6a05f11d4d3@syzkaller.appspotmail.com
Signed-off-by: Dave Watson <davejwatson@fb.com>
Fixes: 3c4d7559159b ("tls: kernel TLS support")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 7818011fd250..4618f1c31137 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -440,7 +440,7 @@ alloc_encrypted:
 			ret = tls_push_record(sk, msg->msg_flags, record_type);
 			if (!ret)
 				continue;
-			if (ret == -EAGAIN)
+			if (ret < 0)
 				goto send_end;
 
 			copied -= try_to_copy;
-- 
cgit v1.2.3


From b7ed879425be371905d856410d19e9a42a62bcf3 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Fri, 13 Jul 2018 14:40:50 +0900
Subject: net: ip6_gre: get ipv6hdr after skb_cow_head()

A KASAN:use-after-free bug was found related to ip6-erspan
while running selftests/net/ip6_gre_headroom.sh

It happens because of following sequence:
- ipv6hdr pointer is obtained from skb
- skb_cow_head() is called, skb->head memory is reallocated
- old data is accessed using ipv6hdr pointer

skb_cow_head() call was added in e41c7c68ea77 ("ip6erspan: make sure
enough headroom at xmit."), but looking at the history there was a
chance of similar bug because gre_handle_offloads() and pskb_trim()
can also reallocate skb->head memory. Fixes tag points to commit
which introduced possibility of this bug.

This patch moves ipv6hdr pointer assignment after skb_cow_head() call.

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index c8cf2fdbb13b..cd2cfb04e5d8 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -927,7 +927,6 @@ tx_err:
 static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 					 struct net_device *dev)
 {
-	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device_stats *stats;
@@ -1010,6 +1009,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 			goto tx_err;
 		}
 	} else {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
 		switch (skb->protocol) {
 		case htons(ETH_P_IP):
 			memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-- 
cgit v1.2.3


From e66515999b627368892ccc9b3a13a506f2ea1357 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 13 Jul 2018 17:21:42 +0200
Subject: ipv6: make DAD fail with enhanced DAD when nonce length differs

Commit adc176c54722 ("ipv6 addrconf: Implemented enhanced DAD (RFC7527)")
added enhanced DAD with a nonce length of 6 bytes. However, RFC7527
doesn't specify the length of the nonce, other than being 6 + 8*k bytes,
with integer k >= 0 (RFC3971 5.3.2). The current implementation simply
assumes that the nonce will always be 6 bytes, but others systems are
free to choose different sizes.

If another system sends a nonce of different length but with the same 6
bytes prefix, it shouldn't be considered as the same nonce. Thus, check
that the length of the received nonce is the same as the length we sent.

Ugly scapy test script running on veth0:

def loop():
    pkt=sniff(iface="veth0", filter="icmp6", count=1)
    pkt = pkt[0]
    b = bytearray(pkt[Raw].load)
    b[1] += 1
    b += b'\xde\xad\xbe\xef\xde\xad\xbe\xef'
    pkt[Raw].load = bytes(b)
    pkt[IPv6].plen += 8
    # fixup checksum after modifying the payload
    pkt[IPv6].payload.cksum -= 0x3b44
    if pkt[IPv6].payload.cksum < 0:
        pkt[IPv6].payload.cksum += 0xffff
    sendp(pkt, iface="veth0")

This should result in DAD failure for any address added to veth0's peer,
but is currently ignored.

Fixes: adc176c54722 ("ipv6 addrconf: Implemented enhanced DAD (RFC7527)")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e640d2f3c55c..0ec273997d1d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -811,7 +811,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 			return;
 		}
 	}
-	if (ndopts.nd_opts_nonce)
+	if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
 		memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
 
 	inc = ipv6_addr_is_multicast(daddr);
-- 
cgit v1.2.3


From 31048d7aedf31bf0f69c54a662944632f29d82f2 Mon Sep 17 00:00:00 2001
From: Stefan Baranoff <sbaranoff@gmail.com>
Date: Sun, 15 Jul 2018 11:36:37 -0400
Subject: tcp: Fix broken repair socket window probe patch

Correct previous bad attempt at allowing sockets to come out of TCP
repair without sending window probes. To avoid changing size of
the repair variable in struct tcp_sock, this lets the decision for
sending probes or not to be made when coming out of repair by
introducing two ways to turn it off.

v2:
* Remove erroneous comment; defines now make behavior clear

Fixes: 70b7ff130224 ("tcp: allow user to create repair socket without window probes")
Signed-off-by: Stefan Baranoff <sbaranoff@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |  4 ++++
 net/ipv4/tcp.c           | 13 +++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 29eb659aa77a..e3f6ed8a7064 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -127,6 +127,10 @@ enum {
 
 #define TCP_CM_INQ		TCP_INQ
 
+#define TCP_REPAIR_ON		1
+#define TCP_REPAIR_OFF		0
+#define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
+
 struct tcp_repair_opt {
 	__u32	opt_code;
 	__u32	opt_val;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e5e2ca9ab1b..ec2186e3087f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2823,16 +2823,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_REPAIR:
 		if (!tcp_can_repair_sock(sk))
 			err = -EPERM;
-		/* 1 for normal repair, 2 for no window probes */
-		else if (val == 1 || val == 2) {
-			tp->repair = val;
+		else if (val == TCP_REPAIR_ON) {
+			tp->repair = 1;
 			sk->sk_reuse = SK_FORCE_REUSE;
 			tp->repair_queue = TCP_NO_QUEUE;
-		} else if (val == 0) {
+		} else if (val == TCP_REPAIR_OFF) {
+			tp->repair = 0;
+			sk->sk_reuse = SK_NO_REUSE;
+			tcp_send_window_probe(sk);
+		} else if (val == TCP_REPAIR_OFF_NO_WP) {
 			tp->repair = 0;
 			sk->sk_reuse = SK_NO_REUSE;
-			if (tp->repair == 1)
-				tcp_send_window_probe(sk);
 		} else
 			err = -EINVAL;
 
-- 
cgit v1.2.3


From b5d2d75e079a918be686957b1a8d2f6c5cc95a0a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 15 Jul 2018 09:35:19 -0700
Subject: net/ipv6: Do not allow device only routes via the multipath API

Eric reported that reverting the patch that fixed and simplified IPv6
multipath routes means reverting back to invalid userspace notifications.
eg.,
$ ip -6 route add 2001:db8:1::/64 nexthop dev eth0 nexthop dev eth1

only generates a single notification:
2001:db8:1::/64 dev eth0 metric 1024 pref medium

While working on a fix for this problem I found another case that is just
broken completely - a multipath route with a gateway followed by device
followed by gateway:
    $ ip -6 ro add 2001:db8:103::/64
          nexthop via 2001:db8:1::64
          nexthop dev dummy2
          nexthop via 2001:db8:3::64

In this case the device only route is dropped completely - no notification
to userpsace but no addition to the FIB either:

$ ip -6 ro ls
2001:db8:1::/64 dev dummy1 proto kernel metric 256 pref medium
2001:db8:2::/64 dev dummy2 proto kernel metric 256 pref medium
2001:db8:3::/64 dev dummy3 proto kernel metric 256 pref medium
2001:db8:103::/64 metric 1024
	nexthop via 2001:db8:1::64 dev dummy1 weight 1
	nexthop via 2001:db8:3::64 dev dummy3 weight 1 pref medium
fe80::/64 dev dummy1 proto kernel metric 256 pref medium
fe80::/64 dev dummy2 proto kernel metric 256 pref medium
fe80::/64 dev dummy3 proto kernel metric 256 pref medium

Really, IPv6 multipath is just FUBAR'ed beyond repair when it comes to
device only routes, so do not allow it all.

This change will break any scripts relying on the mpath api for insert,
but I don't see any other way to handle the permutations. Besides, since
the routes are added to the FIB as standalone (non-multipath) routes the
kernel is not doing what the user requested, so it might as well tell the
user that.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 63f99411f0de..2ce0bd17de4f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4388,6 +4388,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			rt = NULL;
 			goto cleanup;
 		}
+		if (!rt6_qualify_for_ecmp(rt)) {
+			err = -EINVAL;
+			NL_SET_ERR_MSG(extack,
+				       "Device only routes can not be added for IPv6 using the multipath API.");
+			fib6_info_release(rt);
+			goto cleanup;
+		}
 
 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
 
-- 
cgit v1.2.3


From 1992d99882afda6dc17f9d49c06150856a91282f Mon Sep 17 00:00:00 2001
From: Ursula Braun <ursula.braun@linux.ibm.com>
Date: Mon, 16 Jul 2018 13:56:52 +0200
Subject: net/smc: take sock lock in smc_ioctl()

SMC ioctl processing requires the sock lock to work properly in
all thinkable scenarios.
Problem has been found with RaceFuzzer and fixes:
   KASAN: null-ptr-deref Read in smc_ioctl

Reported-by: Byoungyoung Lee <lifeasageek@gmail.com>
Reported-by: syzbot+35b2c5aa76fd398b9fd4@syzkaller.appspotmail.com
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5334157f5065..c12a7fc18f56 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1524,10 +1524,13 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 			return -EBADF;
 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
 	}
+	lock_sock(&smc->sk);
 	switch (cmd) {
 	case SIOCINQ: /* same as FIONREAD */
-		if (smc->sk.sk_state == SMC_LISTEN)
+		if (smc->sk.sk_state == SMC_LISTEN) {
+			release_sock(&smc->sk);
 			return -EINVAL;
+		}
 		if (smc->sk.sk_state == SMC_INIT ||
 		    smc->sk.sk_state == SMC_CLOSED)
 			answ = 0;
@@ -1536,8 +1539,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		break;
 	case SIOCOUTQ:
 		/* output queue size (not send + not acked) */
-		if (smc->sk.sk_state == SMC_LISTEN)
+		if (smc->sk.sk_state == SMC_LISTEN) {
+			release_sock(&smc->sk);
 			return -EINVAL;
+		}
 		if (smc->sk.sk_state == SMC_INIT ||
 		    smc->sk.sk_state == SMC_CLOSED)
 			answ = 0;
@@ -1547,8 +1552,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		break;
 	case SIOCOUTQNSD:
 		/* output queue size (not send only) */
-		if (smc->sk.sk_state == SMC_LISTEN)
+		if (smc->sk.sk_state == SMC_LISTEN) {
+			release_sock(&smc->sk);
 			return -EINVAL;
+		}
 		if (smc->sk.sk_state == SMC_INIT ||
 		    smc->sk.sk_state == SMC_CLOSED)
 			answ = 0;
@@ -1556,8 +1563,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 			answ = smc_tx_prepared_sends(&smc->conn);
 		break;
 	case SIOCATMARK:
-		if (smc->sk.sk_state == SMC_LISTEN)
+		if (smc->sk.sk_state == SMC_LISTEN) {
+			release_sock(&smc->sk);
 			return -EINVAL;
+		}
 		if (smc->sk.sk_state == SMC_INIT ||
 		    smc->sk.sk_state == SMC_CLOSED) {
 			answ = 0;
@@ -1573,8 +1582,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		}
 		break;
 	default:
+		release_sock(&smc->sk);
 		return -ENOIOCTLCMD;
 	}
+	release_sock(&smc->sk);
 
 	return put_user(answ, (int __user *)arg);
 }
-- 
cgit v1.2.3


From 26b2f552525cf98fad08515bd6faa427f2f22038 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 13 Jul 2018 01:38:08 +0900
Subject: netfilter: nf_tables: fix jumpstack depth validation

The level of struct nft_ctx is updated by nf_tables_check_loops().  That
is used to validate jumpstack depth. But jumpstack validation routine
doesn't update and validate recursively.  So, in some cases, chain depth
can be bigger than the NFT_JUMP_STACK_SIZE.

After this patch, The jumpstack validation routine is located in the
nft_chain_validate(). When new rules or new set elements are added, the
nft_table_validate() is called by the nf_tables_newrule and the
nf_tables_newsetelem. The nft_table_validate() calls the
nft_chain_validate() that visit all their children chains recursively.
So it can update depth of chain certainly.

Reproducer:
   %cat ./test.sh
   #!/bin/bash
   nft add table ip filter
   nft add chain ip filter input { type filter hook input priority 0\; }
   for ((i=0;i<20;i++)); do
	nft add chain ip filter a$i
   done

   nft add rule ip filter input jump a1

   for ((i=0;i<10;i++)); do
	nft add rule ip filter a$i jump a$((i+1))
   done

   for ((i=11;i<19;i++)); do
	nft add rule ip filter a$i jump a$((i+1))
   done

   nft add rule ip filter a10 jump a11

Result:
[  253.931782] WARNING: CPU: 1 PID: 0 at net/netfilter/nf_tables_core.c:186 nft_do_chain+0xacc/0xdf0 [nf_tables]
[  253.931915] Modules linked in: nf_tables nfnetlink ip_tables x_tables
[  253.932153] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.18.0-rc3+ #48
[  253.932153] RIP: 0010:nft_do_chain+0xacc/0xdf0 [nf_tables]
[  253.932153] Code: 83 f8 fb 0f 84 c7 00 00 00 e9 d0 00 00 00 83 f8 fd 74 0e 83 f8 ff 0f 84 b4 00 00 00 e9 bd 00 00 00 83 bd 64 fd ff ff 0f 76 09 <0f> 0b 31 c0 e9 bc 02 00 00 44 8b ad 64 fd
[  253.933807] RSP: 0018:ffff88011b807570 EFLAGS: 00010212
[  253.933807] RAX: 00000000fffffffd RBX: ffff88011b807660 RCX: 0000000000000000
[  253.933807] RDX: 0000000000000010 RSI: ffff880112b39d78 RDI: ffff88011b807670
[  253.933807] RBP: ffff88011b807850 R08: ffffed0023700ece R09: ffffed0023700ecd
[  253.933807] R10: ffff88011b80766f R11: ffffed0023700ece R12: ffff88011b807898
[  253.933807] R13: ffff880112b39d80 R14: ffff880112b39d60 R15: dffffc0000000000
[  253.933807] FS:  0000000000000000(0000) GS:ffff88011b800000(0000) knlGS:0000000000000000
[  253.933807] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  253.933807] CR2: 00000000014f1008 CR3: 000000006b216000 CR4: 00000000001006e0
[  253.933807] Call Trace:
[  253.933807]  <IRQ>
[  253.933807]  ? sched_clock_cpu+0x132/0x170
[  253.933807]  ? __nft_trace_packet+0x180/0x180 [nf_tables]
[  253.933807]  ? sched_clock_cpu+0x132/0x170
[  253.933807]  ? debug_show_all_locks+0x290/0x290
[  253.933807]  ? __lock_acquire+0x4835/0x4af0
[  253.933807]  ? inet_ehash_locks_alloc+0x1a0/0x1a0
[  253.933807]  ? unwind_next_frame+0x159e/0x1840
[  253.933807]  ? __read_once_size_nocheck.constprop.4+0x5/0x10
[  253.933807]  ? nft_do_chain_ipv4+0x197/0x1e0 [nf_tables]
[  253.933807]  ? nft_do_chain+0x5/0xdf0 [nf_tables]
[  253.933807]  nft_do_chain_ipv4+0x197/0x1e0 [nf_tables]
[  253.933807]  ? nft_do_chain_arp+0xb0/0xb0 [nf_tables]
[  253.933807]  ? __lock_is_held+0x9d/0x130
[  253.933807]  nf_hook_slow+0xc4/0x150
[  253.933807]  ip_local_deliver+0x28b/0x380
[  253.933807]  ? ip_call_ra_chain+0x3e0/0x3e0
[  253.933807]  ? ip_rcv_finish+0x1610/0x1610
[  253.933807]  ip_rcv+0xbcc/0xcc0
[  253.933807]  ? debug_show_all_locks+0x290/0x290
[  253.933807]  ? ip_local_deliver+0x380/0x380
[  253.933807]  ? __lock_is_held+0x9d/0x130
[  253.933807]  ? ip_local_deliver+0x380/0x380
[  253.933807]  __netif_receive_skb_core+0x1c9c/0x2240

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++--
 net/netfilter/nf_tables_api.c     | 11 ++++-------
 net/netfilter/nft_immediate.c     |  3 +++
 net/netfilter/nft_lookup.c        | 13 +++++++++++--
 4 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 08c005ce56e9..4e82a4c49912 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -150,6 +150,7 @@ static inline void nft_data_debug(const struct nft_data *data)
  *	@portid: netlink portID of the original message
  *	@seq: netlink sequence number
  *	@family: protocol family
+ *	@level: depth of the chains
  *	@report: notify via unicast netlink message
  */
 struct nft_ctx {
@@ -160,6 +161,7 @@ struct nft_ctx {
 	u32				portid;
 	u32				seq;
 	u8				family;
+	u8				level;
 	bool				report;
 };
 
@@ -865,7 +867,6 @@ enum nft_chain_flags {
  *	@table: table that this chain belongs to
  *	@handle: chain handle
  *	@use: number of jump references to this chain
- *	@level: length of longest path to this chain
  *	@flags: bitmask of enum nft_chain_flags
  *	@name: name of the chain
  */
@@ -878,7 +879,6 @@ struct nft_chain {
 	struct nft_table		*table;
 	u64				handle;
 	u32				use;
-	u16				level;
 	u8				flags:6,
 					genmask:2;
 	char				*name;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 896d4a36081d..d41fa2c82f14 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -75,6 +75,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 {
 	ctx->net	= net;
 	ctx->family	= family;
+	ctx->level	= 0;
 	ctx->table	= table;
 	ctx->chain	= chain;
 	ctx->nla   	= nla;
@@ -2384,6 +2385,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
 	struct nft_rule *rule;
 	int err;
 
+	if (ctx->level == NFT_JUMP_STACK_SIZE)
+		return -EMLINK;
+
 	list_for_each_entry(rule, &chain->rules, list) {
 		if (!nft_is_active_next(ctx->net, rule))
 			continue;
@@ -6837,13 +6841,6 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
 			err = nf_tables_check_loops(ctx, data->verdict.chain);
 			if (err < 0)
 				return err;
-
-			if (ctx->chain->level + 1 >
-			    data->verdict.chain->level) {
-				if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
-					return -EMLINK;
-				data->verdict.chain->level = ctx->chain->level + 1;
-			}
 		}
 
 		return 0;
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 15adf8ca82c3..0777a93211e2 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -98,6 +98,7 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
 				  const struct nft_data **d)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
 	const struct nft_data *data;
 	int err;
 
@@ -109,9 +110,11 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
 	switch (data->verdict.code) {
 	case NFT_JUMP:
 	case NFT_GOTO:
+		pctx->level++;
 		err = nft_chain_validate(ctx, data->verdict.chain);
 		if (err < 0)
 			return err;
+		pctx->level--;
 		break;
 	default:
 		break;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 42e6fadf1417..c2a1d84cdfc4 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -155,7 +155,9 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
 				       struct nft_set_elem *elem)
 {
 	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	struct nft_ctx *pctx = (struct nft_ctx *)ctx;
 	const struct nft_data *data;
+	int err;
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
 	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
@@ -165,10 +167,17 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
 	switch (data->verdict.code) {
 	case NFT_JUMP:
 	case NFT_GOTO:
-		return nft_chain_validate(ctx, data->verdict.chain);
+		pctx->level++;
+		err = nft_chain_validate(ctx, data->verdict.chain);
+		if (err < 0)
+			return err;
+		pctx->level--;
+		break;
 	default:
-		return 0;
+		break;
 	}
+
+	return 0;
 }
 
 static int nft_lookup_validate(const struct nft_ctx *ctx,
-- 
cgit v1.2.3


From 9970a8e40d4c39e23d62d32540366d1d7d2cce9b Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 10 Jul 2018 23:21:08 +0900
Subject: netfilter: nft_set_hash: add rcu_barrier() in the nft_rhash_destroy()

GC of set uses call_rcu() to destroy elements.
So that elements would be destroyed after destroying sets and chains.
But, elements should be destroyed before destroying sets and chains.
In order to wait calling call_rcu(), a rcu_barrier() is added.

In order to test correctly, below patch should be applied.
https://patchwork.ozlabs.org/patch/940883/

test scripts:
   %cat test.nft
   table ip aa {
	   map map1 {
		   type ipv4_addr : verdict; flags timeout;
		   elements = {
			   0 : jump a0,
			   1 : jump a0,
			   2 : jump a0,
			   3 : jump a0,
			   4 : jump a0,
			   5 : jump a0,
			   6 : jump a0,
			   7 : jump a0,
			   8 : jump a0,
			   9 : jump a0,
		   }
		   timeout 1s;
	   }
	   chain a0 {
	   }
   }
   flush ruleset

   [ ... ]

   table ip aa {
	   map map1 {
		   type ipv4_addr : verdict; flags timeout;
		   elements = {
			   0 : jump a0,
			   1 : jump a0,
			   2 : jump a0,
			   3 : jump a0,
			   4 : jump a0,
			   5 : jump a0,
			   6 : jump a0,
			   7 : jump a0,
			   8 : jump a0,
			   9 : jump a0,
		   }
		   timeout 1s;
	   }
	   chain a0 {
	   }
   }
   flush ruleset

Splat looks like:
[  200.795603] kernel BUG at net/netfilter/nf_tables_api.c:1363!
[  200.806944] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
[  200.812253] CPU: 1 PID: 1582 Comm: nft Not tainted 4.17.0+ #24
[  200.820297] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015
[  200.830309] RIP: 0010:nf_tables_chain_destroy.isra.34+0x62/0x240 [nf_tables]
[  200.838317] Code: 43 50 85 c0 74 26 48 8b 45 00 48 8b 4d 08 ba 54 05 00 00 48 c7 c6 60 6d 29 c0 48 c7 c7 c0 65 29 c0
4c 8b 40 08 e8 58 e5 fd f8 <0f> 0b 48 89 da 48 b8 00 00 00 00 00 fc ff
[  200.860366] RSP: 0000:ffff880118dbf4d0 EFLAGS: 00010282
[  200.866354] RAX: 0000000000000061 RBX: ffff88010cdeaf08 RCX: 0000000000000000
[  200.874355] RDX: 0000000000000061 RSI: 0000000000000008 RDI: ffffed00231b7e90
[  200.882361] RBP: ffff880118dbf4e8 R08: ffffed002373bcfb R09: ffffed002373bcfa
[  200.890354] R10: 0000000000000000 R11: ffffed002373bcfb R12: dead000000000200
[  200.898356] R13: dead000000000100 R14: ffffffffbb62af38 R15: dffffc0000000000
[  200.906354] FS:  00007fefc31fd700(0000) GS:ffff88011b800000(0000) knlGS:0000000000000000
[  200.915533] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  200.922355] CR2: 0000557f1c8e9128 CR3: 0000000106880000 CR4: 00000000001006e0
[  200.930353] Call Trace:
[  200.932351]  ? nf_tables_commit+0x26f6/0x2c60 [nf_tables]
[  200.939525]  ? nf_tables_setelem_notify.constprop.49+0x1a0/0x1a0 [nf_tables]
[  200.947525]  ? nf_tables_delchain+0x6e0/0x6e0 [nf_tables]
[  200.952383]  ? nft_add_set_elem+0x1700/0x1700 [nf_tables]
[  200.959532]  ? nla_parse+0xab/0x230
[  200.963529]  ? nfnetlink_rcv_batch+0xd06/0x10d0 [nfnetlink]
[  200.968384]  ? nfnetlink_net_init+0x130/0x130 [nfnetlink]
[  200.975525]  ? debug_show_all_locks+0x290/0x290
[  200.980363]  ? debug_show_all_locks+0x290/0x290
[  200.986356]  ? sched_clock_cpu+0x132/0x170
[  200.990352]  ? find_held_lock+0x39/0x1b0
[  200.994355]  ? sched_clock_local+0x10d/0x130
[  200.999531]  ? memset+0x1f/0x40

Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_hash.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 72ef35b51cac..90c3e7e6cacb 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -387,6 +387,7 @@ static void nft_rhash_destroy(const struct nft_set *set)
 	struct nft_rhash *priv = nft_set_priv(set);
 
 	cancel_delayed_work_sync(&priv->gc_work);
+	rcu_barrier();
 	rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
 				    (void *)set);
 }
-- 
cgit v1.2.3


From c293ac959f809ee1cd31609d9e62bccf6804b2e6 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 10 Jul 2018 23:22:01 +0900
Subject: netfilter: nft_set_rbtree: fix panic when destroying set by GC

This patch fixes below.
1. check null pointer of rb_next.
 rb_next can return null. so null check routine should be added.
2. add rcu_barrier in destroy routine.
 GC uses call_rcu to remove elements. but all elements should be
 removed before destroying set and chains. so that rcu_barrier is added.

test script:
   %cat test.nft
   table inet aa {
	   map map1 {
		   type ipv4_addr : verdict; flags interval, timeout;
		   elements = {
			   0-1 : jump a0,
			   3-4 : jump a0,
			   6-7 : jump a0,
			   9-10 : jump a0,
			   12-13 : jump a0,
			   15-16 : jump a0,
			   18-19 : jump a0,
			   21-22 : jump a0,
			   24-25 : jump a0,
			   27-28 : jump a0,
		   }
		   timeout 1s;
	   }
	   chain a0 {
	   }
   }
   flush ruleset
   table inet aa {
	   map map1 {
		   type ipv4_addr : verdict; flags interval, timeout;
		   elements = {
			   0-1 : jump a0,
			   3-4 : jump a0,
			   6-7 : jump a0,
			   9-10 : jump a0,
			   12-13 : jump a0,
			   15-16 : jump a0,
			   18-19 : jump a0,
			   21-22 : jump a0,
			   24-25 : jump a0,
			   27-28 : jump a0,
		   }
		   timeout 1s;
	   }
	   chain a0 {
	   }
   }
   flush ruleset

splat looks like:
[ 2402.419838] kasan: GPF could be caused by NULL-ptr deref or user memory access
[ 2402.428433] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
[ 2402.429343] CPU: 1 PID: 1350 Comm: kworker/1:1 Not tainted 4.18.0-rc2+ #1
[ 2402.429343] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 03/23/2017
[ 2402.429343] Workqueue: events_power_efficient nft_rbtree_gc [nft_set_rbtree]
[ 2402.429343] RIP: 0010:rb_next+0x1e/0x130
[ 2402.429343] Code: e9 de f2 ff ff 0f 1f 80 00 00 00 00 41 55 48 89 fa 41 54 55 53 48 c1 ea 03 48 b8 00 00 00 0
[ 2402.429343] RSP: 0018:ffff880105f77678 EFLAGS: 00010296
[ 2402.429343] RAX: dffffc0000000000 RBX: ffff8801143e3428 RCX: 1ffff1002287c69c
[ 2402.429343] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000000
[ 2402.429343] RBP: 0000000000000000 R08: ffffed0016aabc24 R09: ffffed0016aabc24
[ 2402.429343] R10: 0000000000000001 R11: ffffed0016aabc23 R12: 0000000000000000
[ 2402.429343] R13: ffff8800b6933388 R14: dffffc0000000000 R15: ffff8801143e3440
[ 2402.534486] kasan: CONFIG_KASAN_INLINE enabled
[ 2402.534212] FS:  0000000000000000(0000) GS:ffff88011b600000(0000) knlGS:0000000000000000
[ 2402.534212] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2402.534212] CR2: 0000000000863008 CR3: 00000000a3c16000 CR4: 00000000001006e0
[ 2402.534212] Call Trace:
[ 2402.534212]  nft_rbtree_gc+0x2b5/0x5f0 [nft_set_rbtree]
[ 2402.534212]  process_one_work+0xc1b/0x1ee0
[ 2402.540329] kasan: GPF could be caused by NULL-ptr deref or user memory access
[ 2402.534212]  ? _raw_spin_unlock_irq+0x29/0x40
[ 2402.534212]  ? pwq_dec_nr_in_flight+0x3e0/0x3e0
[ 2402.534212]  ? set_load_weight+0x270/0x270
[ 2402.534212]  ? __schedule+0x6ea/0x1fb0
[ 2402.534212]  ? __sched_text_start+0x8/0x8
[ 2402.534212]  ? save_trace+0x320/0x320
[ 2402.534212]  ? sched_clock_local+0xe2/0x150
[ 2402.534212]  ? find_held_lock+0x39/0x1c0
[ 2402.534212]  ? worker_thread+0x35f/0x1150
[ 2402.534212]  ? lock_contended+0xe90/0xe90
[ 2402.534212]  ? __lock_acquire+0x4520/0x4520
[ 2402.534212]  ? do_raw_spin_unlock+0xb1/0x350
[ 2402.534212]  ? do_raw_spin_trylock+0x111/0x1b0
[ 2402.534212]  ? do_raw_spin_lock+0x1f0/0x1f0
[ 2402.534212]  worker_thread+0x169/0x1150

Fixes: 8d8540c4f5e0("netfilter: nft_set_rbtree: add timeout support")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 1f8f257cb518..9873d734b494 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -381,7 +381,7 @@ static void nft_rbtree_gc(struct work_struct *work)
 
 		gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
 		if (!gcb)
-			goto out;
+			break;
 
 		atomic_dec(&set->nelems);
 		nft_set_gc_batch_add(gcb, rbe);
@@ -390,10 +390,12 @@ static void nft_rbtree_gc(struct work_struct *work)
 			rbe = rb_entry(prev, struct nft_rbtree_elem, node);
 			atomic_dec(&set->nelems);
 			nft_set_gc_batch_add(gcb, rbe);
+			prev = NULL;
 		}
 		node = rb_next(node);
+		if (!node)
+			break;
 	}
-out:
 	if (gcb) {
 		for (i = 0; i < gcb->head.cnt; i++) {
 			rbe = gcb->elems[i];
@@ -440,6 +442,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
 	struct rb_node *node;
 
 	cancel_delayed_work_sync(&priv->gc_work);
+	rcu_barrier();
 	while ((node = priv->root.rb_node) != NULL) {
 		rb_erase(node, &priv->root);
 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
-- 
cgit v1.2.3


From 83ed7d1fe2d2d4a11b30660dec20168bb473d9c1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 18 Jul 2018 10:48:56 +0200
Subject: ipv6: ila: select CONFIG_DST_CACHE

My randconfig builds came across an old missing dependency for ILA:

ERROR: "dst_cache_set_ip6" [net/ipv6/ila/ila.ko] undefined!
ERROR: "dst_cache_get" [net/ipv6/ila/ila.ko] undefined!
ERROR: "dst_cache_init" [net/ipv6/ila/ila.ko] undefined!
ERROR: "dst_cache_destroy" [net/ipv6/ila/ila.ko] undefined!

We almost never run into this by accident because randconfig builds
end up selecting DST_CACHE from some other tunnel protocol, and this
one appears to be the only one missing the explicit 'select'.

>From all I can tell, this problem first appeared in linux-4.9
when dst_cache support got added to ILA.

Fixes: 79ff2fc31e0f ("ila: Cache a route to translated address")
Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 0eff75525da1..b3885ca22d6f 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -108,6 +108,7 @@ config IPV6_MIP6
 config IPV6_ILA
 	tristate "IPv6: Identifier Locator Addressing (ILA)"
 	depends on NETFILTER
+	select DST_CACHE
 	select LWTUNNEL
 	---help---
 	  Support for IPv6 Identifier Locator Addressing (ILA).
-- 
cgit v1.2.3


From 3bc53be9db21040b5d2de4d455f023c8c494aa68 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 18 Jul 2018 18:57:27 +0900
Subject: net/nfc: Avoid stalls when nfc_alloc_send_skb() returned NULL.

syzbot is reporting stalls at nfc_llcp_send_ui_frame() [1]. This is
because nfc_llcp_send_ui_frame() is retrying the loop without any delay
when nonblocking nfc_alloc_send_skb() returned NULL.

Since there is no need to use MSG_DONTWAIT if we retry until
sock_alloc_send_pskb() succeeds, let's use blocking call.
Also, in case an unexpected error occurred, let's break the loop
if blocking nfc_alloc_send_skb() failed.

[1] https://syzkaller.appspot.com/bug?id=4a131cc571c3733e0eff6bc673f4e36ae48f19c6

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+d29d18215e477cfbfbdd@syzkaller.appspotmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/nfc/llcp_commands.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 2ceefa183cee..6a196e438b6c 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -752,11 +752,14 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
 		pr_debug("Fragment %zd bytes remaining %zd",
 			 frag_len, remaining_len);
 
-		pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT,
+		pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0,
 					 frag_len + LLCP_HEADER_SIZE, &err);
 		if (pdu == NULL) {
-			pr_err("Could not allocate PDU\n");
-			continue;
+			pr_err("Could not allocate PDU (error=%d)\n", err);
+			len -= remaining_len;
+			if (len == 0)
+				len = err;
+			break;
 		}
 
 		pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI);
-- 
cgit v1.2.3


From 99be51f11d51400f744632f3938445a8d4de8943 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ursula.braun@linux.ibm.com>
Date: Wed, 18 Jul 2018 15:22:49 +0200
Subject: net/smc: optimize consumer cursor updates

The SMC protocol requires to send a separate consumer cursor update,
if it cannot be piggybacked to updates of the producer cursor.
Currently the decision to send a separate consumer cursor update
just considers the amount of data already received by the socket
program. It does not consider the amount of data already arrived, but
not yet consumed by the receiver. Basing the decision on the
difference between already confirmed and already arrived data
(instead of difference between already confirmed and already consumed
data), may lead to a somewhat earlier consumer cursor update send in
fast unidirectional traffic scenarios, and thus to better throughput.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Suggested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_tx.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index cee666400752..f82886b7d1d8 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -495,7 +495,8 @@ out:
 
 void smc_tx_consumer_update(struct smc_connection *conn, bool force)
 {
-	union smc_host_cursor cfed, cons;
+	union smc_host_cursor cfed, cons, prod;
+	int sender_free = conn->rmb_desc->len;
 	int to_confirm;
 
 	smc_curs_write(&cons,
@@ -505,11 +506,18 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
 		       smc_curs_read(&conn->rx_curs_confirmed, conn),
 		       conn);
 	to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
+	if (to_confirm > conn->rmbe_update_limit) {
+		smc_curs_write(&prod,
+			       smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+			       conn);
+		sender_free = conn->rmb_desc->len -
+			      smc_curs_diff(conn->rmb_desc->len, &prod, &cfed);
+	}
 
 	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
 	    force ||
 	    ((to_confirm > conn->rmbe_update_limit) &&
-	     ((to_confirm > (conn->rmb_desc->len / 2)) ||
+	     ((sender_free <= (conn->rmb_desc->len / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
 		if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
 		    conn->alert_token_local) { /* connection healthy */
-- 
cgit v1.2.3


From ac0107edba253a6e58e923f9e68825decef3e681 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ursula.braun@linux.ibm.com>
Date: Wed, 18 Jul 2018 15:22:50 +0200
Subject: net/smc: add error handling for get_user()

For security reasons the return code of get_user() should always be
checked.

Fixes: 01d2f7e2cdd31 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index c12a7fc18f56..6e5479067db0 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1456,7 +1456,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 
 	if (optlen < sizeof(int))
 		return -EINVAL;
-	get_user(val, (int __user *)optval);
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
 
 	lock_sock(sk);
 	switch (optname) {
-- 
cgit v1.2.3


From f6bdc42f021194ec095914b92c7a8b1a09789e6d Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 18 Jul 2018 15:22:51 +0200
Subject: net/smc: reset recv timeout after clc handshake

During clc handshake the receive timeout is set to CLC_WAIT_TIME.
Remember and reset the original timeout value after the receive calls,
and remove a duplicate assignment of CLC_WAIT_TIME.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_clc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 717449b1da0b..ae5d168653ce 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -250,6 +250,7 @@ out:
 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		     u8 expected_type)
 {
+	long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
 	struct sock *clc_sk = smc->clcsock->sk;
 	struct smc_clc_msg_hdr *clcm = buf;
 	struct msghdr msg = {NULL, 0};
@@ -306,7 +307,6 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	memset(&msg, 0, sizeof(struct msghdr));
 	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen);
 	krflags = MSG_WAITALL;
-	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
 	len = sock_recvmsg(smc->clcsock, &msg, krflags);
 	if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
 		smc->sk.sk_err = EPROTO;
@@ -322,6 +322,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	}
 
 out:
+	smc->clcsock->sk->sk_rcvtimeo = rcvtimeo;
 	return reason_code;
 }
 
-- 
cgit v1.2.3


From 53189183909f392c2aff1177565eabbfc48b8524 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 17 Jul 2018 20:58:14 +0800
Subject: net: sched: Using NULL instead of plain integer

Fixes the following sparse warnings:

net/sched/cls_api.c:1101:43: warning: Using plain integer as NULL pointer
net/sched/cls_api.c:1492:75: warning: Using plain integer as NULL pointer

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index cdc3c87c53e6..f74513a7c7a8 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1053,7 +1053,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 	for (tp = rtnl_dereference(chain->filter_chain);
 	     tp; tp = rtnl_dereference(tp->next))
 		tfilter_notify(net, oskb, n, tp, block,
-			       q, parent, 0, event, false);
+			       q, parent, NULL, event, false);
 }
 
 static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1444,7 +1444,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
 			memset(&cb->args[1], 0,
 			       sizeof(cb->args) - sizeof(cb->args[0]));
 		if (cb->args[1] == 0) {
-			if (tcf_fill_node(net, skb, tp, block, q, parent, 0,
+			if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
 					  NETLINK_CB(cb->skb).portid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					  RTM_NEWTFILTER) <= 0)
-- 
cgit v1.2.3


From 3ee593adbbb46d9e1bb1320915943926f4744483 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 17 Jul 2018 16:52:54 +0100
Subject: ipv6: sr: fix useless rol32 call on hash

The rol32 call is currently rotating hash but the rol'd value is
being discarded. I believe the current code is incorrect and hash
should be assigned the rotated value returned from rol32.

Detected by CoverityScan, CID#1468411 ("Useless call")

Fixes: b5facfdba14c ("ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: dlebrun@google.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 19ccf0dc996c..a8854dd3e9c5 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -101,7 +101,7 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
 
 	if (do_flowlabel > 0) {
 		hash = skb_get_hash(skb);
-		rol32(hash, 16);
+		hash = rol32(hash, 16);
 		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
 	} else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
 		flowlabel = ip6_flowlabel(inner_hdr);
-- 
cgit v1.2.3


From e56b8ce363a36fb7b74b80aaa5cc9084f2c908b4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 17 Jul 2018 18:27:45 -0700
Subject: tcp: identify cryptic messages as TCP seq # bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Attempt to make cryptic TCP seq number error messages clearer by
(1) identifying the source of the message as "TCP", (2) identifying the
errors as "seq # bug", and (3) grouping the field identifiers and values
by separating them with commas.

E.g., the following message is changed from:

recvmsg bug 2: copied 73BCB6CD seq 70F17CBE rcvnxt 73BCB9AA fl 0
WARNING: CPU: 2 PID: 1501 at /linux/net/ipv4/tcp.c:1881 tcp_recvmsg+0x649/0xb90

to:

TCP recvmsg seq # bug 2: copied 73BCB6CD, seq 70F17CBE, rcvnxt 73BCB9AA, fl 0
WARNING: CPU: 2 PID: 1501 at /linux/net/ipv4/tcp.c:2011 tcp_recvmsg+0x694/0xba0

Suggested-by: 積丹尼 Dan Jacobson <jidanni@jidanni.org>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ec2186e3087f..4491faf83f4f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1998,7 +1998,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			 * shouldn't happen.
 			 */
 			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
-				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
+				 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
 				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
 				 flags))
 				break;
@@ -2013,7 +2013,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
-			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+			     "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
 			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
 		}
 
-- 
cgit v1.2.3


From 4905bd9a42271bbf7dbef06b5e1edb18f33ac27c Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Tue, 17 Jul 2018 18:10:37 +0300
Subject: net/page_pool: Fix inconsistent lock state warning

Fix the warning below by calling the ptr_ring_consume_bh,
which uses spin_[un]lock_bh.

[  179.064300] ================================
[  179.069073] WARNING: inconsistent lock state
[  179.073846] 4.18.0-rc2+ #18 Not tainted
[  179.078133] --------------------------------
[  179.082907] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[  179.089637] swapper/21/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[  179.095478] 00000000963d1995 (&(&r->consumer_lock)->rlock){+.?.}, at:
__page_pool_empty_ring+0x61/0x100
[  179.105988] {SOFTIRQ-ON-W} state was registered at:
[  179.111443]   _raw_spin_lock+0x35/0x50
[  179.115634]   __page_pool_empty_ring+0x61/0x100
[  179.120699]   page_pool_destroy+0x32/0x50
[  179.125204]   mlx5e_free_rq+0x38/0xc0 [mlx5_core]
[  179.130471]   mlx5e_close_channel+0x20/0x120 [mlx5_core]
[  179.136418]   mlx5e_close_channels+0x26/0x40 [mlx5_core]
[  179.142364]   mlx5e_close_locked+0x44/0x50 [mlx5_core]
[  179.148509]   mlx5e_close+0x42/0x60 [mlx5_core]
[  179.153936]   __dev_close_many+0xb1/0x120
[  179.158749]   dev_close_many+0xa2/0x170
[  179.163364]   rollback_registered_many+0x148/0x460
[  179.169047]   rollback_registered+0x56/0x90
[  179.174043]   unregister_netdevice_queue+0x7e/0x100
[  179.179816]   unregister_netdev+0x18/0x20
[  179.184623]   mlx5e_remove+0x2a/0x50 [mlx5_core]
[  179.190107]   mlx5_remove_device+0xe5/0x110 [mlx5_core]
[  179.196274]   mlx5_unregister_interface+0x39/0x90 [mlx5_core]
[  179.203028]   cleanup+0x5/0xbfc [mlx5_core]
[  179.208031]   __x64_sys_delete_module+0x16b/0x240
[  179.213640]   do_syscall_64+0x5a/0x210
[  179.218151]   entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  179.224218] irq event stamp: 334398
[  179.228438] hardirqs last  enabled at (334398): [<ffffffffa511d8b7>]
rcu_process_callbacks+0x1c7/0x790
[  179.239178] hardirqs last disabled at (334397): [<ffffffffa511d872>]
rcu_process_callbacks+0x182/0x790
[  179.249931] softirqs last  enabled at (334386): [<ffffffffa509732e>] irq_enter+0x5e/0x70
[  179.259306] softirqs last disabled at (334387): [<ffffffffa509741c>] irq_exit+0xdc/0xf0
[  179.268584]
[  179.268584] other info that might help us debug this:
[  179.276572]  Possible unsafe locking scenario:
[  179.276572]
[  179.283877]        CPU0
[  179.286954]        ----
[  179.290033]   lock(&(&r->consumer_lock)->rlock);
[  179.295546]   <Interrupt>
[  179.298830]     lock(&(&r->consumer_lock)->rlock);
[  179.304550]
[  179.304550]  *** DEADLOCK ***

Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/page_pool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 68bf07206744..43a932cb609b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -269,7 +269,7 @@ static void __page_pool_empty_ring(struct page_pool *pool)
 	struct page *page;
 
 	/* Empty recycle ring */
-	while ((page = ptr_ring_consume(&pool->ring))) {
+	while ((page = ptr_ring_consume_bh(&pool->ring))) {
 		/* Verify the refcnt invariant of cached pages */
 		if (!(page_ref_count(page) == 1))
 			pr_crit("%s() page_pool refcnt %d violation\n",
-- 
cgit v1.2.3


From b8088dda98b9064a2b3007fe54b03ede70a15602 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 17 Jul 2018 07:17:53 +0200
Subject: netfilter: nf_tables: use dev->name directly

no need to store the name in separate area.

Furthermore, it uses kmalloc but not kfree and most accesses seem to treat
it as char[IFNAMSIZ] not char *.

Remove this and use dev->name instead.

In case event zeroed dev, just omit the name in the dump.

Fixes: d92191aa84e5f1 ("netfilter: nf_tables: cache device name in flowtable object")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  1 -
 net/netfilter/nf_tables_api.c     | 14 +++++---------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 4e82a4c49912..dc417ef0a0c5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1124,7 +1124,6 @@ struct nft_flowtable {
 	u32				genmask:2,
 					use:30;
 	u64				handle;
-	char				*dev_name[NFT_FLOWTABLE_DEVICE_MAX];
 	/* runtime data below here */
 	struct nf_hook_ops		*ops ____cacheline_aligned;
 	struct nf_flowtable		data;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d41fa2c82f14..54a4f75ff9da 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5324,8 +5324,6 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 		flowtable->ops[i].priv		= &flowtable->data;
 		flowtable->ops[i].hook		= flowtable->data.type->hook;
 		flowtable->ops[i].dev		= dev_array[i];
-		flowtable->dev_name[i]		= kstrdup(dev_array[i]->name,
-							  GFP_KERNEL);
 	}
 
 	return err;
@@ -5483,10 +5481,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 err6:
 	i = flowtable->ops_len;
 err5:
-	for (k = i - 1; k >= 0; k--) {
-		kfree(flowtable->dev_name[k]);
+	for (k = i - 1; k >= 0; k--)
 		nf_unregister_net_hook(net, &flowtable->ops[k]);
-	}
 
 	kfree(flowtable->ops);
 err4:
@@ -5585,9 +5581,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 		goto nla_put_failure;
 
 	for (i = 0; i < flowtable->ops_len; i++) {
-		if (flowtable->dev_name[i][0] &&
-		    nla_put_string(skb, NFTA_DEVICE_NAME,
-				   flowtable->dev_name[i]))
+		const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev);
+
+		if (dev &&
+		    nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
 			goto nla_put_failure;
 	}
 	nla_nest_end(skb, nest_devs);
@@ -5829,7 +5826,6 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev,
 			continue;
 
 		nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
-		flowtable->dev_name[i][0] = '\0';
 		flowtable->ops[i].dev = NULL;
 		break;
 	}
-- 
cgit v1.2.3


From a12486ebe104190a6c10557134014290afa98370 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 17 Jul 2018 07:17:54 +0200
Subject: netfilter: nf_tables: free flow table struct too

Fixes: 3b49e2e94e6ebb ("netfilter: nf_tables: add flow table netlink frontend")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 54a4f75ff9da..200da08524ae 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5784,6 +5784,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 	kfree(flowtable->name);
 	flowtable->data.type->free(&flowtable->data);
 	module_put(flowtable->data.type->owner);
+	kfree(flowtable);
 }
 
 static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
-- 
cgit v1.2.3


From 9f8aac0be21ed5f99bd5ba0ff315d710737d1794 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 17 Jul 2018 07:17:55 +0200
Subject: netfilter: nf_tables: fix memory leaks on chain rename

The new name is stored in the transaction metadata, on commit,
the pointers to the old and new names are swapped.

Therefore in abort and commit case we have to free the
pointer in the chain_trans container.

In commit case, the pointer can be used by another cpu that
is currently dumping the renamed chain, thus kfree needs to
happen after waiting for rcu readers to complete.

Fixes: b7263e071a ("netfilter: nf_tables: Allow chain name of up to 255 chars")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 200da08524ae..91230d713190 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6087,6 +6087,9 @@ static void nft_commit_release(struct nft_trans *trans)
 	case NFT_MSG_DELTABLE:
 		nf_tables_table_destroy(&trans->ctx);
 		break;
+	case NFT_MSG_NEWCHAIN:
+		kfree(nft_trans_chain_name(trans));
+		break;
 	case NFT_MSG_DELCHAIN:
 		nf_tables_chain_destroy(&trans->ctx);
 		break;
@@ -6316,13 +6319,15 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
 			break;
 		case NFT_MSG_NEWCHAIN:
-			if (nft_trans_chain_update(trans))
+			if (nft_trans_chain_update(trans)) {
 				nft_chain_commit_update(trans);
-			else
+				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+				/* trans destroyed after rcu grace period */
+			} else {
 				nft_clear(net, trans->ctx.chain);
-
-			nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
-			nft_trans_destroy(trans);
+				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+				nft_trans_destroy(trans);
+			}
 			break;
 		case NFT_MSG_DELCHAIN:
 			nft_chain_del(trans->ctx.chain);
@@ -6472,7 +6477,7 @@ static int __nf_tables_abort(struct net *net)
 		case NFT_MSG_NEWCHAIN:
 			if (nft_trans_chain_update(trans)) {
 				free_percpu(nft_trans_chain_stats(trans));
-
+				kfree(nft_trans_chain_name(trans));
 				nft_trans_destroy(trans);
 			} else {
 				trans->ctx.table->use--;
-- 
cgit v1.2.3


From c6cc94df65c3174be92afbee638f11cbb5e606a7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 17 Jul 2018 07:17:56 +0200
Subject: netfilter: nf_tables: don't allow to rename to already-pending name

Its possible to rename two chains to the same name in one
transaction:

nft add chain t c1
nft add chain t c2
nft 'rename chain t c1 c3;rename chain t c2 c3'

This creates two chains named 'c3'.

Appears to be harmless, both chains can still be deleted both
by name or handle, but, nevertheless, its a bug.

Walk transaction log and also compare vs. the pending renames.

Both chains can still be deleted, but nevertheless it is a bug as
we don't allow to create chains with identical names, so we should
prevent this from happening-by-rename too.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 91230d713190..d7b9748e338e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1598,7 +1598,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 	struct nft_base_chain *basechain;
 	struct nft_stats *stats = NULL;
 	struct nft_chain_hook hook;
-	const struct nlattr *name;
 	struct nf_hook_ops *ops;
 	struct nft_trans *trans;
 	int err;
@@ -1646,12 +1645,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			return PTR_ERR(stats);
 	}
 
+	err = -ENOMEM;
 	trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
 				sizeof(struct nft_trans_chain));
-	if (trans == NULL) {
-		free_percpu(stats);
-		return -ENOMEM;
-	}
+	if (trans == NULL)
+		goto err;
 
 	nft_trans_chain_stats(trans) = stats;
 	nft_trans_chain_update(trans) = true;
@@ -1661,19 +1659,37 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 	else
 		nft_trans_chain_policy(trans) = -1;
 
-	name = nla[NFTA_CHAIN_NAME];
-	if (nla[NFTA_CHAIN_HANDLE] && name) {
-		nft_trans_chain_name(trans) =
-			nla_strdup(name, GFP_KERNEL);
-		if (!nft_trans_chain_name(trans)) {
-			kfree(trans);
-			free_percpu(stats);
-			return -ENOMEM;
+	if (nla[NFTA_CHAIN_HANDLE] &&
+	    nla[NFTA_CHAIN_NAME]) {
+		struct nft_trans *tmp;
+		char *name;
+
+		err = -ENOMEM;
+		name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+		if (!name)
+			goto err;
+
+		err = -EEXIST;
+		list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) {
+			if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
+			    tmp->ctx.table == table &&
+			    nft_trans_chain_update(tmp) &&
+			    nft_trans_chain_name(tmp) &&
+			    strcmp(name, nft_trans_chain_name(tmp)) == 0) {
+				kfree(name);
+				goto err;
+			}
 		}
+
+		nft_trans_chain_name(trans) = name;
 	}
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
 
 	return 0;
+err:
+	free_percpu(stats);
+	kfree(trans);
+	return err;
 }
 
 static int nf_tables_newchain(struct net *net, struct sock *nlsk,
-- 
cgit v1.2.3


From 6613b6173dee098997229caf1f3b961c49da75e6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 17 Jul 2018 21:03:15 +0200
Subject: netfilter: conntrack: dccp: treat SYNC/SYNCACK as invalid if no prior
 state

When first DCCP packet is SYNC or SYNCACK, we insert a new conntrack
that has an un-initialized timeout value, i.e. such entry could be
reaped at any time.

Mark them as INVALID and only ignore SYNC/SYNCACK when connection had
an old state.

Reported-by: syzbot+6f18401420df260e37ed@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_dccp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index abe647d5b8c6..9ce6336d1e55 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -243,14 +243,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
 		 * We currently ignore Sync packets
 		 *
 		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
 		},
 		[DCCP_PKT_SYNCACK] = {
 		/*
 		 * We currently ignore SyncAck packets
 		 *
 		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
 		},
 	},
 	[CT_DCCP_ROLE_SERVER] = {
@@ -371,14 +371,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
 		 * We currently ignore Sync packets
 		 *
 		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
 		},
 		[DCCP_PKT_SYNCACK] = {
 		/*
 		 * We currently ignore SyncAck packets
 		 *
 		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
 		},
 	},
 };
-- 
cgit v1.2.3


From 2987babb6982306509380fc11b450227a844493b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Jul 2018 13:56:34 -0700
Subject: tcp: helpers to send special DCTCP ack

Refactor and create helpers to send the special ACK in DCTCP.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 00e5a300ddb9..ee1b0705321d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1023,8 +1023,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
-			    gfp_t gfp_mask)
+static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
+			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet;
@@ -1100,7 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	th->source		= inet->inet_sport;
 	th->dest		= inet->inet_dport;
 	th->seq			= htonl(tcb->seq);
-	th->ack_seq		= htonl(tp->rcv_nxt);
+	th->ack_seq		= htonl(rcv_nxt);
 	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
 					tcb->tcp_flags);
 
@@ -1178,6 +1178,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	return err;
 }
 
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+			    gfp_t gfp_mask)
+{
+	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
+				  tcp_sk(sk)->rcv_nxt);
+}
+
 /* This routine just queues the buffer for sending.
  *
  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
@@ -3571,7 +3578,7 @@ void tcp_send_delayed_ack(struct sock *sk)
 }
 
 /* This routine sends an ack and also updates the window. */
-void tcp_send_ack(struct sock *sk)
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
 {
 	struct sk_buff *buff;
 
@@ -3604,7 +3611,12 @@ void tcp_send_ack(struct sock *sk)
 	skb_set_tcp_pure_ack(buff);
 
 	/* Send it off, this clears delayed acks for us. */
-	tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
+	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+}
+
+void tcp_send_ack(struct sock *sk)
+{
+	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
 }
 EXPORT_SYMBOL_GPL(tcp_send_ack);
 
-- 
cgit v1.2.3


From 27cde44a259c380a3c09066fc4b42de7dde9b1ad Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Jul 2018 13:56:35 -0700
Subject: tcp: do not cancel delay-AcK on DCTCP special ACK

Currently when a DCTCP receiver delays an ACK and receive a
data packet with a different CE mark from the previous one's, it
sends two immediate ACKs acking previous and latest sequences
respectly (for ECN accounting).

Previously sending the first ACK may mark off the delayed ACK timer
(tcp_event_ack_sent). This may subsequently prevent sending the
second ACK to acknowledge the latest sequence (tcp_ack_snd_check).
The culprit is that tcp_send_ack() assumes it always acknowleges
the latest sequence, which is not true for the first special ACK.

The fix is to not make the assumption in tcp_send_ack and check the
actual ack sequence before cancelling the delayed ACK. Further it's
safer to pass the ack sequence number as a local variable into
tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid
future bugs like this.

Reported-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  1 +
 net/ipv4/tcp_dctcp.c  | 34 ++++------------------------------
 net/ipv4/tcp_output.c | 10 +++++++---
 3 files changed, 12 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3482d13d655b..a08de496d1b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -539,6 +539,7 @@ void tcp_send_fin(struct sock *sk);
 void tcp_send_active_reset(struct sock *sk, gfp_t priority);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
 void tcp_send_ack(struct sock *sk);
 void tcp_send_delayed_ack(struct sock *sk);
 void tcp_send_loss_probe(struct sock *sk);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5869f89ca656..078328afbfe3 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -133,21 +133,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
 	 * ACK has not sent yet.
 	 */
 	if (!ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
-		u32 tmp_rcv_nxt;
-
-		/* Save current rcv_nxt. */
-		tmp_rcv_nxt = tp->rcv_nxt;
-
-		/* Generate previous ack with CE=0. */
-		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-		tp->rcv_nxt = ca->prior_rcv_nxt;
-
-		tcp_send_ack(sk);
-
-		/* Recover current rcv_nxt. */
-		tp->rcv_nxt = tmp_rcv_nxt;
-	}
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+		__tcp_send_ack(sk, ca->prior_rcv_nxt);
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 1;
@@ -164,21 +151,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 	 * ACK has not sent yet.
 	 */
 	if (ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
-		u32 tmp_rcv_nxt;
-
-		/* Save current rcv_nxt. */
-		tmp_rcv_nxt = tp->rcv_nxt;
-
-		/* Generate previous ack with CE=1. */
-		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
-		tp->rcv_nxt = ca->prior_rcv_nxt;
-
-		tcp_send_ack(sk);
-
-		/* Recover current rcv_nxt. */
-		tp->rcv_nxt = tmp_rcv_nxt;
-	}
+	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+		__tcp_send_ack(sk, ca->prior_rcv_nxt);
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ee1b0705321d..c4172c1fb198 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 }
 
 /* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
+				      u32 rcv_nxt)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 			__sock_put(sk);
 	}
+
+	if (unlikely(rcv_nxt != tp->rcv_nxt))
+		return;  /* Special ACK sent by DCTCP to reflect ECN */
 	tcp_dec_quickack_mode(sk, pkts);
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
@@ -1141,7 +1145,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	icsk->icsk_af_ops->send_check(sk, skb);
 
 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
-		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
 
 	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
@@ -3613,12 +3617,12 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
 	/* Send it off, this clears delayed acks for us. */
 	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
 }
+EXPORT_SYMBOL_GPL(__tcp_send_ack);
 
 void tcp_send_ack(struct sock *sk)
 {
 	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
 }
-EXPORT_SYMBOL_GPL(tcp_send_ack);
 
 /* This routine sends a packet with an out of date sequence
  * number. It assumes the other end will try to ack it.
-- 
cgit v1.2.3


From a0496ef2c23b3b180902dd185d0d63ccbc624cf8 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 18 Jul 2018 13:56:36 -0700
Subject: tcp: do not delay ACK in DCTCP upon CE status change

Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change
has to be sent immediately so the sender can respond quickly:

""" When receiving packets, the CE codepoint MUST be processed as follows:

   1.  If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to
       true and send an immediate ACK.

   2.  If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE
       to false and send an immediate ACK.
"""

Previously DCTCP implementation may continue to delay the ACK. This
patch fixes that to implement the RFC by forcing an immediate ACK.

Tested with this packetdrill script provided by Larry Brakmo

0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0

0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
0.110 < [ect0] . 1:1(0) ack 1 win 257
0.200 accept(3, ..., ...) = 4
   +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0

0.200 < [ect0] . 1:1001(1000) ack 1 win 257
0.200 > [ect01] . 1:1(0) ack 1001

0.200 write(4, ..., 1) = 1
0.200 > [ect01] P. 1:2(1) ack 1001

0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
+0.005 < [ce] . 2001:3001(1000) ack 2 win 257

+0.000 > [ect01] . 2:2(0) ack 2001
// Previously the ACK below would be delayed by 40ms
+0.000 > [ect01] E. 2:2(0) ack 3001

+0.500 < F. 9501:9501(0) ack 4 win 257

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  1 +
 net/ipv4/tcp_dctcp.c | 30 ++++++++++++++++++------------
 net/ipv4/tcp_input.c |  3 ++-
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a08de496d1b2..25116ec02087 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
 			struct pipe_inode_info *pipe, size_t len,
 			unsigned int flags);
 
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
 static inline void tcp_dec_quickack_mode(struct sock *sk,
 					 const unsigned int pkts)
 {
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 078328afbfe3..8b637f9f23a2 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -129,12 +129,15 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
 	struct dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	/* State has changed from CE=0 to CE=1 and delayed
-	 * ACK has not sent yet.
-	 */
-	if (!ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
-		__tcp_send_ack(sk, ca->prior_rcv_nxt);
+	if (!ca->ce_state) {
+		/* State has changed from CE=0 to CE=1, force an immediate
+		 * ACK to reflect the new CE state. If an ACK was delayed,
+		 * send that first to reflect the prior CE state.
+		 */
+		if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+			__tcp_send_ack(sk, ca->prior_rcv_nxt);
+		tcp_enter_quickack_mode(sk, 1);
+	}
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 1;
@@ -147,12 +150,15 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
 	struct dctcp *ca = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	/* State has changed from CE=1 to CE=0 and delayed
-	 * ACK has not sent yet.
-	 */
-	if (ca->ce_state &&
-	    inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
-		__tcp_send_ack(sk, ca->prior_rcv_nxt);
+	if (ca->ce_state) {
+		/* State has changed from CE=1 to CE=0, force an immediate
+		 * ACK to reflect the new CE state. If an ACK was delayed,
+		 * send that first to reflect the prior CE state.
+		 */
+		if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+			__tcp_send_ack(sk, ca->prior_rcv_nxt);
+		tcp_enter_quickack_mode(sk, 1);
+	}
 
 	ca->prior_rcv_nxt = tp->rcv_nxt;
 	ca->ce_state = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8e5522c6833a..6bade06aaf72 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -215,7 +215,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
 		icsk->icsk_ack.quick = quickacks;
 }
 
-static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -223,6 +223,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
 	icsk->icsk_ack.pingpong = 0;
 	icsk->icsk_ack.ato = TCP_ATO_MIN;
 }
+EXPORT_SYMBOL(tcp_enter_quickack_mode);
 
 /* Send ACKs quickly, if "quick" count is not exhausted
  * and the session is not interactive.
-- 
cgit v1.2.3


From fcf4793e278edede8fcd748198d12128037e526c Mon Sep 17 00:00:00 2001
From: Doron Roberts-Kedes <doronrk@fb.com>
Date: Wed, 18 Jul 2018 16:22:27 -0700
Subject: tls: check RCV_SHUTDOWN in tls_wait_data

The current code does not check sk->sk_shutdown & RCV_SHUTDOWN.
tls_sw_recvmsg may return a positive value in the case where bytes have
already been copied when the socket is shutdown. sk->sk_err has been
cleared, causing the tls_wait_data to hang forever on a subsequent
invocation. Checking sk->sk_shutdown & RCV_SHUTDOWN, as in tcp_recvmsg,
fixes this problem.

Fixes: c46234ebb4d1 ("tls: RX path for ktls")
Acked-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4618f1c31137..1f3d9789af30 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -646,6 +646,9 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
 			return NULL;
 		}
 
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			return NULL;
+
 		if (sock_flag(sk, SOCK_DONE))
 			return NULL;
 
-- 
cgit v1.2.3


From 64119e05f7b31e83e2555f6782e6cdc8f81c63f4 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 19 Jul 2018 10:27:13 +0800
Subject: net: caif: Add a missing rcu_read_unlock() in caif_flow_cb

Add a missing rcu_read_unlock in the error path

Fixes: c95567c80352 ("caif: added check for potential null return")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index e0adcd123f48..711d7156efd8 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -131,8 +131,10 @@ static void caif_flow_cb(struct sk_buff *skb)
 	caifd = caif_get(skb->dev);
 
 	WARN_ON(caifd == NULL);
-	if (caifd == NULL)
+	if (!caifd) {
+		rcu_read_unlock();
 		return;
+	}
 
 	caifd_hold(caifd);
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 24b711edfc34bc45777a3f068812b7d1ed004a5d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 19 Jul 2018 12:41:18 -0700
Subject: net/ipv6: Fix linklocal to global address with VRF

Example setup:
    host: ip -6 addr add dev eth1 2001:db8:104::4
           where eth1 is enslaved to a VRF

    switch: ip -6 ro add 2001:db8:104::4/128 dev br1
            where br1 only has an LLA

           ping6 2001:db8:104::4
           ssh   2001:db8:104::4

(NOTE: UDP works fine if the PKTINFO has the address set to the global
address and ifindex is set to the index of eth1 with a destination an
LLA).

For ICMP, icmp6_iif needs to be updated to check if skb->dev is an
L3 master. If it is then return the ifindex from rt6i_idev similar
to what is done for loopback.

For TCP, restore the original tcp_v6_iif definition which is needed in
most places and add a new tcp_v6_iif_l3_slave that considers the
l3_slave variability. This latter check is only needed for socket
lookups.

Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 5 +++++
 net/ipv6/icmp.c     | 5 +++--
 net/ipv6/tcp_ipv6.c | 6 ++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 25116ec02087..cd3ecda9386a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -840,6 +840,11 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
  * as TCP moves IP6CB into a different location in skb->cb[]
  */
 static inline int tcp_v6_iif(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->header.h6.iif;
+}
+
+static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
 {
 	bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index be491bf6ab6e..ef2505aefc15 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buff *skb)
 
 	/* for local traffic to local address, skb dev is the loopback
 	 * device. Check if there is a dst attached to the skb and if so
-	 * get the real device index.
+	 * get the real device index. Same is needed for replies to a link
+	 * local address on a device enslaved to an L3 master device
 	 */
-	if (unlikely(iif == LOOPBACK_IFINDEX)) {
+	if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
 		const struct rt6_info *rt6 = skb_rt6_info(skb);
 
 		if (rt6)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7efa9fd7e109..03e6b7a2bc53 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -938,7 +938,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 					   &tcp_hashinfo, NULL, 0,
 					   &ipv6h->saddr,
 					   th->source, &ipv6h->daddr,
-					   ntohs(th->source), tcp_v6_iif(skb),
+					   ntohs(th->source),
+					   tcp_v6_iif_l3_slave(skb),
 					   tcp_v6_sdif(skb));
 		if (!sk1)
 			goto out;
@@ -1609,7 +1610,8 @@ do_time_wait:
 					    skb, __tcp_hdrlen(th),
 					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
-					    ntohs(th->dest), tcp_v6_iif(skb),
+					    ntohs(th->dest),
+					    tcp_v6_iif_l3_slave(skb),
 					    sdif);
 		if (sk2) {
 			struct inet_timewait_sock *tw = inet_twsk(sk);
-- 
cgit v1.2.3


From ff907a11a0d68a749ce1a321f4505c03bf72190c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 19 Jul 2018 16:04:38 -0700
Subject: net: skb_segment() should not return NULL

syzbot caught a NULL deref [1], caused by skb_segment()

skb_segment() has many "goto err;" that assume the @err variable
contains -ENOMEM.

A successful call to __skb_linearize() should not clear @err,
otherwise a subsequent memory allocation error could return NULL.

While we are at it, we might use -EINVAL instead of -ENOMEM when
MAX_SKB_FRAGS limit is reached.

[1]
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
CPU: 0 PID: 13285 Comm: syz-executor3 Not tainted 4.18.0-rc4+ #146
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:tcp_gso_segment+0x3dc/0x1780 net/ipv4/tcp_offload.c:106
Code: f0 ff ff 0f 87 1c fd ff ff e8 00 88 0b fb 48 8b 75 d0 48 b9 00 00 00 00 00 fc ff df 48 8d be 90 00 00 00 48 89 f8 48 c1 e8 03 <0f> b6 14 08 48 8d 86 94 00 00 00 48 89 c6 83 e0 07 48 c1 ee 03 0f
RSP: 0018:ffff88019b7fd060 EFLAGS: 00010206
RAX: 0000000000000012 RBX: 0000000000000020 RCX: dffffc0000000000
RDX: 0000000000040000 RSI: 0000000000000000 RDI: 0000000000000090
RBP: ffff88019b7fd0f0 R08: ffff88019510e0c0 R09: ffffed003b5c46d6
R10: ffffed003b5c46d6 R11: ffff8801dae236b3 R12: 0000000000000001
R13: ffff8801d6c581f4 R14: 0000000000000000 R15: ffff8801d6c58128
FS:  00007fcae64d6700(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004e8664 CR3: 00000001b669b000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 tcp4_gso_segment+0x1c3/0x440 net/ipv4/tcp_offload.c:54
 inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342
 inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342
 skb_mac_gso_segment+0x3b5/0x740 net/core/dev.c:2792
 __skb_gso_segment+0x3c3/0x880 net/core/dev.c:2865
 skb_gso_segment include/linux/netdevice.h:4099 [inline]
 validate_xmit_skb+0x640/0xf30 net/core/dev.c:3104
 __dev_queue_xmit+0xc14/0x3910 net/core/dev.c:3561
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3602
 neigh_hh_output include/net/neighbour.h:473 [inline]
 neigh_output include/net/neighbour.h:481 [inline]
 ip_finish_output2+0x1063/0x1860 net/ipv4/ip_output.c:229
 ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317
 NF_HOOK_COND include/linux/netfilter.h:276 [inline]
 ip_output+0x223/0x880 net/ipv4/ip_output.c:405
 dst_output include/net/dst.h:444 [inline]
 ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
 iptunnel_xmit+0x567/0x850 net/ipv4/ip_tunnel_core.c:91
 ip_tunnel_xmit+0x1598/0x3af1 net/ipv4/ip_tunnel.c:778
 ipip_tunnel_xmit+0x264/0x2c0 net/ipv4/ipip.c:308
 __netdev_start_xmit include/linux/netdevice.h:4148 [inline]
 netdev_start_xmit include/linux/netdevice.h:4157 [inline]
 xmit_one net/core/dev.c:3034 [inline]
 dev_hard_start_xmit+0x26c/0xc30 net/core/dev.c:3050
 __dev_queue_xmit+0x29ef/0x3910 net/core/dev.c:3569
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3602
 neigh_direct_output+0x15/0x20 net/core/neighbour.c:1403
 neigh_output include/net/neighbour.h:483 [inline]
 ip_finish_output2+0xa67/0x1860 net/ipv4/ip_output.c:229
 ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317
 NF_HOOK_COND include/linux/netfilter.h:276 [inline]
 ip_output+0x223/0x880 net/ipv4/ip_output.c:405
 dst_output include/net/dst.h:444 [inline]
 ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
 ip_queue_xmit+0x9df/0x1f80 net/ipv4/ip_output.c:504
 tcp_transmit_skb+0x1bf9/0x3f10 net/ipv4/tcp_output.c:1168
 tcp_write_xmit+0x1641/0x5c20 net/ipv4/tcp_output.c:2363
 __tcp_push_pending_frames+0xb2/0x290 net/ipv4/tcp_output.c:2536
 tcp_push+0x638/0x8c0 net/ipv4/tcp.c:735
 tcp_sendmsg_locked+0x2ec5/0x3f00 net/ipv4/tcp.c:1410
 tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1447
 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
 sock_sendmsg_nosec net/socket.c:641 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:651
 __sys_sendto+0x3d7/0x670 net/socket.c:1797
 __do_sys_sendto net/socket.c:1809 [inline]
 __se_sys_sendto net/socket.c:1805 [inline]
 __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1805
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x455ab9
Code: 1d ba fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb b9 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fcae64d5c68 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00007fcae64d66d4 RCX: 0000000000455ab9
RDX: 0000000000000001 RSI: 0000000020000200 RDI: 0000000000000013
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000014
R13: 00000000004c1145 R14: 00000000004d1818 R15: 0000000000000006
Modules linked in:
Dumping ftrace buffer:
   (ftrace buffer empty)

Fixes: ddff00d42043 ("net: Move skb_has_shared_frag check out of GRE code and into segmentation")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8e51f8555e11..fb35b62af272 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3720,6 +3720,7 @@ normal:
 				net_warn_ratelimited(
 					"skb_segment: too many frags: %u %u\n",
 					pos, mss);
+				err = -EINVAL;
 				goto err;
 			}
 
@@ -3753,11 +3754,10 @@ skip_fraglist:
 
 perform_csum_check:
 		if (!csum) {
-			if (skb_has_shared_frag(nskb)) {
-				err = __skb_linearize(nskb);
-				if (err)
-					goto err;
-			}
+			if (skb_has_shared_frag(nskb) &&
+			    __skb_linearize(nskb))
+				goto err;
+
 			if (!nskb->remcsum_offload)
 				nskb->ip_summed = CHECKSUM_NONE;
 			SKB_GSO_CB(nskb)->csum =
-- 
cgit v1.2.3


From 08d3ffcc0cfaba36f6b86fd568cc3bc773061fa6 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 20 Jul 2018 14:04:27 +0800
Subject: multicast: do not restore deleted record source filter mode to new
 one

There are two scenarios that we will restore deleted records. The first is
when device down and up(or unmap/remap). In this scenario the new filter
mode is same with previous one. Because we get it from in_dev->mc_list and
we do not touch it during device down and up.

The other scenario is when a new socket join a group which was just delete
and not finish sending status reports. In this scenario, we should use the
current filter mode instead of restore old one. Here are 4 cases in total.

old_socket        new_socket       before_fix       after_fix
  IN(A)             IN(A)           ALLOW(A)         ALLOW(A)
  IN(A)             EX( )           TO_IN( )         TO_EX( )
  EX( )             IN(A)           TO_EX( )         ALLOW(A)
  EX( )             EX( )           TO_EX( )         TO_EX( )

Fixes: 24803f38a5c0b (igmp: do not remove igmp souce list info when set link down)
Fixes: 1666d49e1d416 (mld: do not remove mld souce list info when set link down)
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 3 +--
 net/ipv6/mcast.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b3c899a630a0..28fef7d15959 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,8 +1200,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
 	spin_lock_bh(&im->lock);
 	if (pmc) {
 		im->interface = pmc->interface;
-		im->sfmode = pmc->sfmode;
-		if (pmc->sfmode == MCAST_INCLUDE) {
+		if (im->sfmode == MCAST_INCLUDE) {
 			im->tomb = pmc->tomb;
 			im->sources = pmc->sources;
 			for (psf = im->sources; psf; psf = psf->sf_next)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 2699be7202be..f60f310785fd 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -790,8 +790,7 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
 	spin_lock_bh(&im->mca_lock);
 	if (pmc) {
 		im->idev = pmc->idev;
-		im->mca_sfmode = pmc->mca_sfmode;
-		if (pmc->mca_sfmode == MCAST_INCLUDE) {
+		if (im->mca_sfmode == MCAST_INCLUDE) {
 			im->mca_tomb = pmc->mca_tomb;
 			im->mca_sources = pmc->mca_sources;
 			for (psf = im->mca_sources; psf; psf = psf->sf_next)
-- 
cgit v1.2.3


From 5025f7f7d506fba9b39e7fe8ca10f6f34cb9bc2d Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 20 Jul 2018 13:21:01 -0700
Subject: rtnetlink: add rtnl_link_state check in rtnl_configure_link

rtnl_configure_link sets dev->rtnl_link_state to
RTNL_LINK_INITIALIZED and unconditionally calls
__dev_notify_flags to notify user-space of dev flags.

current call sequence for rtnl_configure_link
rtnetlink_newlink
    rtnl_link_ops->newlink
    rtnl_configure_link (unconditionally notifies userspace of
                         default and new dev flags)

If a newlink handler wants to call rtnl_configure_link
early, we will end up with duplicate notifications to
user-space.

This patch fixes rtnl_configure_link to check rtnl_link_state
and call __dev_notify_flags with gchanges = 0 if already
RTNL_LINK_INITIALIZED.

Later in the series, this patch will help the following sequence
where a driver implementing newlink can call rtnl_configure_link
to initialize the link early.

makes the following call sequence work:
rtnetlink_newlink
    rtnl_link_ops->newlink (vxlan) -> rtnl_configure_link (initializes
                                                link and notifies
                                                user-space of default
                                                dev flags)
    rtnl_configure_link (updates dev flags if requested by user ifm
                         and notifies user-space of new dev flags)

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ef61222fdef..e3f743c141b3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2759,9 +2759,12 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
 			return err;
 	}
 
-	dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
-
-	__dev_notify_flags(dev, old_flags, ~0U);
+	if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
+		__dev_notify_flags(dev, old_flags, 0U);
+	} else {
+		dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+		__dev_notify_flags(dev, old_flags, ~0U);
+	}
 	return 0;
 }
 EXPORT_SYMBOL(rtnl_configure_link);
-- 
cgit v1.2.3


From e873e4b9cc7e8ce79e5c5627b32b107035bb3f5d Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Sat, 21 Jul 2018 20:56:32 -0700
Subject: ipv6: use fib6_info_hold_safe() when necessary

In the code path where only rcu read lock is held, e.g. in the route
lookup code path, it is not safe to directly call fib6_info_hold()
because the fib6_info may already have been deleted but still exists
in the rcu grace period. Holding reference to it could cause double
free and crash the kernel.

This patch adds a new function fib6_info_hold_safe() and replace
fib6_info_hold() in all necessary places.

Syzbot reported 3 crash traces because of this. One of them is:
8021q: adding VLAN 0 to HW filter on device team0
IPv6: ADDRCONF(NETDEV_CHANGE): team0: link becomes ready
dst_release: dst:(____ptrval____) refcnt:-1
dst_release: dst:(____ptrval____) refcnt:-2
WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 dst_hold include/net/dst.h:239 [inline]
WARNING: CPU: 1 PID: 4845 at include/net/dst.h:239 ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204
dst_release: dst:(____ptrval____) refcnt:-1
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 4845 Comm: syz-executor493 Not tainted 4.18.0-rc3+ #10
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 panic+0x238/0x4e7 kernel/panic.c:184
dst_release: dst:(____ptrval____) refcnt:-2
dst_release: dst:(____ptrval____) refcnt:-3
 __warn.cold.8+0x163/0x1ba kernel/panic.c:536
dst_release: dst:(____ptrval____) refcnt:-4
 report_bug+0x252/0x2d0 lib/bug.c:186
 fixup_bug arch/x86/kernel/traps.c:178 [inline]
 do_error_trap+0x1fc/0x4d0 arch/x86/kernel/traps.c:296
dst_release: dst:(____ptrval____) refcnt:-5
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:316
 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:dst_hold include/net/dst.h:239 [inline]
RIP: 0010:ip6_setup_cork+0xd66/0x1830 net/ipv6/ip6_output.c:1204
Code: c1 ed 03 89 9d 18 ff ff ff 48 b8 00 00 00 00 00 fc ff df 41 c6 44 05 00 f8 e9 2d 01 00 00 4c 8b a5 c8 fe ff ff e8 1a f6 e6 fa <0f> 0b e9 6a fc ff ff e8 0e f6 e6 fa 48 8b 85 d0 fe ff ff 48 8d 78
RSP: 0018:ffff8801a8fcf178 EFLAGS: 00010293
RAX: ffff8801a8eba5c0 RBX: 0000000000000000 RCX: ffffffff869511e6
RDX: 0000000000000000 RSI: ffffffff869515b6 RDI: 0000000000000005
RBP: ffff8801a8fcf2c8 R08: ffff8801a8eba5c0 R09: ffffed0035ac8338
R10: ffffed0035ac8338 R11: ffff8801ad6419c3 R12: ffff8801a8fcf720
R13: ffff8801a8fcf6a0 R14: ffff8801ad6419c0 R15: ffff8801ad641980
 ip6_make_skb+0x2c8/0x600 net/ipv6/ip6_output.c:1768
 udpv6_sendmsg+0x2c90/0x35f0 net/ipv6/udp.c:1376
 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
 sock_sendmsg_nosec net/socket.c:641 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:651
 ___sys_sendmsg+0x51d/0x930 net/socket.c:2125
 __sys_sendmmsg+0x240/0x6f0 net/socket.c:2220
 __do_sys_sendmmsg net/socket.c:2249 [inline]
 __se_sys_sendmmsg net/socket.c:2246 [inline]
 __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2246
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x446ba9
Code: e8 cc bb 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fb39a469da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00000000006dcc54 RCX: 0000000000446ba9
RDX: 00000000000000b8 RSI: 0000000020001b00 RDI: 0000000000000003
RBP: 00000000006dcc50 R08: 00007fb39a46a700 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 45c828efc7a64843
R13: e6eeb815b9d8a477 R14: 5068caf6f713c6fc R15: 0000000000000001
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes")
Reported-by: syzbot+902e2a1bcd4f7808cef5@syzkaller.appspotmail.com
Reported-by: syzbot+8ae62d67f647abeeceb9@syzkaller.appspotmail.com
Reported-by: syzbot+3f08feb14086930677d0@syzkaller.appspotmail.com
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/addrconf.c   |  3 ++-
 net/ipv6/route.c      | 41 +++++++++++++++++++++++++++++++----------
 3 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 71b9043aa0e7..3d4930528db0 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -281,6 +281,11 @@ static inline void fib6_info_hold(struct fib6_info *f6i)
 	atomic_inc(&f6i->fib6_ref);
 }
 
+static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
+{
+	return atomic_inc_not_zero(&f6i->fib6_ref);
+}
+
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
 	if (f6i && atomic_dec_and_test(&f6i->fib6_ref))
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 91580c62bb86..f66a1cae3366 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2374,7 +2374,8 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 			continue;
 		if ((rt->fib6_flags & noflags) != 0)
 			continue;
-		fib6_info_hold(rt);
+		if (!fib6_info_hold_safe(rt))
+			continue;
 		break;
 	}
 out:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2ce0bd17de4f..ec18b3ce8b6d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -972,10 +972,10 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 	rt->dst.lastuse = jiffies;
 }
 
+/* Caller must already hold reference to @from */
 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
-	fib6_info_hold(from);
 	rcu_assign_pointer(rt->from, from);
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
@@ -984,6 +984,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 	}
 }
 
+/* Caller must already hold reference to @ort */
 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
 	struct net_device *dev = fib6_info_nh_dev(ort);
@@ -1044,9 +1045,14 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rt6_info *nrt;
 
+	if (!fib6_info_hold_safe(rt))
+		return NULL;
+
 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	if (nrt)
 		ip6_rt_copy_init(nrt, rt);
+	else
+		fib6_info_release(rt);
 
 	return nrt;
 }
@@ -1178,10 +1184,15 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 	 *	Clone the route.
 	 */
 
+	if (!fib6_info_hold_safe(ort))
+		return NULL;
+
 	dev = ip6_rt_get_dev_rcu(ort);
 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
-	if (!rt)
+	if (!rt) {
+		fib6_info_release(ort);
 		return NULL;
+	}
 
 	ip6_rt_copy_init(rt, ort);
 	rt->rt6i_flags |= RTF_CACHE;
@@ -1210,12 +1221,17 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
 	struct net_device *dev;
 	struct rt6_info *pcpu_rt;
 
+	if (!fib6_info_hold_safe(rt))
+		return NULL;
+
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(rt);
 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	rcu_read_unlock();
-	if (!pcpu_rt)
+	if (!pcpu_rt) {
+		fib6_info_release(rt);
 		return NULL;
+	}
 	ip6_rt_copy_init(pcpu_rt, rt);
 	pcpu_rt->rt6i_flags |= RTF_PCPU;
 	return pcpu_rt;
@@ -2486,7 +2502,7 @@ restart:
 
 out:
 	if (ret)
-		dst_hold(&ret->dst);
+		ip6_hold_safe(net, &ret, true);
 	else
 		ret = ip6_create_rt_rcu(rt);
 
@@ -3303,7 +3319,8 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
 				continue;
-			fib6_info_hold(rt);
+			if (!fib6_info_hold_safe(rt))
+				continue;
 			rcu_read_unlock();
 
 			/* if gateway was specified only delete the one hop */
@@ -3409,6 +3426,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 
 	rcu_read_lock();
 	from = rcu_dereference(rt->from);
+	/* This fib6_info_hold() is safe here because we hold reference to rt
+	 * and rt already holds reference to fib6_info.
+	 */
 	fib6_info_hold(from);
 	rcu_read_unlock();
 
@@ -3470,7 +3490,8 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 			continue;
 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
-		fib6_info_hold(rt);
+		if (!fib6_info_hold_safe(rt))
+			continue;
 		break;
 	}
 out:
@@ -3530,8 +3551,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
 			break;
 	}
-	if (rt)
-		fib6_info_hold(rt);
+	if (rt && !fib6_info_hold_safe(rt))
+		rt = NULL;
 	rcu_read_unlock();
 	return rt;
 }
@@ -3579,8 +3600,8 @@ restart:
 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
 
 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
-		    (!idev || idev->cnf.accept_ra != 2)) {
-			fib6_info_hold(rt);
+		    (!idev || idev->cnf.accept_ra != 2) &&
+		    fib6_info_hold_safe(rt)) {
 			rcu_read_unlock();
 			ip6_del_rt(net, rt);
 			goto restart;
-- 
cgit v1.2.3


From 3dd1c9a1270736029ffca670e9bd0265f4120600 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 23 Jul 2018 16:50:48 +0200
Subject: ip: hash fragments consistently

The skb hash for locally generated ip[v6] fragments belonging
to the same datagram can vary in several circumstances:
* for connected UDP[v6] sockets, the first fragment get its hash
  via set_owner_w()/skb_set_hash_from_sk()
* for unconnected IPv6 UDPv6 sockets, the first fragment can get
  its hash via ip6_make_flowlabel()/skb_get_hash_flowi6(), if
  auto_flowlabel is enabled

For the following frags the hash is usually computed via
skb_get_hash().
The above can cause OoO for unconnected IPv6 UDPv6 socket: in that
scenario the egress tx queue can be selected on a per packet basis
via the skb hash.
It may also fool flow-oriented schedulers to place fragments belonging
to the same datagram in different flows.

Fix the issue by copying the skb hash from the head frag into
the others at fragmentation time.

Before this commit:
perf probe -a "dev_queue_xmit skb skb->hash skb->l4_hash:b1@0/8 skb->sw_hash:b1@1/8"
netperf -H $IPV4 -t UDP_STREAM -l 5 -- -m 2000 -n &
perf record -e probe:dev_queue_xmit -e probe:skb_set_owner_w -a sleep 0.1
perf script
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=3713014309 l4_hash=1 sw_hash=0
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=0 l4_hash=0 sw_hash=0

After this commit:
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0
probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0

Fixes: b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit")
Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 2 ++
 net/ipv6/ip6_output.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9d9762..0e3edd25f881 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -523,6 +523,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->dev = from->dev;
 	to->mark = from->mark;
 
+	skb_copy_hash(to, from);
+
 	/* Copy the flags to each fragment. */
 	IPCB(to)->flags = IPCB(from)->flags;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a14fb4fcdf18..3168847c30d1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -570,6 +570,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->dev = from->dev;
 	to->mark = from->mark;
 
+	skb_copy_hash(to, from);
+
 #ifdef CONFIG_NET_SCHED
 	to->tc_index = from->tc_index;
 #endif
-- 
cgit v1.2.3


From 72cd43ba64fc172a443410ce01645895850844c8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:17 -0700
Subject: tcp: free batches of packets in tcp_prune_ofo_queue()

Juha-Matti Tilli reported that malicious peers could inject tiny
packets in out_of_order_queue, forcing very expensive calls
to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
every incoming packet. out_of_order_queue rb-tree can contain
thousands of nodes, iterating over all of them is not nice.

Before linux-4.9, we would have pruned all packets in ofo_queue
in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.

Since we plan to increase tcp_rmem[2] in the future to cope with
modern BDP, can not revert to the old behavior, without great pain.

Strategy taken in this patch is to purge ~12.5 % of the queue capacity.

Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6bade06aaf72..64e45b279431 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4942,6 +4942,7 @@ new_range:
  * 2) not add too big latencies if thousands of packets sit there.
  *    (But if application shrinks SO_RCVBUF, we could still end up
  *     freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
  *
  * Return true if queue has shrunk.
  */
@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct rb_node *node, *prev;
+	int goal;
 
 	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		return false;
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+	goal = sk->sk_rcvbuf >> 3;
 	node = &tp->ooo_last_skb->rbnode;
 	do {
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
+		goal -= rb_to_skb(node)->truesize;
 		tcp_drop(sk, rb_to_skb(node));
-		sk_mem_reclaim(sk);
-		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
-		    !tcp_under_memory_pressure(sk))
-			break;
+		if (!prev || goal <= 0) {
+			sk_mem_reclaim(sk);
+			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+			    !tcp_under_memory_pressure(sk))
+				break;
+			goal = sk->sk_rcvbuf >> 3;
+		}
 		node = prev;
 	} while (node);
 	tp->ooo_last_skb = rb_to_skb(prev);
-- 
cgit v1.2.3


From f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:18 -0700
Subject: tcp: avoid collapses in tcp_prune_queue() if possible

Right after a TCP flow is created, receiving tiny out of order
packets allways hit the condition :

if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
	tcp_clamp_window(sk);

tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
(guarded by tcp_rmem[2])

Calling tcp_collapse_ofo_queue() in this case is not useful,
and offers a O(N^2) surface attack to malicious peers.

Better not attempt anything before full queue capacity is reached,
forcing attacker to spend lots of resource and allow us to more
easily detect the abuse.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 64e45b279431..53289911362a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk)
 	else if (tcp_under_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
-- 
cgit v1.2.3


From 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:19 -0700
Subject: tcp: detect malicious patterns in tcp_collapse_ofo_queue()

In case an attacker feeds tiny packets completely out of order,
tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
expensive copies, but not changing socket memory usage at all.

1) Do not attempt to collapse tiny skbs.
2) Add logic to exit early when too many tiny skbs are detected.

We prefer not doing aggressive collapsing (which copies packets)
for pathological flows, and revert to tcp_prune_ofo_queue() which
will be less expensive.

In the future, we might add the possibility of terminating flows
that are proven to be malicious.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53289911362a..78068b902e7b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4902,6 +4902,7 @@ end:
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 range_truesize, sum_tiny = 0;
 	struct sk_buff *skb, *head;
 	u32 start, end;
 
@@ -4913,6 +4914,7 @@ new_range:
 	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
+	range_truesize = skb->truesize;
 
 	for (head = skb;;) {
 		skb = skb_rb_next(skb);
@@ -4923,11 +4925,20 @@ new_range:
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
-				     head, skb, start, end);
+			/* Do not attempt collapsing tiny skbs */
+			if (range_truesize != head->truesize ||
+			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+					     head, skb, start, end);
+			} else {
+				sum_tiny += range_truesize;
+				if (sum_tiny > sk->sk_rcvbuf >> 3)
+					return;
+			}
 			goto new_range;
 		}
 
+		range_truesize += skb->truesize;
 		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 			start = TCP_SKB_CB(skb)->seq;
 		if (after(TCP_SKB_CB(skb)->end_seq, end))
-- 
cgit v1.2.3


From 8541b21e781a22dce52a74fef0b9bed00404a1cd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:20 -0700
Subject: tcp: call tcp_drop() from tcp_data_queue_ofo()

In order to be able to give better diagnostics and detect
malicious traffic, we need to have better sk->sk_drops tracking.

Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 78068b902e7b..b062a7692238 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4510,7 +4510,7 @@ coalesce_done:
 				/* All the bits are present. Drop. */
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb);
+				tcp_drop(sk, skb);
 				skb = NULL;
 				tcp_dsack_set(sk, seq, end_seq);
 				goto add_sack;
@@ -4529,7 +4529,7 @@ coalesce_done:
 						 TCP_SKB_CB(skb1)->end_seq);
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb1);
+				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
 		} else if (tcp_try_coalesce(sk, skb1,
-- 
cgit v1.2.3


From 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Jul 2018 09:28:21 -0700
Subject: tcp: add tcp_ooo_try_coalesce() helper

In case skb in out_or_order_queue is the result of
multiple skbs coalescing, we would like to get a proper gso_segs
counter tracking, so that future tcp_drop() can report an accurate
number.

I chose to not implement this tracking for skbs in receive queue,
since they are not dropped, unless socket is disconnected.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b062a7692238..3bcd30a2ba06 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk,
 	return true;
 }
 
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+			     struct sk_buff *to,
+			     struct sk_buff *from,
+			     bool *fragstolen)
+{
+	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+	/* In case tcp_drop() is called later, update to->gso_segs */
+	if (res) {
+		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+			       max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+	}
+	return res;
+}
+
 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 {
 	sk_drops_add(sk, skb);
@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, tp->ooo_last_skb,
-			     skb, &fragstolen)) {
+	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+				 skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
 		kfree_skb_partial(skb, fragstolen);
@@ -4532,8 +4549,8 @@ coalesce_done:
 				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, skb1,
-					    skb, &fragstolen)) {
+		} else if (tcp_ooo_try_coalesce(sk, skb1,
+						skb, &fragstolen)) {
 			goto coalesce_done;
 		}
 		p = &parent->rb_right;
-- 
cgit v1.2.3


From 90fd131afc565159c9e0ea742f082b337e10f8c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 23 Jul 2018 12:47:14 +0200
Subject: netfilter: nf_tables: move dumper state allocation into ->start

Shaochun Chen points out we leak dumper filter state allocations
stored in dump_control->data in case there is an error before netlink sets
cb_running (after which ->done will be called at some point).

In order to fix this, add .start functions and do the allocations
there.

->done is going to clean up, and in case error occurs before
->start invocation no cleanups need to be done anymore.

Reported-by: shaochun chen <cscnull@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 219 ++++++++++++++++++++++--------------------
 1 file changed, 115 insertions(+), 104 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d7b9748e338e..f5745e4c6513 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2271,6 +2271,39 @@ done:
 	return skb->len;
 }
 
+static int nf_tables_dump_rules_start(struct netlink_callback *cb)
+{
+	const struct nlattr * const *nla = cb->data;
+	struct nft_rule_dump_ctx *ctx = NULL;
+
+	if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
+		ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
+		if (!ctx)
+			return -ENOMEM;
+
+		if (nla[NFTA_RULE_TABLE]) {
+			ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
+							GFP_ATOMIC);
+			if (!ctx->table) {
+				kfree(ctx);
+				return -ENOMEM;
+			}
+		}
+		if (nla[NFTA_RULE_CHAIN]) {
+			ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
+						GFP_ATOMIC);
+			if (!ctx->chain) {
+				kfree(ctx->table);
+				kfree(ctx);
+				return -ENOMEM;
+			}
+		}
+	}
+
+	cb->data = ctx;
+	return 0;
+}
+
 static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 {
 	struct nft_rule_dump_ctx *ctx = cb->data;
@@ -2300,38 +2333,13 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
+			.start= nf_tables_dump_rules_start,
 			.dump = nf_tables_dump_rules,
 			.done = nf_tables_dump_rules_done,
 			.module = THIS_MODULE,
+			.data = (void *)nla,
 		};
 
-		if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
-			struct nft_rule_dump_ctx *ctx;
-
-			ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
-			if (!ctx)
-				return -ENOMEM;
-
-			if (nla[NFTA_RULE_TABLE]) {
-				ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
-							GFP_ATOMIC);
-				if (!ctx->table) {
-					kfree(ctx);
-					return -ENOMEM;
-				}
-			}
-			if (nla[NFTA_RULE_CHAIN]) {
-				ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
-							GFP_ATOMIC);
-				if (!ctx->chain) {
-					kfree(ctx->table);
-					kfree(ctx);
-					return -ENOMEM;
-				}
-			}
-			c.data = ctx;
-		}
-
 		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
@@ -3181,6 +3189,18 @@ done:
 	return skb->len;
 }
 
+static int nf_tables_dump_sets_start(struct netlink_callback *cb)
+{
+	struct nft_ctx *ctx_dump = NULL;
+
+	ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC);
+	if (ctx_dump == NULL)
+		return -ENOMEM;
+
+	cb->data = ctx_dump;
+	return 0;
+}
+
 static int nf_tables_dump_sets_done(struct netlink_callback *cb)
 {
 	kfree(cb->data);
@@ -3208,18 +3228,12 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
+			.start = nf_tables_dump_sets_start,
 			.dump = nf_tables_dump_sets,
 			.done = nf_tables_dump_sets_done,
+			.data = &ctx,
 			.module = THIS_MODULE,
 		};
-		struct nft_ctx *ctx_dump;
-
-		ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
-		if (ctx_dump == NULL)
-			return -ENOMEM;
-
-		*ctx_dump = ctx;
-		c.data = ctx_dump;
 
 		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
@@ -3869,6 +3883,15 @@ nla_put_failure:
 	return -ENOSPC;
 }
 
+static int nf_tables_dump_set_start(struct netlink_callback *cb)
+{
+	struct nft_set_dump_ctx *dump_ctx = cb->data;
+
+	cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC);
+
+	return cb->data ? 0 : -ENOMEM;
+}
+
 static int nf_tables_dump_set_done(struct netlink_callback *cb)
 {
 	kfree(cb->data);
@@ -4022,20 +4045,17 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
+			.start = nf_tables_dump_set_start,
 			.dump = nf_tables_dump_set,
 			.done = nf_tables_dump_set_done,
 			.module = THIS_MODULE,
 		};
-		struct nft_set_dump_ctx *dump_ctx;
-
-		dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
-		if (!dump_ctx)
-			return -ENOMEM;
-
-		dump_ctx->set = set;
-		dump_ctx->ctx = ctx;
+		struct nft_set_dump_ctx dump_ctx = {
+			.set = set,
+			.ctx = ctx,
+		};
 
-		c.data = dump_ctx;
+		c.data = &dump_ctx;
 		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
@@ -4995,38 +5015,42 @@ done:
 	return skb->len;
 }
 
-static int nf_tables_dump_obj_done(struct netlink_callback *cb)
+static int nf_tables_dump_obj_start(struct netlink_callback *cb)
 {
-	struct nft_obj_filter *filter = cb->data;
+	const struct nlattr * const *nla = cb->data;
+	struct nft_obj_filter *filter = NULL;
 
-	if (filter) {
-		kfree(filter->table);
-		kfree(filter);
+	if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) {
+		filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
+		if (!filter)
+			return -ENOMEM;
+
+		if (nla[NFTA_OBJ_TABLE]) {
+			filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
+			if (!filter->table) {
+				kfree(filter);
+				return -ENOMEM;
+			}
+		}
+
+		if (nla[NFTA_OBJ_TYPE])
+			filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 	}
 
+	cb->data = filter;
 	return 0;
 }
 
-static struct nft_obj_filter *
-nft_obj_filter_alloc(const struct nlattr * const nla[])
+static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 {
-	struct nft_obj_filter *filter;
-
-	filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
-	if (!filter)
-		return ERR_PTR(-ENOMEM);
+	struct nft_obj_filter *filter = cb->data;
 
-	if (nla[NFTA_OBJ_TABLE]) {
-		filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
-		if (!filter->table) {
-			kfree(filter);
-			return ERR_PTR(-ENOMEM);
-		}
+	if (filter) {
+		kfree(filter->table);
+		kfree(filter);
 	}
-	if (nla[NFTA_OBJ_TYPE])
-		filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 
-	return filter;
+	return 0;
 }
 
 /* called with rcu_read_lock held */
@@ -5047,21 +5071,13 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
+			.start = nf_tables_dump_obj_start,
 			.dump = nf_tables_dump_obj,
 			.done = nf_tables_dump_obj_done,
 			.module = THIS_MODULE,
+			.data = (void *)nla,
 		};
 
-		if (nla[NFTA_OBJ_TABLE] ||
-		    nla[NFTA_OBJ_TYPE]) {
-			struct nft_obj_filter *filter;
-
-			filter = nft_obj_filter_alloc(nla);
-			if (IS_ERR(filter))
-				return -ENOMEM;
-
-			c.data = filter;
-		}
 		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
@@ -5667,37 +5683,39 @@ done:
 	return skb->len;
 }
 
-static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
+static int nf_tables_dump_flowtable_start(struct netlink_callback *cb)
 {
-	struct nft_flowtable_filter *filter = cb->data;
+	const struct nlattr * const *nla = cb->data;
+	struct nft_flowtable_filter *filter = NULL;
 
-	if (!filter)
-		return 0;
+	if (nla[NFTA_FLOWTABLE_TABLE]) {
+		filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
+		if (!filter)
+			return -ENOMEM;
 
-	kfree(filter->table);
-	kfree(filter);
+		filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
+					   GFP_ATOMIC);
+		if (!filter->table) {
+			kfree(filter);
+			return -ENOMEM;
+		}
+	}
 
+	cb->data = filter;
 	return 0;
 }
 
-static struct nft_flowtable_filter *
-nft_flowtable_filter_alloc(const struct nlattr * const nla[])
+static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
 {
-	struct nft_flowtable_filter *filter;
+	struct nft_flowtable_filter *filter = cb->data;
 
-	filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
 	if (!filter)
-		return ERR_PTR(-ENOMEM);
+		return 0;
 
-	if (nla[NFTA_FLOWTABLE_TABLE]) {
-		filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
-					   GFP_ATOMIC);
-		if (!filter->table) {
-			kfree(filter);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-	return filter;
+	kfree(filter->table);
+	kfree(filter);
+
+	return 0;
 }
 
 /* called with rcu_read_lock held */
@@ -5717,20 +5735,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
+			.start = nf_tables_dump_flowtable_start,
 			.dump = nf_tables_dump_flowtable,
 			.done = nf_tables_dump_flowtable_done,
 			.module = THIS_MODULE,
+			.data = (void *)nla,
 		};
 
-		if (nla[NFTA_FLOWTABLE_TABLE]) {
-			struct nft_flowtable_filter *filter;
-
-			filter = nft_flowtable_filter_alloc(nla);
-			if (IS_ERR(filter))
-				return -ENOMEM;
-
-			c.data = filter;
-		}
 		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
-- 
cgit v1.2.3


From 144fe2bfd236dc814eae587aea7e2af03dbdd755 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 23 Jul 2018 22:37:54 +0200
Subject: sock: fix sg page frag coalescing in sk_alloc_sg

Current sg coalescing logic in sk_alloc_sg() (latter is used by tls and
sockmap) is not quite correct in that we do fetch the previous sg entry,
however the subsequent check whether the refilled page frag from the
socket is still the same as from the last entry with prior offset and
length matching the start of the current buffer is comparing always the
first sg list entry instead of the prior one.

Fixes: 3c4d7559159b ("tls: kernel TLS support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 9e8f65585b81..bc2d7a37297f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2277,9 +2277,9 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
 		pfrag->offset += use;
 
 		sge = sg + sg_curr - 1;
-		if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
-		    sg->offset + sg->length == orig_offset) {
-			sg->length += use;
+		if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
+		    sge->offset + sge->length == orig_offset) {
+			sge->length += use;
 		} else {
 			sge = sg + sg_curr;
 			sg_unmark_end(sge);
-- 
cgit v1.2.3


From e31f6456c01c76f154e1b25cd54df97809a49edb Mon Sep 17 00:00:00 2001
From: Amar Singhal <asinghal@codeaurora.org>
Date: Fri, 20 Jul 2018 12:15:18 -0700
Subject: cfg80211: never ignore user regulatory hint

Currently user regulatory hint is ignored if all wiphys
in the system are self managed. But the hint is not ignored
if there is no wiphy in the system. This affects the global
regulatory setting. Global regulatory setting needs to be
maintained so that it can be applied to a new wiphy entering
the system. Therefore, do not ignore user regulatory setting
even if all wiphys in the system are self managed.

Signed-off-by: Amar Singhal <asinghal@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index bbe6298e4bb9..4fc66a117b7d 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2240,7 +2240,9 @@ static void wiphy_update_regulatory(struct wiphy *wiphy,
 		 * as some drivers used this to restore its orig_* reg domain.
 		 */
 		if (initiator == NL80211_REGDOM_SET_BY_CORE &&
-		    wiphy->regulatory_flags & REGULATORY_CUSTOM_REG)
+		    wiphy->regulatory_flags & REGULATORY_CUSTOM_REG &&
+		    !(wiphy->regulatory_flags &
+		      REGULATORY_WIPHY_SELF_MANAGED))
 			reg_call_notifier(wiphy, lr);
 		return;
 	}
@@ -2787,26 +2789,6 @@ static void notify_self_managed_wiphys(struct regulatory_request *request)
 	}
 }
 
-static bool reg_only_self_managed_wiphys(void)
-{
-	struct cfg80211_registered_device *rdev;
-	struct wiphy *wiphy;
-	bool self_managed_found = false;
-
-	ASSERT_RTNL();
-
-	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
-		wiphy = &rdev->wiphy;
-		if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
-			self_managed_found = true;
-		else
-			return false;
-	}
-
-	/* make sure at least one self-managed wiphy exists */
-	return self_managed_found;
-}
-
 /*
  * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_*
  * Regulatory hints come on a first come first serve basis and we
@@ -2839,10 +2821,6 @@ static void reg_process_pending_hints(void)
 	spin_unlock(&reg_requests_lock);
 
 	notify_self_managed_wiphys(reg_request);
-	if (reg_only_self_managed_wiphys()) {
-		reg_free_request(reg_request);
-		return;
-	}
 
 	reg_process_hint(reg_request);
 
-- 
cgit v1.2.3


From 2efd4fca703a6707cad16ab486eaab8fc7f0fd49 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 23 Jul 2018 19:36:48 -0400
Subject: ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull

Syzbot reported a read beyond the end of the skb head when returning
IPV6_ORIGDSTADDR:

  BUG: KMSAN: kernel-infoleak in put_cmsg+0x5ef/0x860 net/core/scm.c:242
  CPU: 0 PID: 4501 Comm: syz-executor128 Not tainted 4.17.0+ #9
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
  Google 01/01/2011
  Call Trace:
    __dump_stack lib/dump_stack.c:77 [inline]
    dump_stack+0x185/0x1d0 lib/dump_stack.c:113
    kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1125
    kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1219
    kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1261
    copy_to_user include/linux/uaccess.h:184 [inline]
    put_cmsg+0x5ef/0x860 net/core/scm.c:242
    ip6_datagram_recv_specific_ctl+0x1cf3/0x1eb0 net/ipv6/datagram.c:719
    ip6_datagram_recv_ctl+0x41c/0x450 net/ipv6/datagram.c:733
    rawv6_recvmsg+0x10fb/0x1460 net/ipv6/raw.c:521
    [..]

This logic and its ipv4 counterpart read the destination port from
the packet at skb_transport_offset(skb) + 4.

With MSG_MORE and a local SOCK_RAW sender, syzbot was able to cook a
packet that stores headers exactly up to skb_transport_offset(skb) in
the head and the remainder in a frag.

Call pskb_may_pull before accessing the pointer to ensure that it lies
in skb head.

Link: http://lkml.kernel.org/r/CAF=yD-LEJwZj5a1-bAAj2Oy_hKmGygV6rsJ_WOrAYnv-fnayiQ@mail.gmail.com
Reported-by: syzbot+9adb4b567003cac781f0@syzkaller.appspotmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 7 +++++--
 net/ipv6/datagram.c    | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64c76dcf7386..c0fe5ad996f2 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 {
 	struct sockaddr_in sin;
 	const struct iphdr *iph = ip_hdr(skb);
-	__be16 *ports = (__be16 *)skb_transport_header(skb);
+	__be16 *ports;
+	int end;
 
-	if (skb_transport_offset(skb) + 4 > (int)skb->len)
+	end = skb_transport_offset(skb) + 4;
+	if (end > 0 && !pskb_may_pull(skb, end))
 		return;
 
 	/* All current transport protocols have the port numbers in the
 	 * first four bytes of the transport header and this function is
 	 * written with this assumption in mind.
 	 */
+	ports = (__be16 *)skb_transport_header(skb);
 
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = iph->daddr;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2ee08b6a86a4..1a1f876f8e28 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 	}
 	if (np->rxopt.bits.rxorigdstaddr) {
 		struct sockaddr_in6 sin6;
-		__be16 *ports = (__be16 *) skb_transport_header(skb);
+		__be16 *ports;
+		int end;
 
-		if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
+		end = skb_transport_offset(skb) + 4;
+		if (end <= 0 || pskb_may_pull(skb, end)) {
 			/* All current transport protocols have the port numbers in the
 			 * first four bytes of the transport header and this function is
 			 * written with this assumption in mind.
 			 */
+			ports = (__be16 *)skb_transport_header(skb);
 
 			sin6.sin6_family = AF_INET6;
 			sin6.sin6_addr = ipv6_hdr(skb)->daddr;
-- 
cgit v1.2.3